Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

author: Felix Blyakher <felixb@sgi.com> 2009-06-10 18:07:47 -0400
committer: Felix Blyakher <felixb@sgi.com> 2009-06-10 18:07:47 -0400
commit: 4e73e0eb633f8a1b5cbf20e7f42c6dbfec1d1ca7 (patch)
tree: 0cea46e43f0625244c3d06a71d6559e5ec5419ca /fs
parent: 4156e735d3abde8e9243b5d22f7999dd3fffab2e (diff)
parent: 07a2039b8eb0af4ff464efd3dfd95de5c02648c6 (diff)
452 files changed, 52825 insertions, 11103 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 5f8ab8adb5f5..ab5547ff29a1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -155,6 +156,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        root = d_alloc_root(inode);
        if (!root) {
+                iput(inode);
                retval = -ENOMEM;
                goto release_sb;
        }
@@ -173,10 +175,7 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        return 0;
 release_sb:
-        if (sb) {
+        deactivate_locked_super(sb);
-                up_write(&sb->s_umount);
-                deactivate_super(sb);
-        }
 free_stat:
        kfree(st);
@@ -230,9 +229,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 static void
 v9fs_umount_begin(struct super_block *sb)
 {
-        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        struct v9fs_session_info *v9ses;
+        lock_kernel();
+        v9ses = sb->s_fs_info;
        v9fs_session_cancel(v9ses);
+        unlock_kernel();
 }
 static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index cef8b18ceaa3..9f7270f36b2a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -66,6 +66,13 @@ config GENERIC_ACL
        bool
        select FS_POSIX_ACL
+menu "Caches"
+source "fs/fscache/Kconfig"
+source "fs/cachefiles/Kconfig"
+endmenu
 if BLOCK
 menu "CD-ROM/DVD Filesystems"
@@ -168,6 +175,33 @@ source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
+source "fs/exofs/Kconfig"
+config NILFS2_FS
+        tristate "NILFS2 file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        select CRC32
+        help
+          NILFS2 is a log-structured file system (LFS) supporting continuous
+          snapshotting.  In addition to versioning capability of the entire
+          file system, users can even restore files mistakenly overwritten or
+          destroyed just a few seconds ago.  Since this file system can keep
+          consistency like conventional LFS, it achieves quick recovery after
+          system crashes.
+          NILFS2 creates a number of checkpoints every few seconds or per
+          synchronous write basis (unless there is no change).  Users can
+          select significant versions among continuously created checkpoints,
+          and can change them into snapshots which will be preserved for long
+          periods until they are changed back to checkpoints.  Each
+          snapshot is mountable as a read-only file system concurrently with
+          its writable mount, and this feature is convenient for online backup.
+          Some features including atime, extended attributes, and POSIX ACLs,
+          are not supported yet.
+          To compile this file system support as a module, choose M here: the
+          module will be called nilfs2.  If unsure, say N.
 endif # MISC_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..af6d04700d9c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o drop_caches.o splice.o sync.o utimes.o \
-                stack.o
+                stack.o fs_struct.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=        buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -63,6 +63,7 @@ obj-$(CONFIG_PROFILING)		+= dcookies.o
 obj-$(CONFIG_DLM)               += dlm/
 
 # Do not add any filesystems before this line
+obj-$(CONFIG_FSCACHE)           += fscache/
 obj-$(CONFIG_REISERFS_FS)       += reiserfs/
 obj-$(CONFIG_EXT3_FS)           += ext3/ # Before ext2 so root fs can be ext3
 obj-$(CONFIG_EXT2_FS)           += ext2/
@@ -113,10 +114,13 @@ obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)            += xfs/
 obj-$(CONFIG_9P_FS)             += 9p/
 obj-$(CONFIG_AFS_FS)            += afs/
+obj-$(CONFIG_NILFS2_FS)         += nilfs2/
 obj-$(CONFIG_BEFS_FS)           += befs/
 obj-$(CONFIG_HOSTFS)            += hostfs/
 obj-$(CONFIG_HPPFS)             += hppfs/
+obj-$(CONFIG_CACHEFILES)        += cachefiles/
 obj-$(CONFIG_DEBUG_FS)          += debugfs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
 obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_EXOFS_FS)          += exofs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7f83a46f2b7e..dd9becca4241 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
 static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
+        struct super_block *sb = dentry->d_sb;
+        struct adfs_sb_info *sbi = ADFS_SB(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type    = ADFS_SUPER_MAGIC;
-        buf->f_namelen = asb->s_namelen;
+        buf->f_namelen = sbi->s_namelen;
-        buf->f_bsize   = dentry->d_sb->s_blocksize;
+        buf->f_bsize   = sb->s_blocksize;
-        buf->f_blocks  = asb->s_size;
+        buf->f_blocks  = sbi->s_size;
-        buf->f_files   = asb->s_ids_per_zone * asb->s_map_size;
+        buf->f_files   = sbi->s_ids_per_zone * sbi->s_map_size;
        buf->f_bavail  =
-        buf->f_bfree   = adfs_map_free(dentry->d_sb);
+        buf->f_bfree   = adfs_map_free(sb);
        buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a19d64b582aa..63f5183f263b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -507,8 +507,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
                kfree(new_opts);
                return -EINVAL;
        }
-        kfree(sb->s_options);
+        replace_mount_options(sb, new_opts);
-        sb->s_options = new_opts;
        sbi->s_flags = mount_flags;
        sbi->s_mode  = mode;
@@ -533,6 +532,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        int              free;
+        u64              id = huge_encode_dev(sb->s_bdev->bd_dev);
        pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
             AFFS_SB(sb)->s_reserved);
@@ -543,6 +543,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_blocks  = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
        buf->f_bfree   = free;
        buf->f_bavail  = free;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
+        buf->f_namelen = 30;
        return 0;
 }
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index e7b522fe15e1..5c4e61d3c772 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -19,3 +19,11 @@ config AFS_DEBUG
          See <file:Documentation/filesystems/afs.txt> for more information.
          If unsure, say N.
+config AFS_FSCACHE
+        bool "Provide AFS client caching support (EXPERIMENTAL)"
+        depends on EXPERIMENTAL
+        depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
+        help
+          Say Y here if you want AFS data to be cached locally on disk through
+          the generic filesystem cache manager
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index a66671082cfb..4f64b95d57bd 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,7 +2,10 @@
 # Makefile for Red Hat Linux AFS client.
 #
+afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
 kafs-objs := \
+        $(afs-cache-y) \
        callback.o \
        cell.o \
        cmservice.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index de0d7de69edc..e2b1d3f16519 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -1,6 +1,6 @@
 /* AFS caching stuff
 *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -9,248 +9,395 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifdef AFS_CACHING_SUPPORT
+#include <linux/slab.h>
-static cachefs_match_val_t afs_cell_cache_match(void *target,
+#include <linux/sched.h>
-                                                const void *entry);
+#include "internal.h"
-static void afs_cell_cache_update(void *source, void *entry);
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
-struct cachefs_index_def afs_cache_cell_index_def = {
+                                       void *buffer, uint16_t buflen);
-        .name                   = "cell_ix",
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
-        .data_size              = sizeof(struct afs_cache_cell),
+                                       void *buffer, uint16_t buflen);
-        .keys[0]                = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
-        .match                  = afs_cell_cache_match,
+                                                      const void *buffer,
-        .update                 = afs_cell_cache_update,
+                                                      uint16_t buflen);
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t buflen);
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vlocation_cache_check_aux(
+        void *cookie_netfs_data, const void *buffer, uint16_t buflen);
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+                                         void *buffer, uint16_t buflen);
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+                                        void *buffer, uint16_t buflen);
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+                                     uint64_t *size);
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+                                        void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+                                                       const void *buffer,
+                                                       uint16_t buflen);
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
+struct fscache_netfs afs_cache_netfs = {
+        .name                   = "afs",
+        .version                = 0,
+};
+struct fscache_cookie_def afs_cell_cache_index_def = {
+        .name           = "AFS.cell",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = afs_cell_cache_get_key,
+        .get_aux        = afs_cell_cache_get_aux,
+        .check_aux      = afs_cell_cache_check_aux,
+};
+struct fscache_cookie_def afs_vlocation_cache_index_def = {
+        .name                   = "AFS.vldb",
+        .type                   = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key                = afs_vlocation_cache_get_key,
+        .get_aux                = afs_vlocation_cache_get_aux,
+        .check_aux              = afs_vlocation_cache_check_aux,
+};
+struct fscache_cookie_def afs_volume_cache_index_def = {
+        .name           = "AFS.volume",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = afs_volume_cache_get_key,
+};
+struct fscache_cookie_def afs_vnode_cache_index_def = {
+        .name                   = "AFS.vnode",
+        .type                   = FSCACHE_COOKIE_TYPE_DATAFILE,
+        .get_key                = afs_vnode_cache_get_key,
+        .get_attr               = afs_vnode_cache_get_attr,
+        .get_aux                = afs_vnode_cache_get_aux,
+        .check_aux              = afs_vnode_cache_check_aux,
+        .now_uncached           = afs_vnode_cache_now_uncached,
 };
-#endif
 /*
- * match a cell record obtained from the cache
+ * set the key for the index entry
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
-static cachefs_match_val_t afs_cell_cache_match(void *target,
+                                       void *buffer, uint16_t bufmax)
-                                                const void *entry)
 {
-        const struct afs_cache_cell *ccell = entry;
+        const struct afs_cell *cell = cookie_netfs_data;
-        struct afs_cell *cell = target;
+        uint16_t klen;
-        _enter("{%s},{%s}", ccell->name, cell->name);
+        _enter("%p,%p,%u", cell, buffer, bufmax);
-        if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
+        klen = strlen(cell->name);
-                _leave(" = SUCCESS");
+        if (klen > bufmax)
-                return CACHEFS_MATCH_SUCCESS;
+                return 0;
-        }
-        _leave(" = FAILED");
+        memcpy(buffer, cell->name, klen);
-        return CACHEFS_MATCH_FAILED;
+        return klen;
 }
-#endif
 /*
- * update a cell record in the cache
+ * provide new auxilliary cache data
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
-static void afs_cell_cache_update(void *source, void *entry)
+                                       void *buffer, uint16_t bufmax)
 {
-        struct afs_cache_cell *ccell = entry;
+        const struct afs_cell *cell = cookie_netfs_data;
-        struct afs_cell *cell = source;
+        uint16_t dlen;
-        _enter("%p,%p", source, entry);
+        _enter("%p,%p,%u", cell, buffer, bufmax);
-        strncpy(ccell->name, cell->name, sizeof(ccell->name));
+        dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
+        dlen = min(dlen, bufmax);
+        dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
-        memcpy(ccell->vl_servers,
+        memcpy(buffer, cell->vl_addrs, dlen);
-               cell->vl_addrs,
+        return dlen;
-               min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
+}
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+                                                      const void *buffer,
+                                                      uint16_t buflen)
+{
+        _leave(" = OKAY");
+        return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-                                                     const void *entry);
-static void afs_vlocation_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_vlocation_cache_index_def = {
-        .name           = "vldb",
-        .data_size      = sizeof(struct afs_cache_vlocation),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-        .match          = afs_vlocation_cache_match,
-        .update         = afs_vlocation_cache_update,
-};
-#endif
+/*****************************************************************************/
 /*
- * match a VLDB record stored in the cache
+ * set the key for the index entry
- * - may also load target from entry
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+                                            void *buffer, uint16_t bufmax)
-                                                     const void *entry)
 {
-        const struct afs_cache_vlocation *vldb = entry;
+        const struct afs_vlocation *vlocation = cookie_netfs_data;
-        struct afs_vlocation *vlocation = target;
+        uint16_t klen;
+        _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+        klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
+        if (klen > bufmax)
+                return 0;
-        _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+        memcpy(buffer, vlocation->vldb.name, klen);
-        if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
+        _leave(" = %u", klen);
-            ) {
+        return klen;
-                if (!vlocation->valid ||
+}
-                    vlocation->vldb.rtime == vldb->rtime
+/*
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax)
+{
+        const struct afs_vlocation *vlocation = cookie_netfs_data;
+        uint16_t dlen;
+        _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+        dlen = sizeof(struct afs_cache_vlocation);
+        dlen -= offsetof(struct afs_cache_vlocation, nservers);
+        if (dlen > bufmax)
+                return 0;
+        memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
+        _leave(" = %u", dlen);
+        return dlen;
+}
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static
+enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
+                                                    const void *buffer,
+                                                    uint16_t buflen)
+{
+        const struct afs_cache_vlocation *cvldb;
+        struct afs_vlocation *vlocation = cookie_netfs_data;
+        uint16_t dlen;
+        _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
+        /* check the size of the data is what we're expecting */
+        dlen = sizeof(struct afs_cache_vlocation);
+        dlen -= offsetof(struct afs_cache_vlocation, nservers);
+        if (dlen != buflen)
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
+        /* if what's on disk is more valid than what's in memory, then use the
+         * VL record from the cache */
+        if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
+                memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
+                vlocation->valid = 1;
+                _leave(" = SUCCESS [c->m]");
+                return FSCACHE_CHECKAUX_OKAY;
+        }
+        /* need to update the cache if the cached info differs */
+        if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
+                /* delete if the volume IDs for this name differ */
+                if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
+                           sizeof(cvldb->vid)) != 0
                    ) {
-                        vlocation->vldb = *vldb;
+                        _leave(" = OBSOLETE");
-                        vlocation->valid = 1;
+                        return FSCACHE_CHECKAUX_OBSOLETE;
-                        _leave(" = SUCCESS [c->m]");
-                        return CACHEFS_MATCH_SUCCESS;
-                } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
-                        /* delete if VIDs for this name differ */
-                        if (memcmp(&vlocation->vldb.vid,
-                                   &vldb->vid,
-                                   sizeof(vldb->vid)) != 0) {
-                                _leave(" = DELETE");
-                                return CACHEFS_MATCH_SUCCESS_DELETE;
-                        }
-                        _leave(" = UPDATE");
-                        return CACHEFS_MATCH_SUCCESS_UPDATE;
-                } else {
-                        _leave(" = SUCCESS");
-                        return CACHEFS_MATCH_SUCCESS;
                }
+                _leave(" = UPDATE");
+                return FSCACHE_CHECKAUX_NEEDS_UPDATE;
        }
-        _leave(" = FAILED");
+        _leave(" = OKAY");
-        return CACHEFS_MATCH_FAILED;
+        return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
+/*****************************************************************************/
 /*
- * update a VLDB record stored in the cache
+ * set the key for the volume index entry
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
-static void afs_vlocation_cache_update(void *source, void *entry)
+                                        void *buffer, uint16_t bufmax)
 {
-        struct afs_cache_vlocation *vldb = entry;
+        const struct afs_volume *volume = cookie_netfs_data;
-        struct afs_vlocation *vlocation = source;
+        uint16_t klen;
+        _enter("{%u},%p,%u", volume->type, buffer, bufmax);
+        klen = sizeof(volume->type);
+        if (klen > bufmax)
+                return 0;
-        _enter("");
+        memcpy(buffer, &volume->type, sizeof(volume->type));
+        _leave(" = %u", klen);
+        return klen;
-        *vldb = vlocation->vldb;
 }
-#endif
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-                                                  const void *entry);
-static void afs_volume_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_volume_cache_index_def = {
-        .name           = "volume",
-        .data_size      = sizeof(struct afs_cache_vhash),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-        .keys[1]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-        .match          = afs_volume_cache_match,
-        .update         = afs_volume_cache_update,
-};
-#endif
+/*****************************************************************************/
 /*
- * match a volume hash record stored in the cache
+ * set the key for the index entry
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
-static cachefs_match_val_t afs_volume_cache_match(void *target,
+                                        void *buffer, uint16_t bufmax)
-                                                  const void *entry)
 {
-        const struct afs_cache_vhash *vhash = entry;
+        const struct afs_vnode *vnode = cookie_netfs_data;
-        struct afs_volume *volume = target;
+        uint16_t klen;
-        _enter("{%u},{%u}", volume->type, vhash->vtype);
+        _enter("{%x,%x,%llx},%p,%u",
+               vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+               buffer, bufmax);
-        if (volume->type == vhash->vtype) {
+        klen = sizeof(vnode->fid.vnode);
-                _leave(" = SUCCESS");
+        if (klen > bufmax)
-                return CACHEFS_MATCH_SUCCESS;
+                return 0;
-        }
+        memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
-        _leave(" = FAILED");
+        _leave(" = %u", klen);
-        return CACHEFS_MATCH_FAILED;
+        return klen;
 }
-#endif
 /*
- * update a volume hash record stored in the cache
+ * provide updated file attributes
 */
-#ifdef AFS_CACHING_SUPPORT
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
-static void afs_volume_cache_update(void *source, void *entry)
+                                     uint64_t *size)
 {
-        struct afs_cache_vhash *vhash = entry;
+        const struct afs_vnode *vnode = cookie_netfs_data;
-        struct afs_volume *volume = source;
-        _enter("");
+        _enter("{%x,%x,%llx},",
+               vnode->fid.vnode, vnode->fid.unique,
+               vnode->status.data_version);
-        vhash->vtype = volume->type;
+        *size = vnode->status.size;
 }
-#endif
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
-                                                 const void *entry);
-static void afs_vnode_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_vnode_cache_index_def = {
-        .name           = "vnode",
-        .data_size      = sizeof(struct afs_cache_vnode),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 4 },
-        .match          = afs_vnode_cache_match,
-        .update         = afs_vnode_cache_update,
-};
-#endif
 /*
- * match a vnode record stored in the cache
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+                                        void *buffer, uint16_t bufmax)
+{
+        const struct afs_vnode *vnode = cookie_netfs_data;
+        uint16_t dlen;
+        _enter("{%x,%x,%Lx},%p,%u",
+               vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+               buffer, bufmax);
+        dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+        if (dlen > bufmax)
+                return 0;
+        memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
+        buffer += sizeof(vnode->fid.unique);
+        memcpy(buffer, &vnode->status.data_version,
+               sizeof(vnode->status.data_version));
+        _leave(" = %u", dlen);
+        return dlen;
+}
+/*
+ * check that the auxilliary data indicates that the entry is still valid
 */
-#ifdef AFS_CACHING_SUPPORT
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
+                                                       const void *buffer,
-                                                 const void *entry)
+                                                       uint16_t buflen)
 {
-        const struct afs_cache_vnode *cvnode = entry;
+        struct afs_vnode *vnode = cookie_netfs_data;
-        struct afs_vnode *vnode = target;
+        uint16_t dlen;
-        _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+        _enter("{%x,%x,%llx},%p,%u",
-               vnode->fid.vnode,
+               vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
-               vnode->fid.unique,
+               buffer, buflen);
-               vnode->status.version,
-               cvnode->vnode_id,
+        /* check the size of the data is what we're expecting */
-               cvnode->vnode_unique,
+        dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
-               cvnode->data_version);
+        if (dlen != buflen) {
+                _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
-        if (vnode->fid.vnode != cvnode->vnode_id) {
+                return FSCACHE_CHECKAUX_OBSOLETE;
-                _leave(" = FAILED");
-                return CACHEFS_MATCH_FAILED;
        }
-        if (vnode->fid.unique != cvnode->vnode_unique ||
+        if (memcmp(buffer,
-            vnode->status.version != cvnode->data_version) {
+                   &vnode->fid.unique,
-                _leave(" = DELETE");
+                   sizeof(vnode->fid.unique)
-                return CACHEFS_MATCH_SUCCESS_DELETE;
+                   ) != 0) {
+                unsigned unique;
+                memcpy(&unique, buffer, sizeof(unique));
+                _leave(" = OBSOLETE [uniq %x != %x]",
+                       unique, vnode->fid.unique);
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        }
+        if (memcmp(buffer + sizeof(vnode->fid.unique),
+                   &vnode->status.data_version,
+                   sizeof(vnode->status.data_version)
+                   ) != 0) {
+                afs_dataversion_t version;
+                memcpy(&version, buffer + sizeof(vnode->fid.unique),
+                       sizeof(version));
+                _leave(" = OBSOLETE [vers %llx != %llx]",
+                       version, vnode->status.data_version);
+                return FSCACHE_CHECKAUX_OBSOLETE;
        }
        _leave(" = SUCCESS");
-        return CACHEFS_MATCH_SUCCESS;
+        return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
 /*
- * update a vnode record stored in the cache
+ * indication the cookie is no longer uncached
+ * - this function is called when the backing store currently caching a cookie
+ *   is removed
+ * - the netfs should use this to clean up any markers indicating cached pages
+ * - this is mandatory for any object that may have data
 */
-#ifdef AFS_CACHING_SUPPORT
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
-static void afs_vnode_cache_update(void *source, void *entry)
 {
-        struct afs_cache_vnode *cvnode = entry;
+        struct afs_vnode *vnode = cookie_netfs_data;
-        struct afs_vnode *vnode = source;
+        struct pagevec pvec;
+        pgoff_t first;
+        int loop, nr_pages;
+        _enter("{%x,%x,%Lx}",
+               vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
+        pagevec_init(&pvec, 0);
+        first = 0;
+        for (;;) {
+                /* grab a bunch of pages to clean */
+                nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
+                                          first,
+                                          PAGEVEC_SIZE - pagevec_count(&pvec));
+                if (!nr_pages)
+                        break;
-        _enter("");
+                for (loop = 0; loop < nr_pages; loop++)
+                        ClearPageFsCache(pvec.pages[loop]);
+                first = pvec.pages[nr_pages - 1]->index + 1;
+                pvec.nr = nr_pages;
+                pagevec_release(&pvec);
+                cond_resched();
+        }
-        cvnode->vnode_id        = vnode->fid.vnode;
+        _leave("");
-        cvnode->vnode_unique    = vnode->fid.unique;
-        cvnode->data_version    = vnode->status.version;
 }
-#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 36a3642cf90e..5c4f6b499e90 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,6 +1,6 @@
 /* AFS local cache management interface
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -9,15 +9,4 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifndef AFS_CACHE_H
+#include <linux/fscache.h>
-#define AFS_CACHE_H
-#undef AFS_CACHING_SUPPORT
-#include <linux/mm.h>
-#ifdef AFS_CACHING_SUPPORT
-#include <linux/cachefs.h>
-#endif
-#include "types.h"
-#endif /* AFS_CACHE_H */
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5e1df14e16b1..e19c13f059ed 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
        if (ret < 0)
                goto error;
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        /* put it up for caching */
+        /* put it up for caching (this never returns an error) */
-        cachefs_acquire_cookie(afs_cache_netfs.primary_index,
+        cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
-                               &afs_vlocation_cache_index_def,
+                                             &afs_cell_cache_index_def,
-                               cell,
+                                             cell);
-                               &cell->cache);
 #endif
        /* add to the cell lists */
@@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell)
        list_del_init(&cell->proc_link);
        up_write(&afs_proc_cells_sem);
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_relinquish_cookie(cell->cache, 0);
+        fscache_relinquish_cookie(cell->cache, 0);
 #endif
        key_put(cell->anonymous_key);
        kfree(cell);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a3901769a96c..0149dab365e7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 static int afs_launder_page(struct page *page);
+static int afs_readpages(struct file *filp, struct address_space *mapping,
+                         struct list_head *pages, unsigned nr_pages);
 const struct file_operations afs_file_operations = {
        .open           = afs_open,
        .release        = afs_release,
@@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = {
 const struct address_space_operations afs_fs_aops = {
        .readpage       = afs_readpage,
+        .readpages      = afs_readpages,
        .set_page_dirty = afs_set_page_dirty,
        .launder_page   = afs_launder_page,
        .releasepage    = afs_releasepage,
@@ -98,38 +102,21 @@ int afs_release(struct inode *inode, struct file *file)
        return 0;
 }
+#ifdef CONFIG_AFS_FSCACHE
 /*
 * deal with notification that a page was read from the cache
 */
-#ifdef AFS_CACHING_SUPPORT
+static void afs_file_readpage_read_complete(struct page *page,
-static void afs_readpage_read_complete(void *cookie_data,
+                                            void *data,
-                                       struct page *page,
+                                            int error)
-                                       void *data,
-                                       int error)
 {
-        _enter("%p,%p,%p,%d", cookie_data, page, data, error);
+        _enter("%p,%p,%d", page, data, error);
-        if (error)
+        /* if the read completes with an error, we just unlock the page and let
-                SetPageError(page);
+         * the VM reissue the readpage */
-        else
+        if (!error)
                SetPageUptodate(page);
        unlock_page(page);
-}
-#endif
-/*
- * deal with notification that a page was written to the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_readpage_write_complete(void *cookie_data,
-                                        struct page *page,
-                                        void *data,
-                                        int error)
-{
-        _enter("%p,%p,%p,%d", cookie_data, page, data, error);
-        unlock_page(page);
 }
 #endif
@@ -161,9 +148,9 @@ static int afs_readpage(struct file *file, struct page *page)
        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
                goto error;
-#ifdef AFS_CACHING_SUPPORT
        /* is it cached? */
-        ret = cachefs_read_or_alloc_page(vnode->cache,
+#ifdef CONFIG_AFS_FSCACHE
+        ret = fscache_read_or_alloc_page(vnode->cache,
                                         page,
                                         afs_file_readpage_read_complete,
                                         NULL,
@@ -171,20 +158,21 @@ static int afs_readpage(struct file *file, struct page *page)
 #else
        ret = -ENOBUFS;
 #endif
        switch (ret) {
-                /* read BIO submitted and wb-journal entry found */
-        case 1:
-                BUG(); // TODO - handle wb-journal match
                /* read BIO submitted (page in cache) */
        case 0:
                break;
-                /* no page available in cache */
+                /* page not yet cached */
-        case -ENOBUFS:
        case -ENODATA:
+                _debug("cache said ENODATA");
+                goto go_on;
+                /* page will not be cached */
+        case -ENOBUFS:
+                _debug("cache said ENOBUFS");
        default:
+        go_on:
                offset = page->index << PAGE_CACHE_SHIFT;
                len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
@@ -198,27 +186,25 @@ static int afs_readpage(struct file *file, struct page *page)
                                set_bit(AFS_VNODE_DELETED, &vnode->flags);
                                ret = -ESTALE;
                        }
-#ifdef AFS_CACHING_SUPPORT
-                        cachefs_uncache_page(vnode->cache, page);
+#ifdef CONFIG_AFS_FSCACHE
+                        fscache_uncache_page(vnode->cache, page);
 #endif
+                        BUG_ON(PageFsCache(page));
                        goto error;
                }
                SetPageUptodate(page);
-#ifdef AFS_CACHING_SUPPORT
+                /* send the page to the cache */
-                if (cachefs_write_page(vnode->cache,
+#ifdef CONFIG_AFS_FSCACHE
-                                       page,
+                if (PageFsCache(page) &&
-                                       afs_file_readpage_write_complete,
+                    fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
-                                       NULL,
+                        fscache_uncache_page(vnode->cache, page);
-                                       GFP_KERNEL) != 0
+                        BUG_ON(PageFsCache(page));
-                    ) {
-                        cachefs_uncache_page(vnode->cache, page);
-                        unlock_page(page);
                }
-#else
-                unlock_page(page);
 #endif
+                unlock_page(page);
        }
        _leave(" = 0");
@@ -232,34 +218,59 @@ error:
 }
 /*
- * invalidate part or all of a page
+ * read a set of pages
 */
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static int afs_readpages(struct file *file, struct address_space *mapping,
+                         struct list_head *pages, unsigned nr_pages)
 {
-        int ret = 1;
+        struct afs_vnode *vnode;
+        int ret = 0;
-        _enter("{%lu},%lu", page->index, offset);
+        _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
-        BUG_ON(!PageLocked(page));
+        vnode = AFS_FS_I(mapping->host);
+        if (vnode->flags & AFS_VNODE_DELETED) {
+                _leave(" = -ESTALE");
+                return -ESTALE;
+        }
-        if (PagePrivate(page)) {
+        /* attempt to read as many of the pages as possible */
-                /* We release buffers only if the entire page is being
+#ifdef CONFIG_AFS_FSCACHE
-                 * invalidated.
+        ret = fscache_read_or_alloc_pages(vnode->cache,
-                 * The get_block cached value has been unconditionally
+                                          mapping,
-                 * invalidated, so real IO is not possible anymore.
+                                          pages,
-                 */
+                                          &nr_pages,
-                if (offset == 0) {
+                                          afs_file_readpage_read_complete,
-                        BUG_ON(!PageLocked(page));
+                                          NULL,
+                                          mapping_gfp_mask(mapping));
-                        ret = 0;
+#else
-                        if (!PageWriteback(page))
+        ret = -ENOBUFS;
-                                ret = page->mapping->a_ops->releasepage(page,
+#endif
-                                                                        0);
-                        /* possibly should BUG_ON(!ret); - neilb */
+        switch (ret) {
-                }
+                /* all pages are being read from the cache */
+        case 0:
+                BUG_ON(!list_empty(pages));
+                BUG_ON(nr_pages != 0);
+                _leave(" = 0 [reading all]");
+                return 0;
+                /* there were pages that couldn't be read from the cache */
+        case -ENODATA:
+        case -ENOBUFS:
+                break;
+                /* other error */
+        default:
+                _leave(" = %d", ret);
+                return ret;
        }
-        _leave(" = %d", ret);
+        /* load the missing pages from the network */
+        ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+        _leave(" = %d [netting]", ret);
+        return ret;
 }
 /*
@@ -273,25 +284,82 @@ static int afs_launder_page(struct page *page)
 }
 /*
- * release a page and cleanup its private data
+ * invalidate part or all of a page
+ * - release a page and clean up its private data if offset is 0 (indicating
+ *   the entire page)
+ */
+static void afs_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+        _enter("{%lu},%lu", page->index, offset);
+        BUG_ON(!PageLocked(page));
+        /* we clean up only if the entire page is being invalidated */
+        if (offset == 0) {
+#ifdef CONFIG_AFS_FSCACHE
+                if (PageFsCache(page)) {
+                        struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+                        fscache_wait_on_page_write(vnode->cache, page);
+                        fscache_uncache_page(vnode->cache, page);
+                        ClearPageFsCache(page);
+                }
+#endif
+                if (PagePrivate(page)) {
+                        if (wb && !PageWriteback(page)) {
+                                set_page_private(page, 0);
+                                afs_put_writeback(wb);
+                        }
+                        if (!page_private(page))
+                                ClearPagePrivate(page);
+                }
+        }
+        _leave("");
+}
+/*
+ * release a page and clean up its private state if it's not busy
+ * - return true if the page can now be released, false if not
 */
 static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 {
+        struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
        struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
-        struct afs_writeback *wb;
        _enter("{{%x:%u}[%lu],%lx},%x",
               vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
               gfp_flags);
+        /* deny if page is being written to the cache and the caller hasn't
+         * elected to wait */
+#ifdef CONFIG_AFS_FSCACHE
+        if (PageFsCache(page)) {
+                if (fscache_check_page_write(vnode->cache, page)) {
+                        if (!(gfp_flags & __GFP_WAIT)) {
+                                _leave(" = F [cache busy]");
+                                return 0;
+                        }
+                        fscache_wait_on_page_write(vnode->cache, page);
+                }
+                fscache_uncache_page(vnode->cache, page);
+                ClearPageFsCache(page);
+        }
+#endif
        if (PagePrivate(page)) {
-                wb = (struct afs_writeback *) page_private(page);
+                if (wb) {
-                ASSERT(wb != NULL);
+                        set_page_private(page, 0);
-                set_page_private(page, 0);
+                        afs_put_writeback(wb);
+                }
                ClearPagePrivate(page);
-                afs_put_writeback(wb);
        }
-        _leave(" = 0");
+        /* indicate that the page can be released */
-        return 0;
+        _leave(" = T");
+        return 1;
 }
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index bb47217f6a18..c048f0658751 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
                return -EBADMSG;
        }
+#ifdef CONFIG_AFS_FSCACHE
+        if (vnode->status.size != inode->i_size)
+                fscache_attr_changed(vnode->cache);
+#endif
        inode->i_nlink          = vnode->status.nlink;
        inode->i_uid            = vnode->status.owner;
        inode->i_gid            = 0;
@@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
                return inode;
        }
-#ifdef AFS_CACHING_SUPPORT
-        /* set up caching before reading the status, as fetch-status reads the
-         * first page of symlinks to see if they're really mntpts */
-        cachefs_acquire_cookie(vnode->volume->cache,
-                               NULL,
-                               vnode,
-                               &vnode->cache);
-#endif
        if (!status) {
                /* it's a remotely extant inode */
                set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
@@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
                }
        }
+        /* set up caching before mapping the status, as map-status reads the
+         * first page of symlinks to see if they're really mountpoints */
+        inode->i_size = vnode->status.size;
+#ifdef CONFIG_AFS_FSCACHE
+        vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
+                                              &afs_vnode_cache_index_def,
+                                              vnode);
+#endif
        ret = afs_inode_map_status(vnode, key);
        if (ret < 0)
                goto bad_inode;
@@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
        /* failure */
 bad_inode:
+#ifdef CONFIG_AFS_FSCACHE
+        fscache_relinquish_cookie(vnode->cache, 0);
+        vnode->cache = NULL;
+#endif
        iget_failed(inode);
        _leave(" = %d [bad]", ret);
        return ERR_PTR(ret);
@@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode)
        ASSERT(list_empty(&vnode->writebacks));
        ASSERT(!vnode->cb_promised);
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_relinquish_cookie(vnode->cache, 0);
+        fscache_relinquish_cookie(vnode->cache, 0);
        vnode->cache = NULL;
 #endif
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 67f259d99cd6..106be66dafd2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
 #include "afs.h"
 #include "afs_vl.h"
+#include "cache.h"
 #define AFS_CELL_MAX_ADDRS 15
@@ -193,8 +194,8 @@ struct afs_cell {
        struct key              *anonymous_key; /* anonymous user key for this cell */
        struct list_head        proc_link;      /* /proc cell list link */
        struct proc_dir_entry   *proc_dir;      /* /proc dir for this cell */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        struct cachefs_cookie   *cache;         /* caching cookie */
+        struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        /* server record management */
@@ -249,8 +250,8 @@ struct afs_vlocation {
        struct list_head        grave;          /* link in master graveyard list */
        struct list_head        update;         /* link in master update list */
        struct afs_cell         *cell;          /* cell to which volume belongs */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        struct cachefs_cookie   *cache;         /* caching cookie */
+        struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        struct afs_cache_vlocation vldb;        /* volume information DB record */
        struct afs_volume       *vols[3];       /* volume access record pointer (index by type) */
@@ -302,8 +303,8 @@ struct afs_volume {
        atomic_t                usage;
        struct afs_cell         *cell;          /* cell to which belongs (unrefd ptr) */
        struct afs_vlocation    *vlocation;     /* volume location */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        struct cachefs_cookie   *cache;         /* caching cookie */
+        struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        afs_volid_t             vid;            /* volume ID */
        afs_voltype_t           type;           /* type of volume */
@@ -333,8 +334,8 @@ struct afs_vnode {
        struct afs_server       *server;        /* server currently supplying this file */
        struct afs_fid          fid;            /* the file identifier for this inode */
        struct afs_file_status  status;         /* AFS status info for this file */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        struct cachefs_cookie   *cache;         /* caching cookie */
+        struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        struct afs_permits      *permits;       /* cache of permits so far obtained */
        struct mutex            permits_lock;   /* lock for altering permits list */
@@ -428,6 +429,22 @@ struct afs_uuid {
 /*****************************************************************************/
 /*
+ * cache.c
+ */
+#ifdef CONFIG_AFS_FSCACHE
+extern struct fscache_netfs afs_cache_netfs;
+extern struct fscache_cookie_def afs_cell_cache_index_def;
+extern struct fscache_cookie_def afs_vlocation_cache_index_def;
+extern struct fscache_cookie_def afs_volume_cache_index_def;
+extern struct fscache_cookie_def afs_vnode_cache_index_def;
+#else
+#define afs_cell_cache_index_def        (*(struct fscache_cookie_def *) NULL)
+#define afs_vlocation_cache_index_def   (*(struct fscache_cookie_def *) NULL)
+#define afs_volume_cache_index_def      (*(struct fscache_cookie_def *) NULL)
+#define afs_vnode_cache_index_def       (*(struct fscache_cookie_def *) NULL)
+#endif
+/*
 * callback.c
 */
 extern void afs_init_callback_state(struct afs_server *);
@@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void);
 */
 extern struct rw_semaphore afs_proc_cells_sem;
 extern struct list_head afs_proc_cells;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_cache_cell_index_def;
-#endif
 #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
 extern int afs_cell_init(char *);
@@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *);
 * main.c
 */
 extern struct afs_uuid afs_uuid;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_netfs afs_cache_netfs;
-#endif
 /*
 * misc.c
@@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t);
 /*
 * vlclient.c
 */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vlocation_cache_index_def;
-#endif
 extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
                                    const char *, struct afs_cache_vlocation *,
                                    const struct afs_wait_mode *);
@@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void);
 /*
 * vnode.c
 */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vnode_cache_index_def;
-#endif
-extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
 static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
 {
        return container_of(inode, struct afs_vnode, vfs_inode);
@@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
 /*
 * volume.c
 */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_volume_cache_index_def;
-#endif
 #define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
 extern void afs_put_volume(struct afs_volume *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 2d3e5d4fb9f7..66d54d348c55 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,6 +1,6 @@
 /* AFS client file system
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -29,18 +29,6 @@ static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
-#ifdef AFS_CACHING_SUPPORT
-static struct cachefs_netfs_operations afs_cache_ops = {
-        .get_page_cookie        = afs_cache_get_page_cookie,
-};
-struct cachefs_netfs afs_cache_netfs = {
-        .name                   = "afs",
-        .version                = 0,
-        .ops                    = &afs_cache_ops,
-};
-#endif
 struct afs_uuid afs_uuid;
 /*
@@ -104,10 +92,9 @@ static int __init afs_init(void)
        if (ret < 0)
                return ret;
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
        /* we want to be able to cache */
-        ret = cachefs_register_netfs(&afs_cache_netfs,
+        ret = fscache_register_netfs(&afs_cache_netfs);
-                                     &afs_cache_cell_index_def);
        if (ret < 0)
                goto error_cache;
 #endif
@@ -142,8 +129,8 @@ error_fs:
 error_open_socket:
 error_vl_update_init:
 error_cell_init:
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_unregister_netfs(&afs_cache_netfs);
+        fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
        afs_callback_update_kill();
@@ -175,8 +162,8 @@ static void __exit afs_exit(void)
        afs_vlocation_purge();
        flush_scheduled_work();
        afs_cell_purge();
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_unregister_netfs(&afs_cache_netfs);
+        fscache_unregister_netfs(&afs_cache_netfs);
 #endif
        afs_proc_cleanup();
        rcu_barrier();
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 78db4953a800..2b9e2d03a390 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        if (PageError(page))
                goto error;
-        buf = kmap(page);
+        buf = kmap_atomic(page, KM_USER0);
        memcpy(devname, buf, size);
-        kunmap(page);
+        kunmap_atomic(buf, KM_USER0);
        page_cache_release(page);
        page = NULL;
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index 49f189423063..7ad36506c256 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen)
        struct net_device *dev;
        int ret = -ENODEV;
-        if (maclen != ETH_ALEN)
+        BUG_ON(maclen != ETH_ALEN);
-                BUG();
        rtnl_lock();
        dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index aee239a048cb..76828e5f8a39 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -405,21 +405,20 @@ static int afs_get_sb(struct file_system_type *fs_type,
                sb->s_flags = flags;
                ret = afs_fill_super(sb, &params);
                if (ret < 0) {
-                        up_write(&sb->s_umount);
+                        deactivate_locked_super(sb);
-                        deactivate_super(sb);
                        goto error;
                }
-                sb->s_options = new_opts;
+                save_mount_options(sb, new_opts);
                sb->s_flags |= MS_ACTIVE;
        } else {
                _debug("reuse");
-                kfree(new_opts);
                ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
        }
        simple_set_mnt(mnt, sb);
        afs_put_volume(params.volume);
        afs_put_cell(params.cell);
+        kfree(new_opts);
        _leave(" = 0 [%p]", sb);
        return 0;
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 849fc3160cb5..ec2a7431e458 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl,
        vl->vldb = *vldb;
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        /* update volume entry in local cache */
+        fscache_update_cookie(vl->cache);
-        cachefs_update_cookie(vl->cache);
 #endif
 }
@@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
        memset(&vldb, 0, sizeof(vldb));
        /* see if we have an in-cache copy (will set vl->valid if there is) */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_acquire_cookie(cell->cache,
+        vl->cache = fscache_acquire_cookie(vl->cell->cache,
-                               &afs_volume_cache_index_def,
+                                           &afs_vlocation_cache_index_def, vl);
-                               vlocation,
-                               &vl->cache);
 #endif
        if (vl->valid) {
@@ -420,6 +417,11 @@ fill_in_record:
        spin_unlock(&vl->lock);
        wake_up(&vl->waitq);
+        /* update volume entry in local cache */
+#ifdef CONFIG_AFS_FSCACHE
+        fscache_update_cookie(vl->cache);
+#endif
        /* schedule for regular updates */
        afs_vlocation_queue_for_updates(vl);
        goto success;
@@ -465,7 +467,7 @@ found_in_memory:
        spin_unlock(&vl->lock);
 success:
-        _leave(" = %p",vl);
+        _leave(" = %p", vl);
        return vl;
 error_abandon:
@@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
 {
        _enter("%p", vl);
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_relinquish_cookie(vl->cache, 0);
+        fscache_relinquish_cookie(vl->cache, 0);
 #endif
        afs_put_cell(vl->cell);
        kfree(vl);
 }
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 8bab0e3437f9..a353e69e2391 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        }
        /* attach the cache and volume location */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_acquire_cookie(vlocation->cache,
+        volume->cache = fscache_acquire_cookie(vlocation->cache,
-                               &afs_vnode_cache_index_def,
+                                               &afs_volume_cache_index_def,
-                               volume,
+                                               volume);
-                               &volume->cache);
 #endif
        afs_get_vlocation(vlocation);
        volume->vlocation = vlocation;
@@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume)
        up_write(&vlocation->cell->vl_sem);
        /* finish cleaning up the volume */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_relinquish_cookie(volume->cache, 0);
+        fscache_relinquish_cookie(volume->cache, 0);
 #endif
        afs_put_vlocation(vlocation);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3fb36d433621..c2e7a7ff0080 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
        _leave(" = %d", ret);
        return ret;
 }
+/*
+ * notification that a previously read-only page is about to become writable
+ * - if it returns an error, the caller will deliver a bus error signal
+ */
+int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+        struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+        _enter("{{%x:%u}},{%lx}",
+               vnode->fid.vid, vnode->fid.vnode, page->index);
+        /* wait for the page to be written to the cache before we allow it to
+         * be modified */
+#ifdef CONFIG_AFS_FSCACHE
+        fscache_wait_on_page_write(vnode->cache, page);
+#endif
+        _leave(" = 0");
+        return 0;
+}
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index bf8c8af98004..4eb4d8dfb2f1 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -39,10 +39,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 {
        struct autofs_dirhash *dh = &sbi->dirhash;
        struct autofs_dir_ent *ent;
-        struct dentry *dentry;
        unsigned long timeout = sbi->exp_timeout;
        while (1) {
+                struct path path;
+                int umount_ok;
                if ( list_empty(&dh->expiry_head) || sbi->catatonic )
                        return NULL;    /* No entries */
                /* We keep the list sorted by last_usage and want old stuff */
@@ -57,17 +59,17 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
                        return ent; /* Symlinks are always expirable */
                /* Get the dentry for the autofs subdirectory */
-                dentry = ent->dentry;
+                path.dentry = ent->dentry;
-                if ( !dentry ) {
+                if (!path.dentry) {
                        /* Should only happen in catatonic mode */
                        printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
                        autofs_delete_usage(ent);
                        continue;
                }
-                if ( !dentry->d_inode ) {
+                if (!path.dentry->d_inode) {
-                        dput(dentry);
+                        dput(path.dentry);
                        printk("autofs: negative dentry on expiry queue: %s\n",
                               ent->name);
                        autofs_delete_usage(ent);
@@ -76,29 +78,29 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
                /* Make sure entry is mounted and unused; note that dentry will
                   point to the mounted-on-top root. */
-                if (!S_ISDIR(dentry->d_inode->i_mode)||!d_mountpoint(dentry)) {
+                if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
+                    !d_mountpoint(path.dentry)) {
                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
                        continue;
                }
-                mntget(mnt);
+                path.mnt = mnt;
-                dget(dentry);
+                path_get(&path);
-                if (!follow_down(&mnt, &dentry)) {
+                if (!follow_down(&path.mnt, &path.dentry)) {
-                        dput(dentry);
+                        path_put(&path);
-                        mntput(mnt);
                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
                        continue;
                }
-                while (d_mountpoint(dentry) && follow_down(&mnt, &dentry))
+                while (d_mountpoint(path.dentry) &&
+                       follow_down(&path.mnt, &path.dentry))
                        ;
-                dput(dentry);
+                umount_ok = may_umount(path.mnt);
+                path_put(&path);
-                if ( may_umount(mnt) ) {
+                if (umount_ok) {
-                        mntput(mnt);
                        DPRINTK(("autofs: signaling expire on %s\n", ent->name));
                        return ent; /* Expirable! */
                }
                DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-                mntput(mnt);
        }
        return NULL;            /* No expirable entries */
 }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a76803108d06..b7ff33c63101 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
                        struct autofs_sb_info *,
                        struct autofs_packet_expire __user *);
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+                            struct autofs_sb_info *sbi, int when);
 int autofs4_expire_multi(struct super_block *, struct vfsmount *,
                        struct autofs_sb_info *, int __user *);
 struct dentry *autofs4_expire_direct(struct super_block *sb,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 025e105bffea..84168c0dcc2d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -54,11 +54,10 @@ static int check_name(const char *name)
 * Check a string doesn't overrun the chunk of
 * memory we copied from user land.
 */
-static int invalid_str(char *str, void *end)
+static int invalid_str(char *str, size_t size)
 {
-        while ((void *) str <= end)
+        if (memchr(str, 0, size))
-                if (!*str++)
+                return 0;
-                        return 0;
        return -EINVAL;
 }
@@ -138,8 +137,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
        }
        if (param->size > sizeof(*param)) {
-                err = invalid_str(param->path,
+                err = invalid_str(param->path, param->size - sizeof(*param));
-                                 (void *) ((size_t) param + param->size));
                if (err) {
                        AUTOFS_WARN(
                          "path string terminator missing for cmd(0x%08x)",
@@ -488,7 +486,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
        }
        path = param->path;
-        devid = sbi->sb->s_dev;
+        devid = new_encode_dev(sbi->sb->s_dev);
        param->requester.uid = param->requester.gid = -1;
@@ -525,40 +523,13 @@ static int autofs_dev_ioctl_expire(struct file *fp,
                                   struct autofs_sb_info *sbi,
                                   struct autofs_dev_ioctl *param)
 {
-        struct dentry *dentry;
        struct vfsmount *mnt;
-        int err = -EAGAIN;
        int how;
        how = param->expire.how;
        mnt = fp->f_path.mnt;
-        if (autofs_type_trigger(sbi->type))
+        return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
-                dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
-        else
-                dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
-        if (dentry) {
-                struct autofs_info *ino = autofs4_dentry_ino(dentry);
-                /*
-                 * This is synchronous because it makes the daemon a
-                 * little easier
-                */
-                err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
-                spin_lock(&sbi->fs_lock);
-                if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-                        ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
-                        sbi->sb->s_root->d_mounted++;
-                }
-                ino->flags &= ~AUTOFS_INF_EXPIRING;
-                complete_all(&ino->expire_complete);
-                spin_unlock(&sbi->fs_lock);
-                dput(dentry);
-        }
-        return err;
 }
 /* Check if autofs mount point is in use */
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index e3bd50776f9e..3077d8f16523 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -70,8 +70,10 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
                 * Otherwise it's an offset mount and we need to check
                 * if we can umount its mount, if there is one.
                 */
-                if (!d_mountpoint(dentry))
+                if (!d_mountpoint(dentry)) {
+                        status = 0;
                        goto done;
+                }
        }
        /* Update the expiry counter if fs is busy */
@@ -478,22 +480,16 @@ int autofs4_expire_run(struct super_block *sb,
        return ret;
 }
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
-   more to be done */
+                            struct autofs_sb_info *sbi, int when)
-int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
-                        struct autofs_sb_info *sbi, int __user *arg)
 {
        struct dentry *dentry;
        int ret = -EAGAIN;
-        int do_now = 0;
-        if (arg && get_user(do_now, arg))
-                return -EFAULT;
        if (autofs_type_trigger(sbi->type))
-                dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
+                dentry = autofs4_expire_direct(sb, mnt, sbi, when);
        else
-                dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
+                dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
        if (dentry) {
                struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -516,3 +512,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
        return ret;
 }
+/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+   more to be done */
+int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+                        struct autofs_sb_info *sbi, int __user *arg)
+{
+        int do_now = 0;
+        if (arg && get_user(do_now, arg))
+                return -EFAULT;
+        return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
+}
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 74b1469a9504..e383bf0334f1 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
                 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
-        expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
-        if (expiring) {
-                /*
-                 * If we are racing with expire the request might not
-                 * be quite complete but the directory has been removed
-                 * so it must have been successful, so just wait for it.
-                 */
-                ino = autofs4_dentry_ino(expiring);
-                autofs4_expire_wait(expiring);
-                spin_lock(&sbi->lookup_lock);
-                if (!list_empty(&ino->expiring))
-                        list_del_init(&ino->expiring);
-                spin_unlock(&sbi->lookup_lock);
-                dput(expiring);
-        }
        unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
        if (unhashed)
                dentry = unhashed;
@@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        }
        if (!oz_mode) {
+                mutex_unlock(&dir->i_mutex);
+                expiring = autofs4_lookup_expiring(sbi,
+                                                   dentry->d_parent,
+                                                   &dentry->d_name);
+                if (expiring) {
+                        /*
+                         * If we are racing with expire the request might not
+                         * be quite complete but the directory has been removed
+                         * so it must have been successful, so just wait for it.
+                         */
+                        ino = autofs4_dentry_ino(expiring);
+                        autofs4_expire_wait(expiring);
+                        spin_lock(&sbi->lookup_lock);
+                        if (!list_empty(&ino->expiring))
+                                list_del_init(&ino->expiring);
+                        spin_unlock(&sbi->lookup_lock);
+                        dput(expiring);
+                }
                spin_lock(&dentry->d_lock);
                dentry->d_flags |= DCACHE_AUTOFS_PENDING;
                spin_unlock(&dentry->d_lock);
-                if (dentry->d_op && dentry->d_op->d_revalidate) {
+                if (dentry->d_op && dentry->d_op->d_revalidate)
-                        mutex_unlock(&dir->i_mutex);
                        (dentry->d_op->d_revalidate)(dentry, nd);
-                        mutex_lock(&dir->i_mutex);
+                mutex_lock(&dir->i_mutex);
-                }
        }
        /*
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index eeb246845909..2341375386f8 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -297,20 +297,14 @@ static int validate_request(struct autofs_wait_queue **wait,
         */
        if (notify == NFY_MOUNT) {
                /*
-                 * If the dentry isn't hashed just go ahead and try the
+                 * If the dentry was successfully mounted while we slept
-                 * mount again with a new wait (not much else we can do).
+                 * on the wait queue mutex we can return success. If it
-                */
+                 * isn't mounted (doesn't have submounts for the case of
-                if (!d_unhashed(dentry)) {
+                 * a multi-mount with no mount at it's base) we can
-                        /*
+                 * continue on and create a new request.
-                         * But if the dentry is hashed, that means that we
+                 */
-                         * got here through the revalidate path.  Thus, we
+                if (have_submounts(dentry))
-                         * need to check if the dentry has been mounted
+                        return 0;
-                         * while we waited on the wq_mutex. If it has,
-                         * simply return success.
-                         */
-                        if (d_mountpoint(dentry))
-                                return 0;
-                }
        }
        return 1;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #endif                          /* __KERNEL__ */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d06cb023ad02..76afd0d6b86c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -900,6 +900,7 @@ static int
 befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        befs_debug(sb, "---> befs_statfs()");
@@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = buf->f_bfree;
        buf->f_files = 0;       /* UNKNOWN */
        buf->f_ffree = 0;       /* UNKNOWN */
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = BEFS_NAME_LEN;
        befs_debug(sb, "<--- befs_statfs()");
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 41f2b4d0093e..ca40f828f64d 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -8,6 +8,7 @@
 */
 #include <linux/fs.h>
+#include <asm/page.h> /* for PAGE_SIZE */
 #include "befs.h"
 #include "super.h"
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 33b7235f853b..40381df34869 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -12,8 +12,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/errno.h>
@@ -21,20 +19,15 @@
 #include <linux/binfmts.h>
 #include <linux/string.h>
 #include <linux/file.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
 #include <linux/slab.h>
-#include <linux/shm.h>
 #include <linux/personality.h>
 #include <linux/elfcore.h>
 #include <linux/init.h>
 #include <linux/highuid.h>
-#include <linux/smp.h>
 #include <linux/compiler.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/utsname.h>
@@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        unsigned long error;
        struct elf_phdr *elf_ppnt, *elf_phdata;
        unsigned long elf_bss, elf_brk;
-        int elf_exec_fileno;
        int retval, i;
        unsigned int size;
        unsigned long elf_entry;
@@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                goto out_free_ph;
        }
-        retval = get_unused_fd();
-        if (retval < 0)
-                goto out_free_ph;
-        get_file(bprm->file);
-        fd_install(elf_exec_fileno = retval, bprm->file);
        elf_ppnt = elf_phdata;
        elf_bss = 0;
        elf_brk = 0;
@@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        retval = -ENOEXEC;
                        if (elf_ppnt->p_filesz > PATH_MAX || 
                            elf_ppnt->p_filesz < 2)
-                                goto out_free_file;
+                                goto out_free_ph;
                        retval = -ENOMEM;
                        elf_interpreter = kmalloc(elf_ppnt->p_filesz,
                                                  GFP_KERNEL);
                        if (!elf_interpreter)
-                                goto out_free_file;
+                                goto out_free_ph;
                        retval = kernel_read(bprm->file, elf_ppnt->p_offset,
                                             elf_interpreter,
@@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        kfree(elf_phdata);
-        sys_close(elf_exec_fileno);
        set_binfmt(&elf_format);
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
@@ -1028,8 +1012,6 @@ out_free_dentry:
                fput(interpreter);
 out_free_interp:
        kfree(elf_interpreter);
-out_free_file:
-        sys_close(elf_exec_fileno);
 out_free_ph:
        kfree(elf_phdata);
        goto out;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f3e72c5c19f5..fdb66faa24f1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
                        params->elfhdr_addr = seg->addr;
                /* clear any space allocated but not loaded */
-                if (phdr->p_filesz < phdr->p_memsz)
+                if (phdr->p_filesz < phdr->p_memsz) {
-                        clear_user((void *) (seg->addr + phdr->p_filesz),
+                        ret = clear_user((void *) (seg->addr + phdr->p_filesz),
-                                   phdr->p_memsz - phdr->p_filesz);
+                                         phdr->p_memsz - phdr->p_filesz);
+                        if (ret)
+                                return ret;
+                }
                if (mm) {
                        if (phdr->p_flags & PF_X) {
@@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
        struct elf32_fdpic_loadseg *seg;
        struct elf32_phdr *phdr;
        unsigned long load_addr, delta_vaddr;
-        int loop, dvset;
+        int loop, dvset, ret;
        load_addr = params->load_addr;
        delta_vaddr = 0;
@@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                 * PT_LOAD */
                if (prot & PROT_WRITE && disp > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-                        clear_user((void __user *) maddr, disp);
+                        ret = clear_user((void __user *) maddr, disp);
+                        if (ret)
+                                return ret;
                        maddr += disp;
                }
@@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                if (prot & PROT_WRITE && excess1 > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess1);
-                        clear_user((void __user *) maddr + phdr->p_filesz,
+                        ret = clear_user((void __user *) maddr + phdr->p_filesz,
-                                   excess1);
+                                         excess1);
+                        if (ret)
+                                return ret;
                }
 #else
                if (excess > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess);
-                        clear_user((void *) maddr + phdr->p_filesz, excess);
+                        ret = clear_user((void *) maddr + phdr->p_filesz, excess);
+                        if (ret)
+                                return ret;
                }
 #endif
@@ -1379,7 +1388,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
        prstatus->pr_sigpend = p->pending.signal.sig[0];
        prstatus->pr_sighold = p->blocked.sig[0];
        prstatus->pr_pid = task_pid_vnr(p);
-        prstatus->pr_ppid = task_pid_vnr(p->parent);
+        prstatus->pr_ppid = task_pid_vnr(p->real_parent);
        prstatus->pr_pgrp = task_pgrp_vnr(p);
        prstatus->pr_sid = task_session_vnr(p);
        if (thread_group_leader(p)) {
@@ -1424,7 +1433,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
        psinfo->pr_psargs[len] = 0;
        psinfo->pr_pid = task_pid_vnr(p);
-        psinfo->pr_ppid = task_pid_vnr(p->parent);
+        psinfo->pr_ppid = task_pid_vnr(p->real_parent);
        psinfo->pr_pgrp = task_pgrp_vnr(p);
        psinfo->pr_sid = task_session_vnr(p);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5cebf0b37798..697f6b5f1313 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -41,6 +41,7 @@
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <asm/cacheflush.h>
+#include <asm/page.h>
 /****************************************************************************/
@@ -54,6 +55,18 @@
 #define DBG_FLT(a...)
 #endif
+/*
+ * User data (stack, data section and bss) needs to be aligned
+ * for the same reasons as SLAB memory is, and to the same amount.
+ * Avoid duplicating architecture specific code by using the same
+ * macro as with SLAB allocation:
+ */
+#ifdef ARCH_SLAB_MINALIGN
+#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN)
+#else
+#define FLAT_DATA_ALIGN (sizeof(void *))
+#endif
 #define RELOC_FAILED 0xff00ff01         /* Relocation incorrect somewhere */
 #define UNLOADED_LIB 0x7ff000ff         /* Placeholder for unused library */
@@ -114,20 +127,18 @@ static unsigned long create_flat_tables(
        int envc = bprm->envc;
        char uninitialized_var(dummy);
-        sp = (unsigned long *) ((-(unsigned long)sizeof(char *))&(unsigned long) p);
+        sp = (unsigned long *)p;
+        sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
+        sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN);
+        argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
+        envp = argv + (argc + 1);
-        sp -= envc+1;
-        envp = sp;
-        sp -= argc+1;
-        argv = sp;
-        flat_stack_align(sp);
        if (flat_argvp_envp_on_stack()) {
-                --sp; put_user((unsigned long) envp, sp);
+                put_user((unsigned long) envp, sp + 2);
-                --sp; put_user((unsigned long) argv, sp);
+                put_user((unsigned long) argv, sp + 1);
        }
-        put_user(argc,--sp);
+        put_user(argc, sp);
        current->mm->arg_start = (unsigned long) p;
        while (argc-->0) {
                put_user((unsigned long) p, argv++);
@@ -558,7 +569,9 @@ static int load_flat_file(struct linux_binprm * bprm,
                        ret = realdatastart;
                        goto err;
                }
-                datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long);
+                datapos = ALIGN(realdatastart +
+                                MAX_SHARED_LIBS * sizeof(unsigned long),
+                                FLAT_DATA_ALIGN);
                DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n",
                                (int)(data_len + bss_len + stack_len), (int)datapos);
@@ -604,9 +617,12 @@ static int load_flat_file(struct linux_binprm * bprm,
                }
                realdatastart = textpos + ntohl(hdr->data_start);
-                datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long);
+                datapos = ALIGN(realdatastart +
-                reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) +
+                                MAX_SHARED_LIBS * sizeof(unsigned long),
-                                MAX_SHARED_LIBS * sizeof(unsigned long));
+                                FLAT_DATA_ALIGN);
+                reloc = (unsigned long *)
+                        (datapos + (ntohl(hdr->reloc_start) - text_len));
                memp = textpos;
                memp_size = len;
 #ifdef CONFIG_BINFMT_ZFLAT
@@ -854,7 +870,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
        stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
        stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
+        stack_len += FLAT_DATA_ALIGN - 1;  /* reserve for upcoming alignment */
        
        res = load_flat_file(bprm, &libinfo, 0, &stack_len);
        if (res > (unsigned long)-4096)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 08644a61616e..eff74b9c9e77 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -188,7 +188,6 @@ out:
 static int
 load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 {
-        int som_exec_fileno;
        int retval;
        unsigned int size;
        unsigned long som_entry;
@@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                goto out_free;
        }
-        retval = get_unused_fd();
-        if (retval < 0)
-                goto out_free;
-        get_file(bprm->file);
-        fd_install(som_exec_fileno = retval, bprm->file);
        /* Flush all traces of the currently running executable */
        retval = flush_old_exec(bprm);
        if (retval)
diff --git a/fs/bio.c b/fs/bio.c
index a040cde7f6fd..98711647ece4 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -175,14 +175,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
        struct bio_vec *bvl;
        /*
-         * If 'bs' is given, lookup the pool and do the mempool alloc.
-         * If not, this is a bio_kmalloc() allocation and just do a
-         * kzalloc() for the exact number of vecs right away.
-         */
-        if (!bs)
-                bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
-        /*
         * see comment near bvec_array define!
         */
        switch (nr) {
@@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs)
        mempool_free(p, bs->bio_pool);
 }
-/*
- * default destructor for a bio allocated with bio_alloc_bioset()
- */
-static void bio_fs_destructor(struct bio *bio)
-{
-        bio_free(bio, fs_bio_set);
-}
-static void bio_kmalloc_destructor(struct bio *bio)
-{
-        if (bio_has_allocated_vec(bio))
-                kfree(bio->bi_io_vec);
-        kfree(bio);
-}
 void bio_init(struct bio *bio)
 {
        memset(bio, 0, sizeof(*bio));
@@ -301,21 +278,15 @@ void bio_init(struct bio *bio)
 **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+        unsigned long idx = BIO_POOL_NONE;
        struct bio_vec *bvl = NULL;
-        struct bio *bio = NULL;
+        struct bio *bio;
-        unsigned long idx = 0;
+        void *p;
-        void *p = NULL;
+        p = mempool_alloc(bs->bio_pool, gfp_mask);
-        if (bs) {
+        if (unlikely(!p))
-                p = mempool_alloc(bs->bio_pool, gfp_mask);
+                return NULL;
-                if (!p)
+        bio = p + bs->front_pad;
-                        goto err;
-                bio = p + bs->front_pad;
-        } else {
-                bio = kmalloc(sizeof(*bio), gfp_mask);
-                if (!bio)
-                        goto err;
-        }
        bio_init(bio);
@@ -332,22 +303,33 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                nr_iovecs = bvec_nr_vecs(idx);
        }
+out_set:
        bio->bi_flags |= idx << BIO_POOL_OFFSET;
        bio->bi_max_vecs = nr_iovecs;
-out_set:
        bio->bi_io_vec = bvl;
        return bio;
 err_free:
-        if (bs)
+        mempool_free(p, bs->bio_pool);
-                mempool_free(p, bs->bio_pool);
-        else
-                kfree(bio);
-err:
        return NULL;
 }
+static void bio_fs_destructor(struct bio *bio)
+{
+        bio_free(bio, fs_bio_set);
+}
+/**
+ *      bio_alloc - allocate a new bio, memory pool backed
+ *      @gfp_mask: allocation mask to use
+ *      @nr_iovecs: number of iovecs
+ *
+ *      Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask
+ *      contains __GFP_WAIT, the allocation is guaranteed to succeed.
+ *
+ *      RETURNS:
+ *      Pointer to new bio on success, NULL on failure.
+ */
 struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 {
        struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
@@ -358,19 +340,45 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
        return bio;
 }
-/*
+static void bio_kmalloc_destructor(struct bio *bio)
- * Like bio_alloc(), but doesn't use a mempool backing. This means that
+{
- * it CAN fail, but while bio_alloc() can only be used for allocations
+        if (bio_integrity(bio))
- * that have a short (finite) life span, bio_kmalloc() should be used
+                bio_integrity_free(bio);
- * for more permanent bio allocations (like allocating some bio's for
+        kfree(bio);
- * initalization or setup purposes).
+}
- */
+/**
+ * bio_alloc - allocate a bio for I/O
+ * @gfp_mask:   the GFP_ mask given to the slab allocator
+ * @nr_iovecs:  number of iovecs to pre-allocate
+ *
+ * Description:
+ *   bio_alloc will allocate a bio and associated bio_vec array that can hold
+ *   at least @nr_iovecs entries. Allocations will be done from the
+ *   fs_bio_set. Also see @bio_alloc_bioset.
+ *
+ *   If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
+ *   a bio. This is due to the mempool guarantees. To make this work, callers
+ *   must never allocate more than 1 bio at the time from this pool. Callers
+ *   that need to allocate more than 1 bio must always submit the previously
+ *   allocate bio for IO before attempting to allocate a new one. Failure to
+ *   do so can cause livelocks under memory pressure.
+ *
+ **/
 struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
-        struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+        struct bio *bio;
-        if (bio)
+        bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
-                bio->bi_destructor = bio_kmalloc_destructor;
+                      gfp_mask);
+        if (unlikely(!bio))
+                return NULL;
+        bio_init(bio);
+        bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+        bio->bi_max_vecs = nr_iovecs;
+        bio->bi_io_vec = bio->bi_inline_vecs;
+        bio->bi_destructor = bio_kmalloc_destructor;
        return bio;
 }
@@ -809,12 +817,15 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
                len += iov[i].iov_len;
        }
+        if (offset)
+                nr_pages++;
        bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
        if (!bmd)
                return ERR_PTR(-ENOMEM);
        ret = -ENOMEM;
-        bio = bio_alloc(gfp_mask, nr_pages);
+        bio = bio_kmalloc(gfp_mask, nr_pages);
        if (!bio)
                goto out_bmd;
@@ -938,7 +949,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
        if (!nr_pages)
                return ERR_PTR(-EINVAL);
-        bio = bio_alloc(gfp_mask, nr_pages);
+        bio = bio_kmalloc(gfp_mask, nr_pages);
        if (!bio)
                return ERR_PTR(-ENOMEM);
@@ -1122,7 +1133,7 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data,
        int offset, i;
        struct bio *bio;
-        bio = bio_alloc(gfp_mask, nr_pages);
+        bio = bio_kmalloc(gfp_mask, nr_pages);
        if (!bio)
                return ERR_PTR(-ENOMEM);
@@ -1420,8 +1431,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
 }
 /*
- * split a bio - only worry about a bio with a single page
+ * split a bio - only worry about a bio with a single page in its iovec
- * in it's iovec
 */
 struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8c3c6899ccf3..f45dbc18dd17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev)
        }
        return sync_blockdev(bdev);
 }
+EXPORT_SYMBOL(fsync_bdev);
 /**
 * freeze_bdev  --  lock a filesystem and force it into a consistent state
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b8..94212844a9bc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,25 +1,10 @@
-ifneq ($(KERNELRELEASE),)
-# kbuild part of makefile
 obj-$(CONFIG_BTRFS_FS) := btrfs.o
-btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           file-item.o inode-item.o inode-map.o disk-io.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
-           compression.o
+           compression.o delayed-ref.o
-else
-# Normal Makefile
-KERNELDIR := /lib/modules/`uname -r`/build
-all:
-        $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
-modules_install:
-        $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
-clean:
-        $(MAKE) -C $(KERNELDIR) M=`pwd` clean
-endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..cbba000dccbe 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                return ERR_PTR(-EINVAL);
        }
+        /* Handle the cached NULL acl case without locking */
+        acl = ACCESS_ONCE(*p_acl);
+        if (!acl)
+                return acl;
        spin_lock(&inode->i_lock);
-        if (*p_acl != BTRFS_ACL_NOT_CACHED)
+        acl = *p_acl;
-                acl = posix_acl_dup(*p_acl);
+        if (acl != BTRFS_ACL_NOT_CACHED)
+                acl = posix_acl_dup(acl);
        spin_unlock(&inode->i_lock);
-        if (acl)
+        if (acl != BTRFS_ACL_NOT_CACHED)
                return acl;
        size = __btrfs_getxattr(inode, name, "", 0);
        if (size > 0) {
                value = kzalloc(size, GFP_NOFS);
@@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                        btrfs_update_cached_acl(inode, p_acl, acl);
                }
                kfree(value);
-        } else if (size == -ENOENT) {
+        } else if (size == -ENOENT || size == -ENODATA || size == 0) {
+                /* FIXME, who returns -ENOENT?  I think nobody */
                acl = NULL;
                btrfs_update_cached_acl(inode, p_acl, acl);
+        } else {
+                acl = ERR_PTR(-EIO);
        }
        return acl;
@@ -256,7 +264,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..502c3d61de62 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,12 +20,12 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
-#include <linux/ftrace.h>
 #include "async-thread.h"
 #define WORK_QUEUED_BIT 0
 #define WORK_DONE_BIT 1
 #define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
 /*
 * container for the kthread task pointer and the list of pending work
@@ -37,6 +37,7 @@ struct btrfs_worker_thread {
        /* list of struct btrfs_work that are waiting for service */
        struct list_head pending;
+        struct list_head prio_pending;
        /* list of worker threads from struct btrfs_workers */
        struct list_head worker_list;
@@ -104,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
        spin_lock_irqsave(&workers->lock, flags);
-        while (!list_empty(&workers->order_list)) {
+        while (1) {
-                work = list_entry(workers->order_list.next,
+                if (!list_empty(&workers->prio_order_list)) {
-                                  struct btrfs_work, order_list);
+                        work = list_entry(workers->prio_order_list.next,
+                                          struct btrfs_work, order_list);
+                } else if (!list_empty(&workers->order_list)) {
+                        work = list_entry(workers->order_list.next,
+                                          struct btrfs_work, order_list);
+                } else {
+                        break;
+                }
                if (!test_bit(WORK_DONE_BIT, &work->flags))
                        break;
@@ -144,8 +151,14 @@ static int worker_loop(void *arg)
        do {
                spin_lock_irq(&worker->lock);
 again_locked:
-                while (!list_empty(&worker->pending)) {
+                while (1) {
-                        cur = worker->pending.next;
+                        if (!list_empty(&worker->prio_pending))
+                                cur = worker->prio_pending.next;
+                        else if (!list_empty(&worker->pending))
+                                cur = worker->pending.next;
+                        else
+                                break;
                        work = list_entry(cur, struct btrfs_work, list);
                        list_del(&work->list);
                        clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -164,7 +177,6 @@ again_locked:
                        spin_lock_irq(&worker->lock);
                        check_idle_worker(worker);
                }
                if (freezing(current)) {
                        worker->working = 0;
@@ -179,7 +191,8 @@ again_locked:
                                 * jump_in?
                                 */
                                smp_mb();
-                                if (!list_empty(&worker->pending))
+                                if (!list_empty(&worker->pending) ||
+                                    !list_empty(&worker->prio_pending))
                                        continue;
                                /*
@@ -192,13 +205,18 @@ again_locked:
                                 */
                                schedule_timeout(1);
                                smp_mb();
-                                if (!list_empty(&worker->pending))
+                                if (!list_empty(&worker->pending) ||
+                                    !list_empty(&worker->prio_pending))
                                        continue;
+                                if (kthread_should_stop())
+                                        break;
                                /* still no more work?, sleep for real */
                                spin_lock_irq(&worker->lock);
                                set_current_state(TASK_INTERRUPTIBLE);
-                                if (!list_empty(&worker->pending))
+                                if (!list_empty(&worker->pending) ||
+                                    !list_empty(&worker->prio_pending))
                                        goto again_locked;
                                /*
@@ -208,7 +226,8 @@ again_locked:
                                worker->working = 0;
                                spin_unlock_irq(&worker->lock);
-                                schedule();
+                                if (!kthread_should_stop())
+                                        schedule();
                        }
                        __set_current_state(TASK_RUNNING);
                }
@@ -245,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
        INIT_LIST_HEAD(&workers->worker_list);
        INIT_LIST_HEAD(&workers->idle_list);
        INIT_LIST_HEAD(&workers->order_list);
+        INIT_LIST_HEAD(&workers->prio_order_list);
        spin_lock_init(&workers->lock);
        workers->max_workers = max;
        workers->idle_thresh = 32;
@@ -270,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                }
                INIT_LIST_HEAD(&worker->pending);
+                INIT_LIST_HEAD(&worker->prio_pending);
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
                atomic_set(&worker->num_pending, 0);
@@ -393,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
                goto out;
        spin_lock_irqsave(&worker->lock, flags);
-        list_add_tail(&work->list, &worker->pending);
+        if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+                list_add_tail(&work->list, &worker->prio_pending);
+        else
+                list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
        /* by definition we're busy, take ourselves off the idle
@@ -419,6 +443,11 @@ out:
        return 0;
 }
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+        set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
 /*
 * places a struct btrfs_work into the pending queue of one of the kthreads
 */
@@ -435,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        worker = find_worker(workers);
        if (workers->ordered) {
                spin_lock_irqsave(&workers->lock, flags);
-                list_add_tail(&work->order_list, &workers->order_list);
+                if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+                        list_add_tail(&work->order_list,
+                                      &workers->prio_order_list);
+                } else {
+                        list_add_tail(&work->order_list, &workers->order_list);
+                }
                spin_unlock_irqrestore(&workers->lock, flags);
        } else {
                INIT_LIST_HEAD(&work->order_list);
@@ -443,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        spin_lock_irqsave(&worker->lock, flags);
-        list_add_tail(&work->list, &worker->pending);
+        if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+                list_add_tail(&work->list, &worker->prio_pending);
+        else
+                list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
        check_busy_worker(worker);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 31be4ed8b63e..1b511c109db6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,6 +85,7 @@ struct btrfs_workers {
         * of work items waiting for completion
         */
        struct list_head order_list;
+        struct list_head prio_order_list;
        /* lock for finding the next worker thread to queue on */
        spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
 int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74f..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
         */
        struct list_head delalloc_inodes;
+        /*
+         * list for tracking inodes that must be sent to disk before a
+         * rename or truncate commit
+         */
+        struct list_head ordered_operations;
        /* the space_info for where this inode's data allocations are done */
        struct btrfs_space_info *space_info;
@@ -86,12 +92,6 @@ struct btrfs_inode {
         */
        u64 logged_trans;
-        /*
-         * trans that last made a change that should be fully fsync'd.  This
-         * gets reset to zero each time the inode is logged
-         */
-        u64 log_dirty_trans;
        /* total number of bytes pending delalloc, used by stat to calc the
         * real block usage of the file
         */
@@ -121,6 +121,25 @@ struct btrfs_inode {
        /* the start of block group preferred for allocations. */
        u64 block_group;
+        /* the fsync log has some corner cases that mean we have to check
+         * directories to see if any unlinks have been done before
+         * the directory was logged.  See tree-log.c for all the
+         * details
+         */
+        u64 last_unlink_trans;
+        /*
+         * ordered_data_close is set by truncate when a file that used
+         * to have good data has been truncated to zero.  When it is set
+         * the btrfs file release call will add this inode to the
+         * ordered operations list so that we make sure to flush out any
+         * new data the application may have written before commit.
+         *
+         * yes, its silly to have a single bitflag, but we might grow more
+         * of these.
+         */
+        unsigned ordered_data_close:1;
        struct inode vfs_inode;
 };
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529aa..fedf8b9f03a2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
- *
- * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.
- * btrfs_alloc_reserved_extent is used to finish the allocation.
 */
 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct extent_buffer *buf,
                             struct extent_buffer *parent, int parent_slot,
                             struct extent_buffer **cow_ret,
-                             u64 search_start, u64 empty_size,
+                             u64 search_start, u64 empty_size)
-                             u64 prealloc_dest)
 {
        u64 parent_start;
        struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        level = btrfs_header_level(buf);
        nritems = btrfs_header_nritems(buf);
-        if (prealloc_dest) {
+        cow = btrfs_alloc_free_block(trans, root, buf->len,
-                struct btrfs_key ins;
+                                     parent_start, root->root_key.objectid,
+                                     trans->transid, level,
-                ins.objectid = prealloc_dest;
+                                     search_start, empty_size);
-                ins.offset = buf->len;
-                ins.type = BTRFS_EXTENT_ITEM_KEY;
-                ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
-                                                  root->root_key.objectid,
-                                                  trans->transid, level, &ins);
-                BUG_ON(ret);
-                cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-                                            buf->len, level);
-        } else {
-                cow = btrfs_alloc_free_block(trans, root, buf->len,
-                                             parent_start,
-                                             root->root_key.objectid,
-                                             trans->transid, level,
-                                             search_start, empty_size);
-        }
        if (IS_ERR(cow))
                return PTR_ERR(cow);
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, struct extent_buffer *buf,
                    struct extent_buffer *parent, int parent_slot,
-                    struct extent_buffer **cow_ret, u64 prealloc_dest)
+                    struct extent_buffer **cow_ret)
 {
        u64 search_start;
        int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
            btrfs_header_owner(buf) == root->root_key.objectid &&
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                *cow_ret = buf;
-                WARN_ON(prealloc_dest);
                return 0;
        }
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
        btrfs_set_lock_blocking(buf);
        ret = __btrfs_cow_block(trans, root, buf, parent,
-                                 parent_slot, cow_ret, search_start, 0,
+                                 parent_slot, cow_ret, search_start, 0);
-                                 prealloc_dest);
        return ret;
 }
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                err = __btrfs_cow_block(trans, root, cur, parent, i,
                                        &cur, search_start,
                                        min(16 * blocksize,
-                                            (end_slot - i) * blocksize), 0);
+                                            (end_slot - i) * blocksize));
                if (err) {
                        btrfs_tree_unlock(cur);
                        free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                BUG_ON(!child);
                btrfs_tree_lock(child);
                btrfs_set_lock_blocking(child);
-                ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
                BUG_ON(ret);
                spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                spin_unlock(&root->node_lock);
                ret = btrfs_update_extent_ref(trans, root, child->start,
+                                              child->len,
                                              mid->start, child->start,
                                              root->root_key.objectid,
                                              trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
+        if (trans->transaction->delayed_refs.flushing &&
+            btrfs_header_nritems(mid) > 2)
+                return 0;
        if (btrfs_header_nritems(mid) < 2)
                err_on_enospc = 1;
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(left);
                btrfs_set_lock_blocking(left);
                wret = btrfs_cow_block(trans, root, left,
-                                       parent, pslot - 1, &left, 0);
+                                       parent, pslot - 1, &left);
                if (wret) {
                        ret = wret;
                        goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(right);
                btrfs_set_lock_blocking(right);
                wret = btrfs_cow_block(trans, root, right,
-                                       parent, pslot + 1, &right, 0);
+                                       parent, pslot + 1, &right);
                if (wret) {
                        ret = wret;
                        goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        wret = 1;
                } else {
                        ret = btrfs_cow_block(trans, root, left, parent,
-                                              pslot - 1, &left, 0);
+                                              pslot - 1, &left);
                        if (ret)
                                wret = 1;
                        else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                } else {
                        ret = btrfs_cow_block(trans, root, right,
                                              parent, pslot + 1,
-                                              &right, 0);
+                                              &right);
                        if (ret)
                                wret = 1;
                        else {
@@ -1262,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
 */
-static noinline void reada_for_search(struct btrfs_root *root,
+static void reada_for_search(struct btrfs_root *root,
-                                      struct btrfs_path *path,
+                             struct btrfs_path *path,
-                                      int level, int slot, u64 objectid)
+                             int level, int slot, u64 objectid)
 {
        struct extent_buffer *node;
        struct btrfs_disk_key disk_key;
@@ -1343,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        int ret = 0;
        int blocksize;
-        parent = path->nodes[level - 1];
+        parent = path->nodes[level + 1];
        if (!parent)
                return 0;
        nritems = btrfs_header_nritems(parent);
-        slot = path->slots[level];
+        slot = path->slots[level + 1];
        blocksize = btrfs_level_size(root, level);
        if (slot > 0) {
@@ -1359,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        block1 = 0;
                free_extent_buffer(eb);
        }
-        if (slot < nritems) {
+        if (slot + 1 < nritems) {
                block2 = btrfs_node_blockptr(parent, slot + 1);
                gen = btrfs_node_ptr_generation(parent, slot + 1);
                eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1369,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        }
        if (block1 || block2) {
                ret = -EAGAIN;
+                /* release the whole path */
                btrfs_release_path(root, path);
+                /* read the blocks */
                if (block1)
                        readahead_tree_block(root, block1, blocksize, 0);
                if (block2)
@@ -1379,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        eb = read_tree_block(root, block1, blocksize, 0);
                        free_extent_buffer(eb);
                }
-                if (block1) {
+                if (block2) {
                        eb = read_tree_block(root, block2, blocksize, 0);
                        free_extent_buffer(eb);
                }
@@ -1465,6 +1451,138 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 }
 /*
+ * helper function for btrfs_search_slot.  The goal is to find a block
+ * in cache without setting the path to blocking.  If we find the block
+ * we return zero and the path is unchanged.
+ *
+ * If we can't find the block, we set the path blocking and do some
+ * reada.  -EAGAIN is returned and the search must be repeated.
+ */
+static int
+read_block_for_search(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct btrfs_path *p,
+                       struct extent_buffer **eb_ret, int level, int slot,
+                       struct btrfs_key *key)
+{
+        u64 blocknr;
+        u64 gen;
+        u32 blocksize;
+        struct extent_buffer *b = *eb_ret;
+        struct extent_buffer *tmp;
+        int ret;
+        blocknr = btrfs_node_blockptr(b, slot);
+        gen = btrfs_node_ptr_generation(b, slot);
+        blocksize = btrfs_level_size(root, level - 1);
+        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+                /*
+                 * we found an up to date block without sleeping, return
+                 * right away
+                 */
+                *eb_ret = tmp;
+                return 0;
+        }
+        /*
+         * reduce lock contention at high levels
+         * of the btree by dropping locks before
+         * we read.  Don't release the lock on the current
+         * level because we need to walk this node to figure
+         * out which blocks to read.
+         */
+        btrfs_unlock_up_safe(p, level + 1);
+        btrfs_set_path_blocking(p);
+        if (tmp)
+                free_extent_buffer(tmp);
+        if (p->reada)
+                reada_for_search(root, p, level, slot, key->objectid);
+        btrfs_release_path(NULL, p);
+        ret = -EAGAIN;
+        tmp = read_tree_block(root, blocknr, blocksize, gen);
+        if (tmp) {
+                /*
+                 * If the read above didn't mark this buffer up to date,
+                 * it will never end up being up to date.  Set ret to EIO now
+                 * and give up so that our caller doesn't loop forever
+                 * on our EAGAINs.
+                 */
+                if (!btrfs_buffer_uptodate(tmp, 0))
+                        ret = -EIO;
+                free_extent_buffer(tmp);
+        }
+        return ret;
+}
+/*
+ * helper function for btrfs_search_slot.  This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned.  If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct btrfs_path *p,
+                       struct extent_buffer *b, int level, int ins_len)
+{
+        int ret;
+        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+                int sret;
+                sret = reada_for_balance(root, p, level);
+                if (sret)
+                        goto again;
+                btrfs_set_path_blocking(p);
+                sret = split_node(trans, root, p, level);
+                btrfs_clear_path_blocking(p, NULL);
+                BUG_ON(sret > 0);
+                if (sret) {
+                        ret = sret;
+                        goto done;
+                }
+                b = p->nodes[level];
+        } else if (ins_len < 0 && btrfs_header_nritems(b) <
+                   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+                int sret;
+                sret = reada_for_balance(root, p, level);
+                if (sret)
+                        goto again;
+                btrfs_set_path_blocking(p);
+                sret = balance_level(trans, root, p, level);
+                btrfs_clear_path_blocking(p, NULL);
+                if (sret) {
+                        ret = sret;
+                        goto done;
+                }
+                b = p->nodes[level];
+                if (!b) {
+                        btrfs_release_path(NULL, p);
+                        goto again;
+                }
+                BUG_ON(btrfs_header_nritems(b) == 1);
+        }
+        return 0;
+again:
+        ret = -EAGAIN;
+done:
+        return ret;
+}
+/*
 * look for key in the tree.  path is filled in with nodes along the way
 * if key is found, we return zero and you can find the item in the leaf
 * level of the path (level 0)
@@ -1482,17 +1600,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      ins_len, int cow)
 {
        struct extent_buffer *b;
-        struct extent_buffer *tmp;
        int slot;
        int ret;
        int level;
-        int should_reada = p->reada;
        int lowest_unlock = 1;
-        int blocksize;
        u8 lowest_level = 0;
-        u64 blocknr;
-        u64 gen;
-        struct btrfs_key prealloc_block;
        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1613,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        if (ins_len < 0)
                lowest_unlock = 2;
-        prealloc_block.objectid = 0;
 again:
        if (p->skip_locking)
                b = btrfs_root_node(root);
@@ -1523,50 +1633,21 @@ again:
                if (cow) {
                        int wret;
-                        /* is a cow on this block not required */
+                        /*
+                         * if we don't really need to cow this block
+                         * then we don't want to set the path blocking,
+                         * so we test it here
+                         */
                        if (btrfs_header_generation(b) == trans->transid &&
                            btrfs_header_owner(b) == root->root_key.objectid &&
                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
                                goto cow_done;
                        }
-                        /* ok, we have to cow, is our old prealloc the right
-                         * size?
-                         */
-                        if (prealloc_block.objectid &&
-                            prealloc_block.offset != b->len) {
-                                btrfs_release_path(root, p);
-                                btrfs_free_reserved_extent(root,
-                                           prealloc_block.objectid,
-                                           prealloc_block.offset);
-                                prealloc_block.objectid = 0;
-                                goto again;
-                        }
-                        /*
-                         * for higher level blocks, try not to allocate blocks
-                         * with the block and the parent locks held.
-                         */
-                        if (level > 0 && !prealloc_block.objectid) {
-                                u32 size = b->len;
-                                u64 hint = b->start;
-                                btrfs_release_path(root, p);
-                                ret = btrfs_reserve_extent(trans, root,
-                                                           size, size, 0,
-                                                           hint, (u64)-1,
-                                                           &prealloc_block, 0);
-                                BUG_ON(ret);
-                                goto again;
-                        }
                        btrfs_set_path_blocking(p);
                        wret = btrfs_cow_block(trans, root, b,
                                               p->nodes[level + 1],
-                                               p->slots[level + 1],
+                                               p->slots[level + 1], &b);
-                                               &b, prealloc_block.objectid);
-                        prealloc_block.objectid = 0;
                        if (wret) {
                                free_extent_buffer(b);
                                ret = wret;
@@ -1611,51 +1692,15 @@ cow_done:
                        if (ret && slot > 0)
                                slot -= 1;
                        p->slots[level] = slot;
-                        if ((p->search_for_split || ins_len > 0) &&
+                        ret = setup_nodes_for_search(trans, root, p, b, level,
-                            btrfs_header_nritems(b) >=
+                                                     ins_len);
-                            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+                        if (ret == -EAGAIN)
-                                int sret;
+                                goto again;
+                        else if (ret)
-                                sret = reada_for_balance(root, p, level);
+                                goto done;
-                                if (sret)
+                        b = p->nodes[level];
-                                        goto again;
+                        slot = p->slots[level];
-                                btrfs_set_path_blocking(p);
-                                sret = split_node(trans, root, p, level);
-                                btrfs_clear_path_blocking(p, NULL);
-                                BUG_ON(sret > 0);
-                                if (sret) {
-                                        ret = sret;
-                                        goto done;
-                                }
-                                b = p->nodes[level];
-                                slot = p->slots[level];
-                        } else if (ins_len < 0 &&
-                                   btrfs_header_nritems(b) <
-                                   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
-                                int sret;
-                                sret = reada_for_balance(root, p, level);
-                                if (sret)
-                                        goto again;
-                                btrfs_set_path_blocking(p);
-                                sret = balance_level(trans, root, p, level);
-                                btrfs_clear_path_blocking(p, NULL);
-                                if (sret) {
-                                        ret = sret;
-                                        goto done;
-                                }
-                                b = p->nodes[level];
-                                if (!b) {
-                                        btrfs_release_path(NULL, p);
-                                        goto again;
-                                }
-                                slot = p->slots[level];
-                                BUG_ON(btrfs_header_nritems(b) == 1);
-                        }
                        unlock_up(p, level, lowest_unlock);
                        /* this is only true while dropping a snapshot */
@@ -1664,44 +1709,14 @@ cow_done:
                                goto done;
                        }
-                        blocknr = btrfs_node_blockptr(b, slot);
+                        ret = read_block_for_search(trans, root, p,
-                        gen = btrfs_node_ptr_generation(b, slot);
+                                                    &b, level, slot, key);
-                        blocksize = btrfs_level_size(root, level - 1);
+                        if (ret == -EAGAIN)
+                                goto again;
+                        if (ret == -EIO)
+                                goto done;
-                        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
-                        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-                                b = tmp;
-                        } else {
-                                /*
-                                 * reduce lock contention at high levels
-                                 * of the btree by dropping locks before
-                                 * we read.
-                                 */
-                                if (level > 0) {
-                                        btrfs_release_path(NULL, p);
-                                        if (tmp)
-                                                free_extent_buffer(tmp);
-                                        if (should_reada)
-                                                reada_for_search(root, p,
-                                                                 level, slot,
-                                                                 key->objectid);
-                                        tmp = read_tree_block(root, blocknr,
-                                                         blocksize, gen);
-                                        if (tmp)
-                                                free_extent_buffer(tmp);
-                                        goto again;
-                                } else {
-                                        btrfs_set_path_blocking(p);
-                                        if (tmp)
-                                                free_extent_buffer(tmp);
-                                        if (should_reada)
-                                                reada_for_search(root, p,
-                                                                 level, slot,
-                                                                 key->objectid);
-                                        b = read_node_slot(root, b, slot);
-                                }
-                        }
                        if (!p->skip_locking) {
                                int lret;
@@ -1742,12 +1757,10 @@ done:
         * we don't really know what they plan on doing with the path
         * from here on, so for now just mark it as blocking
         */
-        btrfs_set_path_blocking(p);
+        if (!p->leave_spinning)
-        if (prealloc_block.objectid) {
+                btrfs_set_path_blocking(p);
-                btrfs_free_reserved_extent(root,
+        if (ret < 0)
-                           prealloc_block.objectid,
+                btrfs_release_path(root, p);
-                           prealloc_block.offset);
-        }
        return ret;
 }
@@ -1768,7 +1781,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
        int ret;
        eb = btrfs_lock_root_node(root);
-        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
        BUG_ON(ret);
        btrfs_set_lock_blocking(eb);
@@ -1826,7 +1839,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
                        }
                        ret = btrfs_cow_block(trans, root, eb, parent, slot,
-                                              &eb, 0);
+                                              &eb);
                        BUG_ON(ret);
                        if (root->root_key.objectid ==
@@ -2139,7 +2152,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        spin_unlock(&root->node_lock);
        ret = btrfs_update_extent_ref(trans, root, lower->start,
-                                      lower->start, c->start,
+                                      lower->len, lower->start, c->start,
                                      root->root_key.objectid,
                                      trans->transid, level - 1);
        BUG_ON(ret);
@@ -2174,8 +2187,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
        BUG_ON(!path->nodes[level]);
        lower = path->nodes[level];
        nritems = btrfs_header_nritems(lower);
-        if (slot > nritems)
+        BUG_ON(slot > nritems);
-                BUG();
        if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
                BUG();
        if (slot != nritems) {
@@ -2221,7 +2233,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
-        } else {
+        } else if (!trans->transaction->delayed_refs.flushing) {
                ret = push_nodes_for_insert(trans, root, path, level);
                c = path->nodes[level];
                if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2341,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
        return ret;
 }
-/*
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
- * push some data in the path leaf to the right, trying to free up at
+                                      struct btrfs_root *root,
- * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+                                      struct btrfs_path *path,
- *
+                                      int data_size, int empty,
- * returns 1 if the push failed because the other node didn't have enough
+                                      struct extent_buffer *right,
- * room, 0 if everything worked out and < 0 if there were major errors.
+                                      int free_space, u32 left_nritems)
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-                           *root, struct btrfs_path *path, int data_size,
-                           int empty)
 {
        struct extent_buffer *left = path->nodes[0];
-        struct extent_buffer *right;
+        struct extent_buffer *upper = path->nodes[1];
-        struct extent_buffer *upper;
        struct btrfs_disk_key disk_key;
        int slot;
        u32 i;
-        int free_space;
        int push_space = 0;
        int push_items = 0;
        struct btrfs_item *item;
-        u32 left_nritems;
        u32 nr;
        u32 right_nritems;
        u32 data_end;
        u32 this_item_size;
        int ret;
-        slot = path->slots[1];
-        if (!path->nodes[1])
-                return 1;
-        upper = path->nodes[1];
-        if (slot >= btrfs_header_nritems(upper) - 1)
-                return 1;
-        btrfs_assert_tree_locked(path->nodes[1]);
-        right = read_node_slot(root, upper, slot + 1);
-        btrfs_tree_lock(right);
-        btrfs_set_lock_blocking(right);
-        free_space = btrfs_leaf_free_space(root, right);
-        if (free_space < data_size)
-                goto out_unlock;
-        /* cow and double check */
-        ret = btrfs_cow_block(trans, root, right, upper,
-                              slot + 1, &right, 0);
-        if (ret)
-                goto out_unlock;
-        free_space = btrfs_leaf_free_space(root, right);
-        if (free_space < data_size)
-                goto out_unlock;
-        left_nritems = btrfs_header_nritems(left);
-        if (left_nritems == 0)
-                goto out_unlock;
        if (empty)
                nr = 0;
        else
@@ -2397,6 +2370,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (path->slots[0] >= left_nritems)
                push_space += data_size;
+        slot = path->slots[1];
        i = left_nritems - 1;
        while (i >= nr) {
                item = btrfs_item_nr(left, i);
@@ -2528,24 +2502,82 @@ out_unlock:
 }
 /*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+                           *root, struct btrfs_path *path, int data_size,
+                           int empty)
+{
+        struct extent_buffer *left = path->nodes[0];
+        struct extent_buffer *right;
+        struct extent_buffer *upper;
+        int slot;
+        int free_space;
+        u32 left_nritems;
+        int ret;
+        if (!path->nodes[1])
+                return 1;
+        slot = path->slots[1];
+        upper = path->nodes[1];
+        if (slot >= btrfs_header_nritems(upper) - 1)
+                return 1;
+        btrfs_assert_tree_locked(path->nodes[1]);
+        right = read_node_slot(root, upper, slot + 1);
+        btrfs_tree_lock(right);
+        btrfs_set_lock_blocking(right);
+        free_space = btrfs_leaf_free_space(root, right);
+        if (free_space < data_size)
+                goto out_unlock;
+        /* cow and double check */
+        ret = btrfs_cow_block(trans, root, right, upper,
+                              slot + 1, &right);
+        if (ret)
+                goto out_unlock;
+        free_space = btrfs_leaf_free_space(root, right);
+        if (free_space < data_size)
+                goto out_unlock;
+        left_nritems = btrfs_header_nritems(left);
+        if (left_nritems == 0)
+                goto out_unlock;
+        return __push_leaf_right(trans, root, path, data_size, empty,
+                                right, free_space, left_nritems);
+out_unlock:
+        btrfs_tree_unlock(right);
+        free_extent_buffer(right);
+        return 1;
+}
+/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 */
-static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
-                          *root, struct btrfs_path *path, int data_size,
+                                     struct btrfs_root *root,
-                          int empty)
+                                     struct btrfs_path *path, int data_size,
+                                     int empty, struct extent_buffer *left,
+                                     int free_space, int right_nritems)
 {
        struct btrfs_disk_key disk_key;
        struct extent_buffer *right = path->nodes[0];
-        struct extent_buffer *left;
        int slot;
        int i;
-        int free_space;
        int push_space = 0;
        int push_items = 0;
        struct btrfs_item *item;
        u32 old_left_nritems;
-        u32 right_nritems;
        u32 nr;
        int ret = 0;
        int wret;
@@ -2553,41 +2585,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
        u32 old_left_item_size;
        slot = path->slots[1];
-        if (slot == 0)
-                return 1;
-        if (!path->nodes[1])
-                return 1;
-        right_nritems = btrfs_header_nritems(right);
-        if (right_nritems == 0)
-                return 1;
-        btrfs_assert_tree_locked(path->nodes[1]);
-        left = read_node_slot(root, path->nodes[1], slot - 1);
-        btrfs_tree_lock(left);
-        btrfs_set_lock_blocking(left);
-        free_space = btrfs_leaf_free_space(root, left);
-        if (free_space < data_size) {
-                ret = 1;
-                goto out;
-        }
-        /* cow and double check */
-        ret = btrfs_cow_block(trans, root, left,
-                              path->nodes[1], slot - 1, &left, 0);
-        if (ret) {
-                /* we hit -ENOSPC, but it isn't fatal here */
-                ret = 1;
-                goto out;
-        }
-        free_space = btrfs_leaf_free_space(root, left);
-        if (free_space < data_size) {
-                ret = 1;
-                goto out;
-        }
        if (empty)
                nr = right_nritems;
@@ -2755,6 +2752,154 @@ out:
 }
 /*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, struct btrfs_path *path, int data_size,
+                          int empty)
+{
+        struct extent_buffer *right = path->nodes[0];
+        struct extent_buffer *left;
+        int slot;
+        int free_space;
+        u32 right_nritems;
+        int ret = 0;
+        slot = path->slots[1];
+        if (slot == 0)
+                return 1;
+        if (!path->nodes[1])
+                return 1;
+        right_nritems = btrfs_header_nritems(right);
+        if (right_nritems == 0)
+                return 1;
+        btrfs_assert_tree_locked(path->nodes[1]);
+        left = read_node_slot(root, path->nodes[1], slot - 1);
+        btrfs_tree_lock(left);
+        btrfs_set_lock_blocking(left);
+        free_space = btrfs_leaf_free_space(root, left);
+        if (free_space < data_size) {
+                ret = 1;
+                goto out;
+        }
+        /* cow and double check */
+        ret = btrfs_cow_block(trans, root, left,
+                              path->nodes[1], slot - 1, &left);
+        if (ret) {
+                /* we hit -ENOSPC, but it isn't fatal here */
+                ret = 1;
+                goto out;
+        }
+        free_space = btrfs_leaf_free_space(root, left);
+        if (free_space < data_size) {
+                ret = 1;
+                goto out;
+        }
+        return __push_leaf_left(trans, root, path, data_size,
+                               empty, left, free_space, right_nritems);
+out:
+        btrfs_tree_unlock(left);
+        free_extent_buffer(left);
+        return ret;
+}
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_path *path,
+                               struct extent_buffer *l,
+                               struct extent_buffer *right,
+                               int slot, int mid, int nritems)
+{
+        int data_copy_size;
+        int rt_data_off;
+        int i;
+        int ret = 0;
+        int wret;
+        struct btrfs_disk_key disk_key;
+        nritems = nritems - mid;
+        btrfs_set_header_nritems(right, nritems);
+        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+                           btrfs_item_nr_offset(mid),
+                           nritems * sizeof(struct btrfs_item));
+        copy_extent_buffer(right, l,
+                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+                     data_copy_size, btrfs_leaf_data(l) +
+                     leaf_data_end(root, l), data_copy_size);
+        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+                      btrfs_item_end_nr(l, mid);
+        for (i = 0; i < nritems; i++) {
+                struct btrfs_item *item = btrfs_item_nr(right, i);
+                u32 ioff;
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(right, item);
+                btrfs_set_item_offset(right, item, ioff + rt_data_off);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        btrfs_set_header_nritems(l, mid);
+        ret = 0;
+        btrfs_item_key(right, &disk_key, 0);
+        wret = insert_ptr(trans, root, path, &disk_key, right->start,
+                          path->slots[1] + 1, 1);
+        if (wret)
+                ret = wret;
+        btrfs_mark_buffer_dirty(right);
+        btrfs_mark_buffer_dirty(l);
+        BUG_ON(path->slots[0] != slot);
+        ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+        BUG_ON(ret);
+        if (mid <= slot) {
+                btrfs_tree_unlock(path->nodes[0]);
+                free_extent_buffer(path->nodes[0]);
+                path->nodes[0] = right;
+                path->slots[0] -= mid;
+                path->slots[1] += 1;
+        } else {
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        BUG_ON(path->slots[0] < 0);
+        return ret;
+}
+/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
 *
@@ -2771,17 +2916,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        int mid;
        int slot;
        struct extent_buffer *right;
-        int data_copy_size;
-        int rt_data_off;
-        int i;
        int ret = 0;
        int wret;
        int double_split;
        int num_doubles = 0;
-        struct btrfs_disk_key disk_key;
        /* first try to make some room by pushing left and right */
-        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+            !trans->transaction->delayed_refs.flushing) {
                wret = push_leaf_right(trans, root, path, data_size, 0);
                if (wret < 0)
                        return wret;
@@ -2830,11 +2972,14 @@ again:
        write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
                            BTRFS_UUID_SIZE);
        if (mid <= slot) {
                if (nritems == 1 ||
                    leaf_space_used(l, mid, nritems - mid) + data_size >
                        BTRFS_LEAF_DATA_SIZE(root)) {
                        if (slot >= nritems) {
+                                struct btrfs_disk_key disk_key;
                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
                                btrfs_set_header_nritems(right, 0);
                                wret = insert_ptr(trans, root, path,
@@ -2862,6 +3007,8 @@ again:
                if (leaf_space_used(l, 0, mid) + data_size >
                        BTRFS_LEAF_DATA_SIZE(root)) {
                        if (!extend && data_size && slot == 0) {
+                                struct btrfs_disk_key disk_key;
                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
                                btrfs_set_header_nritems(right, 0);
                                wret = insert_ptr(trans, root, path,
@@ -2894,76 +3041,16 @@ again:
                        }
                }
        }
-        nritems = nritems - mid;
-        btrfs_set_header_nritems(right, nritems);
-        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
-        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+        ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
-                           btrfs_item_nr_offset(mid),
-                           nritems * sizeof(struct btrfs_item));
-        copy_extent_buffer(right, l,
-                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
-                     data_copy_size, btrfs_leaf_data(l) +
-                     leaf_data_end(root, l), data_copy_size);
-        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
-                      btrfs_item_end_nr(l, mid);
-        for (i = 0; i < nritems; i++) {
-                struct btrfs_item *item = btrfs_item_nr(right, i);
-                u32 ioff;
-                if (!right->map_token) {
-                        map_extent_buffer(right, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &right->map_token, &right->kaddr,
-                                        &right->map_start, &right->map_len,
-                                        KM_USER1);
-                }
-                ioff = btrfs_item_offset(right, item);
-                btrfs_set_item_offset(right, item, ioff + rt_data_off);
-        }
-        if (right->map_token) {
-                unmap_extent_buffer(right, right->map_token, KM_USER1);
-                right->map_token = NULL;
-        }
-        btrfs_set_header_nritems(l, mid);
-        ret = 0;
-        btrfs_item_key(right, &disk_key, 0);
-        wret = insert_ptr(trans, root, path, &disk_key, right->start,
-                          path->slots[1] + 1, 1);
-        if (wret)
-                ret = wret;
-        btrfs_mark_buffer_dirty(right);
-        btrfs_mark_buffer_dirty(l);
-        BUG_ON(path->slots[0] != slot);
-        ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
        BUG_ON(ret);
-        if (mid <= slot) {
-                btrfs_tree_unlock(path->nodes[0]);
-                free_extent_buffer(path->nodes[0]);
-                path->nodes[0] = right;
-                path->slots[0] -= mid;
-                path->slots[1] += 1;
-        } else {
-                btrfs_tree_unlock(right);
-                free_extent_buffer(right);
-        }
-        BUG_ON(path->slots[0] < 0);
        if (double_split) {
                BUG_ON(num_doubles != 0);
                num_doubles++;
                goto again;
        }
        return ret;
 }
@@ -3021,26 +3108,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
                return -EAGAIN;
        }
+        btrfs_set_path_blocking(path);
        ret = split_leaf(trans, root, &orig_key, path,
                         sizeof(struct btrfs_item), 1);
        path->keep_locks = 0;
        BUG_ON(ret);
+        btrfs_unlock_up_safe(path, 1);
+        leaf = path->nodes[0];
+        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+split:
        /*
         * make sure any changes to the path from split_leaf leave it
         * in a blocking state
         */
        btrfs_set_path_blocking(path);
-        leaf = path->nodes[0];
-        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-split:
        item = btrfs_item_nr(leaf, path->slots[0]);
        orig_offset = btrfs_item_offset(leaf, item);
        item_size = btrfs_item_size(leaf, item);
        buf = kmalloc(item_size, GFP_NOFS);
        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
                            path->slots[0]), item_size);
@@ -3445,39 +3533,27 @@ out:
 }
 /*
- * Given a key and some data, insert items into the tree.
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
- * This does all the path init required, making room in the tree if needed.
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
 */
-int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+static noinline_for_stack int
-                            struct btrfs_root *root,
+setup_items_for_insert(struct btrfs_trans_handle *trans,
-                            struct btrfs_path *path,
+                      struct btrfs_root *root, struct btrfs_path *path,
-                            struct btrfs_key *cpu_key, u32 *data_size,
+                      struct btrfs_key *cpu_key, u32 *data_size,
-                            int nr)
+                      u32 total_data, u32 total_size, int nr)
 {
-        struct extent_buffer *leaf;
        struct btrfs_item *item;
-        int ret = 0;
-        int slot;
-        int slot_orig;
        int i;
        u32 nritems;
-        u32 total_size = 0;
-        u32 total_data = 0;
        unsigned int data_end;
        struct btrfs_disk_key disk_key;
+        int ret;
+        struct extent_buffer *leaf;
+        int slot;
-        for (i = 0; i < nr; i++)
-                total_data += data_size[i];
-        total_size = total_data + (nr * sizeof(struct btrfs_item));
-        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-        if (ret == 0)
-                return -EEXIST;
-        if (ret < 0)
-                goto out;
-        slot_orig = path->slots[0];
        leaf = path->nodes[0];
+        slot = path->slots[0];
        nritems = btrfs_header_nritems(leaf);
        data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3565,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                BUG();
        }
-        slot = path->slots[0];
-        BUG_ON(slot < 0);
        if (slot != nritems) {
                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
@@ -3547,21 +3620,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                data_end -= data_size[i];
                btrfs_set_item_size(leaf, item, data_size[i]);
        }
        btrfs_set_header_nritems(leaf, nritems + nr);
-        btrfs_mark_buffer_dirty(leaf);
        ret = 0;
        if (slot == 0) {
+                struct btrfs_disk_key disk_key;
                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
                ret = fixup_low_keys(trans, root, path, &disk_key, 1);
        }
+        btrfs_unlock_up_safe(path, 1);
+        btrfs_mark_buffer_dirty(leaf);
        if (btrfs_leaf_free_space(root, leaf) < 0) {
                btrfs_print_leaf(root, leaf);
                BUG();
        }
+        return ret;
+}
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct btrfs_key *cpu_key, u32 *data_size,
+                            int nr)
+{
+        struct extent_buffer *leaf;
+        int ret = 0;
+        int slot;
+        int i;
+        u32 total_size = 0;
+        u32 total_data = 0;
+        for (i = 0; i < nr; i++)
+                total_data += data_size[i];
+        total_size = total_data + (nr * sizeof(struct btrfs_item));
+        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+        if (ret == 0)
+                return -EEXIST;
+        if (ret < 0)
+                goto out;
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        BUG_ON(slot < 0);
+        ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+                               total_data, total_size, nr);
 out:
-        btrfs_unlock_up_safe(path, 1);
        return ret;
 }
@@ -3749,7 +3861,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
                /* delete the leaf if it is mostly empty */
-                if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+                    !trans->transaction->delayed_refs.flushing) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -3757,6 +3870,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        slot = path->slots[1];
                        extent_buffer_get(leaf);
+                        btrfs_set_path_blocking(path);
                        wret = push_leaf_left(trans, root, path, 1, 1);
                        if (wret < 0 && wret != -ENOSPC)
                                ret = wret;
@@ -4042,28 +4156,44 @@ next:
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
        int slot;
-        int level = 1;
+        int level;
        struct extent_buffer *c;
-        struct extent_buffer *next = NULL;
+        struct extent_buffer *next;
        struct btrfs_key key;
        u32 nritems;
        int ret;
+        int old_spinning = path->leave_spinning;
+        int force_blocking = 0;
        nritems = btrfs_header_nritems(path->nodes[0]);
        if (nritems == 0)
                return 1;
-        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+        /*
+         * we take the blocks in an order that upsets lockdep.  Using
+         * blocking mode is the only way around it.
+         */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        force_blocking = 1;
+#endif
+        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+        level = 1;
+        next = NULL;
        btrfs_release_path(root, path);
        path->keep_locks = 1;
+        if (!force_blocking)
+                path->leave_spinning = 1;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        path->keep_locks = 0;
        if (ret < 0)
                return ret;
-        btrfs_set_path_blocking(path);
        nritems = btrfs_header_nritems(path->nodes[0]);
        /*
         * by releasing the path above we dropped all our locks.  A balance
@@ -4073,19 +4203,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
         */
        if (nritems > 0 && path->slots[0] < nritems - 1) {
                path->slots[0]++;
+                ret = 0;
                goto done;
        }
        while (level < BTRFS_MAX_LEVEL) {
-                if (!path->nodes[level])
+                if (!path->nodes[level]) {
-                        return 1;
+                        ret = 1;
+                        goto done;
+                }
                slot = path->slots[level] + 1;
                c = path->nodes[level];
                if (slot >= btrfs_header_nritems(c)) {
                        level++;
-                        if (level == BTRFS_MAX_LEVEL)
+                        if (level == BTRFS_MAX_LEVEL) {
-                                return 1;
+                                ret = 1;
+                                goto done;
+                        }
                        continue;
                }
@@ -4094,16 +4229,27 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                        free_extent_buffer(next);
                }
-                /* the path was set to blocking above */
+                next = c;
-                if (level == 1 && (path->locks[1] || path->skip_locking) &&
+                ret = read_block_for_search(NULL, root, path, &next, level,
-                    path->reada)
+                                            slot, &key);
-                        reada_for_search(root, path, level, slot, 0);
+                if (ret == -EAGAIN)
+                        goto again;
+                if (ret < 0) {
+                        btrfs_release_path(root, path);
+                        goto done;
+                }
-                next = read_node_slot(root, c, slot);
                if (!path->skip_locking) {
-                        btrfs_assert_tree_locked(c);
+                        ret = btrfs_try_spin_lock(next);
-                        btrfs_tree_lock(next);
+                        if (!ret) {
-                        btrfs_set_lock_blocking(next);
+                                btrfs_set_path_blocking(path);
+                                btrfs_tree_lock(next);
+                                if (!force_blocking)
+                                        btrfs_clear_path_blocking(path, next);
+                        }
+                        if (force_blocking)
+                                btrfs_set_lock_blocking(next);
                }
                break;
        }
@@ -4113,27 +4259,47 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                c = path->nodes[level];
                if (path->locks[level])
                        btrfs_tree_unlock(c);
                free_extent_buffer(c);
                path->nodes[level] = next;
                path->slots[level] = 0;
                if (!path->skip_locking)
                        path->locks[level] = 1;
                if (!level)
                        break;
-                btrfs_set_path_blocking(path);
+                ret = read_block_for_search(NULL, root, path, &next, level,
-                if (level == 1 && path->locks[1] && path->reada)
+                                            0, &key);
-                        reada_for_search(root, path, level, slot, 0);
+                if (ret == -EAGAIN)
-                next = read_node_slot(root, next, 0);
+                        goto again;
+                if (ret < 0) {
+                        btrfs_release_path(root, path);
+                        goto done;
+                }
                if (!path->skip_locking) {
                        btrfs_assert_tree_locked(path->nodes[level]);
-                        btrfs_tree_lock(next);
+                        ret = btrfs_try_spin_lock(next);
-                        btrfs_set_lock_blocking(next);
+                        if (!ret) {
+                                btrfs_set_path_blocking(path);
+                                btrfs_tree_lock(next);
+                                if (!force_blocking)
+                                        btrfs_clear_path_blocking(path, next);
+                        }
+                        if (force_blocking)
+                                btrfs_set_lock_blocking(next);
                }
        }
+        ret = 0;
 done:
        unlock_up(path, 0, 1);
-        return 0;
+        path->leave_spinning = old_spinning;
+        if (!old_spinning)
+                btrfs_set_path_blocking(path);
+        return ret;
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d8..4414a5d9983a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
 #define BTRFS_MAX_LEVEL 8
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list.  That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -136,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 #define BTRFS_FT_MAX            9
 /*
- * the key defines the order in the tree, and so it also defines (optimal)
+ * The key defines the order in the tree, and so it also defines (optimal)
- * block layout.  objectid corresonds to the inode number.  The flags
+ * block layout.
- * tells us things about the object, and is a kind of stream selector.
+ *
- * so for a given inode, keys with flags of 1 might refer to the inode
+ * objectid corresponds to the inode number.
- * data, flags of 2 may point to file data in the btree and flags == 3
+ *
- * may point to extents.
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
 *
 * offset is the starting byte offset for this key in the stream.
 *
@@ -193,7 +203,7 @@ struct btrfs_dev_item {
        /*
         * starting byte of this partition on the device,
-         * to allowr for stripe alignment in the future
+         * to allow for stripe alignment in the future
         */
        __le64 start_offset;
@@ -401,15 +411,16 @@ struct btrfs_path {
        int locks[BTRFS_MAX_LEVEL];
        int reada;
        /* keep some upper locks as we walk down */
-        int keep_locks;
-        int skip_locking;
        int lowest_level;
        /*
         * set by btrfs_split_item, tells search_slot to keep all locks
         * and to force calls to keep space in the nodes
         */
-        int search_for_split;
+        unsigned int search_for_split:1;
+        unsigned int keep_locks:1;
+        unsigned int skip_locking:1;
+        unsigned int leave_spinning:1;
 };
 /*
@@ -625,18 +636,35 @@ struct btrfs_space_info {
        struct rw_semaphore groups_sem;
 };
-struct btrfs_free_space {
+/*
-        struct rb_node bytes_index;
+ * free clusters are used to claim free space in relatively large chunks,
-        struct rb_node offset_index;
+ * allowing us to do less seeky writes.  They are used for all metadata
-        u64 offset;
+ * allocations and data allocations in ssd mode.
-        u64 bytes;
+ */
+struct btrfs_free_cluster {
+        spinlock_t lock;
+        spinlock_t refill_lock;
+        struct rb_root root;
+        /* largest extent in this cluster */
+        u64 max_size;
+        /* first extent starting offset */
+        u64 window_start;
+        struct btrfs_block_group_cache *block_group;
+        /*
+         * when a cluster is allocated from a block group, we put the
+         * cluster onto a list in the block group so that it can
+         * be freed before the block group is freed.
+         */
+        struct list_head block_group_list;
 };
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
        spinlock_t lock;
-        struct mutex alloc_mutex;
        struct mutex cache_mutex;
        u64 pinned;
        u64 reserved;
@@ -648,6 +676,7 @@ struct btrfs_block_group_cache {
        struct btrfs_space_info *space_info;
        /* free space cache stuff */
+        spinlock_t tree_lock;
        struct rb_root free_space_bytes;
        struct rb_root free_space_offset;
@@ -659,6 +688,11 @@ struct btrfs_block_group_cache {
        /* usage count */
        atomic_t count;
+        /* List of struct btrfs_free_clusters for this block group.
+         * Today it will only have one thing on it, but that may change
+         */
+        struct list_head cluster_list;
 };
 struct btrfs_leaf_ref_tree {
@@ -688,15 +722,18 @@ struct btrfs_fs_info {
        struct rb_root block_group_cache_tree;
        struct extent_io_tree pinned_extents;
-        struct extent_io_tree pending_del;
-        struct extent_io_tree extent_ins;
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
        u64 generation;
        u64 last_trans_committed;
-        u64 last_trans_new_blockgroup;
+        /*
+         * this is updated to the current trans every time a full commit
+         * is required instead of the faster short fsync log commits
+         */
+        u64 last_trans_log_full_commit;
        u64 open_ioctl_trans;
        unsigned long mount_opt;
        u64 max_extent;
@@ -717,12 +754,20 @@ struct btrfs_fs_info {
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
-        struct mutex extent_ins_mutex;
-        struct mutex pinned_mutex;
        struct mutex chunk_mutex;
        struct mutex drop_mutex;
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+        /*
+         * this protects the ordered operations list only while we are
+         * processing all of the entries on it.  This way we make
+         * sure the commit code doesn't find the list temporarily empty
+         * because another function happens to be doing non-waiting preflush
+         * before jumping into the main commit.
+         */
+        struct mutex ordered_operations_mutex;
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
@@ -737,10 +782,29 @@ struct btrfs_fs_info {
         * ordered extents
         */
        spinlock_t ordered_extent_lock;
+        /*
+         * all of the data=ordered extents pending writeback
+         * these can span multiple transactions and basically include
+         * every dirty data page that isn't from nodatacow
+         */
        struct list_head ordered_extents;
+        /*
+         * all of the inodes that have delalloc bytes.  It is possible for
+         * this list to be empty even when there is still dirty data=ordered
+         * extents waiting to finish IO.
+         */
        struct list_head delalloc_inodes;
        /*
+         * special rename and truncate targets that must be on disk before
+         * we're allowed to commit.  This is basically the ext3 style
+         * data=ordered list.
+         */
+        struct list_head ordered_operations;
+        /*
         * there is a pool of worker threads for checksumming during writes
         * and a pool for checksumming after reads.  This is because readers
         * can run with FS locks held, and the writers may be waiting for
@@ -781,6 +845,11 @@ struct btrfs_fs_info {
        atomic_t throttle_gen;
        u64 total_pinned;
+        /* protected by the delalloc lock, used to keep from writing
+         * metadata until there is a nice batch
+         */
+        u64 dirty_metadata_bytes;
        struct list_head dirty_cowonly_roots;
        struct btrfs_fs_devices *fs_devices;
@@ -795,8 +864,12 @@ struct btrfs_fs_info {
        spinlock_t delalloc_lock;
        spinlock_t new_trans_lock;
        u64 delalloc_bytes;
-        u64 last_alloc;
-        u64 last_data_alloc;
+        /* data_alloc_cluster is only used in ssd mode */
+        struct btrfs_free_cluster data_alloc_cluster;
+        /* all metadata allocations go through this cluster */
+        struct btrfs_free_cluster meta_alloc_cluster;
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
@@ -808,6 +881,9 @@ struct btrfs_fs_info {
        u64 metadata_alloc_profile;
        u64 system_alloc_profile;
+        unsigned data_chunk_allocations;
+        unsigned metadata_ratio;
        void *bdev_holder;
 };
@@ -888,7 +964,6 @@ struct btrfs_root {
 };
 /*
 * inode items have the data typically returned from stat and store other
 * info about object characteristics.  There is one for every file and dir in
 * the FS
@@ -919,7 +994,7 @@ struct btrfs_root {
 #define BTRFS_EXTENT_CSUM_KEY   128
 /*
- * root items point to tree roots.  There are typically in the root
+ * root items point to tree roots.  They are typically in the root
 * tree used by the super block to find all the other trees
 */
 #define BTRFS_ROOT_ITEM_KEY     132
@@ -966,6 +1041,8 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SSD                 (1 << 3)
 #define BTRFS_MOUNT_DEGRADED            (1 << 4)
 #define BTRFS_MOUNT_COMPRESS            (1 << 5)
+#define BTRFS_MOUNT_NOTREELOG           (1 << 6)
+#define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1704,18 +1781,16 @@ static inline struct dentry *fdentry(struct file *file)
 }
 /* extent-tree.c */
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u32 *refs);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
                                u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, u64 objectid, u64 bytenr);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
@@ -1777,7 +1852,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         u64 root_objectid, u64 ref_generation,
                         u64 owner_objectid);
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
+                            struct btrfs_root *root, u64 bytenr, u64 num_bytes,
                            u64 orig_parent, u64 parent,
                            u64 root_objectid, u64 ref_generation,
                            u64 owner_objectid);
@@ -1838,7 +1913,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, struct extent_buffer *buf,
                    struct extent_buffer *parent, int parent_slot,
-                    struct extent_buffer **cow_ret, u64 prealloc_dest);
+                    struct extent_buffer **cow_ret);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct extent_buffer *buf,
@@ -2060,7 +2135,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
                              struct file_ra_state *ra, struct file *file,
                              pgoff_t offset, pgoff_t last_index);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
@@ -2102,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode,
-                       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+                       u64 start, u64 end, u64 locked_end,
+                       u64 inline_limit, u64 *hint_block);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct inode *inode, u64 start, u64 end);
@@ -2133,21 +2209,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
-/* free-space-cache.c */
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                         u64 bytenr, u64 size);
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
-                              u64 offset, u64 bytes);
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-                            u64 bytenr, u64 size);
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
-                                 u64 offset, u64 bytes);
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
-                                   *block_group);
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
-                                               *block_group, u64 offset,
-                                               u64 bytes);
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
-                           u64 bytes);
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..d6c01c096a40
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+/*
+ * delayed back reference update tracking.  For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing.   This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ *
+ * Right now this code is only used for reference counted trees, but
+ * the long term goal is to get rid of the similar code for delayed
+ * extent tree modifications.
+ */
+/*
+ * entries in the rb tree are ordered by the byte number of the extent
+ * and by the byte number of the parent block.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref,
+                      u64 bytenr, u64 parent)
+{
+        if (bytenr < ref->bytenr)
+                return -1;
+        if (bytenr > ref->bytenr)
+                return 1;
+        if (parent < ref->parent)
+                return -1;
+        if (parent > ref->parent)
+                return 1;
+        return 0;
+}
+/*
+ * insert a new ref into the rbtree.  This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+                                                  u64 bytenr, u64 parent,
+                                                  struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent_node = NULL;
+        struct btrfs_delayed_ref_node *entry;
+        int cmp;
+        while (*p) {
+                parent_node = *p;
+                entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+                                 rb_node);
+                cmp = comp_entry(entry, bytenr, parent);
+                if (cmp < 0)
+                        p = &(*p)->rb_left;
+                else if (cmp > 0)
+                        p = &(*p)->rb_right;
+                else
+                        return entry;
+        }
+        entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+        rb_link_node(node, parent_node, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+/*
+ * find an entry based on (bytenr,parent).  This returns the delayed
+ * ref if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+                                  u64 bytenr, u64 parent,
+                                  struct btrfs_delayed_ref_node **last)
+{
+        struct rb_node *n = root->rb_node;
+        struct btrfs_delayed_ref_node *entry;
+        int cmp;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+                WARN_ON(!entry->in_tree);
+                if (last)
+                        *last = entry;
+                cmp = comp_entry(entry, bytenr, parent);
+                if (cmp < 0)
+                        n = n->rb_left;
+                else if (cmp > 0)
+                        n = n->rb_right;
+                else
+                        return entry;
+        }
+        return NULL;
+}
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                           struct btrfs_delayed_ref_head *head)
+{
+        struct btrfs_delayed_ref_root *delayed_refs;
+        delayed_refs = &trans->transaction->delayed_refs;
+        assert_spin_locked(&delayed_refs->lock);
+        if (mutex_trylock(&head->mutex))
+                return 0;
+        atomic_inc(&head->node.refs);
+        spin_unlock(&delayed_refs->lock);
+        mutex_lock(&head->mutex);
+        spin_lock(&delayed_refs->lock);
+        if (!head->node.in_tree) {
+                mutex_unlock(&head->mutex);
+                btrfs_put_delayed_ref(&head->node);
+                return -EAGAIN;
+        }
+        btrfs_put_delayed_ref(&head->node);
+        return 0;
+}
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                           struct list_head *cluster, u64 start)
+{
+        int count = 0;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct rb_node *node;
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_head *head;
+        delayed_refs = &trans->transaction->delayed_refs;
+        if (start == 0) {
+                node = rb_first(&delayed_refs->root);
+        } else {
+                ref = NULL;
+                tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+                if (ref) {
+                        struct btrfs_delayed_ref_node *tmp;
+                        node = rb_prev(&ref->rb_node);
+                        while (node) {
+                                tmp = rb_entry(node,
+                                               struct btrfs_delayed_ref_node,
+                                               rb_node);
+                                if (tmp->bytenr < start)
+                                        break;
+                                ref = tmp;
+                                node = rb_prev(&ref->rb_node);
+                        }
+                        node = &ref->rb_node;
+                } else
+                        node = rb_first(&delayed_refs->root);
+        }
+again:
+        while (node && count < 32) {
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                if (btrfs_delayed_ref_is_head(ref)) {
+                        head = btrfs_delayed_node_to_head(ref);
+                        if (list_empty(&head->cluster)) {
+                                list_add_tail(&head->cluster, cluster);
+                                delayed_refs->run_delayed_start =
+                                        head->node.bytenr;
+                                count++;
+                                WARN_ON(delayed_refs->num_heads_ready == 0);
+                                delayed_refs->num_heads_ready--;
+                        } else if (count) {
+                                /* the goal of the clustering is to find extents
+                                 * that are likely to end up in the same extent
+                                 * leaf on disk.  So, we don't want them spread
+                                 * all over the tree.  Stop now if we've hit
+                                 * a head that was already in use
+                                 */
+                                break;
+                        }
+                }
+                node = rb_next(node);
+        }
+        if (count) {
+                return 0;
+        } else if (start) {
+                /*
+                 * we've gone to the end of the rbtree without finding any
+                 * clusters.  start from the beginning and try again
+                 */
+                start = 0;
+                node = rb_first(&delayed_refs->root);
+                goto again;
+        }
+        return 1;
+}
+/*
+ * This checks to see if there are any delayed refs in the
+ * btree for a given bytenr.  It returns one if it finds any
+ * and zero otherwise.
+ *
+ * If it only finds a head node, it returns 0.
+ *
+ * The idea is to use this when deciding if you can safely delete an
+ * extent from the extent allocation tree.  There may be a pending
+ * ref in the rbtree that adds or removes references, so as long as this
+ * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
+ * allocation tree.
+ */
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct rb_node *prev_node;
+        int ret = 0;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        if (ref) {
+                prev_node = rb_prev(&ref->rb_node);
+                if (!prev_node)
+                        goto out;
+                ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+                               rb_node);
+                if (ref->bytenr == bytenr)
+                        ret = 1;
+        }
+out:
+        spin_unlock(&delayed_refs->lock);
+        return ret;
+}
+/*
+ * helper function to lookup reference count
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree.  This way you
+ * can check to see what the reference count would be if all of the
+ * delayed refs are processed.
+ */
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 num_bytes, u32 *refs)
+{
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_head *head;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_key key;
+        u32 num_refs;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = num_bytes;
+        delayed_refs = &trans->transaction->delayed_refs;
+again:
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+                                &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret == 0) {
+                leaf = path->nodes[0];
+                ei = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_extent_item);
+                num_refs = btrfs_extent_refs(leaf, ei);
+        } else {
+                num_refs = 0;
+                ret = 0;
+        }
+        spin_lock(&delayed_refs->lock);
+        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        if (ref) {
+                head = btrfs_delayed_node_to_head(ref);
+                if (mutex_trylock(&head->mutex)) {
+                        num_refs += ref->ref_mod;
+                        mutex_unlock(&head->mutex);
+                        *refs = num_refs;
+                        goto out;
+                }
+                atomic_inc(&ref->refs);
+                spin_unlock(&delayed_refs->lock);
+                btrfs_release_path(root->fs_info->extent_root, path);
+                mutex_lock(&head->mutex);
+                mutex_unlock(&head->mutex);
+                btrfs_put_delayed_ref(ref);
+                goto again;
+        } else {
+                *refs = num_refs;
+        }
+out:
+        spin_unlock(&delayed_refs->lock);
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree.  existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+                    struct btrfs_delayed_ref_root *delayed_refs,
+                    struct btrfs_delayed_ref_node *existing,
+                    struct btrfs_delayed_ref_node *update)
+{
+        struct btrfs_delayed_ref *existing_ref;
+        struct btrfs_delayed_ref *ref;
+        existing_ref = btrfs_delayed_node_to_ref(existing);
+        ref = btrfs_delayed_node_to_ref(update);
+        if (ref->pin)
+                existing_ref->pin = 1;
+        if (ref->action != existing_ref->action) {
+                /*
+                 * this is effectively undoing either an add or a
+                 * drop.  We decrement the ref_mod, and if it goes
+                 * down to zero we just delete the entry without
+                 * every changing the extent allocation tree.
+                 */
+                existing->ref_mod--;
+                if (existing->ref_mod == 0) {
+                        rb_erase(&existing->rb_node,
+                                 &delayed_refs->root);
+                        existing->in_tree = 0;
+                        btrfs_put_delayed_ref(existing);
+                        delayed_refs->num_entries--;
+                        if (trans->delayed_ref_updates)
+                                trans->delayed_ref_updates--;
+                }
+        } else {
+                if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+                        /* if we're adding refs, make sure all the
+                         * details match up.  The extent could
+                         * have been totally freed and reallocated
+                         * by a different owner before the delayed
+                         * ref entries were removed.
+                         */
+                        existing_ref->owner_objectid = ref->owner_objectid;
+                        existing_ref->generation = ref->generation;
+                        existing_ref->root = ref->root;
+                        existing->num_bytes = update->num_bytes;
+                }
+                /*
+                 * the action on the existing ref matches
+                 * the action on the ref we're trying to add.
+                 * Bump the ref_mod by one so the backref that
+                 * is eventually added/removed has the correct
+                 * reference count
+                 */
+                existing->ref_mod += update->ref_mod;
+        }
+}
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+                         struct btrfs_delayed_ref_node *update)
+{
+        struct btrfs_delayed_ref_head *existing_ref;
+        struct btrfs_delayed_ref_head *ref;
+        existing_ref = btrfs_delayed_node_to_head(existing);
+        ref = btrfs_delayed_node_to_head(update);
+        if (ref->must_insert_reserved) {
+                /* if the extent was freed and then
+                 * reallocated before the delayed ref
+                 * entries were processed, we can end up
+                 * with an existing head ref without
+                 * the must_insert_reserved flag set.
+                 * Set it again here
+                 */
+                existing_ref->must_insert_reserved = ref->must_insert_reserved;
+                /*
+                 * update the num_bytes so we make sure the accounting
+                 * is done correctly
+                 */
+                existing->num_bytes = update->num_bytes;
+        }
+        /*
+         * update the reference mod on the head to reflect this new operation
+         */
+        existing->ref_mod += update->ref_mod;
+}
+/*
+ * helper function to actually insert a delayed ref into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count in the head node and properly dealing
+ * with updating existing nodes as new modifications are queued.
+ */
+static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+                          struct btrfs_delayed_ref_node *ref,
+                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                          u64 ref_generation, u64 owner_objectid, int action,
+                          int pin)
+{
+        struct btrfs_delayed_ref_node *existing;
+        struct btrfs_delayed_ref *full_ref;
+        struct btrfs_delayed_ref_head *head_ref = NULL;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int count_mod = 1;
+        int must_insert_reserved = 0;
+        /*
+         * the head node stores the sum of all the mods, so dropping a ref
+         * should drop the sum in the head node by one.
+         */
+        if (parent == (u64)-1) {
+                if (action == BTRFS_DROP_DELAYED_REF)
+                        count_mod = -1;
+                else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+                        count_mod = 0;
+        }
+        /*
+         * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+         * the reserved accounting when the extent is finally added, or
+         * if a later modification deletes the delayed ref without ever
+         * inserting the extent into the extent allocation tree.
+         * ref->must_insert_reserved is the flag used to record
+         * that accounting mods are required.
+         *
+         * Once we record must_insert_reserved, switch the action to
+         * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+         */
+        if (action == BTRFS_ADD_DELAYED_EXTENT) {
+                must_insert_reserved = 1;
+                action = BTRFS_ADD_DELAYED_REF;
+        } else {
+                must_insert_reserved = 0;
+        }
+        delayed_refs = &trans->transaction->delayed_refs;
+        /* first set the basic ref node struct up */
+        atomic_set(&ref->refs, 1);
+        ref->bytenr = bytenr;
+        ref->parent = parent;
+        ref->ref_mod = count_mod;
+        ref->in_tree = 1;
+        ref->num_bytes = num_bytes;
+        if (btrfs_delayed_ref_is_head(ref)) {
+                head_ref = btrfs_delayed_node_to_head(ref);
+                head_ref->must_insert_reserved = must_insert_reserved;
+                INIT_LIST_HEAD(&head_ref->cluster);
+                mutex_init(&head_ref->mutex);
+        } else {
+                full_ref = btrfs_delayed_node_to_ref(ref);
+                full_ref->root = ref_root;
+                full_ref->generation = ref_generation;
+                full_ref->owner_objectid = owner_objectid;
+                full_ref->pin = pin;
+                full_ref->action = action;
+        }
+        existing = tree_insert(&delayed_refs->root, bytenr,
+                               parent, &ref->rb_node);
+        if (existing) {
+                if (btrfs_delayed_ref_is_head(ref))
+                        update_existing_head_ref(existing, ref);
+                else
+                        update_existing_ref(trans, delayed_refs, existing, ref);
+                /*
+                 * we've updated the existing ref, free the newly
+                 * allocated ref
+                 */
+                kfree(ref);
+        } else {
+                if (btrfs_delayed_ref_is_head(ref)) {
+                        delayed_refs->num_heads++;
+                        delayed_refs->num_heads_ready++;
+                }
+                delayed_refs->num_entries++;
+                trans->delayed_ref_updates++;
+        }
+        return 0;
+}
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                          u64 ref_generation, u64 owner_objectid, int action,
+                          int pin)
+{
+        struct btrfs_delayed_ref *ref;
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        if (!ref)
+                return -ENOMEM;
+        /*
+         * the parent = 0 case comes from cases where we don't actually
+         * know the parent yet.  It will get updated later via a add/drop
+         * pair.
+         */
+        if (parent == 0)
+                parent = bytenr;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref) {
+                kfree(ref);
+                return -ENOMEM;
+        }
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        /*
+         * insert both the head node and the new ref without dropping
+         * the spin lock
+         */
+        ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+                                      (u64)-1, 0, 0, 0, action, pin);
+        BUG_ON(ret);
+        ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+                                      parent, ref_root, ref_generation,
+                                      owner_objectid, action, pin);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        delayed_refs = &trans->transaction->delayed_refs;
+        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        if (ref)
+                return btrfs_delayed_node_to_head(ref);
+        return NULL;
+}
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ *
+ * The main point of this call is to add and remove a backreference in a single
+ * shot, taking the lock only once, and only searching for the head node once.
+ *
+ * It is the same as doing a ref add and delete in two separate calls.
+ */
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+                          u64 bytenr, u64 num_bytes, u64 orig_parent,
+                          u64 parent, u64 orig_ref_root, u64 ref_root,
+                          u64 orig_ref_generation, u64 ref_generation,
+                          u64 owner_objectid, int pin)
+{
+        struct btrfs_delayed_ref *ref;
+        struct btrfs_delayed_ref *old_ref;
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        if (!ref)
+                return -ENOMEM;
+        old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
+        if (!old_ref) {
+                kfree(ref);
+                return -ENOMEM;
+        }
+        /*
+         * the parent = 0 case comes from cases where we don't actually
+         * know the parent yet.  It will get updated later via a add/drop
+         * pair.
+         */
+        if (parent == 0)
+                parent = bytenr;
+        if (orig_parent == 0)
+                orig_parent = bytenr;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref) {
+                kfree(ref);
+                kfree(old_ref);
+                return -ENOMEM;
+        }
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        /*
+         * insert both the head node and the new ref without dropping
+         * the spin lock
+         */
+        ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+                                      (u64)-1, 0, 0, 0,
+                                      BTRFS_UPDATE_DELAYED_HEAD, 0);
+        BUG_ON(ret);
+        ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+                                      parent, ref_root, ref_generation,
+                                      owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
+        BUG_ON(ret);
+        ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
+                                      orig_parent, orig_ref_root,
+                                      orig_ref_generation, owner_objectid,
+                                      BTRFS_DROP_DELAYED_REF, pin);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000000..3bec2ff0b15c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+struct btrfs_delayed_ref_node {
+        struct rb_node rb_node;
+        /* the starting bytenr of the extent */
+        u64 bytenr;
+        /* the parent our backref will point to */
+        u64 parent;
+        /* the size of the extent */
+        u64 num_bytes;
+        /* ref count on this data structure */
+        atomic_t refs;
+        /*
+         * how many refs is this entry adding or deleting.  For
+         * head refs, this may be a negative number because it is keeping
+         * track of the total mods done to the reference count.
+         * For individual refs, this will always be a positive number
+         *
+         * It may be more than one, since it is possible for a single
+         * parent to have more than one ref on an extent
+         */
+        int ref_mod;
+        /* is this node still in the rbtree? */
+        unsigned int in_tree:1;
+};
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent.  They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+        struct btrfs_delayed_ref_node node;
+        /*
+         * the mutex is held while running the refs, and it is also
+         * held when checking the sum of reference modifications.
+         */
+        struct mutex mutex;
+        struct list_head cluster;
+        /*
+         * when a new extent is allocated, it is just reserved in memory
+         * The actual extent isn't inserted into the extent allocation tree
+         * until the delayed ref is processed.  must_insert_reserved is
+         * used to flag a delayed ref so the accounting can be updated
+         * when a full insert is done.
+         *
+         * It is possible the extent will be freed before it is ever
+         * inserted into the extent allocation tree.  In this case
+         * we need to update the in ram accounting to properly reflect
+         * the free has happened.
+         */
+        unsigned int must_insert_reserved:1;
+};
+struct btrfs_delayed_ref {
+        struct btrfs_delayed_ref_node node;
+        /* the root objectid our ref will point to */
+        u64 root;
+        /* the generation for the backref */
+        u64 generation;
+        /* owner_objectid of the backref  */
+        u64 owner_objectid;
+        /* operation done by this entry in the rbtree */
+        u8 action;
+        /* if pin == 1, when the extent is freed it will be pinned until
+         * transaction commit
+         */
+        unsigned int pin:1;
+};
+struct btrfs_delayed_ref_root {
+        struct rb_root root;
+        /* this spin lock protects the rbtree and the entries inside */
+        spinlock_t lock;
+        /* how many delayed ref updates we've queued, used by the
+         * throttling code
+         */
+        unsigned long num_entries;
+        /* total number of head nodes in tree */
+        unsigned long num_heads;
+        /* total number of head nodes ready for processing */
+        unsigned long num_heads_ready;
+        /*
+         * set when the tree is flushing before a transaction commit,
+         * used by the throttling code to decide if new updates need
+         * to be run right away
+         */
+        int flushing;
+        u64 run_delayed_start;
+};
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+        WARN_ON(atomic_read(&ref->refs) == 0);
+        if (atomic_dec_and_test(&ref->refs)) {
+                WARN_ON(ref->in_tree);
+                kfree(ref);
+        }
+}
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                          u64 ref_generation, u64 owner_objectid, int action,
+                          int pin);
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 num_bytes, u32 *refs);
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+                          u64 bytenr, u64 num_bytes, u64 orig_parent,
+                          u64 parent, u64 orig_ref_root, u64 ref_root,
+                          u64 orig_ref_generation, u64 ref_generation,
+                          u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                           struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                           struct list_head *cluster, u64 search_start);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+        return node->parent == (u64)-1;
+}
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_ref *
+btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+{
+        WARN_ON(btrfs_delayed_ref_is_head(node));
+        return container_of(node, struct btrfs_delayed_ref, node);
+}
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+        WARN_ON(!btrfs_delayed_ref_is_head(node));
+        return container_of(node, struct btrfs_delayed_ref_head, node);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7d..1d70236ba00c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
        key.objectid = dir;
        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
        key.offset = btrfs_name_hash(name, name_len);
        path = btrfs_alloc_path();
+        path->leave_spinning = 1;
        data_size = sizeof(*dir_item) + name_len;
        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
                                        name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ec80c0fc869..4b0ea0b80c23 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
 #include "locking.h"
 #include "ref-cache.h"
 #include "tree-log.h"
+#include "free-space-cache.h"
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -231,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                        memcpy(&found, result, csum_size);
                        read_extent_buffer(buf, &val, 0, csum_size);
-                        printk(KERN_INFO "btrfs: %s checksum verify failed "
+                        if (printk_ratelimit()) {
-                               "on %llu wanted %X found %X level %d\n",
+                                printk(KERN_INFO "btrfs: %s checksum verify "
-                               root->fs_info->sb->s_id,
+                                       "failed on %llu wanted %X found %X "
-                               buf->start, val, found, btrfs_header_level(buf));
+                                       "level %d\n",
+                                       root->fs_info->sb->s_id,
+                                       (unsigned long long)buf->start, val, found,
+                                       btrfs_header_level(buf));
+                        }
                        if (result != (char *)&inline_result)
                                kfree(result);
                        return 1;
@@ -267,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                ret = 0;
                goto out;
        }
-        printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+        if (printk_ratelimit()) {
-               (unsigned long long)eb->start,
+                printk("parent transid verify failed on %llu wanted %llu "
-               (unsigned long long)parent_transid,
+                       "found %llu\n",
-               (unsigned long long)btrfs_header_generation(eb));
+                       (unsigned long long)eb->start,
+                       (unsigned long long)parent_transid,
+                       (unsigned long long)btrfs_header_generation(eb));
+        }
        ret = 1;
        clear_extent_buffer_uptodate(io_tree, eb);
 out:
@@ -414,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
-                printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
+                if (printk_ratelimit()) {
-                       (unsigned long long)found_start,
+                        printk(KERN_INFO "btrfs bad tree block start "
-                       (unsigned long long)eb->start);
+                               "%llu %llu\n",
+                               (unsigned long long)found_start,
+                               (unsigned long long)eb->start);
+                }
                ret = -EIO;
                goto err;
        }
@@ -428,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                goto err;
        }
        if (check_tree_block_fsid(root, eb)) {
-                printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+                if (printk_ratelimit()) {
-                       (unsigned long long)eb->start);
+                        printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+                               (unsigned long long)eb->start);
+                }
                ret = -EIO;
                goto err;
        }
@@ -578,19 +591,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->bio_flags = bio_flags;
        atomic_inc(&fs_info->nr_async_submits);
+        if (rw & (1 << BIO_RW_SYNCIO))
+                btrfs_set_work_high_prio(&async->work);
        btrfs_queue_worker(&fs_info->workers, &async->work);
-#if 0
-        int limit = btrfs_async_submit_limit(fs_info);
-        if (atomic_read(&fs_info->nr_async_submits) > limit) {
-                wait_event_timeout(fs_info->async_submit_wait,
-                           (atomic_read(&fs_info->nr_async_submits) < limit),
-                           HZ/10);
-                wait_event_timeout(fs_info->async_submit_wait,
-                           (atomic_read(&fs_info->nr_async_bios) < limit),
-                           HZ/10);
-        }
-#endif
        while (atomic_read(&fs_info->async_submit_draining) &&
              atomic_read(&fs_info->nr_async_submits)) {
                wait_event(fs_info->async_submit_wait,
@@ -655,6 +661,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
        }
        /*
         * kthread helpers are used to submit writes so that checksumming
         * can happen in parallel across all CPUs
@@ -668,14 +675,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        struct extent_buffer *eb;
+        int was_dirty;
        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (!(current->flags & PF_MEMALLOC)) {
+                return extent_write_full_page(tree, page,
+                                              btree_get_extent, wbc);
+        }
-        if (current->flags & PF_MEMALLOC) {
+        redirty_page_for_writepage(wbc, page);
-                redirty_page_for_writepage(wbc, page);
+        eb = btrfs_find_tree_block(root, page_offset(page),
-                unlock_page(page);
+                                      PAGE_CACHE_SIZE);
-                return 0;
+        WARN_ON(!eb);
+        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+        if (!was_dirty) {
+                spin_lock(&root->fs_info->delalloc_lock);
+                root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+                spin_unlock(&root->fs_info->delalloc_lock);
        }
-        return extent_write_full_page(tree, page, btree_get_extent, wbc);
+        free_extent_buffer(eb);
+        unlock_page(page);
+        return 0;
 }
 static int btree_writepages(struct address_space *mapping,
@@ -684,15 +708,15 @@ static int btree_writepages(struct address_space *mapping,
        struct extent_io_tree *tree;
        tree = &BTRFS_I(mapping->host)->io_tree;
        if (wbc->sync_mode == WB_SYNC_NONE) {
+                struct btrfs_root *root = BTRFS_I(mapping->host)->root;
                u64 num_dirty;
-                u64 start = 0;
                unsigned long thresh = 32 * 1024 * 1024;
                if (wbc->for_kupdate)
                        return 0;
-                num_dirty = count_range_bits(tree, &start, (u64)-1,
+                /* this is a bit racy, but that's ok */
-                                             thresh, EXTENT_DIRTY);
+                num_dirty = root->fs_info->dirty_metadata_bytes;
                if (num_dirty < thresh)
                        return 0;
        }
@@ -747,27 +771,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
        }
 }
-#if 0
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
-        struct buffer_head *bh;
-        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-        struct buffer_head *head;
-        if (!page_has_buffers(page)) {
-                create_empty_buffers(page, root->fs_info->sb->s_blocksize,
-                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
-        }
-        head = page_buffers(page);
-        bh = head;
-        do {
-                if (buffer_dirty(bh))
-                        csum_tree_block(root, bh, 0);
-                bh = bh->b_this_page;
-        } while (bh != head);
-        return block_write_full_page(page, btree_get_block, wbc);
-}
-#endif
 static struct address_space_operations btree_aops = {
        .readpage       = btree_readpage,
        .writepage      = btree_writepage,
@@ -845,8 +848,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
        if (ret == 0)
                set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
-        else
-                WARN_ON(1);
        return buf;
 }
@@ -859,9 +860,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
            root->fs_info->running_transaction->transid) {
                btrfs_assert_tree_locked(buf);
-                /* ugh, clear_extent_buffer_dirty can be expensive */
+                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-                btrfs_set_lock_blocking(buf);
+                        spin_lock(&root->fs_info->delalloc_lock);
+                        if (root->fs_info->dirty_metadata_bytes >= buf->len)
+                                root->fs_info->dirty_metadata_bytes -= buf->len;
+                        else
+                                WARN_ON(1);
+                        spin_unlock(&root->fs_info->delalloc_lock);
+                }
+                /* ugh, clear_extent_buffer_dirty needs to lock the page */
+                btrfs_set_lock_blocking(buf);
                clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
                                          buf);
        }
@@ -1247,11 +1256,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
        int ret = 0;
        struct btrfs_device *device;
        struct backing_dev_info *bdi;
-#if 0
-        if ((bdi_bits & (1 << BDI_write_congested)) &&
-            btrfs_congested_async(info, 0))
-                return 1;
-#endif
        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
                if (!device->bdev)
                        continue;
@@ -1387,8 +1392,6 @@ static int bio_ready_for_csum(struct bio *bio)
        ret = extent_range_uptodate(io_tree, start + length,
                                    start + buf_len - 1);
-        if (ret == 1)
-                return ret;
        return ret;
 }
@@ -1471,12 +1474,6 @@ static int transaction_kthread(void *arg)
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
-                if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
-                        printk(KERN_INFO "btrfs: total reference cache "
-                               "size %llu\n",
-                               root->fs_info->total_ref_cache_size);
-                }
                mutex_lock(&root->fs_info->trans_mutex);
                cur = root->fs_info->running_transaction;
                if (!cur) {
@@ -1493,6 +1490,7 @@ static int transaction_kthread(void *arg)
                mutex_unlock(&root->fs_info->trans_mutex);
                trans = btrfs_start_transaction(root, 1);
                ret = btrfs_commit_transaction(trans, root);
 sleep:
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1550,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->hashers);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+        INIT_LIST_HEAD(&fs_info->ordered_operations);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->new_trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
@@ -1579,6 +1578,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->btree_inode = new_inode(sb);
        fs_info->btree_inode->i_ino = 1;
        fs_info->btree_inode->i_nlink = 1;
+        fs_info->metadata_ratio = 8;
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
@@ -1611,10 +1611,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        extent_io_tree_init(&fs_info->pinned_extents,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
-        extent_io_tree_init(&fs_info->pending_del,
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
-        extent_io_tree_init(&fs_info->extent_ins,
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
        fs_info->do_barriers = 1;
        INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,15 +1623,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        insert_inode_hash(fs_info->btree_inode);
        mutex_init(&fs_info->trans_mutex);
+        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->drop_mutex);
-        mutex_init(&fs_info->extent_ins_mutex);
-        mutex_init(&fs_info->pinned_mutex);
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
        mutex_init(&fs_info->tree_reloc_mutex);
+        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
        init_waitqueue_head(&fs_info->transaction_throttle);
        init_waitqueue_head(&fs_info->transaction_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
@@ -1670,7 +1669,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (features) {
                printk(KERN_ERR "BTRFS: couldn't mount because of "
                       "unsupported optional features (%Lx).\n",
-                       features);
+                       (unsigned long long)features);
                err = -EINVAL;
                goto fail_iput;
        }
@@ -1680,7 +1679,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!(sb->s_flags & MS_RDONLY) && features) {
                printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
                       "unsupported option features (%Lx).\n",
-                       features);
+                       (unsigned long long)features);
                err = -EINVAL;
                goto fail_iput;
        }
@@ -2076,10 +2075,10 @@ static int write_dev_supers(struct btrfs_device *device,
                                device->barriers = 0;
                                get_bh(bh);
                                lock_buffer(bh);
-                                ret = submit_bh(WRITE, bh);
+                                ret = submit_bh(WRITE_SYNC, bh);
                        }
                } else {
-                        ret = submit_bh(WRITE, bh);
+                        ret = submit_bh(WRITE_SYNC, bh);
                }
                if (!ret && wait) {
@@ -2272,7 +2271,7 @@ int close_ctree(struct btrfs_root *root)
        if (fs_info->delalloc_bytes) {
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-                       fs_info->delalloc_bytes);
+                       (unsigned long long)fs_info->delalloc_bytes);
        }
        if (fs_info->total_ref_cache_size) {
                printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
@@ -2309,16 +2308,6 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-#if 0
-        while (!list_empty(&fs_info->hashers)) {
-                struct btrfs_hasher *hasher;
-                hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
-                                    hashers);
-                list_del(&hasher->hashers);
-                crypto_free_hash(&fs_info->hash_tfm);
-                kfree(hasher);
-        }
-#endif
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2358,8 +2347,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
        u64 transid = btrfs_header_generation(buf);
        struct inode *btree_inode = root->fs_info->btree_inode;
+        int was_dirty;
-        btrfs_set_lock_blocking(buf);
        btrfs_assert_tree_locked(buf);
        if (transid != root->fs_info->generation) {
@@ -2370,7 +2358,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
                        (unsigned long long)root->fs_info->generation);
                WARN_ON(1);
        }
-        set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+        was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+                                            buf);
+        if (!was_dirty) {
+                spin_lock(&root->fs_info->delalloc_lock);
+                root->fs_info->dirty_metadata_bytes += buf->len;
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
 }
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2410,6 +2404,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 int btree_lock_page_hook(struct page *page)
 {
        struct inode *inode = page->mapping->host;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_buffer *eb;
        unsigned long len;
@@ -2425,6 +2420,16 @@ int btree_lock_page_hook(struct page *page)
        btrfs_tree_lock(eb);
        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+                spin_lock(&root->fs_info->delalloc_lock);
+                if (root->fs_info->dirty_metadata_bytes >= eb->len)
+                        root->fs_info->dirty_metadata_bytes -= eb->len;
+                else
+                        WARN_ON(1);
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
 out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227be..c958ecbc1916 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad2059..35af93355063 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "free-space-cache.h"
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
@@ -49,17 +50,23 @@ struct pending_extent_op {
        int del;
 };
-static int finish_current_insert(struct btrfs_trans_handle *trans,
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *extent_root, int all);
+                                         struct btrfs_root *root, u64 parent,
-static int del_pending_extents(struct btrfs_trans_handle *trans,
+                                         u64 root_objectid, u64 ref_generation,
-                               struct btrfs_root *extent_root, int all);
+                                         u64 owner, struct btrfs_key *ins,
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
+                                         int ref_mod);
-                          struct btrfs_root *root,
+static int update_reserved_extents(struct btrfs_root *root,
-                          u64 bytenr, u64 num_bytes, int is_data);
+                                   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc,
                              int mark_free);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        u64 bytenr, u64 num_bytes, u64 parent,
+                                        u64 root_objectid, u64 ref_generation,
+                                        u64 owner_objectid, int pin,
+                                        int ref_to_drop);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -160,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
        u64 extent_start, extent_end, size;
        int ret;
-        mutex_lock(&info->pinned_mutex);
        while (start < end) {
                ret = find_first_extent_bit(&info->pinned_extents, start,
                                            &extent_start, &extent_end,
@@ -186,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
                ret = btrfs_add_free_space(block_group, start, size);
                BUG_ON(ret);
        }
-        mutex_unlock(&info->pinned_mutex);
        return 0;
 }
@@ -285,8 +290,8 @@ next:
                           block_group->key.objectid +
                           block_group->key.offset);
-        remove_sb_from_cache(root, block_group);
        block_group->cached = 1;
+        remove_sb_from_cache(root, block_group);
        ret = 0;
 err:
        btrfs_free_path(path);
@@ -307,7 +312,7 @@ btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 }
 /*
- * return the block group that contains teh given bytenr
+ * return the block group that contains the given bytenr
 */
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
@@ -320,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
        return cache;
 }
-static inline void put_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
        if (atomic_dec_and_test(&cache->count))
                kfree(cache);
@@ -393,12 +398,12 @@ again:
                            div_factor(cache->key.offset, factor)) {
                                group_start = cache->key.objectid;
                                spin_unlock(&cache->lock);
-                                put_block_group(cache);
+                                btrfs_put_block_group(cache);
                                goto found;
                        }
                }
                spin_unlock(&cache->lock);
-                put_block_group(cache);
+                btrfs_put_block_group(cache);
                cond_resched();
        }
        if (!wrapped) {
@@ -554,262 +559,13 @@ out:
        return ret;
 }
-/*
- * updates all the backrefs that are pending on update_list for the
- * extent_root
- */
-static noinline int update_backrefs(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *extent_root,
-                                    struct btrfs_path *path,
-                                    struct list_head *update_list)
-{
-        struct btrfs_key key;
-        struct btrfs_extent_ref *ref;
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct pending_extent_op *op;
-        struct extent_buffer *leaf;
-        int ret = 0;
-        struct list_head *cur = update_list->next;
-        u64 ref_objectid;
-        u64 ref_root = extent_root->root_key.objectid;
-        op = list_entry(cur, struct pending_extent_op, list);
-search:
-        key.objectid = op->bytenr;
-        key.type = BTRFS_EXTENT_REF_KEY;
-        key.offset = op->orig_parent;
-        ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
-        BUG_ON(ret);
-        leaf = path->nodes[0];
-loop:
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-        ref_objectid = btrfs_ref_objectid(leaf, ref);
-        if (btrfs_ref_root(leaf, ref) != ref_root ||
-            btrfs_ref_generation(leaf, ref) != op->orig_generation ||
-            (ref_objectid != op->level &&
-             ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-                printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
-                       "root %llu, owner %u\n",
-                       (unsigned long long)op->bytenr,
-                       (unsigned long long)op->orig_parent,
-                       (unsigned long long)ref_root, op->level);
-                btrfs_print_leaf(extent_root, leaf);
-                BUG();
-        }
-        key.objectid = op->bytenr;
-        key.offset = op->parent;
-        key.type = BTRFS_EXTENT_REF_KEY;
-        ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
-        BUG_ON(ret);
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-        btrfs_set_ref_generation(leaf, ref, op->generation);
-        cur = cur->next;
-        list_del_init(&op->list);
-        unlock_extent(&info->extent_ins, op->bytenr,
-                      op->bytenr + op->num_bytes - 1, GFP_NOFS);
-        kfree(op);
-        if (cur == update_list) {
-                btrfs_mark_buffer_dirty(path->nodes[0]);
-                btrfs_release_path(extent_root, path);
-                goto out;
-        }
-        op = list_entry(cur, struct pending_extent_op, list);
-        path->slots[0]++;
-        while (path->slots[0] < btrfs_header_nritems(leaf)) {
-                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                if (key.objectid == op->bytenr &&
-                    key.type == BTRFS_EXTENT_REF_KEY)
-                        goto loop;
-                path->slots[0]++;
-        }
-        btrfs_mark_buffer_dirty(path->nodes[0]);
-        btrfs_release_path(extent_root, path);
-        goto search;
-out:
-        return 0;
-}
-static noinline int insert_extents(struct btrfs_trans_handle *trans,
-                                   struct btrfs_root *extent_root,
-                                   struct btrfs_path *path,
-                                   struct list_head *insert_list, int nr)
-{
-        struct btrfs_key *keys;
-        u32 *data_size;
-        struct pending_extent_op *op;
-        struct extent_buffer *leaf;
-        struct list_head *cur = insert_list->next;
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        u64 ref_root = extent_root->root_key.objectid;
-        int i = 0, last = 0, ret;
-        int total = nr * 2;
-        if (!nr)
-                return 0;
-        keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
-        if (!keys)
-                return -ENOMEM;
-        data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
-        if (!data_size) {
-                kfree(keys);
-                return -ENOMEM;
-        }
-        list_for_each_entry(op, insert_list, list) {
-                keys[i].objectid = op->bytenr;
-                keys[i].offset = op->num_bytes;
-                keys[i].type = BTRFS_EXTENT_ITEM_KEY;
-                data_size[i] = sizeof(struct btrfs_extent_item);
-                i++;
-                keys[i].objectid = op->bytenr;
-                keys[i].offset = op->parent;
-                keys[i].type = BTRFS_EXTENT_REF_KEY;
-                data_size[i] = sizeof(struct btrfs_extent_ref);
-                i++;
-        }
-        op = list_entry(cur, struct pending_extent_op, list);
-        i = 0;
-        while (i < total) {
-                int c;
-                ret = btrfs_insert_some_items(trans, extent_root, path,
-                                              keys+i, data_size+i, total-i);
-                BUG_ON(ret < 0);
-                if (last && ret > 1)
-                        BUG();
-                leaf = path->nodes[0];
-                for (c = 0; c < ret; c++) {
-                        int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
-                        /*
-                         * if the first item we inserted was a backref, then
-                         * the EXTENT_ITEM will be the odd c's, else it will
-                         * be the even c's
-                         */
-                        if ((ref_first && (c % 2)) ||
-                            (!ref_first && !(c % 2))) {
-                                struct btrfs_extent_item *itm;
-                                itm = btrfs_item_ptr(leaf, path->slots[0] + c,
-                                                     struct btrfs_extent_item);
-                                btrfs_set_extent_refs(path->nodes[0], itm, 1);
-                                op->del++;
-                        } else {
-                                struct btrfs_extent_ref *ref;
-                                ref = btrfs_item_ptr(leaf, path->slots[0] + c,
-                                                     struct btrfs_extent_ref);
-                                btrfs_set_ref_root(leaf, ref, ref_root);
-                                btrfs_set_ref_generation(leaf, ref,
-                                                         op->generation);
-                                btrfs_set_ref_objectid(leaf, ref, op->level);
-                                btrfs_set_ref_num_refs(leaf, ref, 1);
-                                op->del++;
-                        }
-                        /*
-                         * using del to see when its ok to free up the
-                         * pending_extent_op.  In the case where we insert the
-                         * last item on the list in order to help do batching
-                         * we need to not free the extent op until we actually
-                         * insert the extent_item
-                         */
-                        if (op->del == 2) {
-                                unlock_extent(&info->extent_ins, op->bytenr,
-                                              op->bytenr + op->num_bytes - 1,
-                                              GFP_NOFS);
-                                cur = cur->next;
-                                list_del_init(&op->list);
-                                kfree(op);
-                                if (cur != insert_list)
-                                        op = list_entry(cur,
-                                                struct pending_extent_op,
-                                                list);
-                        }
-                }
-                btrfs_mark_buffer_dirty(leaf);
-                btrfs_release_path(extent_root, path);
-                /*
-                 * Ok backref's and items usually go right next to eachother,
-                 * but if we could only insert 1 item that means that we
-                 * inserted on the end of a leaf, and we have no idea what may
-                 * be on the next leaf so we just play it safe.  In order to
-                 * try and help this case we insert the last thing on our
-                 * insert list so hopefully it will end up being the last
-                 * thing on the leaf and everything else will be before it,
-                 * which will let us insert a whole bunch of items at the same
-                 * time.
-                 */
-                if (ret == 1 && !last && (i + ret < total)) {
-                        /*
-                         * last: where we will pick up the next time around
-                         * i: our current key to insert, will be total - 1
-                         * cur: the current op we are screwing with
-                         * op: duh
-                         */
-                        last = i + ret;
-                        i = total - 1;
-                        cur = insert_list->prev;
-                        op = list_entry(cur, struct pending_extent_op, list);
-                } else if (last) {
-                        /*
-                         * ok we successfully inserted the last item on the
-                         * list, lets reset everything
-                         *
-                         * i: our current key to insert, so where we left off
-                         *    last time
-                         * last: done with this
-                         * cur: the op we are messing with
-                         * op: duh
-                         * total: since we inserted the last key, we need to
-                         *        decrement total so we dont overflow
-                         */
-                        i = last;
-                        last = 0;
-                        total--;
-                        if (i < total) {
-                                cur = insert_list->next;
-                                op = list_entry(cur, struct pending_extent_op,
-                                                list);
-                        }
-                } else {
-                        i += ret;
-                }
-                cond_resched();
-        }
-        ret = 0;
-        kfree(keys);
-        kfree(data_size);
-        return ret;
-}
 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
                                          u64 bytenr, u64 parent,
                                          u64 ref_root, u64 ref_generation,
-                                          u64 owner_objectid)
+                                          u64 owner_objectid,
+                                          int refs_to_add)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -829,9 +585,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                btrfs_set_ref_root(leaf, ref, ref_root);
                btrfs_set_ref_generation(leaf, ref, ref_generation);
                btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-                btrfs_set_ref_num_refs(leaf, ref, 1);
+                btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
        } else if (ret == -EEXIST) {
                u64 existing_owner;
                BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
                leaf = path->nodes[0];
                ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +602,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                num_refs = btrfs_ref_num_refs(leaf, ref);
                BUG_ON(num_refs == 0);
-                btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+                btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
                existing_owner = btrfs_ref_objectid(leaf, ref);
                if (existing_owner != owner_objectid &&
@@ -857,6 +614,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
        } else {
                goto out;
        }
+        btrfs_unlock_up_safe(path, 1);
        btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
        btrfs_release_path(root, path);
@@ -865,7 +623,8 @@ out:
 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
-                                          struct btrfs_path *path)
+                                          struct btrfs_path *path,
+                                          int refs_to_drop)
 {
        struct extent_buffer *leaf;
        struct btrfs_extent_ref *ref;
@@ -875,8 +634,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
        leaf = path->nodes[0];
        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
        num_refs = btrfs_ref_num_refs(leaf, ref);
-        BUG_ON(num_refs == 0);
+        BUG_ON(num_refs < refs_to_drop);
-        num_refs -= 1;
+        num_refs -= refs_to_drop;
        if (num_refs == 0) {
                ret = btrfs_del_item(trans, root, path);
        } else {
@@ -927,332 +686,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
-static noinline int free_extents(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *extent_root,
-                                 struct list_head *del_list)
-{
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct btrfs_path *path;
-        struct btrfs_key key, found_key;
-        struct extent_buffer *leaf;
-        struct list_head *cur;
-        struct pending_extent_op *op;
-        struct btrfs_extent_item *ei;
-        int ret, num_to_del, extent_slot = 0, found_extent = 0;
-        u32 refs;
-        u64 bytes_freed = 0;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        path->reada = 1;
-search:
-        /* search for the backref for the current ref we want to delete */
-        cur = del_list->next;
-        op = list_entry(cur, struct pending_extent_op, list);
-        ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
-                                    op->orig_parent,
-                                    extent_root->root_key.objectid,
-                                    op->orig_generation, op->level, 1);
-        if (ret) {
-                printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
-                       "root %llu gen %llu owner %u\n",
-                       (unsigned long long)op->bytenr,
-                       (unsigned long long)extent_root->root_key.objectid,
-                       (unsigned long long)op->orig_generation, op->level);
-                btrfs_print_leaf(extent_root, path->nodes[0]);
-                WARN_ON(1);
-                goto out;
-        }
-        extent_slot = path->slots[0];
-        num_to_del = 1;
-        found_extent = 0;
-        /*
-         * if we aren't the first item on the leaf we can move back one and see
-         * if our ref is right next to our extent item
-         */
-        if (likely(extent_slot)) {
-                extent_slot--;
-                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                      extent_slot);
-                if (found_key.objectid == op->bytenr &&
-                    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
-                    found_key.offset == op->num_bytes) {
-                        num_to_del++;
-                        found_extent = 1;
-                }
-        }
-        /*
-         * if we didn't find the extent we need to delete the backref and then
-         * search for the extent item key so we can update its ref count
-         */
-        if (!found_extent) {
-                key.objectid = op->bytenr;
-                key.type = BTRFS_EXTENT_ITEM_KEY;
-                key.offset = op->num_bytes;
-                ret = remove_extent_backref(trans, extent_root, path);
-                BUG_ON(ret);
-                btrfs_release_path(extent_root, path);
-                ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
-                BUG_ON(ret);
-                extent_slot = path->slots[0];
-        }
-        /* this is where we update the ref count for the extent */
-        leaf = path->nodes[0];
-        ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
-        refs = btrfs_extent_refs(leaf, ei);
-        BUG_ON(refs == 0);
-        refs--;
-        btrfs_set_extent_refs(leaf, ei, refs);
-        btrfs_mark_buffer_dirty(leaf);
-        /*
-         * This extent needs deleting.  The reason cur_slot is extent_slot +
-         * num_to_del is because extent_slot points to the slot where the extent
-         * is, and if the backref was not right next to the extent we will be
-         * deleting at least 1 item, and will want to start searching at the
-         * slot directly next to extent_slot.  However if we did find the
-         * backref next to the extent item them we will be deleting at least 2
-         * items and will want to start searching directly after the ref slot
-         */
-        if (!refs) {
-                struct list_head *pos, *n, *end;
-                int cur_slot = extent_slot+num_to_del;
-                u64 super_used;
-                u64 root_used;
-                path->slots[0] = extent_slot;
-                bytes_freed = op->num_bytes;
-                mutex_lock(&info->pinned_mutex);
-                ret = pin_down_bytes(trans, extent_root, op->bytenr,
-                                     op->num_bytes, op->level >=
-                                     BTRFS_FIRST_FREE_OBJECTID);
-                mutex_unlock(&info->pinned_mutex);
-                BUG_ON(ret < 0);
-                op->del = ret;
-                /*
-                 * we need to see if we can delete multiple things at once, so
-                 * start looping through the list of extents we are wanting to
-                 * delete and see if their extent/backref's are right next to
-                 * eachother and the extents only have 1 ref
-                 */
-                for (pos = cur->next; pos != del_list; pos = pos->next) {
-                        struct pending_extent_op *tmp;
-                        tmp = list_entry(pos, struct pending_extent_op, list);
-                        /* we only want to delete extent+ref at this stage */
-                        if (cur_slot >= btrfs_header_nritems(leaf) - 1)
-                                break;
-                        btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
-                        if (found_key.objectid != tmp->bytenr ||
-                            found_key.type != BTRFS_EXTENT_ITEM_KEY ||
-                            found_key.offset != tmp->num_bytes)
-                                break;
-                        /* check to make sure this extent only has one ref */
-                        ei = btrfs_item_ptr(leaf, cur_slot,
-                                            struct btrfs_extent_item);
-                        if (btrfs_extent_refs(leaf, ei) != 1)
-                                break;
-                        btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
-                        if (found_key.objectid != tmp->bytenr ||
-                            found_key.type != BTRFS_EXTENT_REF_KEY ||
-                            found_key.offset != tmp->orig_parent)
-                                break;
-                        /*
-                         * the ref is right next to the extent, we can set the
-                         * ref count to 0 since we will delete them both now
-                         */
-                        btrfs_set_extent_refs(leaf, ei, 0);
-                        /* pin down the bytes for this extent */
-                        mutex_lock(&info->pinned_mutex);
-                        ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
-                                             tmp->num_bytes, tmp->level >=
-                                             BTRFS_FIRST_FREE_OBJECTID);
-                        mutex_unlock(&info->pinned_mutex);
-                        BUG_ON(ret < 0);
-                        /*
-                         * use the del field to tell if we need to go ahead and
-                         * free up the extent when we delete the item or not.
-                         */
-                        tmp->del = ret;
-                        bytes_freed += tmp->num_bytes;
-                        num_to_del += 2;
-                        cur_slot += 2;
-                }
-                end = pos;
-                /* update the free space counters */
-                spin_lock(&info->delalloc_lock);
-                super_used = btrfs_super_bytes_used(&info->super_copy);
-                btrfs_set_super_bytes_used(&info->super_copy,
-                                           super_used - bytes_freed);
-                root_used = btrfs_root_used(&extent_root->root_item);
-                btrfs_set_root_used(&extent_root->root_item,
-                                    root_used - bytes_freed);
-                spin_unlock(&info->delalloc_lock);
-                /* delete the items */
-                ret = btrfs_del_items(trans, extent_root, path,
-                                      path->slots[0], num_to_del);
-                BUG_ON(ret);
-                /*
-                 * loop through the extents we deleted and do the cleanup work
-                 * on them
-                 */
-                for (pos = cur, n = pos->next; pos != end;
-                     pos = n, n = pos->next) {
-                        struct pending_extent_op *tmp;
-                        tmp = list_entry(pos, struct pending_extent_op, list);
-                        /*
-                         * remember tmp->del tells us wether or not we pinned
-                         * down the extent
-                         */
-                        ret = update_block_group(trans, extent_root,
-                                                 tmp->bytenr, tmp->num_bytes, 0,
-                                                 tmp->del);
-                        BUG_ON(ret);
-                        list_del_init(&tmp->list);
-                        unlock_extent(&info->extent_ins, tmp->bytenr,
-                                      tmp->bytenr + tmp->num_bytes - 1,
-                                      GFP_NOFS);
-                        kfree(tmp);
-                }
-        } else if (refs && found_extent) {
-                /*
-                 * the ref and extent were right next to eachother, but the
-                 * extent still has a ref, so just free the backref and keep
-                 * going
-                 */
-                ret = remove_extent_backref(trans, extent_root, path);
-                BUG_ON(ret);
-                list_del_init(&op->list);
-                unlock_extent(&info->extent_ins, op->bytenr,
-                              op->bytenr + op->num_bytes - 1, GFP_NOFS);
-                kfree(op);
-        } else {
-                /*
-                 * the extent has multiple refs and the backref we were looking
-                 * for was not right next to it, so just unlock and go next,
-                 * we're good to go
-                 */
-                list_del_init(&op->list);
-                unlock_extent(&info->extent_ins, op->bytenr,
-                              op->bytenr + op->num_bytes - 1, GFP_NOFS);
-                kfree(op);
-        }
-        btrfs_release_path(extent_root, path);
-        if (!list_empty(del_list))
-                goto search;
-out:
-        btrfs_free_path(path);
-        return ret;
-}
 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, u64 bytenr,
+                                     u64 num_bytes,
                                     u64 orig_parent, u64 parent,
                                     u64 orig_root, u64 ref_root,
                                     u64 orig_generation, u64 ref_generation,
                                     u64 owner_objectid)
 {
        int ret;
-        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
-        struct btrfs_path *path;
-        if (root == root->fs_info->extent_root) {
-                struct pending_extent_op *extent_op;
-                u64 num_bytes;
-                BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
-                num_bytes = btrfs_level_size(root, (int)owner_objectid);
-                mutex_lock(&root->fs_info->extent_ins_mutex);
-                if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-                                bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-                        u64 priv;
-                        ret = get_state_private(&root->fs_info->extent_ins,
-                                                bytenr, &priv);
-                        BUG_ON(ret);
-                        extent_op = (struct pending_extent_op *)
-                                                        (unsigned long)priv;
-                        BUG_ON(extent_op->parent != orig_parent);
-                        BUG_ON(extent_op->generation != orig_generation);
-                        extent_op->parent = parent;
+        ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
-                        extent_op->generation = ref_generation;
+                                       orig_parent, parent, orig_root,
-                } else {
+                                       ref_root, orig_generation,
-                        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                                       ref_generation, owner_objectid, pin);
-                        BUG_ON(!extent_op);
-                        extent_op->type = PENDING_BACKREF_UPDATE;
-                        extent_op->bytenr = bytenr;
-                        extent_op->num_bytes = num_bytes;
-                        extent_op->parent = parent;
-                        extent_op->orig_parent = orig_parent;
-                        extent_op->generation = ref_generation;
-                        extent_op->orig_generation = orig_generation;
-                        extent_op->level = (int)owner_objectid;
-                        INIT_LIST_HEAD(&extent_op->list);
-                        extent_op->del = 0;
-                        set_extent_bits(&root->fs_info->extent_ins,
-                                        bytenr, bytenr + num_bytes - 1,
-                                        EXTENT_WRITEBACK, GFP_NOFS);
-                        set_state_private(&root->fs_info->extent_ins,
-                                          bytenr, (unsigned long)extent_op);
-                }
-                mutex_unlock(&root->fs_info->extent_ins_mutex);
-                return 0;
-        }
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        ret = lookup_extent_backref(trans, extent_root, path,
-                                    bytenr, orig_parent, orig_root,
-                                    orig_generation, owner_objectid, 1);
-        if (ret)
-                goto out;
-        ret = remove_extent_backref(trans, extent_root, path);
-        if (ret)
-                goto out;
-        ret = insert_extent_backref(trans, extent_root, path, bytenr,
-                                    parent, ref_root, ref_generation,
-                                    owner_objectid);
        BUG_ON(ret);
-        finish_current_insert(trans, extent_root, 0);
-        del_pending_extents(trans, extent_root, 0);
-out:
-        btrfs_free_path(path);
        return ret;
 }
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 bytenr,
-                            u64 orig_parent, u64 parent,
+                            u64 num_bytes, u64 orig_parent, u64 parent,
                            u64 ref_root, u64 ref_generation,
                            u64 owner_objectid)
 {
@@ -1260,20 +715,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
                return 0;
-        ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
-                                        parent, ref_root, ref_root,
+        ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-                                        ref_generation, ref_generation,
+                                        orig_parent, parent, ref_root,
-                                        owner_objectid);
+                                        ref_root, ref_generation,
+                                        ref_generation, owner_objectid);
        return ret;
 }
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, u64 bytenr,
+                                  u64 num_bytes,
                                  u64 orig_parent, u64 parent,
                                  u64 orig_root, u64 ref_root,
                                  u64 orig_generation, u64 ref_generation,
                                  u64 owner_objectid)
 {
+        int ret;
+        ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+                                    ref_generation, owner_objectid,
+                                    BTRFS_ADD_DELAYED_REF, 0);
+        BUG_ON(ret);
+        return ret;
+}
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 bytenr,
+                          u64 num_bytes, u64 parent, u64 ref_root,
+                          u64 ref_generation, u64 owner_objectid,
+                          int refs_to_add)
+{
        struct btrfs_path *path;
        int ret;
        struct btrfs_key key;
@@ -1286,17 +757,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->reada = 1;
+        path->leave_spinning = 1;
        key.objectid = bytenr;
        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = (u64)-1;
+        key.offset = num_bytes;
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+        /* first find the extent item and update its reference count */
-                                0, 1);
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
-        if (ret < 0)
+                                path, 0, 1);
+        if (ret < 0) {
+                btrfs_set_path_blocking(path);
                return ret;
-        BUG_ON(ret == 0 || path->slots[0] == 0);
+        }
-        path->slots[0]--;
+        if (ret > 0) {
+                WARN_ON(1);
+                btrfs_free_path(path);
+                return -EIO;
+        }
        l = path->nodes[0];
        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +788,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(l, item);
-        btrfs_set_extent_refs(l, item, refs + 1);
+        btrfs_set_extent_refs(l, item, refs + refs_to_add);
+        btrfs_unlock_up_safe(path, 1);
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_release_path(root->fs_info->extent_root, path);
        path->reada = 1;
+        path->leave_spinning = 1;
+        /* now insert the actual backref */
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
                                    path, bytenr, parent,
                                    ref_root, ref_generation,
-                                    owner_objectid);
+                                    owner_objectid, refs_to_add);
        BUG_ON(ret);
-        finish_current_insert(trans, root->fs_info->extent_root, 0);
-        del_pending_extents(trans, root->fs_info->extent_root, 0);
        btrfs_free_path(path);
        return 0;
 }
@@ -1339,68 +820,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
                return 0;
-        ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+        ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
                                     0, ref_root, 0, ref_generation,
                                     owner_objectid);
        return ret;
 }
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root)
+                                        struct btrfs_root *root,
+                                        struct btrfs_delayed_ref_node *node)
+{
+        int ret = 0;
+        struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+        BUG_ON(node->ref_mod == 0);
+        ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+                                  node->parent, ref->root, ref->generation,
+                                  ref->owner_objectid, ref->pin, node->ref_mod);
+        return ret;
+}
+/* helper function to actually process a single delayed ref entry */
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_delayed_ref_node *node,
+                                        int insert_reserved)
 {
-        u64 start;
-        u64 end;
        int ret;
+        struct btrfs_delayed_ref *ref;
-        while(1) {
+        if (node->parent == (u64)-1) {
-                finish_current_insert(trans, root->fs_info->extent_root, 1);
+                struct btrfs_delayed_ref_head *head;
-                del_pending_extents(trans, root->fs_info->extent_root, 1);
+                /*
+                 * we've hit the end of the chain and we were supposed
+                 * to insert this extent into the tree.  But, it got
+                 * deleted before we ever needed to insert it, so all
+                 * we have to do is clean up the accounting
+                 */
+                if (insert_reserved) {
+                        update_reserved_extents(root, node->bytenr,
+                                                node->num_bytes, 0);
+                }
+                head = btrfs_delayed_node_to_head(node);
+                mutex_unlock(&head->mutex);
+                return 0;
+        }
-                /* is there more work to do? */
+        ref = btrfs_delayed_node_to_ref(node);
-                ret = find_first_extent_bit(&root->fs_info->pending_del,
+        if (ref->action == BTRFS_ADD_DELAYED_REF) {
-                                            0, &start, &end, EXTENT_WRITEBACK);
+                if (insert_reserved) {
-                if (!ret)
+                        struct btrfs_key ins;
-                        continue;
-                ret = find_first_extent_bit(&root->fs_info->extent_ins,
+                        ins.objectid = node->bytenr;
-                                            0, &start, &end, EXTENT_WRITEBACK);
+                        ins.offset = node->num_bytes;
-                if (!ret)
+                        ins.type = BTRFS_EXTENT_ITEM_KEY;
-                        continue;
-                break;
+                        /* record the full extent allocation */
+                        ret = __btrfs_alloc_reserved_extent(trans, root,
+                                        node->parent, ref->root,
+                                        ref->generation, ref->owner_objectid,
+                                        &ins, node->ref_mod);
+                        update_reserved_extents(root, node->bytenr,
+                                                node->num_bytes, 0);
+                } else {
+                        /* just add one backref */
+                        ret = add_extent_ref(trans, root, node->bytenr,
+                                     node->num_bytes,
+                                     node->parent, ref->root, ref->generation,
+                                     ref->owner_objectid, node->ref_mod);
+                }
+                BUG_ON(ret);
+        } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+                WARN_ON(insert_reserved);
+                ret = drop_delayed_ref(trans, root, node);
        }
        return 0;
 }
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+static noinline struct btrfs_delayed_ref_node *
-                            struct btrfs_root *root, u64 bytenr,
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
-                            u64 num_bytes, u32 *refs)
 {
-        struct btrfs_path *path;
+        struct rb_node *node;
+        struct btrfs_delayed_ref_node *ref;
+        int action = BTRFS_ADD_DELAYED_REF;
+again:
+        /*
+         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+         * this prevents ref count from going down to zero when
+         * there still are pending delayed ref.
+         */
+        node = rb_prev(&head->node.rb_node);
+        while (1) {
+                if (!node)
+                        break;
+                ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                                rb_node);
+                if (ref->bytenr != head->node.bytenr)
+                        break;
+                if (btrfs_delayed_node_to_ref(ref)->action == action)
+                        return ref;
+                node = rb_prev(node);
+        }
+        if (action == BTRFS_ADD_DELAYED_REF) {
+                action = BTRFS_DROP_DELAYED_REF;
+                goto again;
+        }
+        return NULL;
+}
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct list_head *cluster)
+{
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_head *locked_ref = NULL;
        int ret;
-        struct btrfs_key key;
+        int count = 0;
-        struct extent_buffer *l;
+        int must_insert_reserved = 0;
-        struct btrfs_extent_item *item;
-        WARN_ON(num_bytes < root->sectorsize);
+        delayed_refs = &trans->transaction->delayed_refs;
-        path = btrfs_alloc_path();
+        while (1) {
-        path->reada = 1;
+                if (!locked_ref) {
-        key.objectid = bytenr;
+                        /* pick a new head ref from the cluster list */
-        key.offset = num_bytes;
+                        if (list_empty(cluster))
-        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+                                break;
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-                                0, 0);
+                        locked_ref = list_entry(cluster->next,
-        if (ret < 0)
+                                     struct btrfs_delayed_ref_head, cluster);
-                goto out;
-        if (ret != 0) {
+                        /* grab the lock that says we are going to process
-                btrfs_print_leaf(root, path->nodes[0]);
+                         * all the refs for this head */
-                printk(KERN_INFO "btrfs failed to find block number %llu\n",
+                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
-                       (unsigned long long)bytenr);
-                BUG();
+                        /*
+                         * we may have dropped the spin lock to get the head
+                         * mutex lock, and that might have given someone else
+                         * time to free the head.  If that's true, it has been
+                         * removed from our list and we can move on.
+                         */
+                        if (ret == -EAGAIN) {
+                                locked_ref = NULL;
+                                count++;
+                                continue;
+                        }
+                }
+                /*
+                 * record the must insert reserved flag before we
+                 * drop the spin lock.
+                 */
+                must_insert_reserved = locked_ref->must_insert_reserved;
+                locked_ref->must_insert_reserved = 0;
+                /*
+                 * locked_ref is the head node, so we have to go one
+                 * node back for any delayed ref updates
+                 */
+                ref = select_delayed_ref(locked_ref);
+                if (!ref) {
+                        /* All delayed refs have been processed, Go ahead
+                         * and send the head node to run_one_delayed_ref,
+                         * so that any accounting fixes can happen
+                         */
+                        ref = &locked_ref->node;
+                        list_del_init(&locked_ref->cluster);
+                        locked_ref = NULL;
+                }
+                ref->in_tree = 0;
+                rb_erase(&ref->rb_node, &delayed_refs->root);
+                delayed_refs->num_entries--;
+                spin_unlock(&delayed_refs->lock);
+                ret = run_one_delayed_ref(trans, root, ref,
+                                          must_insert_reserved);
+                BUG_ON(ret);
+                btrfs_put_delayed_ref(ref);
+                count++;
+                cond_resched();
+                spin_lock(&delayed_refs->lock);
+        }
+        return count;
+}
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, unsigned long count)
+{
+        struct rb_node *node;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_delayed_ref_node *ref;
+        struct list_head cluster;
+        int ret;
+        int run_all = count == (unsigned long)-1;
+        int run_most = 0;
+        if (root == root->fs_info->extent_root)
+                root = root->fs_info->tree_root;
+        delayed_refs = &trans->transaction->delayed_refs;
+        INIT_LIST_HEAD(&cluster);
+again:
+        spin_lock(&delayed_refs->lock);
+        if (count == 0) {
+                count = delayed_refs->num_entries * 2;
+                run_most = 1;
+        }
+        while (1) {
+                if (!(run_all || run_most) &&
+                    delayed_refs->num_heads_ready < 64)
+                        break;
+                /*
+                 * go find something we can process in the rbtree.  We start at
+                 * the beginning of the tree, and then build a cluster
+                 * of refs to process starting at the first one we are able to
+                 * lock
+                 */
+                ret = btrfs_find_ref_cluster(trans, &cluster,
+                                             delayed_refs->run_delayed_start);
+                if (ret)
+                        break;
+                ret = run_clustered_refs(trans, root, &cluster);
+                BUG_ON(ret < 0);
+                count -= min_t(unsigned long, ret, count);
+                if (count == 0)
+                        break;
+        }
+        if (run_all) {
+                node = rb_first(&delayed_refs->root);
+                if (!node)
+                        goto out;
+                count = (unsigned long)-1;
+                while (node) {
+                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                                       rb_node);
+                        if (btrfs_delayed_ref_is_head(ref)) {
+                                struct btrfs_delayed_ref_head *head;
+                                head = btrfs_delayed_node_to_head(ref);
+                                atomic_inc(&ref->refs);
+                                spin_unlock(&delayed_refs->lock);
+                                mutex_lock(&head->mutex);
+                                mutex_unlock(&head->mutex);
+                                btrfs_put_delayed_ref(ref);
+                                cond_resched();
+                                goto again;
+                        }
+                        node = rb_next(node);
+                }
+                spin_unlock(&delayed_refs->lock);
+                schedule_timeout(1);
+                goto again;
        }
-        l = path->nodes[0];
-        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-        *refs = btrfs_extent_refs(l, item);
 out:
-        btrfs_free_path(path);
+        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -1624,7 +1315,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
        int refi = 0;
        int slot;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                            u64, u64, u64, u64, u64, u64, u64, u64);
+                            u64, u64, u64, u64, u64, u64, u64, u64, u64);
        ref_root = btrfs_header_owner(buf);
        ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1387,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                if (level == 0) {
                        btrfs_item_key_to_cpu(buf, &key, slot);
+                        fi = btrfs_item_ptr(buf, slot,
+                                            struct btrfs_file_extent_item);
+                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                        if (bytenr == 0)
+                                continue;
                        ret = process_func(trans, root, bytenr,
-                                           orig_buf->start, buf->start,
+                                   btrfs_file_extent_disk_num_bytes(buf, fi),
-                                           orig_root, ref_root,
+                                   orig_buf->start, buf->start,
-                                           orig_generation, ref_generation,
+                                   orig_root, ref_root,
-                                           key.objectid);
+                                   orig_generation, ref_generation,
+                                   key.objectid);
                        if (ret) {
                                faili = slot;
@@ -1709,7 +1407,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                                goto fail;
                        }
                } else {
-                        ret = process_func(trans, root, bytenr,
+                        ret = process_func(trans, root, bytenr, buf->len,
                                           orig_buf->start, buf->start,
                                           orig_root, ref_root,
                                           orig_generation, ref_generation,
@@ -1786,17 +1484,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
                        if (bytenr == 0)
                                continue;
                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                            orig_buf->start, buf->start,
+                                    btrfs_file_extent_disk_num_bytes(buf, fi),
-                                            orig_root, ref_root,
+                                    orig_buf->start, buf->start,
-                                            orig_generation, ref_generation,
+                                    orig_root, ref_root, orig_generation,
-                                            key.objectid);
+                                    ref_generation, key.objectid);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, slot);
                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                            orig_buf->start, buf->start,
+                                            buf->len, orig_buf->start,
-                                            orig_root, ref_root,
+                                            buf->start, orig_root, ref_root,
                                            orig_generation, ref_generation,
                                            level - 1);
                        if (ret)
@@ -1815,7 +1513,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
                                 struct btrfs_block_group_cache *cache)
 {
        int ret;
-        int pending_ret;
        struct btrfs_root *extent_root = root->fs_info->extent_root;
        unsigned long bi;
        struct extent_buffer *leaf;
@@ -1831,12 +1528,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(extent_root, path);
 fail:
-        finish_current_insert(trans, extent_root, 0);
-        pending_ret = del_pending_extents(trans, extent_root, 0);
        if (ret)
                return ret;
-        if (pending_ret)
-                return pending_ret;
        return 0;
 }
@@ -1900,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
        if (!block_group || block_group->ro)
                readonly = 1;
        if (block_group)
-                put_block_group(block_group);
+                btrfs_put_block_group(block_group);
        return readonly;
 }
@@ -2151,10 +1844,14 @@ again:
                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
                       ", %llu bytes_used, %llu bytes_reserved, "
                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
-                       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
+                       "%llu total\n", (unsigned long long)bytes,
-                       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+                       (unsigned long long)data_sinfo->bytes_delalloc,
-                       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+                       (unsigned long long)data_sinfo->bytes_used,
-                       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+                       (unsigned long long)data_sinfo->bytes_reserved,
+                       (unsigned long long)data_sinfo->bytes_pinned,
+                       (unsigned long long)data_sinfo->bytes_readonly,
+                       (unsigned long long)data_sinfo->bytes_may_use,
+                       (unsigned long long)data_sinfo->total_bytes);
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
@@ -2225,15 +1922,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
        spin_unlock(&info->lock);
 }
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+        struct list_head *head = &info->space_info;
+        struct btrfs_space_info *found;
+        rcu_read_lock();
+        list_for_each_entry_rcu(found, head, list) {
+                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+                        found->force_alloc = 1;
+        }
+        rcu_read_unlock();
+}
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
+        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        u64 thresh;
        int ret = 0;
-        mutex_lock(&extent_root->fs_info->chunk_mutex);
+        mutex_lock(&fs_info->chunk_mutex);
        flags = btrfs_reduce_alloc_profile(extent_root, flags);
@@ -2265,6 +1976,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        }
        spin_unlock(&space_info->lock);
+        /*
+         * if we're doing a data chunk, go ahead and make sure that
+         * we keep a reasonable number of metadata chunks allocated in the
+         * FS as well.
+         */
+        if (flags & BTRFS_BLOCK_GROUP_DATA) {
+                fs_info->data_chunk_allocations++;
+                if (!(fs_info->data_chunk_allocations %
+                      fs_info->metadata_ratio))
+                        force_metadata_allocation(fs_info);
+        }
        ret = btrfs_alloc_chunk(trans, extent_root, flags);
        if (ret)
                space_info->full = 1;
@@ -2324,7 +2047,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                                WARN_ON(ret);
                        }
                }
-                put_block_group(cache);
+                btrfs_put_block_group(cache);
                total -= num_bytes;
                bytenr += num_bytes;
        }
@@ -2341,7 +2064,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
                return 0;
        bytenr = cache->key.objectid;
-        put_block_group(cache);
+        btrfs_put_block_group(cache);
        return bytenr;
 }
@@ -2353,7 +2076,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
        if (pin) {
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2361,6 +2083,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                clear_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
        }
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
                BUG_ON(!cache);
@@ -2385,7 +2108,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                        if (cache->cached)
                                btrfs_add_free_space(cache, bytenr, len);
                }
-                put_block_group(cache);
+                btrfs_put_block_group(cache);
                bytenr += len;
                num -= len;
        }
@@ -2416,7 +2139,7 @@ static int update_reserved_extents(struct btrfs_root *root,
                }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
-                put_block_group(cache);
+                btrfs_put_block_group(cache);
                bytenr += len;
                num -= len;
        }
@@ -2431,7 +2154,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
        struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
        int ret;
-        mutex_lock(&root->fs_info->pinned_mutex);
        while (1) {
                ret = find_first_extent_bit(pinned_extents, last,
                                            &start, &end, EXTENT_DIRTY);
@@ -2440,7 +2162,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
                set_extent_dirty(copy, start, end, GFP_NOFS);
                last = end + 1;
        }
-        mutex_unlock(&root->fs_info->pinned_mutex);
        return 0;
 }
@@ -2452,7 +2173,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        u64 end;
        int ret;
-        mutex_lock(&root->fs_info->pinned_mutex);
        while (1) {
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY);
@@ -2461,209 +2181,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                ret = btrfs_discard_extent(root, start, end + 1 - start);
+                /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
-                if (need_resched()) {
+                cond_resched();
-                        mutex_unlock(&root->fs_info->pinned_mutex);
-                        cond_resched();
-                        mutex_lock(&root->fs_info->pinned_mutex);
-                }
        }
-        mutex_unlock(&root->fs_info->pinned_mutex);
        return ret;
 }
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *extent_root, int all)
-{
-        u64 start;
-        u64 end;
-        u64 priv;
-        u64 search = 0;
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct btrfs_path *path;
-        struct pending_extent_op *extent_op, *tmp;
-        struct list_head insert_list, update_list;
-        int ret;
-        int num_inserts = 0, max_inserts, restart = 0;
-        path = btrfs_alloc_path();
-        INIT_LIST_HEAD(&insert_list);
-        INIT_LIST_HEAD(&update_list);
-        max_inserts = extent_root->leafsize /
-                (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
-                 sizeof(struct btrfs_extent_ref) +
-                 sizeof(struct btrfs_extent_item));
-again:
-        mutex_lock(&info->extent_ins_mutex);
-        while (1) {
-                ret = find_first_extent_bit(&info->extent_ins, search, &start,
-                                            &end, EXTENT_WRITEBACK);
-                if (ret) {
-                        if (restart && !num_inserts &&
-                            list_empty(&update_list)) {
-                                restart = 0;
-                                search = 0;
-                                continue;
-                        }
-                        break;
-                }
-                ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
-                if (!ret) {
-                        if (all)
-                                restart = 1;
-                        search = end + 1;
-                        if (need_resched()) {
-                                mutex_unlock(&info->extent_ins_mutex);
-                                cond_resched();
-                                mutex_lock(&info->extent_ins_mutex);
-                        }
-                        continue;
-                }
-                ret = get_state_private(&info->extent_ins, start, &priv);
-                BUG_ON(ret);
-                extent_op = (struct pending_extent_op *)(unsigned long) priv;
-                if (extent_op->type == PENDING_EXTENT_INSERT) {
-                        num_inserts++;
-                        list_add_tail(&extent_op->list, &insert_list);
-                        search = end + 1;
-                        if (num_inserts == max_inserts) {
-                                restart = 1;
-                                break;
-                        }
-                } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
-                        list_add_tail(&extent_op->list, &update_list);
-                        search = end + 1;
-                } else {
-                        BUG();
-                }
-        }
-        /*
-         * process the update list, clear the writeback bit for it, and if
-         * somebody marked this thing for deletion then just unlock it and be
-         * done, the free_extents will handle it
-         */
-        list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
-                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-                                  extent_op->bytenr + extent_op->num_bytes - 1,
-                                  EXTENT_WRITEBACK, GFP_NOFS);
-                if (extent_op->del) {
-                        list_del_init(&extent_op->list);
-                        unlock_extent(&info->extent_ins, extent_op->bytenr,
-                                      extent_op->bytenr + extent_op->num_bytes
-                                      - 1, GFP_NOFS);
-                        kfree(extent_op);
-                }
-        }
-        mutex_unlock(&info->extent_ins_mutex);
-        /*
-         * still have things left on the update list, go ahead an update
-         * everything
-         */
-        if (!list_empty(&update_list)) {
-                ret = update_backrefs(trans, extent_root, path, &update_list);
-                BUG_ON(ret);
-                /* we may have COW'ed new blocks, so lets start over */
-                if (all)
-                        restart = 1;
-        }
-        /*
-         * if no inserts need to be done, but we skipped some extents and we
-         * need to make sure everything is cleaned then reset everything and
-         * go back to the beginning
-         */
-        if (!num_inserts && restart) {
-                search = 0;
-                restart = 0;
-                INIT_LIST_HEAD(&update_list);
-                INIT_LIST_HEAD(&insert_list);
-                goto again;
-        } else if (!num_inserts) {
-                goto out;
-        }
-        /*
-         * process the insert extents list.  Again if we are deleting this
-         * extent, then just unlock it, pin down the bytes if need be, and be
-         * done with it.  Saves us from having to actually insert the extent
-         * into the tree and then subsequently come along and delete it
-         */
-        mutex_lock(&info->extent_ins_mutex);
-        list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
-                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-                                  extent_op->bytenr + extent_op->num_bytes - 1,
-                                  EXTENT_WRITEBACK, GFP_NOFS);
-                if (extent_op->del) {
-                        u64 used;
-                        list_del_init(&extent_op->list);
-                        unlock_extent(&info->extent_ins, extent_op->bytenr,
-                                      extent_op->bytenr + extent_op->num_bytes
-                                      - 1, GFP_NOFS);
-                        mutex_lock(&extent_root->fs_info->pinned_mutex);
-                        ret = pin_down_bytes(trans, extent_root,
-                                             extent_op->bytenr,
-                                             extent_op->num_bytes, 0);
-                        mutex_unlock(&extent_root->fs_info->pinned_mutex);
-                        spin_lock(&info->delalloc_lock);
-                        used = btrfs_super_bytes_used(&info->super_copy);
-                        btrfs_set_super_bytes_used(&info->super_copy,
-                                        used - extent_op->num_bytes);
-                        used = btrfs_root_used(&extent_root->root_item);
-                        btrfs_set_root_used(&extent_root->root_item,
-                                        used - extent_op->num_bytes);
-                        spin_unlock(&info->delalloc_lock);
-                        ret = update_block_group(trans, extent_root,
-                                                 extent_op->bytenr,
-                                                 extent_op->num_bytes,
-                                                 0, ret > 0);
-                        BUG_ON(ret);
-                        kfree(extent_op);
-                        num_inserts--;
-                }
-        }
-        mutex_unlock(&info->extent_ins_mutex);
-        ret = insert_extents(trans, extent_root, path, &insert_list,
-                             num_inserts);
-        BUG_ON(ret);
-        /*
-         * if restart is set for whatever reason we need to go back and start
-         * searching through the pending list again.
-         *
-         * We just inserted some extents, which could have resulted in new
-         * blocks being allocated, which would result in new blocks needing
-         * updates, so if all is set we _must_ restart to get the updated
-         * blocks.
-         */
-        if (restart || all) {
-                INIT_LIST_HEAD(&insert_list);
-                INIT_LIST_HEAD(&update_list);
-                search = 0;
-                restart = 0;
-                num_inserts = 0;
-                goto again;
-        }
-out:
-        btrfs_free_path(path);
-        return 0;
-}
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
-                          u64 bytenr, u64 num_bytes, int is_data)
+                          struct btrfs_path *path,
+                          u64 bytenr, u64 num_bytes, int is_data,
+                          struct extent_buffer **must_clean)
 {
        int err = 0;
        struct extent_buffer *buf;
@@ -2686,17 +2217,18 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
                u64 header_transid = btrfs_header_generation(buf);
                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
                    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+                    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
                    header_transid == trans->transid &&
                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                        clean_tree_block(NULL, root, buf);
+                        *must_clean = buf;
-                        btrfs_tree_unlock(buf);
-                        free_extent_buffer(buf);
                        return 1;
                }
                btrfs_tree_unlock(buf);
        }
        free_extent_buffer(buf);
 pinit:
+        btrfs_set_path_blocking(path);
+        /* unlocks the pinned mutex */
        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
        BUG_ON(err < 0);
@@ -2710,7 +2242,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 ref_generation,
-                         u64 owner_objectid, int pin, int mark_free)
+                         u64 owner_objectid, int pin, int mark_free,
+                         int refs_to_drop)
 {
        struct btrfs_path *path;
        struct btrfs_key key;
@@ -2732,6 +2265,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->reada = 1;
+        path->leave_spinning = 1;
        ret = lookup_extent_backref(trans, extent_root, path,
                                    bytenr, parent, root_objectid,
                                    ref_generation, owner_objectid, 1);
@@ -2753,9 +2287,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                                break;
                }
                if (!found_extent) {
-                        ret = remove_extent_backref(trans, extent_root, path);
+                        ret = remove_extent_backref(trans, extent_root, path,
+                                                    refs_to_drop);
                        BUG_ON(ret);
                        btrfs_release_path(extent_root, path);
+                        path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root,
                                                &key, path, -1, 1);
                        if (ret) {
@@ -2771,8 +2307,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_print_leaf(extent_root, path->nodes[0]);
                WARN_ON(1);
                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-                       "root %llu gen %llu owner %llu\n",
+                       "parent %llu root %llu gen %llu owner %llu\n",
                       (unsigned long long)bytenr,
+                       (unsigned long long)parent,
                       (unsigned long long)root_objectid,
                       (unsigned long long)ref_generation,
                       (unsigned long long)owner_objectid);
@@ -2782,17 +2319,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
        ei = btrfs_item_ptr(leaf, extent_slot,
                            struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
-        BUG_ON(refs == 0);
-        refs -= 1;
-        btrfs_set_extent_refs(leaf, ei, refs);
+        /*
+         * we're not allowed to delete the extent item if there
+         * are other delayed ref updates pending
+         */
+        BUG_ON(refs < refs_to_drop);
+        refs -= refs_to_drop;
+        btrfs_set_extent_refs(leaf, ei, refs);
        btrfs_mark_buffer_dirty(leaf);
-        if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+        if (refs == 0 && found_extent &&
+            path->slots[0] == extent_slot + 1) {
                struct btrfs_extent_ref *ref;
                ref = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_ref);
-                BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+                BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
                /* if the back ref and the extent are next to each other
                 * they get deleted below in one shot
                 */
@@ -2800,11 +2343,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                num_to_del = 2;
        } else if (found_extent) {
                /* otherwise delete the extent back ref */
-                ret = remove_extent_backref(trans, extent_root, path);
+                ret = remove_extent_backref(trans, extent_root, path,
+                                            refs_to_drop);
                BUG_ON(ret);
                /* if refs are 0, we need to setup the path for deletion */
                if (refs == 0) {
                        btrfs_release_path(extent_root, path);
+                        path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root, &key, path,
                                                -1, 1);
                        BUG_ON(ret);
@@ -2814,16 +2359,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
        if (refs == 0) {
                u64 super_used;
                u64 root_used;
+                struct extent_buffer *must_clean = NULL;
                if (pin) {
-                        mutex_lock(&root->fs_info->pinned_mutex);
+                        ret = pin_down_bytes(trans, root, path,
-                        ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+                                bytenr, num_bytes,
-                                owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+                                owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
-                        mutex_unlock(&root->fs_info->pinned_mutex);
+                                &must_clean);
                        if (ret > 0)
                                mark_free = 1;
                        BUG_ON(ret < 0);
                }
                /* block accounting for super block */
                spin_lock(&info->delalloc_lock);
                super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2382,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_set_root_used(&root->root_item,
                                           root_used - num_bytes);
                spin_unlock(&info->delalloc_lock);
+                /*
+                 * it is going to be very rare for someone to be waiting
+                 * on the block we're freeing.  del_items might need to
+                 * schedule, so rather than get fancy, just force it
+                 * to blocking here
+                 */
+                if (must_clean)
+                        btrfs_set_lock_blocking(must_clean);
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
+                if (must_clean) {
+                        clean_tree_block(NULL, root, must_clean);
+                        btrfs_tree_unlock(must_clean);
+                        free_extent_buffer(must_clean);
+                }
                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
+                } else {
+                        invalidate_mapping_pages(info->btree_inode->i_mapping,
+                             bytenr >> PAGE_CACHE_SHIFT,
+                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2417,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                BUG_ON(ret);
        }
        btrfs_free_path(path);
-        finish_current_insert(trans, extent_root, 0);
        return ret;
 }
 /*
- * find all the blocks marked as pending in the radix tree and remove
+ * remove an extent from the root, returns 0 on success
- * them from the extent map
 */
-static int del_pending_extents(struct btrfs_trans_handle *trans,
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *extent_root, int all)
+                                        struct btrfs_root *root,
+                                        u64 bytenr, u64 num_bytes, u64 parent,
+                                        u64 root_objectid, u64 ref_generation,
+                                        u64 owner_objectid, int pin,
+                                        int refs_to_drop)
 {
-        int ret;
+        WARN_ON(num_bytes < root->sectorsize);
-        int err = 0;
-        u64 start;
-        u64 end;
-        u64 priv;
-        u64 search = 0;
-        int nr = 0, skipped = 0;
-        struct extent_io_tree *pending_del;
-        struct extent_io_tree *extent_ins;
-        struct pending_extent_op *extent_op;
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct list_head delete_list;
-        INIT_LIST_HEAD(&delete_list);
-        extent_ins = &extent_root->fs_info->extent_ins;
-        pending_del = &extent_root->fs_info->pending_del;
-again:
-        mutex_lock(&info->extent_ins_mutex);
-        while (1) {
-                ret = find_first_extent_bit(pending_del, search, &start, &end,
-                                            EXTENT_WRITEBACK);
-                if (ret) {
-                        if (all && skipped && !nr) {
-                                search = 0;
-                                skipped = 0;
-                                continue;
-                        }
-                        mutex_unlock(&info->extent_ins_mutex);
-                        break;
-                }
-                ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
-                if (!ret) {
-                        search = end+1;
-                        skipped = 1;
-                        if (need_resched()) {
-                                mutex_unlock(&info->extent_ins_mutex);
-                                cond_resched();
-                                mutex_lock(&info->extent_ins_mutex);
-                        }
-                        continue;
-                }
-                BUG_ON(ret < 0);
-                ret = get_state_private(pending_del, start, &priv);
-                BUG_ON(ret);
-                extent_op = (struct pending_extent_op *)(unsigned long)priv;
-                clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
-                                  GFP_NOFS);
-                if (!test_range_bit(extent_ins, start, end,
-                                    EXTENT_WRITEBACK, 0)) {
-                        list_add_tail(&extent_op->list, &delete_list);
-                        nr++;
-                } else {
-                        kfree(extent_op);
-                        ret = get_state_private(&info->extent_ins, start,
-                                                &priv);
-                        BUG_ON(ret);
-                        extent_op = (struct pending_extent_op *)
-                                                (unsigned long)priv;
-                        clear_extent_bits(&info->extent_ins, start, end,
-                                          EXTENT_WRITEBACK, GFP_NOFS);
-                        if (extent_op->type == PENDING_BACKREF_UPDATE) {
-                                list_add_tail(&extent_op->list, &delete_list);
-                                search = end + 1;
-                                nr++;
-                                continue;
-                        }
-                        mutex_lock(&extent_root->fs_info->pinned_mutex);
-                        ret = pin_down_bytes(trans, extent_root, start,
-                                             end + 1 - start, 0);
-                        mutex_unlock(&extent_root->fs_info->pinned_mutex);
-                        ret = update_block_group(trans, extent_root, start,
-                                                end + 1 - start, 0, ret > 0);
-                        unlock_extent(extent_ins, start, end, GFP_NOFS);
-                        BUG_ON(ret);
-                        kfree(extent_op);
-                }
-                if (ret)
-                        err = ret;
-                search = end + 1;
-                if (need_resched()) {
-                        mutex_unlock(&info->extent_ins_mutex);
-                        cond_resched();
-                        mutex_lock(&info->extent_ins_mutex);
-                }
-        }
-        if (nr) {
+        /*
-                ret = free_extents(trans, extent_root, &delete_list);
+         * if metadata always pin
-                BUG_ON(ret);
+         * if data pin when any transaction has committed this
-        }
+         */
+        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+            ref_generation != trans->transid)
+                pin = 1;
-        if (all && skipped) {
+        if (ref_generation != trans->transid)
-                INIT_LIST_HEAD(&delete_list);
+                pin = 1;
-                search = 0;
-                nr = 0;
-                goto again;
-        }
-        if (!err)
+        return __free_extent(trans, root, bytenr, num_bytes, parent,
-                finish_current_insert(trans, extent_root, 0);
+                            root_objectid, ref_generation,
-        return err;
+                            owner_objectid, pin, pin == 0, refs_to_drop);
 }
 /*
- * remove an extent from the root, returns 0 on success
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well.  This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
 */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
+                                      struct btrfs_root *root, u64 bytenr)
-                               u64 bytenr, u64 num_bytes, u64 parent,
-                               u64 root_objectid, u64 ref_generation,
-                               u64 owner_objectid, int pin)
 {
-        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        struct btrfs_delayed_ref_head *head;
-        int pending_ret;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_delayed_ref_node *ref;
+        struct rb_node *node;
        int ret;
-        WARN_ON(num_bytes < root->sectorsize);
+        delayed_refs = &trans->transaction->delayed_refs;
-        if (root == extent_root) {
+        spin_lock(&delayed_refs->lock);
-                struct pending_extent_op *extent_op = NULL;
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (!head)
-                mutex_lock(&root->fs_info->extent_ins_mutex);
+                goto out;
-                if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-                                bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-                        u64 priv;
-                        ret = get_state_private(&root->fs_info->extent_ins,
-                                                bytenr, &priv);
-                        BUG_ON(ret);
-                        extent_op = (struct pending_extent_op *)
-                                                (unsigned long)priv;
-                        extent_op->del = 1;
+        node = rb_prev(&head->node.rb_node);
-                        if (extent_op->type == PENDING_EXTENT_INSERT) {
+        if (!node)
-                                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                goto out;
-                                return 0;
-                        }
-                }
-                if (extent_op) {
+        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-                        ref_generation = extent_op->orig_generation;
-                        parent = extent_op->orig_parent;
-                }
-                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+        /* there are still entries for this ref, we can't drop it */
-                BUG_ON(!extent_op);
+        if (ref->bytenr == bytenr)
+                goto out;
-                extent_op->type = PENDING_EXTENT_DELETE;
-                extent_op->bytenr = bytenr;
-                extent_op->num_bytes = num_bytes;
-                extent_op->parent = parent;
-                extent_op->orig_parent = parent;
-                extent_op->generation = ref_generation;
-                extent_op->orig_generation = ref_generation;
-                extent_op->level = (int)owner_objectid;
-                INIT_LIST_HEAD(&extent_op->list);
-                extent_op->del = 0;
-                set_extent_bits(&root->fs_info->pending_del,
-                                bytenr, bytenr + num_bytes - 1,
-                                EXTENT_WRITEBACK, GFP_NOFS);
-                set_state_private(&root->fs_info->pending_del,
-                                  bytenr, (unsigned long)extent_op);
-                mutex_unlock(&root->fs_info->extent_ins_mutex);
-                return 0;
-        }
-        /* if metadata always pin */
-        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-                if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-                        mutex_lock(&root->fs_info->pinned_mutex);
-                        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-                        mutex_unlock(&root->fs_info->pinned_mutex);
-                        update_reserved_extents(root, bytenr, num_bytes, 0);
-                        return 0;
-                }
-                pin = 1;
-        }
-        /* if data pin when any transaction has committed this */
+        /*
-        if (ref_generation != trans->transid)
+         * waiting for the lock here would deadlock.  If someone else has it
-                pin = 1;
+         * locked they are already in the process of dropping it anyway
+         */
+        if (!mutex_trylock(&head->mutex))
+                goto out;
-        ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+        /*
-                            root_objectid, ref_generation,
+         * at this point we have a head with no other entries.  Go
-                            owner_objectid, pin, pin == 0);
+         * ahead and process it.
+         */
+        head->node.in_tree = 0;
+        rb_erase(&head->node.rb_node, &delayed_refs->root);
+        delayed_refs->num_entries--;
+        /*
+         * we don't take a ref on the node because we're removing it from the
+         * tree, so we just steal the ref the tree was holding.
+         */
+        delayed_refs->num_heads--;
+        if (list_empty(&head->cluster))
+                delayed_refs->num_heads_ready--;
+        list_del_init(&head->cluster);
+        spin_unlock(&delayed_refs->lock);
-        finish_current_insert(trans, root->fs_info->extent_root, 0);
+        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-        pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
+                                  &head->node, head->must_insert_reserved);
-        return ret ? ret : pending_ret;
+        BUG_ON(ret);
+        btrfs_put_delayed_ref(&head->node);
+        return 0;
+out:
+        spin_unlock(&delayed_refs->lock);
+        return 0;
 }
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2524,28 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
-        ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
+        /*
-                                  root_objectid, ref_generation,
+         * tree log blocks never actually go into the extent allocation
-                                  owner_objectid, pin);
+         * tree, just update pinning info and exit early.
+         *
+         * data extents referenced by the tree log do need to have
+         * their reference counts bumped.
+         */
+        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+            owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                /* unlocks the pinned mutex */
+                btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+                update_reserved_extents(root, bytenr, num_bytes, 0);
+                ret = 0;
+        } else {
+                ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+                                       root_objectid, ref_generation,
+                                       owner_objectid,
+                                       BTRFS_DROP_DELAYED_REF, 1);
+                BUG_ON(ret);
+                ret = check_ref_cleanup(trans, root, bytenr);
+                BUG_ON(ret);
+        }
        return ret;
 }
@@ -3103,228 +2574,262 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        struct btrfs_root *root = orig_root->fs_info->extent_root;
-        u64 total_needed = num_bytes;
+        struct btrfs_free_cluster *last_ptr = NULL;
-        u64 *last_ptr = NULL;
-        u64 last_wanted = 0;
        struct btrfs_block_group_cache *block_group = NULL;
-        int chunk_alloc_done = 0;
        int empty_cluster = 2 * 1024 * 1024;
        int allowed_chunk_alloc = 0;
-        struct list_head *head = NULL, *cur = NULL;
-        int loop = 0;
-        int extra_loop = 0;
        struct btrfs_space_info *space_info;
+        int last_ptr_loop = 0;
+        int loop = 0;
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
        ins->objectid = 0;
        ins->offset = 0;
+        space_info = __find_space_info(root->fs_info, data);
        if (orig_root->ref_cows || empty_size)
                allowed_chunk_alloc = 1;
        if (data & BTRFS_BLOCK_GROUP_METADATA) {
-                last_ptr = &root->fs_info->last_alloc;
+                last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
                        empty_cluster = 64 * 1024;
        }
-        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
+        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
-                last_ptr = &root->fs_info->last_data_alloc;
+                last_ptr = &root->fs_info->data_alloc_cluster;
+        }
        if (last_ptr) {
-                if (*last_ptr) {
+                spin_lock(&last_ptr->lock);
-                        hint_byte = *last_ptr;
+                if (last_ptr->block_group)
-                        last_wanted = *last_ptr;
+                        hint_byte = last_ptr->window_start;
-                } else
+                spin_unlock(&last_ptr->lock);
-                        empty_size += empty_cluster;
-        } else {
-                empty_cluster = 0;
        }
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
-        if (last_wanted && search_start != last_wanted) {
+        if (!last_ptr) {
-                last_wanted = 0;
+                empty_cluster = 0;
-                empty_size += empty_cluster;
+                loop = 1;
        }
-        total_needed += empty_size;
+        if (search_start == hint_byte) {
-        block_group = btrfs_lookup_block_group(root->fs_info, search_start);
+                block_group = btrfs_lookup_block_group(root->fs_info,
-        if (!block_group)
+                                                       search_start);
-                block_group = btrfs_lookup_first_block_group(root->fs_info,
+                if (block_group && block_group_bits(block_group, data)) {
-                                                             search_start);
+                        down_read(&space_info->groups_sem);
-        space_info = __find_space_info(root->fs_info, data);
+                        if (list_empty(&block_group->list) ||
+                            block_group->ro) {
+                                /*
+                                 * someone is removing this block group,
+                                 * we can't jump into the have_block_group
+                                 * target because our list pointers are not
+                                 * valid
+                                 */
+                                btrfs_put_block_group(block_group);
+                                up_read(&space_info->groups_sem);
+                        } else
+                                goto have_block_group;
+                } else if (block_group) {
+                        btrfs_put_block_group(block_group);
+                }
+        }
+search:
        down_read(&space_info->groups_sem);
-        while (1) {
+        list_for_each_entry(block_group, &space_info->block_groups, list) {
-                struct btrfs_free_space *free_space;
+                u64 offset;
-                /*
-                 * the only way this happens if our hint points to a block
-                 * group thats not of the proper type, while looping this
-                 * should never happen
-                 */
-                if (empty_size)
-                        extra_loop = 1;
-                if (!block_group)
+                atomic_inc(&block_group->count);
-                        goto new_group_no_lock;
+                search_start = block_group->key.objectid;
+have_block_group:
                if (unlikely(!block_group->cached)) {
                        mutex_lock(&block_group->cache_mutex);
                        ret = cache_block_group(root, block_group);
                        mutex_unlock(&block_group->cache_mutex);
-                        if (ret)
+                        if (ret) {
+                                btrfs_put_block_group(block_group);
                                break;
+                        }
                }
-                mutex_lock(&block_group->alloc_mutex);
-                if (unlikely(!block_group_bits(block_group, data)))
-                        goto new_group;
                if (unlikely(block_group->ro))
-                        goto new_group;
+                        goto loop;
-                free_space = btrfs_find_free_space(block_group, search_start,
+                if (last_ptr) {
-                                                   total_needed);
+                        /*
-                if (free_space) {
+                         * the refill lock keeps out other
-                        u64 start = block_group->key.objectid;
+                         * people trying to start a new cluster
-                        u64 end = block_group->key.objectid +
+                         */
-                                block_group->key.offset;
+                        spin_lock(&last_ptr->refill_lock);
+                        if (last_ptr->block_group &&
+                            (last_ptr->block_group->ro ||
+                            !block_group_bits(last_ptr->block_group, data))) {
+                                offset = 0;
+                                goto refill_cluster;
+                        }
-                        search_start = stripe_align(root, free_space->offset);
+                        offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+                                                 num_bytes, search_start);
+                        if (offset) {
+                                /* we have a block, we're done */
+                                spin_unlock(&last_ptr->refill_lock);
+                                goto checks;
+                        }
-                        /* move on to the next group */
+                        spin_lock(&last_ptr->lock);
-                        if (search_start + num_bytes >= search_end)
+                        /*
-                                goto new_group;
+                         * whoops, this cluster doesn't actually point to
+                         * this block group.  Get a ref on the block
+                         * group is does point to and try again
+                         */
+                        if (!last_ptr_loop && last_ptr->block_group &&
+                            last_ptr->block_group != block_group) {
-                        /* move on to the next group */
+                                btrfs_put_block_group(block_group);
-                        if (search_start + num_bytes > end)
+                                block_group = last_ptr->block_group;
-                                goto new_group;
+                                atomic_inc(&block_group->count);
+                                spin_unlock(&last_ptr->lock);
+                                spin_unlock(&last_ptr->refill_lock);
-                        if (last_wanted && search_start != last_wanted) {
+                                last_ptr_loop = 1;
-                                total_needed += empty_cluster;
+                                search_start = block_group->key.objectid;
-                                empty_size += empty_cluster;
-                                last_wanted = 0;
                                /*
-                                 * if search_start is still in this block group
+                                 * we know this block group is properly
-                                 * then we just re-search this block group
+                                 * in the list because
+                                 * btrfs_remove_block_group, drops the
+                                 * cluster before it removes the block
+                                 * group from the list
                                 */
-                                if (search_start >= start &&
+                                goto have_block_group;
-                                    search_start < end) {
-                                        mutex_unlock(&block_group->alloc_mutex);
-                                        continue;
-                                }
-                                /* else we go to the next block group */
-                                goto new_group;
                        }
+                        spin_unlock(&last_ptr->lock);
+refill_cluster:
+                        /*
+                         * this cluster didn't work out, free it and
+                         * start over
+                         */
+                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
+                        last_ptr_loop = 0;
-                        if (exclude_nr > 0 &&
+                        /* allocate a cluster in this block group */
-                            (search_start + num_bytes > exclude_start &&
+                        ret = btrfs_find_space_cluster(trans,
-                             search_start < exclude_start + exclude_nr)) {
+                                               block_group, last_ptr,
-                                search_start = exclude_start + exclude_nr;
+                                               offset, num_bytes,
+                                               empty_cluster + empty_size);
+                        if (ret == 0) {
                                /*
-                                 * if search_start is still in this block group
+                                 * now pull our allocation out of this
-                                 * then we just re-search this block group
+                                 * cluster
                                 */
-                                if (search_start >= start &&
+                                offset = btrfs_alloc_from_cluster(block_group,
-                                    search_start < end) {
+                                                  last_ptr, num_bytes,
-                                        mutex_unlock(&block_group->alloc_mutex);
+                                                  search_start);
-                                        last_wanted = 0;
+                                if (offset) {
-                                        continue;
+                                        /* we found one, proceed */
+                                        spin_unlock(&last_ptr->refill_lock);
+                                        goto checks;
                                }
-                                /* else we go to the next block group */
-                                goto new_group;
                        }
+                        /*
+                         * at this point we either didn't find a cluster
+                         * or we weren't able to allocate a block from our
+                         * cluster.  Free the cluster we've been trying
+                         * to use, and go to the next block group
+                         */
+                        if (loop < 2) {
+                                btrfs_return_cluster_to_free_space(NULL,
+                                                                   last_ptr);
+                                spin_unlock(&last_ptr->refill_lock);
+                                goto loop;
+                        }
+                        spin_unlock(&last_ptr->refill_lock);
+                }
-                        ins->objectid = search_start;
+                offset = btrfs_find_space_for_alloc(block_group, search_start,
-                        ins->offset = num_bytes;
+                                                    num_bytes, empty_size);
+                if (!offset)
+                        goto loop;
+checks:
+                search_start = stripe_align(root, offset);
-                        btrfs_remove_free_space_lock(block_group, search_start,
+                /* move on to the next group */
-                                                     num_bytes);
+                if (search_start + num_bytes >= search_end) {
-                        /* we are all good, lets return */
+                        btrfs_add_free_space(block_group, offset, num_bytes);
-                        mutex_unlock(&block_group->alloc_mutex);
+                        goto loop;
-                        break;
                }
-new_group:
-                mutex_unlock(&block_group->alloc_mutex);
-                put_block_group(block_group);
-                block_group = NULL;
-new_group_no_lock:
-                /* don't try to compare new allocations against the
-                 * last allocation any more
-                 */
-                last_wanted = 0;
-                /*
+                /* move on to the next group */
-                 * Here's how this works.
+                if (search_start + num_bytes >
-                 * loop == 0: we were searching a block group via a hint
+                    block_group->key.objectid + block_group->key.offset) {
-                 *              and didn't find anything, so we start at
+                        btrfs_add_free_space(block_group, offset, num_bytes);
-                 *              the head of the block groups and keep searching
+                        goto loop;
-                 * loop == 1: we're searching through all of the block groups
+                }
-                 *              if we hit the head again we have searched
-                 *              all of the block groups for this space and we
+                if (exclude_nr > 0 &&
-                 *              need to try and allocate, if we cant error out.
+                    (search_start + num_bytes > exclude_start &&
-                 * loop == 2: we allocated more space and are looping through
+                     search_start < exclude_start + exclude_nr)) {
-                 *              all of the block groups again.
+                        search_start = exclude_start + exclude_nr;
-                 */
-                if (loop == 0) {
+                        btrfs_add_free_space(block_group, offset, num_bytes);
-                        head = &space_info->block_groups;
+                        /*
-                        cur = head->next;
+                         * if search_start is still in this block group
-                        loop++;
+                         * then we just re-search this block group
-                } else if (loop == 1 && cur == head) {
-                        int keep_going;
-                        /* at this point we give up on the empty_size
-                         * allocations and just try to allocate the min
-                         * space.
-                         *
-                         * The extra_loop field was set if an empty_size
-                         * allocation was attempted above, and if this
-                         * is try we need to try the loop again without
-                         * the additional empty_size.
                         */
-                        total_needed -= empty_size;
+                        if (search_start >= block_group->key.objectid &&
-                        empty_size = 0;
+                            search_start < (block_group->key.objectid +
-                        keep_going = extra_loop;
+                                            block_group->key.offset))
-                        loop++;
+                                goto have_block_group;
+                        goto loop;
+                }
-                        if (allowed_chunk_alloc && !chunk_alloc_done) {
+                ins->objectid = search_start;
-                                up_read(&space_info->groups_sem);
+                ins->offset = num_bytes;
-                                ret = do_chunk_alloc(trans, root, num_bytes +
-                                                     2 * 1024 * 1024, data, 1);
+                if (offset < search_start)
-                                down_read(&space_info->groups_sem);
+                        btrfs_add_free_space(block_group, offset,
-                                if (ret < 0)
+                                             search_start - offset);
-                                        goto loop_check;
+                BUG_ON(offset > search_start);
-                                head = &space_info->block_groups;
-                                /*
+                /* we are all good, lets return */
-                                 * we've allocated a new chunk, keep
+                break;
-                                 * trying
+loop:
-                                 */
+                btrfs_put_block_group(block_group);
-                                keep_going = 1;
+        }
-                                chunk_alloc_done = 1;
+        up_read(&space_info->groups_sem);
-                        } else if (!allowed_chunk_alloc) {
-                                space_info->force_alloc = 1;
+        /* loop == 0, try to find a clustered alloc in every block group
-                        }
+         * loop == 1, try again after forcing a chunk allocation
-loop_check:
+         * loop == 2, set empty_size and empty_cluster to 0 and try again
-                        if (keep_going) {
+         */
-                                cur = head->next;
+        if (!ins->objectid && loop < 3 &&
-                                extra_loop = 0;
+            (empty_size || empty_cluster || allowed_chunk_alloc)) {
-                        } else {
+                if (loop >= 2) {
-                                break;
+                        empty_size = 0;
-                        }
+                        empty_cluster = 0;
-                } else if (cur == head) {
-                        break;
                }
-                block_group = list_entry(cur, struct btrfs_block_group_cache,
+                if (allowed_chunk_alloc) {
-                                         list);
+                        ret = do_chunk_alloc(trans, root, num_bytes +
-                atomic_inc(&block_group->count);
+                                             2 * 1024 * 1024, data, 1);
+                        allowed_chunk_alloc = 0;
+                } else {
+                        space_info->force_alloc = 1;
+                }
-                search_start = block_group->key.objectid;
+                if (loop < 3) {
-                cur = cur->next;
+                        loop++;
+                        goto search;
+                }
+                ret = -ENOSPC;
+        } else if (!ins->objectid) {
+                ret = -ENOSPC;
        }
        /* we found what we needed */
@@ -3332,21 +2837,10 @@ loop_check:
                if (!(data & BTRFS_BLOCK_GROUP_DATA))
                        trans->block_group = block_group->key.objectid;
-                if (last_ptr)
+                btrfs_put_block_group(block_group);
-                        *last_ptr = ins->objectid + ins->offset;
                ret = 0;
-        } else if (!ret) {
-                printk(KERN_ERR "btrfs searching for %llu bytes, "
-                       "num_bytes %llu, loop %d, allowed_alloc %d\n",
-                       (unsigned long long)total_needed,
-                       (unsigned long long)num_bytes,
-                       loop, allowed_chunk_alloc);
-                ret = -ENOSPC;
        }
-        if (block_group)
-                put_block_group(block_group);
-        up_read(&space_info->groups_sem);
        return ret;
 }
@@ -3359,9 +2853,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
                                    info->bytes_pinned - info->bytes_reserved),
               (info->full) ? "" : "not ");
        printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
-               " may_use=%llu, used=%llu\n", info->total_bytes,
+               " may_use=%llu, used=%llu\n",
-               info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
+               (unsigned long long)info->total_bytes,
-               info->bytes_used);
+               (unsigned long long)info->bytes_pinned,
+               (unsigned long long)info->bytes_delalloc,
+               (unsigned long long)info->bytes_may_use,
+               (unsigned long long)info->bytes_used);
        down_read(&info->groups_sem);
        list_for_each_entry(cache, &info->block_groups, list) {
@@ -3451,7 +2948,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        ret = btrfs_discard_extent(root, start, len);
        btrfs_add_free_space(cache, start, len);
-        put_block_group(cache);
+        btrfs_put_block_group(cache);
        update_reserved_extents(root, start, len, 0);
        return ret;
@@ -3475,10 +2972,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root, u64 parent,
                                         u64 root_objectid, u64 ref_generation,
-                                         u64 owner, struct btrfs_key *ins)
+                                         u64 owner, struct btrfs_key *ins,
+                                         int ref_mod)
 {
        int ret;
-        int pending_ret;
        u64 super_used;
        u64 root_used;
        u64 num_bytes = ins->offset;
@@ -3503,33 +3000,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
        spin_unlock(&info->delalloc_lock);
-        if (root == extent_root) {
-                struct pending_extent_op *extent_op;
-                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                BUG_ON(!extent_op);
-                extent_op->type = PENDING_EXTENT_INSERT;
-                extent_op->bytenr = ins->objectid;
-                extent_op->num_bytes = ins->offset;
-                extent_op->parent = parent;
-                extent_op->orig_parent = 0;
-                extent_op->generation = ref_generation;
-                extent_op->orig_generation = 0;
-                extent_op->level = (int)owner;
-                INIT_LIST_HEAD(&extent_op->list);
-                extent_op->del = 0;
-                mutex_lock(&root->fs_info->extent_ins_mutex);
-                set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
-                                ins->objectid + ins->offset - 1,
-                                EXTENT_WRITEBACK, GFP_NOFS);
-                set_state_private(&root->fs_info->extent_ins,
-                                  ins->objectid, (unsigned long)extent_op);
-                mutex_unlock(&root->fs_info->extent_ins_mutex);
-                goto update_block;
-        }
        memcpy(&keys[0], ins, sizeof(*ins));
        keys[1].objectid = ins->objectid;
        keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +3010,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
                                       sizes, 2);
        BUG_ON(ret);
        extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_extent_item);
-        btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+        btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
                             struct btrfs_extent_ref);
        btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
        btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
        btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-        btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+        btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
        btrfs_mark_buffer_dirty(path->nodes[0]);
        trans->alloc_exclude_start = 0;
        trans->alloc_exclude_nr = 0;
        btrfs_free_path(path);
-        finish_current_insert(trans, extent_root, 0);
-        pending_ret = del_pending_extents(trans, extent_root, 0);
        if (ret)
                goto out;
-        if (pending_ret) {
-                ret = pending_ret;
-                goto out;
-        }
-update_block:
        ret = update_block_group(trans, root, ins->objectid,
                                 ins->offset, 1, 0);
        if (ret) {
@@ -3592,9 +3056,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
                return 0;
-        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-                                            ref_generation, owner, ins);
+        ret = btrfs_add_delayed_ref(trans, ins->objectid,
-        update_reserved_extents(root, ins->objectid, ins->offset, 0);
+                                    ins->offset, parent, root_objectid,
+                                    ref_generation, owner,
+                                    BTRFS_ADD_DELAYED_EXTENT, 0);
+        BUG_ON(ret);
        return ret;
 }
@@ -3619,9 +3086,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_remove_free_space(block_group, ins->objectid,
                                      ins->offset);
        BUG_ON(ret);
-        put_block_group(block_group);
+        btrfs_put_block_group(block_group);
        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-                                            ref_generation, owner, ins);
+                                            ref_generation, owner, ins, 1);
        return ret;
 }
@@ -3640,20 +3107,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       u64 search_end, struct btrfs_key *ins, u64 data)
 {
        int ret;
        ret = __btrfs_reserve_extent(trans, root, num_bytes,
                                     min_alloc_size, empty_size, hint_byte,
                                     search_end, ins, data);
        BUG_ON(ret);
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+                ret = btrfs_add_delayed_ref(trans, ins->objectid,
-                                        root_objectid, ref_generation,
+                                            ins->offset, parent, root_objectid,
-                                        owner_objectid, ins);
+                                            ref_generation, owner_objectid,
+                                            BTRFS_ADD_DELAYED_EXTENT, 0);
                BUG_ON(ret);
-        } else {
-                update_reserved_extents(root, ins->objectid, ins->offset, 1);
        }
+        update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
@@ -3789,7 +3254,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-                ret = __btrfs_free_extent(trans, root, disk_bytenr,
+                ret = btrfs_free_extent(trans, root, disk_bytenr,
                                btrfs_file_extent_disk_num_bytes(leaf, fi),
                                leaf->start, leaf_owner, leaf_generation,
                                key.objectid, 0);
@@ -3829,7 +3294,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
         */
        for (i = 0; i < ref->nritems; i++) {
                info = ref->extents + sorted[i].slot;
-                ret = __btrfs_free_extent(trans, root, info->bytenr,
+                ret = btrfs_free_extent(trans, root, info->bytenr,
                                          info->num_bytes, ref->bytenr,
                                          ref->owner, ref->generation,
                                          info->objectid, 0);
@@ -3846,12 +3311,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root, u64 start,
                                     u64 len, u32 *refs)
 {
        int ret;
-        ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+        ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
        BUG_ON(ret);
 #if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3425,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
                 * we just decrement it below and don't update any
                 * of the refs the leaf points to.
                 */
-                ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+                ret = drop_snap_lookup_refcount(trans, root, bytenr,
+                                                blocksize, &refs);
                BUG_ON(ret);
                if (refs != 1)
                        continue;
@@ -4010,7 +3477,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
         */
        for (i = 0; i < refi; i++) {
                bytenr = sorted[i].bytenr;
-                ret = __btrfs_free_extent(trans, root, bytenr,
+                ret = btrfs_free_extent(trans, root, bytenr,
                                        blocksize, eb->start,
                                        root_owner, root_gen, 0, 1);
                BUG_ON(ret);
@@ -4053,7 +3520,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
-        ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+        ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
                                path->nodes[*level]->len, &refs);
        BUG_ON(ret);
        if (refs > 1)
@@ -4104,7 +3571,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
                blocksize = btrfs_level_size(root, *level - 1);
-                ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+                ret = drop_snap_lookup_refcount(trans, root, bytenr,
+                                                blocksize, &refs);
                BUG_ON(ret);
                /*
@@ -4119,7 +3587,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                        root_gen = btrfs_header_generation(parent);
                        path->slots[*level]++;
-                        ret = __btrfs_free_extent(trans, root, bytenr,
+                        ret = btrfs_free_extent(trans, root, bytenr,
                                                blocksize, parent->start,
                                                root_owner, root_gen,
                                                *level - 1, 1);
@@ -4165,7 +3633,7 @@ out:
         * cleanup and free the reference on the last node
         * we processed
         */
-        ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+        ret = btrfs_free_extent(trans, root, bytenr, blocksize,
                                  parent->start, root_owner, root_gen,
                                  *level, 1);
        free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3822,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_path *path;
        int i;
        int orig_level;
+        int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3864,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                }
        }
        while (1) {
+                unsigned long update;
                wret = walk_down_tree(trans, root, path, &level);
                if (wret > 0)
                        break;
@@ -4407,12 +3877,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                        break;
                if (wret < 0)
                        ret = wret;
-                if (trans->transaction->in_commit) {
+                if (trans->transaction->in_commit ||
+                    trans->transaction->delayed_refs.flushing) {
                        ret = -EAGAIN;
                        break;
                }
                atomic_inc(&root->fs_info->throttle_gen);
                wake_up(&root->fs_info->transaction_throttle);
+                for (update_count = 0; update_count < 16; update_count++) {
+                        update = trans->delayed_ref_updates;
+                        trans->delayed_ref_updates = 0;
+                        if (update)
+                                btrfs_run_delayed_refs(trans, root, update);
+                        else
+                                break;
+                }
        }
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
@@ -5457,6 +4936,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
                                        root->root_key.objectid,
                                        trans->transid, key.objectid);
                BUG_ON(ret);
                ret = btrfs_free_extent(trans, root,
                                        bytenr, num_bytes, leaf->start,
                                        btrfs_header_owner(leaf),
@@ -5768,9 +5248,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
                                ref_path, NULL, NULL);
        BUG_ON(ret);
-        if (root == root->fs_info->extent_root)
-                btrfs_extent_post_op(trans, root);
        return 0;
 }
@@ -6038,6 +5515,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
        if (ret)
                goto out;
@@ -6208,6 +5686,9 @@ again:
        btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
        mutex_unlock(&root->fs_info->cleaner_mutex);
+        trans = btrfs_start_transaction(info->tree_root, 1);
+        btrfs_commit_transaction(trans, info->tree_root);
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
@@ -6294,7 +5775,7 @@ next:
        WARN_ON(block_group->reserved > 0);
        WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
        spin_unlock(&block_group->lock);
-        put_block_group(block_group);
+        btrfs_put_block_group(block_group);
        ret = 0;
 out:
        btrfs_free_path(path);
@@ -6421,9 +5902,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
-                mutex_init(&cache->alloc_mutex);
+                spin_lock_init(&cache->tree_lock);
                mutex_init(&cache->cache_mutex);
                INIT_LIST_HEAD(&cache->list);
+                INIT_LIST_HEAD(&cache->cluster_list);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -6466,7 +5948,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        extent_root = root->fs_info->extent_root;
-        root->fs_info->last_trans_new_blockgroup = trans->transid;
+        root->fs_info->last_trans_log_full_commit = trans->transid;
        cache = kzalloc(sizeof(*cache), GFP_NOFS);
        if (!cache)
@@ -6477,9 +5959,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
-        mutex_init(&cache->alloc_mutex);
+        spin_lock_init(&cache->tree_lock);
        mutex_init(&cache->cache_mutex);
        INIT_LIST_HEAD(&cache->list);
+        INIT_LIST_HEAD(&cache->cluster_list);
        btrfs_set_block_group_used(&cache->item, bytes_used);
        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -6500,9 +5983,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                                sizeof(cache->item));
        BUG_ON(ret);
-        finish_current_insert(trans, extent_root, 0);
-        ret = del_pending_extents(trans, extent_root, 0);
-        BUG_ON(ret);
        set_avail_alloc_bits(extent_root->fs_info, type);
        return 0;
@@ -6513,6 +5993,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 {
        struct btrfs_path *path;
        struct btrfs_block_group_cache *block_group;
+        struct btrfs_free_cluster *cluster;
        struct btrfs_key key;
        int ret;
@@ -6524,6 +6005,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        memcpy(&key, &block_group->key, sizeof(key));
+        /* make sure this block group isn't part of an allocation cluster */
+        cluster = &root->fs_info->data_alloc_cluster;
+        spin_lock(&cluster->refill_lock);
+        btrfs_return_cluster_to_free_space(block_group, cluster);
+        spin_unlock(&cluster->refill_lock);
+        /*
+         * make sure this block group isn't part of a metadata
+         * allocation cluster
+         */
+        cluster = &root->fs_info->meta_alloc_cluster;
+        spin_lock(&cluster->refill_lock);
+        btrfs_return_cluster_to_free_space(block_group, cluster);
+        spin_unlock(&cluster->refill_lock);
        path = btrfs_alloc_path();
        BUG_ON(!path);
@@ -6533,7 +6029,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_unlock(&root->fs_info->block_group_cache_lock);
        btrfs_remove_free_space_cache(block_group);
        down_write(&block_group->space_info->groups_sem);
-        list_del(&block_group->list);
+        /*
+         * we must use list_del_init so people can check to see if they
+         * are still on the list after taking the semaphore
+         */
+        list_del_init(&block_group->list);
        up_write(&block_group->space_info->groups_sem);
        spin_lock(&block_group->space_info->lock);
@@ -6542,8 +6042,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_unlock(&block_group->space_info->lock);
        block_group->space_info->full = 0;
-        put_block_group(block_group);
+        btrfs_put_block_group(block_group);
-        put_block_group(block_group);
+        btrfs_put_block_group(block_group);
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e6069..fe9eb990e443 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,12 +17,6 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-                                       unsigned long extra_flags,
-                                       void (*ctor)(void *, struct kmem_cache *,
-                                                    unsigned long));
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -50,20 +44,23 @@ struct extent_page_data {
        /* tells writepage not to lock the state bits for this range
         * it still does the unlocking
         */
-        int extent_locked;
+        unsigned int extent_locked:1;
+        /* tells the submit_bio code to use a WRITE_SYNC */
+        unsigned int sync_io:1;
 };
 int __init extent_io_init(void)
 {
-        extent_state_cache = btrfs_cache_create("extent_state",
+        extent_state_cache = kmem_cache_create("extent_state",
-                                            sizeof(struct extent_state), 0,
+                        sizeof(struct extent_state), 0,
-                                            NULL);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_state_cache)
                return -ENOMEM;
-        extent_buffer_cache = btrfs_cache_create("extent_buffers",
+        extent_buffer_cache = kmem_cache_create("extent_buffers",
-                                            sizeof(struct extent_buffer), 0,
+                        sizeof(struct extent_buffer), 0,
-                                            NULL);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_buffer_cache)
                goto free_state_cache;
        return 0;
@@ -1404,69 +1401,6 @@ out:
        return total_bytes;
 }
-#if 0
-/*
- * helper function to lock both pages and extents in the tree.
- * pages must be locked first.
- */
-static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
-        unsigned long index = start >> PAGE_CACHE_SHIFT;
-        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-        struct page *page;
-        int err;
-        while (index <= end_index) {
-                page = grab_cache_page(tree->mapping, index);
-                if (!page) {
-                        err = -ENOMEM;
-                        goto failed;
-                }
-                if (IS_ERR(page)) {
-                        err = PTR_ERR(page);
-                        goto failed;
-                }
-                index++;
-        }
-        lock_extent(tree, start, end, GFP_NOFS);
-        return 0;
-failed:
-        /*
-         * we failed above in getting the page at 'index', so we undo here
-         * up to but not including the page at 'index'
-         */
-        end_index = index;
-        index = start >> PAGE_CACHE_SHIFT;
-        while (index < end_index) {
-                page = find_get_page(tree->mapping, index);
-                unlock_page(page);
-                page_cache_release(page);
-                index++;
-        }
-        return err;
-}
-/*
- * helper function to unlock both pages and extents in the tree.
- */
-static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
-        unsigned long index = start >> PAGE_CACHE_SHIFT;
-        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-        struct page *page;
-        while (index <= end_index) {
-                page = find_get_page(tree->mapping, index);
-                unlock_page(page);
-                page_cache_release(page);
-                index++;
-        }
-        unlock_extent(tree, start, end, GFP_NOFS);
-        return 0;
-}
-#endif
 /*
 * set the private field for a given byte offset in the tree.  If there isn't
 * an extent_state there already, this does nothing.
@@ -2101,6 +2035,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        return ret;
 }
+static noinline void update_nr_written(struct page *page,
+                                      struct writeback_control *wbc,
+                                      unsigned long nr_written)
+{
+        wbc->nr_to_write -= nr_written;
+        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+                page->mapping->writeback_index = page->index + nr_written;
+}
 /*
 * the writepage semantics are similar to regular writepage.  extent
 * records are inserted to lock ranges in the tree, and as dirty areas
@@ -2136,8 +2080,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 delalloc_end;
        int page_started;
        int compressed;
+        int write_flags;
        unsigned long nr_written = 0;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                write_flags = WRITE_SYNC_PLUG;
+        else
+                write_flags = WRITE;
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
@@ -2164,6 +2114,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        delalloc_end = 0;
        page_started = 0;
        if (!epd->extent_locked) {
+                /*
+                 * make sure the wbc mapping index is at least updated
+                 * to this page.
+                 */
+                update_nr_written(page, wbc, 0);
                while (delalloc_end < page_end) {
                        nr_delalloc = find_lock_delalloc_range(inode, tree,
                                                       page,
@@ -2185,7 +2141,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                 */
                if (page_started) {
                        ret = 0;
-                        goto update_nr_written;
+                        /*
+                         * we've unlocked the page, so we can't update
+                         * the mapping's writeback index, just update
+                         * nr_to_write.
+                         */
+                        wbc->nr_to_write -= nr_written;
+                        goto done_unlocked;
                }
        }
        lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2198,13 +2160,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                if (ret == -EAGAIN) {
                        unlock_extent(tree, start, page_end, GFP_NOFS);
                        redirty_page_for_writepage(wbc, page);
+                        update_nr_written(page, wbc, nr_written);
                        unlock_page(page);
                        ret = 0;
-                        goto update_nr_written;
+                        goto done_unlocked;
                }
        }
-        nr_written++;
+        /*
+         * we don't want to touch the inode after unlocking the page,
+         * so we update the mapping writeback index now
+         */
+        update_nr_written(page, wbc, nr_written + 1);
        end = page_end;
        if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2314,9 +2281,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                                       (unsigned long long)end);
                        }
-                        ret = submit_extent_page(WRITE, tree, page, sector,
+                        ret = submit_extent_page(write_flags, tree, page,
-                                                 iosize, pg_offset, bdev,
+                                                 sector, iosize, pg_offset,
-                                                 &epd->bio, max_nr,
+                                                 bdev, &epd->bio, max_nr,
                                                 end_bio_extent_writepage,
                                                 0, 0, 0);
                        if (ret)
@@ -2336,11 +2303,8 @@ done:
                unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
        unlock_page(page);
-update_nr_written:
+done_unlocked:
-        wbc->nr_to_write -= nr_written;
-        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
-            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
-                page->mapping->writeback_index = page->index + nr_written;
        return 0;
 }
@@ -2460,15 +2424,23 @@ retry:
        return ret;
 }
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
 {
-        struct extent_page_data *epd = data;
        if (epd->bio) {
-                submit_one_bio(WRITE, epd->bio, 0, 0);
+                if (epd->sync_io)
+                        submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+                else
+                        submit_one_bio(WRITE, epd->bio, 0, 0);
                epd->bio = NULL;
        }
 }
+static noinline void flush_write_bio(void *data)
+{
+        struct extent_page_data *epd = data;
+        flush_epd_write_bio(epd);
+}
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                          get_extent_t *get_extent,
                          struct writeback_control *wbc)
@@ -2480,23 +2452,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 0,
+                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = wbc->bdi,
-                .sync_mode      = WB_SYNC_NONE,
+                .sync_mode      = wbc->sync_mode,
                .older_than_this = NULL,
                .nr_to_write    = 64,
                .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
                .range_end      = (loff_t)-1,
        };
        ret = __extent_writepage(page, wbc, &epd);
        extent_write_cache_pages(tree, mapping, &wbc_writepages,
                                 __extent_writepage, &epd, flush_write_bio);
-        if (epd.bio)
+        flush_epd_write_bio(&epd);
-                submit_one_bio(WRITE, epd.bio, 0, 0);
        return ret;
 }
@@ -2515,6 +2486,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 1,
+                .sync_io = mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2512,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                start += PAGE_CACHE_SIZE;
        }
-        if (epd.bio)
+        flush_epd_write_bio(&epd);
-                submit_one_bio(WRITE, epd.bio, 0, 0);
        return ret;
 }
@@ -2556,13 +2527,13 @@ int extent_writepages(struct extent_io_tree *tree,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 0,
+                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
        ret = extent_write_cache_pages(tree, mapping, wbc,
                                       __extent_writepage, &epd,
                                       flush_write_bio);
-        if (epd.bio)
+        flush_epd_write_bio(&epd);
-                submit_one_bio(WRITE, epd.bio, 0, 0);
        return ret;
 }
@@ -2884,25 +2855,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                disko = 0;
                flags = 0;
-                switch (em->block_start) {
+                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
-                case EXTENT_MAP_LAST_BYTE:
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
-                        break;
+                } else if (em->block_start == EXTENT_MAP_HOLE) {
-                case EXTENT_MAP_HOLE:
                        flags |= FIEMAP_EXTENT_UNWRITTEN;
-                        break;
+                } else if (em->block_start == EXTENT_MAP_INLINE) {
-                case EXTENT_MAP_INLINE:
                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
                                  FIEMAP_EXTENT_NOT_ALIGNED);
-                        break;
+                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
-                case EXTENT_MAP_DELALLOC:
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
-                        break;
+                } else {
-                default:
                        disko = em->block_start;
-                        break;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
@@ -3124,20 +3089,15 @@ void free_extent_buffer(struct extent_buffer *eb)
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                              struct extent_buffer *eb)
 {
-        int set;
        unsigned long i;
        unsigned long num_pages;
        struct page *page;
-        u64 start = eb->start;
-        u64 end = start + eb->len - 1;
-        set = clear_extent_dirty(tree, start, end, GFP_NOFS);
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-                if (!set && !PageDirty(page))
+                if (!PageDirty(page))
                        continue;
                lock_page(page);
@@ -3146,22 +3106,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                else
                        set_page_private(page, EXTENT_PAGE_PRIVATE);
-                /*
-                 * if we're on the last page or the first page and the
-                 * block isn't aligned on a page boundary, do extra checks
-                 * to make sure we don't clean page that is partially dirty
-                 */
-                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-                    ((i == num_pages - 1) &&
-                     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-                        start = (u64)page->index << PAGE_CACHE_SHIFT;
-                        end  = start + PAGE_CACHE_SIZE - 1;
-                        if (test_range_bit(tree, start, end,
-                                           EXTENT_DIRTY, 0)) {
-                                unlock_page(page);
-                                continue;
-                        }
-                }
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
                if (!PageDirty(page)) {
@@ -3187,29 +3131,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 {
        unsigned long i;
        unsigned long num_pages;
+        int was_dirty = 0;
+        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
-        for (i = 0; i < num_pages; i++) {
+        for (i = 0; i < num_pages; i++)
-                struct page *page = extent_buffer_page(eb, i);
-                /* writepage may need to do something special for the
-                 * first page, we have to make sure page->private is
-                 * properly set.  releasepage may drop page->private
-                 * on us if the page isn't already dirty.
-                 */
-                lock_page(page);
-                if (i == 0) {
-                        set_page_extent_head(page, eb->len);
-                } else if (PagePrivate(page) &&
-                           page->private != EXTENT_PAGE_PRIVATE) {
-                        set_page_extent_mapped(page);
-                }
                __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-                set_extent_dirty(tree, page_offset(page),
+        return was_dirty;
-                                 page_offset(page) + PAGE_CACHE_SIZE - 1,
-                                 GFP_NOFS);
-                unlock_page(page);
-        }
-        return 0;
 }
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3717,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
                ret = 0;
                goto out;
        }
+        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+                ret = 0;
+                goto out;
+        }
        /* at this point we can safely release the extent buffer */
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf6..5bc20abf3f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
 /*
 * page->private values.  Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                              struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
                             struct extent_buffer *eb);
+int test_extent_buffer_dirty(struct extent_io_tree *tree,
+                             struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
                               struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..30c9365861e6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -6,19 +6,14 @@
 #include <linux/hardirq.h>
 #include "extent_map.h"
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-                                       unsigned long extra_flags,
-                                       void (*ctor)(void *, struct kmem_cache *,
-                                                    unsigned long));
 static struct kmem_cache *extent_map_cache;
 int __init extent_map_init(void)
 {
-        extent_map_cache = btrfs_cache_create("extent_map",
+        extent_map_cache = kmem_cache_create("extent_map",
-                                            sizeof(struct extent_map), 0,
+                        sizeof(struct extent_map), 0,
-                                            NULL);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_map_cache)
                return -ENOMEM;
        return 0;
@@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
        tree->map.rb_node = NULL;
        spin_lock_init(&tree->lock);
 }
-EXPORT_SYMBOL(extent_map_tree_init);
 /**
 * alloc_extent_map - allocate new extent map structure
@@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask)
        atomic_set(&em->refs, 1);
        return em;
 }
-EXPORT_SYMBOL(alloc_extent_map);
 /**
 * free_extent_map - drop reference count of an extent_map
@@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em)
                kmem_cache_free(extent_map_cache, em);
        }
 }
-EXPORT_SYMBOL(free_extent_map);
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
                                   struct rb_node *node)
@@ -234,7 +226,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
        rb = tree_insert(&tree->map, em->start, &em->rb_node);
        if (rb) {
                ret = -EEXIST;
-                free_extent_map(merge);
                goto out;
        }
        atomic_inc(&em->refs);
@@ -265,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 out:
        return ret;
 }
-EXPORT_SYMBOL(add_extent_mapping);
 /* simple helper to do math around the end of an extent, handling wrap */
 static u64 range_end(u64 start, u64 len)
@@ -327,7 +317,6 @@ found:
 out:
        return em;
 }
-EXPORT_SYMBOL(lookup_extent_mapping);
 /**
 * remove_extent_mapping - removes an extent_map from the extent tree
@@ -347,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
        em->in_tree = 0;
        return ret;
 }
-EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd1..9b99886562d0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
        file_key.offset = pos;
        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
                                      sizeof(*item));
        if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                key.offset = end_byte - 1;
                key.type = BTRFS_EXTENT_CSUM_KEY;
+                path->leave_spinning = 1;
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0) {
                        if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
        } else {
                ins_size = csum_size;
        }
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
                                      ins_size);
+        path->leave_spinning = 0;
        if (ret < 0)
                goto fail_unlock;
        if (ret != 0) {
@@ -776,7 +780,6 @@ found:
        item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
                                      btrfs_item_size_nr(leaf, path->slots[0]));
        eb_token = NULL;
-        cond_resched();
 next_sector:
        if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
                eb_token = NULL;
        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        cond_resched();
        if (total_bytes < sums->len) {
                btrfs_release_path(root, path);
+                cond_resched();
                goto again;
        }
 out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b3..1d51dc38bb49 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
        return 0;
 }
-int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
-{
-        return 0;
-#if 0
-        struct btrfs_path *path;
-        struct btrfs_key found_key;
-        struct extent_buffer *leaf;
-        struct btrfs_file_extent_item *extent;
-        u64 last_offset = 0;
-        int nritems;
-        int slot;
-        int found_type;
-        int ret;
-        int err = 0;
-        u64 extent_end = 0;
-        path = btrfs_alloc_path();
-        ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
-                                       last_offset, 0);
-        while (1) {
-                nritems = btrfs_header_nritems(path->nodes[0]);
-                if (path->slots[0] >= nritems) {
-                        ret = btrfs_next_leaf(root, path);
-                        if (ret)
-                                goto out;
-                        nritems = btrfs_header_nritems(path->nodes[0]);
-                }
-                slot = path->slots[0];
-                leaf = path->nodes[0];
-                btrfs_item_key_to_cpu(leaf, &found_key, slot);
-                if (found_key.objectid != inode->i_ino)
-                        break;
-                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                        goto out;
-                if (found_key.offset < last_offset) {
-                        WARN_ON(1);
-                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_ERR "inode %lu found offset %llu "
-                               "expected %llu\n", inode->i_ino,
-                               (unsigned long long)found_key.offset,
-                               (unsigned long long)last_offset);
-                        err = 1;
-                        goto out;
-                }
-                extent = btrfs_item_ptr(leaf, slot,
-                                        struct btrfs_file_extent_item);
-                found_type = btrfs_file_extent_type(leaf, extent);
-                if (found_type == BTRFS_FILE_EXTENT_REG) {
-                        extent_end = found_key.offset +
-                             btrfs_file_extent_num_bytes(leaf, extent);
-                } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                        struct btrfs_item *item;
-                        item = btrfs_item_nr(leaf, slot);
-                        extent_end = found_key.offset +
-                             btrfs_file_extent_inline_len(leaf, extent);
-                        extent_end = (extent_end + root->sectorsize - 1) &
-                                ~((u64)root->sectorsize - 1);
-                }
-                last_offset = extent_end;
-                path->slots[0]++;
-        }
-        if (0 && last_offset < inode->i_size) {
-                WARN_ON(1);
-                btrfs_print_leaf(root, leaf);
-                printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
-                       inode->i_ino, (unsigned long long)last_offset,
-                       (unsigned long long)inode->i_size);
-                err = 1;
-        }
-out:
-        btrfs_free_path(path);
-        return err;
-#endif
-}
 /*
 * this is very complex, but the basic idea is to drop all extents
 * in the range start - end.  hint_block is filled in with a block number
@@ -363,15 +286,16 @@ out:
 */
 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode,
-                       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
+                       u64 start, u64 end, u64 locked_end,
+                       u64 inline_limit, u64 *hint_byte)
 {
        u64 extent_end = 0;
-        u64 locked_end = end;
        u64 search_start = start;
        u64 leaf_start;
        u64 ram_bytes = 0;
        u64 orig_parent = 0;
        u64 disk_bytenr = 0;
+        u64 orig_locked_end = locked_end;
        u8 compression;
        u8 encryption;
        u16 other_encoding = 0;
@@ -606,6 +530,7 @@ next_slot:
                        btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
                        btrfs_release_path(root, path);
+                        path->leave_spinning = 1;
                        ret = btrfs_insert_empty_item(trans, root, path, &ins,
                                                      sizeof(*extent));
                        BUG_ON(ret);
@@ -639,17 +564,22 @@ next_slot:
                                                        ram_bytes);
                        btrfs_set_file_extent_type(leaf, extent, found_type);
+                        btrfs_unlock_up_safe(path, 1);
                        btrfs_mark_buffer_dirty(path->nodes[0]);
+                        btrfs_set_lock_blocking(path->nodes[0]);
                        if (disk_bytenr != 0) {
                                ret = btrfs_update_extent_ref(trans, root,
-                                                disk_bytenr, orig_parent,
+                                                disk_bytenr,
+                                                le64_to_cpu(old.disk_num_bytes),
+                                                orig_parent,
                                                leaf->start,
                                                root->root_key.objectid,
                                                trans->transid, ins.objectid);
                                BUG_ON(ret);
                        }
+                        path->leave_spinning = 0;
                        btrfs_release_path(root, path);
                        if (disk_bytenr != 0)
                                inode_add_bytes(inode, extent_end - end);
@@ -678,11 +608,10 @@ next_slot:
        }
 out:
        btrfs_free_path(path);
-        if (locked_end > end) {
+        if (locked_end > orig_locked_end) {
-                unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+                unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
-                              GFP_NOFS);
+                              locked_end - 1, GFP_NOFS);
        }
-        btrfs_check_file(root, inode);
        return ret;
 }
@@ -824,7 +753,7 @@ again:
                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                BUG_ON(ret);
-                goto done;
+                goto release;
        } else if (split == start) {
                if (locked_end < extent_end) {
                        ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -912,7 +841,7 @@ again:
        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
        if (orig_parent != leaf->start) {
-                ret = btrfs_update_extent_ref(trans, root, bytenr,
+                ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
                                              orig_parent, leaf->start,
                                              root->root_key.objectid,
                                              trans->transid, inode->i_ino);
@@ -920,6 +849,8 @@ again:
        }
 done:
        btrfs_mark_buffer_dirty(leaf);
+release:
        btrfs_release_path(root, path);
        if (split_end && split == start) {
                split = end;
@@ -1125,7 +1056,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                if (will_write) {
                        btrfs_fdatawrite_range(inode->i_mapping, pos,
                                               pos + write_bytes - 1,
-                                               WB_SYNC_NONE);
+                                               WB_SYNC_ALL);
                } else {
                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
                                                           num_pages);
@@ -1155,6 +1086,20 @@ out_nolock:
                page_cache_release(pinned[1]);
        *ppos = pos;
+        /*
+         * we want to make sure fsync finds this change
+         * but we haven't joined a transaction running right now.
+         *
+         * Later on, someone is sure to update the inode and get the
+         * real transid recorded.
+         *
+         * We set last_trans now to the fs_info generation + 1,
+         * this will either be one more than the running transaction
+         * or the generation used for the next transaction if there isn't
+         * one running right now.
+         */
+        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        if (num_written > 0 && will_write) {
                struct btrfs_trans_handle *trans;
@@ -1167,8 +1112,11 @@ out_nolock:
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
                        if (ret == 0) {
-                                btrfs_sync_log(trans, root);
+                                ret = btrfs_sync_log(trans, root);
-                                btrfs_end_transaction(trans, root);
+                                if (ret == 0)
+                                        btrfs_end_transaction(trans, root);
+                                else
+                                        btrfs_commit_transaction(trans, root);
                        } else {
                                btrfs_commit_transaction(trans, root);
                        }
@@ -1185,6 +1133,18 @@ out_nolock:
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
+        /*
+         * ordered_data_close is set by settattr when we are about to truncate
+         * a file from a non-zero size to a zero size.  This tries to
+         * flush down new bytes that may have been written if the
+         * application were using truncate to replace a file in place.
+         */
+        if (BTRFS_I(inode)->ordered_data_close) {
+                BTRFS_I(inode)->ordered_data_close = 0;
+                btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+                        filemap_flush(inode->i_mapping);
+        }
        if (filp->private_data)
                btrfs_ioctl_trans_end(filp);
        return 0;
@@ -1260,8 +1220,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        if (ret > 0) {
                ret = btrfs_commit_transaction(trans, root);
        } else {
-                btrfs_sync_log(trans, root);
+                ret = btrfs_sync_log(trans, root);
-                ret = btrfs_end_transaction(trans, root);
+                if (ret == 0)
+                        ret = btrfs_end_transaction(trans, root);
+                else
+                        ret = btrfs_commit_transaction(trans, root);
        }
        mutex_lock(&dentry->d_inode->i_mutex);
 out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..0bc93657b460 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
 #include <linux/sched.h>
 #include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+struct btrfs_free_space {
+        struct rb_node bytes_index;
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+};
 static int tree_insert_offset(struct rb_root *root, u64 offset,
                              struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 }
 /*
- * searches the tree for the given offset.  If contains is set we will return
+ * searches the tree for the given offset.
- * the free space that contains the given offset.  If contains is not set we
+ *
- * will return the free space that starts at or after the given offset and is
+ * fuzzy == 1: this is used for allocations where we are given a hint of where
- * at least bytes long.
+ * to look for free space.  Because the hint may not be completely on an offset
+ * mark, or the hint may no longer point to free space we need to fudge our
+ * results a bit.  So we look for free space starting at or after offset with at
+ * least bytes size.  We prefer to find as close to the given offset as we can.
+ * Also if the offset is within a free space range, then we will return the free
+ * space that contains the given offset, which means we can return a free space
+ * chunk with an offset before the provided offset.
+ *
+ * fuzzy == 0: this is just a normal tree search.  Give us the free space that
+ * starts at the given offset which is at least bytes size, and if its not there
+ * return NULL.
 */
 static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
                                                   u64 offset, u64 bytes,
-                                                   int contains)
+                                                   int fuzzy)
 {
        struct rb_node *n = root->rb_node;
        struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
                if (offset < entry->offset) {
-                        if (!contains &&
+                        if (fuzzy &&
                            (!ret || entry->offset < ret->offset) &&
                            (bytes <= entry->bytes))
                                ret = entry;
                        n = n->rb_left;
                } else if (offset > entry->offset) {
-                        if ((entry->offset + entry->bytes - 1) >= offset &&
+                        if (fuzzy &&
+                            (entry->offset + entry->bytes - 1) >= offset &&
                            bytes <= entry->bytes) {
                                ret = entry;
                                break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
        int ret = 0;
+        BUG_ON(!info->bytes);
        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
                                 &info->offset_index);
        if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
        return ret;
 }
-static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                                  u64 offset, u64 bytes)
+                         u64 offset, u64 bytes)
 {
        struct btrfs_free_space *right_info;
        struct btrfs_free_space *left_info;
        struct btrfs_free_space *info = NULL;
-        struct btrfs_free_space *alloc_info;
        int ret = 0;
-        alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+        info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-        if (!alloc_info)
+        if (!info)
                return -ENOMEM;
+        info->offset = offset;
+        info->bytes = bytes;
+        spin_lock(&block_group->tree_lock);
        /*
         * first we want to see if there is free space adjacent to the range we
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
        right_info = tree_search_offset(&block_group->free_space_offset,
-                                        offset+bytes, 0, 1);
+                                        offset+bytes, 0, 0);
        left_info = tree_search_offset(&block_group->free_space_offset,
                                       offset-1, 0, 1);
-        if (right_info && right_info->offset == offset+bytes) {
+        if (right_info) {
                unlink_free_space(block_group, right_info);
-                info = right_info;
+                info->bytes += right_info->bytes;
-                info->offset = offset;
+                kfree(right_info);
-                info->bytes += bytes;
-        } else if (right_info && right_info->offset != offset+bytes) {
-                printk(KERN_ERR "btrfs adding space in the middle of an "
-                       "existing free space area. existing: "
-                       "offset=%llu, bytes=%llu. new: offset=%llu, "
-                       "bytes=%llu\n", (unsigned long long)right_info->offset,
-                       (unsigned long long)right_info->bytes,
-                       (unsigned long long)offset,
-                       (unsigned long long)bytes);
-                BUG();
        }
-        if (left_info) {
+        if (left_info && left_info->offset + left_info->bytes == offset) {
                unlink_free_space(block_group, left_info);
+                info->offset = left_info->offset;
-                if (unlikely((left_info->offset + left_info->bytes) !=
+                info->bytes += left_info->bytes;
-                             offset)) {
+                kfree(left_info);
-                        printk(KERN_ERR "btrfs free space to the left "
-                               "of new free space isn't "
-                               "quite right. existing: offset=%llu, "
-                               "bytes=%llu. new: offset=%llu, bytes=%llu\n",
-                               (unsigned long long)left_info->offset,
-                               (unsigned long long)left_info->bytes,
-                               (unsigned long long)offset,
-                               (unsigned long long)bytes);
-                        BUG();
-                }
-                if (info) {
-                        info->offset = left_info->offset;
-                        info->bytes += left_info->bytes;
-                        kfree(left_info);
-                } else {
-                        info = left_info;
-                        info->bytes += bytes;
-                }
        }
-        if (info) {
-                ret = link_free_space(block_group, info);
-                if (!ret)
-                        info = NULL;
-                goto out;
-        }
-        info = alloc_info;
-        alloc_info = NULL;
-        info->offset = offset;
-        info->bytes = bytes;
        ret = link_free_space(block_group, info);
        if (ret)
                kfree(info);
-out:
+        spin_unlock(&block_group->tree_lock);
        if (ret) {
                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
-                if (ret == -EEXIST)
+                BUG_ON(ret == -EEXIST);
-                        BUG();
        }
-        kfree(alloc_info);
        return ret;
 }
-static int
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                            u64 offset, u64 bytes)
-                          u64 offset, u64 bytes)
 {
        struct btrfs_free_space *info;
        int ret = 0;
+        spin_lock(&block_group->tree_lock);
        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
                                  1);
        if (info && info->offset == offset) {
                if (info->bytes < bytes) {
                        printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                               (unsigned long long)bytes);
                        WARN_ON(1);
                        ret = -EINVAL;
+                        spin_unlock(&block_group->tree_lock);
                        goto out;
                }
                unlink_free_space(block_group, info);
                if (info->bytes == bytes) {
                        kfree(info);
+                        spin_unlock(&block_group->tree_lock);
                        goto out;
                }
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                info->bytes -= bytes;
                ret = link_free_space(block_group, info);
+                spin_unlock(&block_group->tree_lock);
                BUG_ON(ret);
        } else if (info && info->offset < offset &&
                   info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,37 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                         */
                        kfree(info);
                }
+                spin_unlock(&block_group->tree_lock);
                /* step two, insert a new info struct to cover anything
                 * before the hole
                 */
-                ret = __btrfs_add_free_space(block_group, old_start,
+                ret = btrfs_add_free_space(block_group, old_start,
-                                             offset - old_start);
+                                           offset - old_start);
                BUG_ON(ret);
        } else {
+                spin_unlock(&block_group->tree_lock);
+                if (!info) {
+                        printk(KERN_ERR "couldn't find space %llu to free\n",
+                               (unsigned long long)offset);
+                        printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
+                               block_group->cached,
+                               (unsigned long long)block_group->key.objectid,
+                               (unsigned long long)block_group->key.offset);
+                        btrfs_dump_free_space(block_group, bytes);
+                } else if (info) {
+                        printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
+                               "but wanted offset=%llu bytes=%llu\n",
+                               (unsigned long long)info->offset,
+                               (unsigned long long)info->bytes,
+                               (unsigned long long)offset,
+                               (unsigned long long)bytes);
+                }
                WARN_ON(1);
        }
 out:
        return ret;
 }
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                         u64 offset, u64 bytes)
-{
-        int ret;
-        struct btrfs_free_space *sp;
-        mutex_lock(&block_group->alloc_mutex);
-        ret = __btrfs_add_free_space(block_group, offset, bytes);
-        sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
-        BUG_ON(!sp);
-        mutex_unlock(&block_group->alloc_mutex);
-        return ret;
-}
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
-                              u64 offset, u64 bytes)
-{
-        int ret;
-        struct btrfs_free_space *sp;
-        ret = __btrfs_add_free_space(block_group, offset, bytes);
-        sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
-        BUG_ON(!sp);
-        return ret;
-}
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-                            u64 offset, u64 bytes)
-{
-        int ret = 0;
-        mutex_lock(&block_group->alloc_mutex);
-        ret = __btrfs_remove_free_space(block_group, offset, bytes);
-        mutex_unlock(&block_group->alloc_mutex);
-        return ret;
-}
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
-                                 u64 offset, u64 bytes)
-{
-        int ret;
-        ret = __btrfs_remove_free_space(block_group, offset, bytes);
-        return ret;
-}
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes)
 {
@@ -408,6 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
+                printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+                       (unsigned long long)info->offset,
+                       (unsigned long long)info->bytes);
        }
        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
               "\n", count);
@@ -428,68 +384,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
        return ret;
 }
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache.  If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already.  In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster)
+{
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        spin_lock(&cluster->lock);
+        if (cluster->block_group != block_group)
+                goto out;
+        cluster->window_start = 0;
+        node = rb_first(&cluster->root);
+        while(node) {
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                node = rb_next(&entry->offset_index);
+                rb_erase(&entry->offset_index, &cluster->root);
+                link_free_space(block_group, entry);
+        }
+        list_del_init(&cluster->block_group_list);
+        btrfs_put_block_group(cluster->block_group);
+        cluster->block_group = NULL;
+        cluster->root.rb_node = NULL;
+out:
+        spin_unlock(&cluster->lock);
+        return 0;
+}
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 {
        struct btrfs_free_space *info;
        struct rb_node *node;
+        struct btrfs_free_cluster *cluster;
+        struct btrfs_free_cluster *safe;
+        spin_lock(&block_group->tree_lock);
+        list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+                                 block_group_list) {
+                WARN_ON(cluster->block_group != block_group);
+                __btrfs_return_cluster_to_free_space(block_group, cluster);
+        }
-        mutex_lock(&block_group->alloc_mutex);
        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
                info = rb_entry(node, struct btrfs_free_space, bytes_index);
                unlink_free_space(block_group, info);
                kfree(info);
                if (need_resched()) {
-                        mutex_unlock(&block_group->alloc_mutex);
+                        spin_unlock(&block_group->tree_lock);
                        cond_resched();
-                        mutex_lock(&block_group->alloc_mutex);
+                        spin_lock(&block_group->tree_lock);
                }
        }
-        mutex_unlock(&block_group->alloc_mutex);
+        spin_unlock(&block_group->tree_lock);
 }
-#if 0
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
-static struct btrfs_free_space *btrfs_find_free_space_offset(struct
+                               u64 offset, u64 bytes, u64 empty_size)
-                                                      btrfs_block_group_cache
-                                                      *block_group, u64 offset,
-                                                      u64 bytes)
 {
-        struct btrfs_free_space *ret;
+        struct btrfs_free_space *entry = NULL;
+        u64 ret = 0;
-        mutex_lock(&block_group->alloc_mutex);
+        spin_lock(&block_group->tree_lock);
-        ret = tree_search_offset(&block_group->free_space_offset, offset,
+        entry = tree_search_offset(&block_group->free_space_offset, offset,
-                                 bytes, 0);
+                                   bytes + empty_size, 1);
-        mutex_unlock(&block_group->alloc_mutex);
+        if (!entry)
+                entry = tree_search_bytes(&block_group->free_space_bytes,
+                                          offset, bytes + empty_size);
+        if (entry) {
+                unlink_free_space(block_group, entry);
+                ret = entry->offset;
+                entry->offset += bytes;
+                entry->bytes -= bytes;
+                if (!entry->bytes)
+                        kfree(entry);
+                else
+                        link_free_space(block_group, entry);
+        }
+        spin_unlock(&block_group->tree_lock);
        return ret;
 }
-static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+/*
-                                                     btrfs_block_group_cache
+ * given a cluster, put all of its extents back into the free space
-                                                     *block_group, u64 offset,
+ * cache.  If a block group is passed, this function will only free
-                                                     u64 bytes)
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+                               struct btrfs_block_group_cache *block_group,
+                               struct btrfs_free_cluster *cluster)
 {
-        struct btrfs_free_space *ret;
+        int ret;
-        mutex_lock(&block_group->alloc_mutex);
+        /* first, get a safe pointer to the block group */
+        spin_lock(&cluster->lock);
+        if (!block_group) {
+                block_group = cluster->block_group;
+                if (!block_group) {
+                        spin_unlock(&cluster->lock);
+                        return 0;
+                }
+        } else if (cluster->block_group != block_group) {
+                /* someone else has already freed it don't redo their work */
+                spin_unlock(&cluster->lock);
+                return 0;
+        }
+        atomic_inc(&block_group->count);
+        spin_unlock(&cluster->lock);
-        ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
+        /* now return any extents the cluster had on it */
-        mutex_unlock(&block_group->alloc_mutex);
+        spin_lock(&block_group->tree_lock);
+        ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+        spin_unlock(&block_group->tree_lock);
+        /* finally drop our ref */
+        btrfs_put_block_group(block_group);
        return ret;
 }
-#endif
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+/*
-                                               *block_group, u64 offset,
+ * given a cluster, try to allocate 'bytes' from it, returns 0
-                                               u64 bytes)
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster, u64 bytes,
+                             u64 min_start)
 {
-        struct btrfs_free_space *ret = NULL;
+        struct btrfs_free_space *entry = NULL;
+        struct rb_node *node;
+        u64 ret = 0;
-        ret = tree_search_offset(&block_group->free_space_offset, offset,
+        spin_lock(&cluster->lock);
-                                 bytes, 0);
+        if (bytes > cluster->max_size)
-        if (!ret)
+                goto out;
-                ret = tree_search_bytes(&block_group->free_space_bytes,
-                                        offset, bytes);
+        if (cluster->block_group != block_group)
+                goto out;
+        node = rb_first(&cluster->root);
+        if (!node)
+                goto out;
+        entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        while(1) {
+                if (entry->bytes < bytes || entry->offset < min_start) {
+                        struct rb_node *node;
+                        node = rb_next(&entry->offset_index);
+                        if (!node)
+                                break;
+                        entry = rb_entry(node, struct btrfs_free_space,
+                                         offset_index);
+                        continue;
+                }
+                ret = entry->offset;
+                entry->offset += bytes;
+                entry->bytes -= bytes;
+                if (entry->bytes == 0) {
+                        rb_erase(&entry->offset_index, &cluster->root);
+                        kfree(entry);
+                }
+                break;
+        }
+out:
+        spin_unlock(&cluster->lock);
        return ret;
 }
+/*
+ * here we try to find a cluster of blocks in a block group.  The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster,
+                             u64 offset, u64 bytes, u64 empty_size)
+{
+        struct btrfs_free_space *entry = NULL;
+        struct rb_node *node;
+        struct btrfs_free_space *next;
+        struct btrfs_free_space *last;
+        u64 min_bytes;
+        u64 window_start;
+        u64 window_free;
+        u64 max_extent = 0;
+        int total_retries = 0;
+        int ret;
+        /* for metadata, allow allocates with more holes */
+        if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+                /*
+                 * we want to do larger allocations when we are
+                 * flushing out the delayed refs, it helps prevent
+                 * making more work as we go along.
+                 */
+                if (trans->transaction->delayed_refs.flushing)
+                        min_bytes = max(bytes, (bytes + empty_size) >> 1);
+                else
+                        min_bytes = max(bytes, (bytes + empty_size) >> 4);
+        } else
+                min_bytes = max(bytes, (bytes + empty_size) >> 2);
+        spin_lock(&block_group->tree_lock);
+        spin_lock(&cluster->lock);
+        /* someone already found a cluster, hooray */
+        if (cluster->block_group) {
+                ret = 0;
+                goto out;
+        }
+again:
+        min_bytes = min(min_bytes, bytes + empty_size);
+        entry = tree_search_bytes(&block_group->free_space_bytes,
+                                  offset, min_bytes);
+        if (!entry) {
+                ret = -ENOSPC;
+                goto out;
+        }
+        window_start = entry->offset;
+        window_free = entry->bytes;
+        last = entry;
+        max_extent = entry->bytes;
+        while(1) {
+                /* out window is just right, lets fill it */
+                if (window_free >= bytes + empty_size)
+                        break;
+                node = rb_next(&last->offset_index);
+                if (!node) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                next = rb_entry(node, struct btrfs_free_space, offset_index);
+                /*
+                 * we haven't filled the empty size and the window is
+                 * very large.  reset and try again
+                 */
+                if (next->offset - window_start > (bytes + empty_size) * 2) {
+                        entry = next;
+                        window_start = entry->offset;
+                        window_free = entry->bytes;
+                        last = entry;
+                        max_extent = 0;
+                        total_retries++;
+                        if (total_retries % 256 == 0) {
+                                if (min_bytes >= (bytes + empty_size)) {
+                                        ret = -ENOSPC;
+                                        goto out;
+                                }
+                                /*
+                                 * grow our allocation a bit, we're not having
+                                 * much luck
+                                 */
+                                min_bytes *= 2;
+                                goto again;
+                        }
+                } else {
+                        last = next;
+                        window_free += next->bytes;
+                        if (entry->bytes > max_extent)
+                                max_extent = entry->bytes;
+                }
+        }
+        cluster->window_start = entry->offset;
+        /*
+         * now we've found our entries, pull them out of the free space
+         * cache and put them into the cluster rbtree
+         *
+         * The cluster includes an rbtree, but only uses the offset index
+         * of each free space cache entry.
+         */
+        while(1) {
+                node = rb_next(&entry->offset_index);
+                unlink_free_space(block_group, entry);
+                ret = tree_insert_offset(&cluster->root, entry->offset,
+                                         &entry->offset_index);
+                BUG_ON(ret);
+                if (!node || entry == last)
+                        break;
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        }
+        ret = 0;
+        cluster->max_size = max_extent;
+        atomic_inc(&block_group->count);
+        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
+        cluster->block_group = block_group;
+out:
+        spin_unlock(&cluster->lock);
+        spin_unlock(&block_group->tree_lock);
+        return ret;
+}
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+        spin_lock_init(&cluster->lock);
+        spin_lock_init(&cluster->refill_lock);
+        cluster->root.rb_node = NULL;
+        cluster->max_size = 0;
+        INIT_LIST_HEAD(&cluster->block_group_list);
+        cluster->block_group = NULL;
+}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_FREE_SPACE_CACHE
+#define __BTRFS_FREE_SPACE_CACHE
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                         u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                            u64 bytenr, u64 size);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+                                   *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+                               u64 offset, u64 bytes, u64 empty_size);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+                           u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster,
+                             u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster, u64 bytes,
+                             u64 min_start);
+int btrfs_return_cluster_to_free_space(
+                               struct btrfs_block_group_cache *block_group,
+                               struct btrfs_free_cluster *cluster);
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a4..6b627c611808 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+        path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0) {
                ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      ins_len);
        if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cc7334d833c9..9abbced1123d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
        }
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
+        search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
        search_key.objectid = search_start;
        search_key.type = 0;
        search_key.offset = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22a..1c8b0190d031 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops;
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
-struct kmem_cache *btrfs_bit_radix_cachep;
 struct kmem_cache *btrfs_path_cachep;
 #define S_SHIFT 12
@@ -134,6 +133,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+        path->leave_spinning = 1;
        btrfs_set_trans_block_group(trans, inode);
        key.objectid = inode->i_ino;
@@ -167,9 +167,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                        cur_size = min_t(unsigned long, compressed_size,
                                       PAGE_CACHE_SIZE);
-                        kaddr = kmap(cpage);
+                        kaddr = kmap_atomic(cpage, KM_USER0);
                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
-                        kunmap(cpage);
+                        kunmap_atomic(kaddr, KM_USER0);
                        i++;
                        ptr += cur_size;
@@ -204,7 +204,7 @@ fail:
 * does the checks required to make sure the data is small enough
 * to fit as an inline extent.
 */
-static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode, u64 start, u64 end,
                                 size_t compressed_size,
@@ -233,7 +233,7 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_drop_extents(trans, root, inode, start,
-                                 aligned_end, start, &hint_byte);
+                                 aligned_end, aligned_end, start, &hint_byte);
        BUG_ON(ret);
        if (isize > actual_end)
@@ -854,11 +854,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
        u64 cur_end;
        int limit = 10 * 1024 * 1042;
-        if (!btrfs_test_opt(root, COMPRESS)) {
-                return cow_file_range(inode, locked_page, start, end,
-                                      page_started, nr_written, 1);
-        }
        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
                         EXTENT_DELALLOC, 1, 0, GFP_NOFS);
        while (start < end) {
@@ -935,7 +930,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
 * If no cow copies or snapshots exist, we write directly to the existing
 * blocks on disk
 */
-static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+static noinline int run_delalloc_nocow(struct inode *inode,
+                                       struct page *locked_page,
                              u64 start, u64 end, int *page_started, int force,
                              unsigned long *nr_written)
 {
@@ -1133,6 +1129,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                              unsigned long *nr_written)
 {
        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        if (btrfs_test_flag(inode, NODATACOW))
                ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1137,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        else if (btrfs_test_flag(inode, PREALLOC))
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
+        else if (!btrfs_test_opt(root, COMPRESS))
+                ret = cow_file_range(inode, locked_page, start, end,
+                                      page_started, nr_written, 1);
        else
                ret = cow_file_range_async(inode, locked_page, start, end,
                                           page_started, nr_written);
        return ret;
 }
@@ -1439,6 +1438,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                       struct inode *inode, u64 file_pos,
                                       u64 disk_bytenr, u64 disk_num_bytes,
                                       u64 num_bytes, u64 ram_bytes,
+                                       u64 locked_end,
                                       u8 compression, u8 encryption,
                                       u16 other_encoding, int extent_type)
 {
@@ -1453,8 +1453,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        path->leave_spinning = 1;
        ret = btrfs_drop_extents(trans, root, inode, file_pos,
-                                 file_pos + num_bytes, file_pos, &hint);
+                                 file_pos + num_bytes, locked_end,
+                                 file_pos, &hint);
        BUG_ON(ret);
        ins.objectid = inode->i_ino;
@@ -1475,6 +1477,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_compression(leaf, fi, compression);
        btrfs_set_file_extent_encryption(leaf, fi, encryption);
        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+        btrfs_unlock_up_safe(path, 1);
+        btrfs_set_lock_blocking(leaf);
        btrfs_mark_buffer_dirty(leaf);
        inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1493,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                          root->root_key.objectid,
                                          trans->transid, inode->i_ino, &ins);
        BUG_ON(ret);
        btrfs_free_path(path);
        return 0;
 }
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction.  It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+static noinline void reada_csum(struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct btrfs_ordered_extent *ordered_extent)
+{
+        struct btrfs_ordered_sum *sum;
+        u64 bytenr;
+        sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
+                         list);
+        bytenr = sum->sums[0].bytenr;
+        /*
+         * we don't care about the results, the point of this search is
+         * just to get the btree leaves into ram
+         */
+        btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
+}
 /* as ordered data IO finishes, this gets called so we can finish
 * an ordered extent if the range of bytes in the file it covers are
 * fully written.
@@ -1500,8 +1530,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-        struct btrfs_ordered_extent *ordered_extent;
+        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_path *path;
        int compressed = 0;
        int ret;
@@ -1509,9 +1540,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        if (!ret)
                return 0;
+        /*
+         * before we join the transaction, try to do some of our IO.
+         * This will limit the amount of IO that we have to do with
+         * the transaction running.  We're unlikely to need to do any
+         * IO if the file extents are new, the disk_i_size checks
+         * covers the most common case.
+         */
+        if (start < BTRFS_I(inode)->disk_i_size) {
+                path = btrfs_alloc_path();
+                if (path) {
+                        ret = btrfs_lookup_file_extent(NULL, root, path,
+                                                       inode->i_ino,
+                                                       start, 0);
+                        ordered_extent = btrfs_lookup_ordered_extent(inode,
+                                                                     start);
+                        if (!list_empty(&ordered_extent->list)) {
+                                btrfs_release_path(root, path);
+                                reada_csum(root, path, ordered_extent);
+                        }
+                        btrfs_free_path(path);
+                }
+        }
        trans = btrfs_join_transaction(root, 1);
-        ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+        if (!ordered_extent)
+                ordered_extent = btrfs_lookup_ordered_extent(inode, start);
        BUG_ON(!ordered_extent);
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
                goto nocow;
@@ -1536,6 +1591,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->disk_len,
                                                ordered_extent->len,
                                                ordered_extent->len,
+                                                ordered_extent->file_offset +
+                                                ordered_extent->len,
                                                compressed, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
                BUG_ON(ret);
@@ -1765,10 +1822,12 @@ good:
        return 0;
 zeroit:
-        printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+        if (printk_ratelimit()) {
-               "private %llu\n", page->mapping->host->i_ino,
+                printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
-               (unsigned long long)start, csum,
+                       "private %llu\n", page->mapping->host->i_ino,
-               (unsigned long long)private);
+                       (unsigned long long)start, csum,
+                       (unsigned long long)private);
+        }
        memset(kaddr + offset, 1, end - start + 1);
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
@@ -1957,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 }
 /*
+ * very simple check to peek ahead in the leaf looking for xattrs.  If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+                                          int slot, u64 objectid)
+{
+        u32 nritems = btrfs_header_nritems(leaf);
+        struct btrfs_key found_key;
+        int scanned = 0;
+        slot++;
+        while (slot < nritems) {
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                /* we found a different objectid, there must not be acls */
+                if (found_key.objectid != objectid)
+                        return 0;
+                /* we found an xattr, assume we've got an acl */
+                if (found_key.type == BTRFS_XATTR_ITEM_KEY)
+                        return 1;
+                /*
+                 * we found a key greater than an xattr key, there can't
+                 * be any acls later on
+                 */
+                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+                        return 0;
+                slot++;
+                scanned++;
+                /*
+                 * it goes inode, inode backrefs, xattrs, extents,
+                 * so if there are a ton of hard links to an inode there can
+                 * be a lot of backrefs.  Don't waste time searching too hard,
+                 * this is just an optimization
+                 */
+                if (scanned >= 8)
+                        break;
+        }
+        /* we hit the end of the leaf before we found an xattr or
+         * something larger than an xattr.  We have to assume the inode
+         * has acls
+         */
+        return 1;
+}
+/*
 * read an inode from the btree into the in-memory inode
 */
 void btrfs_read_locked_inode(struct inode *inode)
@@ -1967,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode)
        struct btrfs_timespec *tspec;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
+        int maybe_acls;
        u64 alloc_group_block;
        u32 rdev;
        int ret;
@@ -2013,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode)
        alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+        /*
+         * try to precache a NULL acl entry for files that don't have
+         * any xattrs or acls
+         */
+        maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+        if (!maybe_acls) {
+                BTRFS_I(inode)->i_acl = NULL;
+                BTRFS_I(inode)->i_default_acl = NULL;
+        }
        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
                                                alloc_group_block, 0);
        btrfs_free_path(path);
@@ -2101,6 +2222,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        path->leave_spinning = 1;
        ret = btrfs_lookup_inode(trans, root, path,
                                 &BTRFS_I(inode)->location, 1);
        if (ret) {
@@ -2147,6 +2269,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                goto err;
        }
+        path->leave_spinning = 1;
        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
                                    name, name_len, -1);
        if (IS_ERR(di)) {
@@ -2190,8 +2313,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
                                         inode, dir->i_ino);
        BUG_ON(ret != 0 && ret != -ENOENT);
-        if (ret != -ENOENT)
-                BTRFS_I(dir)->log_dirty_trans = trans->transid;
        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
                                           dir, index);
@@ -2224,6 +2345,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, dir);
+        btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
@@ -2498,6 +2622,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        key.type = (u8)-1;
 search_again:
+        path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
                goto error;
@@ -2644,6 +2769,7 @@ delete:
                        break;
                }
                if (found_extent) {
+                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes,
                                                leaf->start, root_owner,
@@ -2818,6 +2944,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                        err = btrfs_drop_extents(trans, root, inode,
                                                 cur_offset,
                                                 cur_offset + hole_size,
+                                                 block_end,
                                                 cur_offset, &hint_byte);
                        if (err)
                                break;
@@ -2848,11 +2975,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (err)
                return err;
-        if (S_ISREG(inode->i_mode) &&
+        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-            attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+                if (attr->ia_size > inode->i_size) {
-                err = btrfs_cont_expand(inode, attr->ia_size);
+                        err = btrfs_cont_expand(inode, attr->ia_size);
-                if (err)
+                        if (err)
-                        return err;
+                                return err;
+                } else if (inode->i_size > 0 &&
+                           attr->ia_size == 0) {
+                        /* we're truncating a file that used to have good
+                         * data down to zero.  Make sure it gets into
+                         * the ordered flush list so that any new writes
+                         * get down to disk quickly.
+                         */
+                        BTRFS_I(inode)->ordered_data_close = 1;
+                }
        }
        err = inode_setattr(inode, attr);
@@ -2972,8 +3109,8 @@ static noinline void init_btrfs_i(struct inode *inode)
 {
        struct btrfs_inode *bi = BTRFS_I(inode);
-        bi->i_acl = NULL;
+        bi->i_acl = BTRFS_ACL_NOT_CACHED;
-        bi->i_default_acl = NULL;
+        bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
        bi->generation = 0;
        bi->sequence = 0;
@@ -2984,13 +3121,15 @@ static noinline void init_btrfs_i(struct inode *inode)
        bi->disk_i_size = 0;
        bi->flags = 0;
        bi->index_cnt = (u64)-1;
-        bi->log_dirty_trans = 0;
+        bi->last_unlink_trans = 0;
+        bi->ordered_data_close = 0;
        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
                             inode->i_mapping, GFP_NOFS);
        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                             inode->i_mapping, GFP_NOFS);
        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
        mutex_init(&BTRFS_I(inode)->extent_mutex);
        mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3411,8 +3550,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (dir) {
                ret = btrfs_set_inode_index(dir, index);
-                if (ret)
+                if (ret) {
+                        iput(inode);
                        return ERR_PTR(ret);
+                }
        }
        /*
         * index_cnt is ignored for everything but a dir,
@@ -3449,6 +3590,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        sizes[0] = sizeof(struct btrfs_inode_item);
        sizes[1] = name_len + sizeof(*ref);
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
        if (ret != 0)
                goto fail;
@@ -3494,6 +3636,7 @@ fail:
        if (dir)
                BTRFS_I(dir)->index_cnt--;
        btrfs_free_path(path);
+        iput(inode);
        return ERR_PTR(ret);
 }
@@ -3727,6 +3870,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                drop_inode = 1;
        nr = trans->blocks_used;
+        btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
        btrfs_end_transaction_throttle(trans, root);
 fail:
        if (drop_inode) {
@@ -4151,7 +4296,6 @@ out:
        }
        if (err) {
                free_extent_map(em);
-                WARN_ON(1);
                return ERR_PTR(err);
        }
        return em;
@@ -4292,8 +4436,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 */
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct inode *inode = fdentry(vma->vm_file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4306,10 +4451,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        u64 page_end;
        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
-        if (ret)
+        if (ret) {
+                if (ret == -ENOMEM)
+                        ret = VM_FAULT_OOM;
+                else /* -ENOSPC, -EIO, etc */
+                        ret = VM_FAULT_SIGBUS;
                goto out;
+        }
-        ret = -EINVAL;
+        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
        lock_page(page);
        size = i_size_read(inode);
@@ -4357,6 +4507,8 @@ again:
        }
        ClearPageChecked(page);
        set_page_dirty(page);
+        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 out_unlock:
@@ -4382,6 +4534,27 @@ static void btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        trans = btrfs_start_transaction(root, 1);
+        /*
+         * setattr is responsible for setting the ordered_data_close flag,
+         * but that is only tested during the last file release.  That
+         * could happen well after the next commit, leaving a great big
+         * window where new writes may get lost if someone chooses to write
+         * to this file after truncating to zero
+         *
+         * The inode doesn't have any dirty data here, and so if we commit
+         * this is a noop.  If someone immediately starts writing to the inode
+         * it is very likely we'll catch some of their writes in this
+         * transaction, and the commit will find this file on the ordered
+         * data list with good things to send down.
+         *
+         * This is a best effort solution, there is still a window where
+         * using truncate to replace the contents of the file will
+         * end up with a zero length file after a crash.
+         */
+        if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+                btrfs_add_ordered_operation(trans, root, inode);
        btrfs_set_trans_block_group(trans, inode);
        btrfs_i_size_write(inode, inode->i_size);
@@ -4458,12 +4631,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->i_acl = BTRFS_ACL_NOT_CACHED;
        ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
        INIT_LIST_HEAD(&ei->i_orphan);
+        INIT_LIST_HEAD(&ei->ordered_operations);
        return &ei->vfs_inode;
 }
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
@@ -4474,13 +4650,24 @@ void btrfs_destroy_inode(struct inode *inode)
            BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
                posix_acl_release(BTRFS_I(inode)->i_default_acl);
-        spin_lock(&BTRFS_I(inode)->root->list_lock);
+        /*
+         * Make sure we're properly removed from the ordered operation
+         * lists.
+         */
+        smp_mb();
+        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+                spin_lock(&root->fs_info->ordered_extent_lock);
+                list_del_init(&BTRFS_I(inode)->ordered_operations);
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+        }
+        spin_lock(&root->list_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
                       " list\n", inode->i_ino);
                dump_stack();
        }
-        spin_unlock(&BTRFS_I(inode)->root->list_lock);
+        spin_unlock(&root->list_lock);
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4515,47 +4702,36 @@ void btrfs_destroy_cachep(void)
                kmem_cache_destroy(btrfs_trans_handle_cachep);
        if (btrfs_transaction_cachep)
                kmem_cache_destroy(btrfs_transaction_cachep);
-        if (btrfs_bit_radix_cachep)
-                kmem_cache_destroy(btrfs_bit_radix_cachep);
        if (btrfs_path_cachep)
                kmem_cache_destroy(btrfs_path_cachep);
 }
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-                                       unsigned long extra_flags,
-                                       void (*ctor)(void *))
-{
-        return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
-                                 SLAB_MEM_SPREAD | extra_flags), ctor);
-}
 int btrfs_init_cachep(void)
 {
-        btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
+        btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
-                                          sizeof(struct btrfs_inode),
+                        sizeof(struct btrfs_inode), 0,
-                                          0, init_once);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
        if (!btrfs_inode_cachep)
                goto fail;
-        btrfs_trans_handle_cachep =
-                        btrfs_cache_create("btrfs_trans_handle_cache",
+        btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
-                                           sizeof(struct btrfs_trans_handle),
+                        sizeof(struct btrfs_trans_handle), 0,
-                                           0, NULL);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_trans_handle_cachep)
                goto fail;
-        btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
-                                             sizeof(struct btrfs_transaction),
+        btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
-                                             0, NULL);
+                        sizeof(struct btrfs_transaction), 0,
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_transaction_cachep)
                goto fail;
-        btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
-                                         sizeof(struct btrfs_path),
+        btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
-                                         0, NULL);
+                        sizeof(struct btrfs_path), 0,
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_path_cachep)
                goto fail;
-        btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
-                                              SLAB_DESTROY_BY_RCU, NULL);
-        if (!btrfs_bit_radix_cachep)
-                goto fail;
        return 0;
 fail:
        btrfs_destroy_cachep();
@@ -4605,8 +4781,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_unlock;
+        /*
+         * we're using rename to replace one file with another.
+         * and the replacement file is large.  Start IO on it now so
+         * we don't add too much work to the end of the transaction
+         */
+        if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+            new_inode->i_size &&
+            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+                filemap_flush(old_inode->i_mapping);
        trans = btrfs_start_transaction(root, 1);
+        /*
+         * make sure the inode gets flushed if it is replacing
+         * something.
+         */
+        if (new_inode && new_inode->i_size &&
+            old_inode && S_ISREG(old_inode->i_mode)) {
+                btrfs_add_ordered_operation(trans, root, old_inode);
+        }
+        /*
+         * this is an ugly little race, but the rename is required to make
+         * sure that if we crash, the inode is either at the old name
+         * or the new one.  pinning the log transaction lets us make sure
+         * we don't allow a log commit to come in after we unlink the
+         * name but before we add the new name back in.
+         */
+        btrfs_pin_log_trans(root);
        btrfs_set_trans_block_group(trans, new_dir);
        btrfs_inc_nlink(old_dentry->d_inode);
@@ -4614,6 +4818,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        new_dir->i_ctime = new_dir->i_mtime = ctime;
        old_inode->i_ctime = ctime;
+        if (old_dentry->d_parent != new_dentry->d_parent)
+                btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
        ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
                                 old_dentry->d_name.name,
                                 old_dentry->d_name.len);
@@ -4645,7 +4852,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_fail;
+        btrfs_log_new_name(trans, old_inode, old_dir,
+                                       new_dentry->d_parent);
 out_fail:
+        /* this btrfs_end_log_trans just allows the current
+         * log-sub transaction to complete
+         */
+        btrfs_end_log_trans(root);
        btrfs_end_transaction_throttle(trans, root);
 out_unlock:
        return ret;
@@ -4813,10 +5027,10 @@ out_fail:
        return err;
 }
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+static int prealloc_file_range(struct btrfs_trans_handle *trans,
-                               u64 alloc_hint, int mode)
+                               struct inode *inode, u64 start, u64 end,
+                               u64 locked_end, u64 alloc_hint, int mode)
 {
-        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 alloc_size;
@@ -4824,10 +5038,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
        u64 num_bytes = end - start;
        int ret = 0;
-        trans = btrfs_join_transaction(root, 1);
-        BUG_ON(!trans);
-        btrfs_set_trans_block_group(trans, inode);
        while (num_bytes > 0) {
                alloc_size = min(num_bytes, root->fs_info->max_extent);
                ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -4840,7 +5050,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
                ret = insert_reserved_file_extent(trans, inode,
                                                  cur_offset, ins.objectid,
                                                  ins.offset, ins.offset,
-                                                  ins.offset, 0, 0, 0,
+                                                  ins.offset, locked_end,
+                                                  0, 0, 0,
                                                  BTRFS_FILE_EXTENT_PREALLOC);
                BUG_ON(ret);
                num_bytes -= ins.offset;
@@ -4858,7 +5069,6 @@ out:
                BUG_ON(ret);
        }
-        btrfs_end_transaction(trans, root);
        return ret;
 }
@@ -4870,13 +5080,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        u64 alloc_start;
        u64 alloc_end;
        u64 alloc_hint = 0;
+        u64 locked_end;
        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
+        struct btrfs_trans_handle *trans;
        int ret;
        alloc_start = offset & ~mask;
        alloc_end =  (offset + len + mask) & ~mask;
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
        mutex_lock(&inode->i_mutex);
        if (alloc_start > inode->i_size) {
                ret = btrfs_cont_expand(inode, alloc_start);
@@ -4884,10 +5102,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
+        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
-                lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
-                            alloc_end - 1, GFP_NOFS);
+                trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+                if (!trans) {
+                        ret = -EIO;
+                        goto out;
+                }
+                /* the extent lock is ordered inside the running
+                 * transaction
+                 */
+                lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                            GFP_NOFS);
                ordered = btrfs_lookup_first_ordered_extent(inode,
                                                            alloc_end - 1);
                if (ordered &&
@@ -4895,7 +5124,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                    ordered->file_offset < alloc_end) {
                        btrfs_put_ordered_extent(ordered);
                        unlock_extent(&BTRFS_I(inode)->io_tree,
-                                      alloc_start, alloc_end - 1, GFP_NOFS);
+                                      alloc_start, locked_end, GFP_NOFS);
+                        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+                        /*
+                         * we can't wait on the range with the transaction
+                         * running or with the extent lock held
+                         */
                        btrfs_wait_ordered_range(inode, alloc_start,
                                                 alloc_end - alloc_start);
                } else {
@@ -4913,8 +5148,9 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                last_byte = min(extent_map_end(em), alloc_end);
                last_byte = (last_byte + mask) & ~mask;
                if (em->block_start == EXTENT_MAP_HOLE) {
-                        ret = prealloc_file_range(inode, cur_offset,
+                        ret = prealloc_file_range(trans, inode, cur_offset,
-                                        last_byte, alloc_hint, mode);
+                                        last_byte, locked_end + 1,
+                                        alloc_hint, mode);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
@@ -4930,8 +5166,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        break;
                }
        }
-        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                      GFP_NOFS);
+        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..2624b53ea783 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
                goto out_dput;
        if (!IS_POSIXACL(parent->dentry->d_inode))
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
        error = mnt_want_write(parent->mnt);
        if (error)
@@ -437,10 +437,6 @@ out_unlock:
        return 0;
 }
-/*
- * Called inside transaction, so use GFP_NOFS
- */
 static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 {
        u64 new_size;
@@ -461,15 +457,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
-        if (!vol_args)
+                return PTR_ERR(vol_args);
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        namelen = strlen(vol_args->name);
@@ -483,11 +473,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
                *devstr = '\0';
                devstr = vol_args->name;
                devid = simple_strtoull(devstr, &end, 10);
-                printk(KERN_INFO "resizing devid %llu\n", devid);
+                printk(KERN_INFO "resizing devid %llu\n",
+                       (unsigned long long)devid);
        }
        device = btrfs_find_device(root, devid, NULL, NULL);
        if (!device) {
-                printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+                printk(KERN_INFO "resizer unable to find device %llu\n",
+                       (unsigned long long)devid);
                ret = -EINVAL;
                goto out_unlock;
        }
@@ -545,7 +537,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 out_unlock:
        mutex_unlock(&root->fs_info->volume_mutex);
-out:
        kfree(vol_args);
        return ret;
 }
@@ -565,15 +556,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
-        if (!vol_args)
+                return PTR_ERR(vol_args);
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        namelen = strlen(vol_args->name);
@@ -675,19 +660,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
-        if (!vol_args)
+                return PTR_ERR(vol_args);
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_init_new_device(root, vol_args->name);
-out:
        kfree(vol_args);
        return ret;
 }
@@ -703,19 +682,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
-        if (!vol_args)
+                return PTR_ERR(vol_args);
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_rm_device(root, vol_args->name);
-out:
        kfree(vol_args);
        return ret;
 }
@@ -830,7 +803,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        BUG_ON(!trans);
        /* punch hole in destination first */
-        btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+        btrfs_drop_extents(trans, root, inode, off, off + len,
+                           off + len, 0, &hint_byte);
        /* clone data */
        key.objectid = src->i_ino;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a2..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 /*
 * unfortunately, many of the places that currently set a lock to blocking
- * don't end up blocking for every long, and often they don't block
+ * don't end up blocking for very long, and often they don't block
- * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * at all.  For a dbench 50 run, if we don't spin on the blocking bit
 * at all, the context switch rate can jump up to 400,000/sec or more.
 *
 * So, we're still stuck with this crummy spin on the blocking bit,
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 static int btrfs_spin_on_block(struct extent_buffer *eb)
 {
        int i;
        for (i = 0; i < 512; i++) {
-                cpu_relax();
                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
                        return 1;
                if (need_resched())
                        break;
+                cpu_relax();
        }
        return 0;
 }
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
 {
        int i;
-        spin_nested(eb);
+        if (btrfs_spin_on_block(eb)) {
-        if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                spin_nested(eb);
-                return 1;
+                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-        spin_unlock(&eb->lock);
+                        return 1;
+                spin_unlock(&eb->lock);
+        }
        /* spin for a bit on the BLOCKING flag */
        for (i = 0; i < 2; i++) {
+                cpu_relax();
                if (!btrfs_spin_on_block(eb))
                        break;
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
        DEFINE_WAIT(wait);
        wait.func = btrfs_wake_function;
+        if (!btrfs_spin_on_block(eb))
+                goto sleep;
        while(1) {
                spin_nested(eb);
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
                 * spin for a bit, and if the blocking flag goes away,
                 * loop around
                 */
+                cpu_relax();
                if (btrfs_spin_on_block(eb))
                        continue;
+sleep:
                prepare_to_wait_exclusive(&eb->lock_wq, &wait,
                                          TASK_UNINTERRUPTIBLE);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..d6f0806c682f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
+        /*
+         * we have no more ordered extents for this inode and
+         * no dirty pages.  We can safely remove it from the
+         * list of ordered extents
+         */
+        if (RB_EMPTY_ROOT(&tree->tree) &&
+            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+                list_del_init(&BTRFS_I(inode)->ordered_operations);
+        }
        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 }
 /*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list.  These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io.  When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+        struct btrfs_inode *btrfs_inode;
+        struct inode *inode;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        mutex_lock(&root->fs_info->ordered_operations_mutex);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+        list_splice_init(&root->fs_info->ordered_operations, &splice);
+        while (!list_empty(&splice)) {
+                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                   ordered_operations);
+                inode = &btrfs_inode->vfs_inode;
+                list_del_init(&btrfs_inode->ordered_operations);
+                /*
+                 * the inode may be getting freed (in sys_unlink path).
+                 */
+                inode = igrab(inode);
+                if (!wait && inode) {
+                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                              &root->fs_info->ordered_operations);
+                }
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+                if (inode) {
+                        if (wait)
+                                btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                        else
+                                filemap_flush(inode->i_mapping);
+                        iput(inode);
+                }
+                cond_resched();
+                spin_lock(&root->fs_info->ordered_extent_lock);
+        }
+        if (wait && !list_empty(&root->fs_info->ordered_operations))
+                goto again;
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+        return 0;
+}
+/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
@@ -417,7 +489,7 @@ again:
        /* start IO across the range first to instantiate any delalloc
         * extents
         */
-        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
        /* The compression code will leave pages locked but return from
         * writepage without setting the page writeback.  Starting again
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
        return ret;
 }
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct inode *inode)
+{
+        u64 last_mod;
+        last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+        /*
+         * if this file hasn't been changed since the last transaction
+         * commit, we can safely return without doing anything
+         */
+        if (last_mod < root->fs_info->last_trans_committed)
+                return 0;
+        /*
+         * the transaction is already committing.  Just start the IO and
+         * don't bother with all of this list nonsense
+         */
+        if (trans && root->fs_info->running_transaction->blocked) {
+                btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                return 0;
+        }
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+                list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                              &root->fs_info->ordered_operations);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
                           loff_t end, int sync_mode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct inode *inode);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 19a4daf03ccb..2ff7cd2db25f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/init.h>
+#include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
+        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
+        Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 static match_table_t tokens = {
@@ -83,6 +85,9 @@ static match_table_t tokens = {
        {Opt_compress, "compress"},
        {Opt_ssd, "ssd"},
        {Opt_noacl, "noacl"},
+        {Opt_notreelog, "notreelog"},
+        {Opt_flushoncommit, "flushoncommit"},
+        {Opt_ratio, "metadata_ratio=%d"},
        {Opt_err, NULL},
 };
@@ -191,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                info->max_extent = max_t(u64,
                                        info->max_extent, root->sectorsize);
                                printk(KERN_INFO "btrfs: max_extent at %llu\n",
-                                       info->max_extent);
+                                       (unsigned long long)info->max_extent);
                        }
                        break;
                case Opt_max_inline:
@@ -206,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                                root->sectorsize);
                                }
                                printk(KERN_INFO "btrfs: max_inline at %llu\n",
-                                        info->max_inline);
+                                        (unsigned long long)info->max_inline);
                        }
                        break;
                case Opt_alloc_start:
@@ -216,12 +221,29 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                kfree(num);
                                printk(KERN_INFO
                                        "btrfs: allocations start at %llu\n",
-                                        info->alloc_start);
+                                        (unsigned long long)info->alloc_start);
                        }
                        break;
                case Opt_noacl:
                        root->fs_info->sb->s_flags &= ~MS_POSIXACL;
                        break;
+                case Opt_notreelog:
+                        printk(KERN_INFO "btrfs: disabling tree log\n");
+                        btrfs_set_opt(info->mount_opt, NOTREELOG);
+                        break;
+                case Opt_flushoncommit:
+                        printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
+                        btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+                        break;
+                case Opt_ratio:
+                        intarg = 0;
+                        match_int(&args[0], &intarg);
+                        if (intarg) {
+                                info->metadata_ratio = intarg;
+                                printk(KERN_INFO "btrfs: metadata ratio %d\n",
+                                       info->metadata_ratio);
+                        }
+                        break;
                default:
                        break;
                }
@@ -363,9 +385,8 @@ fail_close:
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
        struct btrfs_trans_handle *trans;
-        struct btrfs_root *root;
+        struct btrfs_root *root = btrfs_sb(sb);
        int ret;
-        root = btrfs_sb(sb);
        if (sb->s_flags & MS_RDONLY)
                return 0;
@@ -385,6 +406,44 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        return ret;
 }
+static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+        struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
+        struct btrfs_fs_info *info = root->fs_info;
+        if (btrfs_test_opt(root, DEGRADED))
+                seq_puts(seq, ",degraded");
+        if (btrfs_test_opt(root, NODATASUM))
+                seq_puts(seq, ",nodatasum");
+        if (btrfs_test_opt(root, NODATACOW))
+                seq_puts(seq, ",nodatacow");
+        if (btrfs_test_opt(root, NOBARRIER))
+                seq_puts(seq, ",nobarrier");
+        if (info->max_extent != (u64)-1)
+                seq_printf(seq, ",max_extent=%llu",
+                           (unsigned long long)info->max_extent);
+        if (info->max_inline != 8192 * 1024)
+                seq_printf(seq, ",max_inline=%llu",
+                           (unsigned long long)info->max_inline);
+        if (info->alloc_start != 0)
+                seq_printf(seq, ",alloc_start=%llu",
+                           (unsigned long long)info->alloc_start);
+        if (info->thread_pool_size !=  min_t(unsigned long,
+                                             num_online_cpus() + 2, 8))
+                seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+        if (btrfs_test_opt(root, COMPRESS))
+                seq_puts(seq, ",compress");
+        if (btrfs_test_opt(root, SSD))
+                seq_puts(seq, ",ssd");
+        if (btrfs_test_opt(root, NOTREELOG))
+                seq_puts(seq, ",notreelog");
+        if (btrfs_test_opt(root, FLUSHONCOMMIT))
+                seq_puts(seq, ",flushoncommit");
+        if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+                seq_puts(seq, ",noacl");
+        return 0;
+}
 static void btrfs_write_super(struct super_block *sb)
 {
        sb->s_dirt = 0;
@@ -443,8 +502,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
        if (s->s_root) {
                if ((flags ^ s->s_flags) & MS_RDONLY) {
-                        up_write(&s->s_umount);
+                        deactivate_locked_super(s);
-                        deactivate_super(s);
                        error = -EBUSY;
                        goto error_close_devices;
                }
@@ -458,8 +516,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
                if (error) {
-                        up_write(&s->s_umount);
+                        deactivate_locked_super(s);
-                        deactivate_super(s);
                        goto error_free_subvol_name;
                }
@@ -476,15 +533,13 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                mutex_unlock(&s->s_root->d_inode->i_mutex);
                if (IS_ERR(root)) {
-                        up_write(&s->s_umount);
+                        deactivate_locked_super(s);
-                        deactivate_super(s);
                        error = PTR_ERR(root);
                        goto error_free_subvol_name;
                }
                if (!root->d_inode) {
                        dput(root);
-                        up_write(&s->s_umount);
+                        deactivate_locked_super(s);
-                        deactivate_super(s);
                        error = -ENXIO;
                        goto error_free_subvol_name;
                }
@@ -589,14 +644,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+        vol = memdup_user((void __user *)arg, sizeof(*vol));
-        if (!vol)
+        if (IS_ERR(vol))
-                return -ENOMEM;
+                return PTR_ERR(vol);
-        if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
-                ret = -EFAULT;
-                goto out;
-        }
        switch (cmd) {
        case BTRFS_IOC_SCAN_DEV:
@@ -604,7 +654,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
                                            &btrfs_fs_type, &fs_devices);
                break;
        }
-out:
        kfree(vol);
        return ret;
 }
@@ -630,7 +680,7 @@ static struct super_operations btrfs_super_ops = {
        .put_super      = btrfs_put_super,
        .write_super    = btrfs_write_super,
        .sync_fs        = btrfs_sync_fs,
-        .show_options   = generic_show_options,
+        .show_options   = btrfs_show_options,
        .write_inode    = btrfs_write_inode,
        .dirty_inode    = btrfs_dirty_inode,
        .alloc_inode    = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4d..01b143605ec1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
                                             GFP_NOFS);
                BUG_ON(!cur_trans);
                root->fs_info->generation++;
-                root->fs_info->last_alloc = 0;
-                root->fs_info->last_data_alloc = 0;
                cur_trans->num_writers = 1;
                cur_trans->num_joined = 0;
                cur_trans->transid = root->fs_info->generation;
@@ -65,6 +63,15 @@ static noinline int join_transaction(struct btrfs_root *root)
                cur_trans->use_count = 1;
                cur_trans->commit_done = 0;
                cur_trans->start_time = get_seconds();
+                cur_trans->delayed_refs.root.rb_node = NULL;
+                cur_trans->delayed_refs.num_entries = 0;
+                cur_trans->delayed_refs.num_heads_ready = 0;
+                cur_trans->delayed_refs.num_heads = 0;
+                cur_trans->delayed_refs.flushing = 0;
+                cur_trans->delayed_refs.run_delayed_start = 0;
+                spin_lock_init(&cur_trans->delayed_refs.lock);
                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
                extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +189,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        h->block_group = 0;
        h->alloc_exclude_nr = 0;
        h->alloc_exclude_start = 0;
+        h->delayed_ref_updates = 0;
        root->fs_info->running_transaction->use_count++;
        mutex_unlock(&root->fs_info->trans_mutex);
        return h;
@@ -271,7 +280,6 @@ void btrfs_throttle(struct btrfs_root *root)
        if (!root->fs_info->open_ioctl_trans)
                wait_current_trans(root);
        mutex_unlock(&root->fs_info->trans_mutex);
        throttle_on_drops(root);
 }
@@ -280,6 +288,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
        struct btrfs_transaction *cur_trans;
        struct btrfs_fs_info *info = root->fs_info;
+        int count = 0;
+        while (count < 4) {
+                unsigned long cur = trans->delayed_ref_updates;
+                trans->delayed_ref_updates = 0;
+                if (cur &&
+                    trans->transaction->delayed_refs.num_heads_ready > 64) {
+                        trans->delayed_ref_updates = 0;
+                        /*
+                         * do a full flush if the transaction is trying
+                         * to close
+                         */
+                        if (trans->transaction->delayed_refs.flushing)
+                                cur = 0;
+                        btrfs_run_delayed_refs(trans, root, cur);
+                } else {
+                        break;
+                }
+                count++;
+        }
        mutex_lock(&info->trans_mutex);
        cur_trans = info->running_transaction;
@@ -424,9 +453,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        u64 old_root_bytenr;
        struct btrfs_root *tree_root = root->fs_info->tree_root;
-        btrfs_extent_post_op(trans, root);
        btrfs_write_dirty_block_groups(trans, root);
-        btrfs_extent_post_op(trans, root);
+        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+        BUG_ON(ret);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +468,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                                     btrfs_header_level(root->node));
                btrfs_set_root_generation(&root->root_item, trans->transid);
-                btrfs_extent_post_op(trans, root);
                ret = btrfs_update_root(trans, tree_root,
                                        &root->root_key,
                                        &root->root_item);
                BUG_ON(ret);
                btrfs_write_dirty_block_groups(trans, root);
-                btrfs_extent_post_op(trans, root);
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                BUG_ON(ret);
        }
        return 0;
 }
@@ -459,15 +489,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct list_head *next;
        struct extent_buffer *eb;
+        int ret;
-        btrfs_extent_post_op(trans, fs_info->tree_root);
+        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+        BUG_ON(ret);
        eb = btrfs_lock_root_node(fs_info->tree_root);
-        btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+        btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
-        btrfs_extent_post_op(trans, fs_info->tree_root);
+        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+        BUG_ON(ret);
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +508,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
                root = list_entry(next, struct btrfs_root, dirty_list);
                update_cowonly_root(trans, root);
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                BUG_ON(ret);
        }
        return 0;
 }
@@ -635,6 +671,37 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 }
 /*
+ * when dropping snapshots, we generate a ton of delayed refs, and it makes
+ * sense not to join the transaction while it is trying to flush the current
+ * queue of delayed refs out.
+ *
+ * This is used by the drop snapshot code only
+ */
+static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
+{
+        DEFINE_WAIT(wait);
+        mutex_lock(&info->trans_mutex);
+        while (info->running_transaction &&
+               info->running_transaction->delayed_refs.flushing) {
+                prepare_to_wait(&info->transaction_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                mutex_unlock(&info->trans_mutex);
+                atomic_dec(&info->throttles);
+                wake_up(&info->transaction_throttle);
+                schedule();
+                atomic_inc(&info->throttles);
+                mutex_lock(&info->trans_mutex);
+                finish_wait(&info->transaction_wait, &wait);
+        }
+        mutex_unlock(&info->trans_mutex);
+        return 0;
+}
+/*
 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
 * all of them
 */
@@ -661,7 +728,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
                atomic_inc(&root->fs_info->throttles);
                while (1) {
+                        /*
+                         * we don't want to jump in and create a bunch of
+                         * delayed refs if the transaction is starting to close
+                         */
+                        wait_transaction_pre_flush(tree_root->fs_info);
                        trans = btrfs_start_transaction(tree_root, 1);
+                        /*
+                         * we've joined a transaction, make sure it isn't
+                         * closing right now
+                         */
+                        if (trans->transaction->delayed_refs.flushing) {
+                                btrfs_end_transaction(trans, tree_root);
+                                continue;
+                        }
                        mutex_lock(&root->fs_info->drop_mutex);
                        ret = btrfs_drop_snapshot(trans, dirty->root);
                        if (ret != -EAGAIN)
@@ -766,7 +848,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        old = btrfs_lock_root_node(root);
-        btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+        btrfs_cow_block(trans, root, old, NULL, 0, &old);
        btrfs_copy_root(trans, root, old, &tmp, objectid);
        btrfs_tree_unlock(old);
@@ -894,12 +976,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        struct extent_io_tree *pinned_copy;
        DEFINE_WAIT(wait);
        int ret;
+        int should_grow = 0;
+        unsigned long now = get_seconds();
+        int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+        btrfs_run_ordered_operations(root, 0);
+        /* make a pass through all the delayed refs we have so far
+         * any runnings procs may add more while we are here
+         */
+        ret = btrfs_run_delayed_refs(trans, root, 0);
+        BUG_ON(ret);
+        cur_trans = trans->transaction;
+        /*
+         * set the flushing flag so procs in this transaction have to
+         * start sending their work down.
+         */
+        cur_trans->delayed_refs.flushing = 1;
+        ret = btrfs_run_delayed_refs(trans, root, 0);
+        BUG_ON(ret);
-        INIT_LIST_HEAD(&dirty_fs_roots);
        mutex_lock(&root->fs_info->trans_mutex);
-        if (trans->transaction->in_commit) {
+        INIT_LIST_HEAD(&dirty_fs_roots);
-                cur_trans = trans->transaction;
+        if (cur_trans->in_commit) {
-                trans->transaction->use_count++;
+                cur_trans->use_count++;
                mutex_unlock(&root->fs_info->trans_mutex);
                btrfs_end_transaction(trans, root);
@@ -922,7 +1024,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
-        cur_trans = trans->transaction;
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
@@ -937,6 +1038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                }
        }
+        if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+                should_grow = 1;
        do {
                int snap_pending = 0;
                joined = cur_trans->num_joined;
@@ -949,26 +1053,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                if (cur_trans->num_writers > 1)
                        timeout = MAX_SCHEDULE_TIMEOUT;
-                else
+                else if (should_grow)
                        timeout = 1;
                mutex_unlock(&root->fs_info->trans_mutex);
-                if (snap_pending) {
+                if (flush_on_commit || snap_pending) {
+                        if (flush_on_commit)
+                                btrfs_start_delalloc_inodes(root);
                        ret = btrfs_wait_ordered_extents(root, 1);
                        BUG_ON(ret);
                }
-                schedule_timeout(timeout);
+                /*
+                 * rename don't use btrfs_join_transaction, so, once we
+                 * set the transaction to blocked above, we aren't going
+                 * to get any new ordered operations.  We can safely run
+                 * it here and no for sure that nothing new will be added
+                 * to the list
+                 */
+                btrfs_run_ordered_operations(root, 1);
+                smp_mb();
+                if (cur_trans->num_writers > 1 || should_grow)
+                        schedule_timeout(timeout);
                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
        } while (cur_trans->num_writers > 1 ||
-                 (cur_trans->num_joined != joined));
+                 (should_grow && cur_trans->num_joined != joined));
        ret = create_pending_snapshots(trans, root->fs_info);
        BUG_ON(ret);
+        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+        BUG_ON(ret);
        WARN_ON(cur_trans != trans->transaction);
        /* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1152,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_copy_pinned(root, pinned_copy);
        trans->transaction->blocked = 0;
        wake_up(&root->fs_info->transaction_throttle);
        wake_up(&root->fs_info->transaction_wait);
@@ -1058,6 +1179,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f882..94f5bde2b58d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
+#include "delayed-ref.h"
 struct btrfs_transaction {
        u64 transid;
+        /*
+         * total writers in this transaction, it must be zero before the
+         * transaction can end
+         */
        unsigned long num_writers;
        unsigned long num_joined;
        int in_commit;
        int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
+        struct btrfs_delayed_ref_root delayed_refs;
 };
 struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
        u64 block_group;
        u64 alloc_exclude_start;
        u64 alloc_exclude_nr;
+        unsigned long delayed_ref_updates;
 };
 struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570e..b10eacdb1620 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        }
        btrfs_release_path(root, path);
-        if (is_extent)
-                btrfs_extent_post_op(trans, root);
 out:
        if (path)
                btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60fa..db5e212e8445 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
 #define LOG_INODE_EXISTS 1
 /*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2).  After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ *  2a is actually the more important variant.  With the extra logging
+ *  a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir.  After a crash the rm -rf must
+ * be replayed.  This must be able to recurse down the entire
+ * directory tree.  The inode link count fixup code takes care of the
+ * ugly details.
+ */
+/*
 * stages for the tree walking.  The first
 * stage (0) is to only pin down the blocks we find
 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
 #define LOG_WALK_REPLAY_INODES 1
 #define LOG_WALK_REPLAY_ALL 2
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, struct inode *inode,
                             int inode_only);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_root *log,
+                                       struct btrfs_path *path,
+                                       u64 dirid, int del_all);
 /*
 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
 }
 /*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+        int ret = -ENOENT;
+        mutex_lock(&root->log_mutex);
+        atomic_inc(&root->log_writers);
+        mutex_unlock(&root->log_mutex);
+        return ret;
+}
+/*
 * indicate we're done making changes to the log tree
 * and wake up anyone waiting to do a sync
 */
-static int end_log_trans(struct btrfs_root *root)
+int btrfs_end_log_trans(struct btrfs_root *root)
 {
        if (atomic_dec_and_test(&root->log_writers)) {
                smp_mb();
@@ -199,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
                              struct extent_buffer *eb,
                              struct walk_control *wc, u64 gen)
 {
-        if (wc->pin) {
+        if (wc->pin)
-                mutex_lock(&log->fs_info->pinned_mutex);
                btrfs_update_pinned_extents(log->fs_info->extent_root,
                                            eb->start, eb->len, 1);
-                mutex_unlock(&log->fs_info->pinned_mutex);
-        }
        if (btrfs_buffer_uptodate(eb, gen)) {
                if (wc->write)
@@ -476,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        saved_nbytes = inode_get_bytes(inode);
        /* drop any overlapping extents */
        ret = btrfs_drop_extents(trans, root, inode,
-                         start, extent_end, start, &alloc_hint);
+                         start, extent_end, extent_end, start, &alloc_hint);
        BUG_ON(ret);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -603,6 +663,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
        ret = link_to_fixup_dir(trans, root, path, location.objectid);
        BUG_ON(ret);
        ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
        BUG_ON(ret);
        kfree(name);
@@ -804,6 +865,7 @@ conflict_again:
                                            victim_name_len)) {
                                btrfs_inc_nlink(inode);
                                btrfs_release_path(root, path);
                                ret = btrfs_unlink_inode(trans, root, dir,
                                                         inode, victim_name,
                                                         victim_name_len);
@@ -922,13 +984,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                key.offset--;
                btrfs_release_path(root, path);
        }
-        btrfs_free_path(path);
+        btrfs_release_path(root, path);
        if (nlink != inode->i_nlink) {
                inode->i_nlink = nlink;
                btrfs_update_inode(trans, root, inode);
        }
        BTRFS_I(inode)->index_cnt = (u64)-1;
+        if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+                ret = replay_dir_deletes(trans, root, NULL, path,
+                                         inode->i_ino, 1);
+                BUG_ON(ret);
+        }
+        btrfs_free_path(path);
        return 0;
 }
@@ -971,9 +1040,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
                iput(inode);
-                if (key.offset == 0)
+                /*
-                        break;
+                 * fixup on a directory may create new entries,
-                key.offset--;
+                 * make sure we always look for the highset possible
+                 * offset
+                 */
+                key.offset = (u64)-1;
        }
        btrfs_release_path(root, path);
        return 0;
@@ -1150,8 +1222,7 @@ insert:
        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
                              name, name_len, log_type, &log_key);
-        if (ret && ret != -ENOENT)
+        BUG_ON(ret && ret != -ENOENT);
-                BUG();
        goto out;
 }
@@ -1313,11 +1384,11 @@ again:
                read_extent_buffer(eb, name, (unsigned long)(di + 1),
                                  name_len);
                log_di = NULL;
-                if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+                if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
                                                       dir_key->objectid,
                                                       name, name_len, 0);
-                } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+                } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
                        log_di = btrfs_lookup_dir_index_item(trans, log,
                                                     log_path,
                                                     dir_key->objectid,
@@ -1378,7 +1449,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct btrfs_root *log,
                                       struct btrfs_path *path,
-                                       u64 dirid)
+                                       u64 dirid, int del_all)
 {
        u64 range_start;
        u64 range_end;
@@ -1408,10 +1479,14 @@ again:
        range_start = 0;
        range_end = 0;
        while (1) {
-                ret = find_dir_range(log, path, dirid, key_type,
+                if (del_all)
-                                     &range_start, &range_end);
+                        range_end = (u64)-1;
-                if (ret != 0)
+                else {
-                        break;
+                        ret = find_dir_range(log, path, dirid, key_type,
+                                             &range_start, &range_end);
+                        if (ret != 0)
+                                break;
+                }
                dir_key.offset = range_start;
                while (1) {
@@ -1437,7 +1512,8 @@ again:
                                break;
                        ret = check_item_in_log(trans, root, log, path,
-                                                log_path, dir, &found_key);
+                                                log_path, dir,
+                                                &found_key);
                        BUG_ON(ret);
                        if (found_key.offset == (u64)-1)
                                break;
@@ -1514,7 +1590,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                        mode = btrfs_inode_mode(eb, inode_item);
                        if (S_ISDIR(mode)) {
                                ret = replay_dir_deletes(wc->trans,
-                                         root, log, path, key.objectid);
+                                         root, log, path, key.objectid, 0);
                                BUG_ON(ret);
                        }
                        ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1609,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                                        root, inode, inode->i_size,
                                        BTRFS_EXTENT_DATA_KEY);
                                BUG_ON(ret);
+                                /* if the nlink count is zero here, the iput
+                                 * will free the inode.  We bump it to make
+                                 * sure it doesn't get freed until the link
+                                 * count fixup is done
+                                 */
+                                if (inode->i_nlink == 0) {
+                                        btrfs_inc_nlink(inode);
+                                        btrfs_update_inode(wc->trans,
+                                                           root, inode);
+                                }
                                iput(inode);
                        }
                        ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1927,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
        return ret;
 }
-static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, unsigned long transid)
 {
        DEFINE_WAIT(wait);
        int index = transid % 2;
@@ -1854,9 +1942,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
                prepare_to_wait(&root->log_commit_wait[index],
                                &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&root->log_mutex);
-                if (root->log_transid < transid + 2 &&
+                if (root->fs_info->last_trans_log_full_commit !=
+                    trans->transid && root->log_transid < transid + 2 &&
                    atomic_read(&root->log_commit[index]))
                        schedule();
                finish_wait(&root->log_commit_wait[index], &wait);
                mutex_lock(&root->log_mutex);
        } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1955,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
        return 0;
 }
-static int wait_for_writer(struct btrfs_root *root)
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root)
 {
        DEFINE_WAIT(wait);
        while (atomic_read(&root->log_writers)) {
                prepare_to_wait(&root->log_writer_wait,
                                &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&root->log_mutex);
-                if (atomic_read(&root->log_writers))
+                if (root->fs_info->last_trans_log_full_commit !=
+                    trans->transid && atomic_read(&root->log_writers))
                        schedule();
                mutex_lock(&root->log_mutex);
                finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1975,14 @@ static int wait_for_writer(struct btrfs_root *root)
 /*
 * btrfs_sync_log does sends a given tree log down to the disk and
 * updates the super blocks to record it.  When this call is done,
- * you know that any inodes previously logged are safely on disk
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
 */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root)
@@ -1896,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_lock(&root->log_mutex);
        index1 = root->log_transid % 2;
        if (atomic_read(&root->log_commit[index1])) {
-                wait_log_commit(root, root->log_transid);
+                wait_log_commit(trans, root, root->log_transid);
                mutex_unlock(&root->log_mutex);
                return 0;
        }
@@ -1904,18 +2004,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        /* wait for previous tree log sync to complete */
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-                wait_log_commit(root, root->log_transid - 1);
+                wait_log_commit(trans, root, root->log_transid - 1);
        while (1) {
                unsigned long batch = root->log_batch;
                mutex_unlock(&root->log_mutex);
                schedule_timeout_uninterruptible(1);
                mutex_lock(&root->log_mutex);
-                wait_for_writer(root);
+                wait_for_writer(trans, root);
                if (batch == root->log_batch)
                        break;
        }
+        /* bail out if we need to do a full commit */
+        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+                ret = -EAGAIN;
+                mutex_unlock(&root->log_mutex);
+                goto out;
+        }
        ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
        BUG_ON(ret);
@@ -1951,16 +2059,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
-                wait_log_commit(log_root_tree, log_root_tree->log_transid);
+                wait_log_commit(trans, log_root_tree,
+                                log_root_tree->log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out;
        }
        atomic_set(&log_root_tree->log_commit[index2], 1);
-        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
+        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
-                wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+                wait_log_commit(trans, log_root_tree,
+                                log_root_tree->log_transid - 1);
+        }
+        wait_for_writer(trans, log_root_tree);
-        wait_for_writer(log_root_tree);
+        /*
+         * now that we've moved on to the tree of log tree roots,
+         * check the full commit flag again
+         */
+        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+                mutex_unlock(&log_root_tree->log_mutex);
+                ret = -EAGAIN;
+                goto out_wake_log_root;
+        }
        ret = btrfs_write_and_wait_marked_extents(log_root_tree,
                                &log_root_tree->dirty_log_pages);
@@ -1985,7 +2106,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * in and cause problems either.
         */
        write_ctree_super(trans, root->fs_info->tree_root, 2);
+        ret = 0;
+out_wake_log_root:
        atomic_set(&log_root_tree->log_commit[index2], 0);
        smp_mb();
        if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2121,8 @@ out:
        return 0;
 }
-/* * free all the extents used by the tree log.  This should be called
+/*
+ * free all the extents used by the tree log.  This should be called
 * at commit time of the full transaction
 */
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2256,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        btrfs_free_path(path);
        mutex_unlock(&BTRFS_I(dir)->log_mutex);
-        end_log_trans(root);
+        btrfs_end_log_trans(root);
        return 0;
 }
@@ -2159,7 +2283,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
                                  dirid, &index);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
-        end_log_trans(root);
+        btrfs_end_log_trans(root);
        return ret;
 }
@@ -2559,7 +2683,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 *
 * This handles both files and directories.
 */
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, struct inode *inode,
                             int inode_only)
 {
@@ -2585,28 +2709,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
        min_key.offset = 0;
        max_key.objectid = inode->i_ino;
+        /* today the code can only do partial logging of directories */
+        if (!S_ISDIR(inode->i_mode))
+            inode_only = LOG_INODE_ALL;
        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
                max_key.type = BTRFS_XATTR_ITEM_KEY;
        else
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
-        /*
-         * if this inode has already been logged and we're in inode_only
-         * mode, we don't want to delete the things that have already
-         * been written to the log.
-         *
-         * But, if the inode has been through an inode_only log,
-         * the logged_trans field is not set.  This allows us to catch
-         * any new names for this inode in the backrefs by logging it
-         * again
-         */
-        if (inode_only == LOG_INODE_EXISTS &&
-            BTRFS_I(inode)->logged_trans == trans->transid) {
-                btrfs_free_path(path);
-                btrfs_free_path(dst_path);
-                goto out;
-        }
        mutex_lock(&BTRFS_I(inode)->log_mutex);
        /*
@@ -2693,7 +2806,6 @@ next_slot:
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
                btrfs_release_path(root, path);
                btrfs_release_path(log, dst_path);
-                BTRFS_I(inode)->log_dirty_trans = 0;
                ret = log_directory_changes(trans, root, inode, path, dst_path);
                BUG_ON(ret);
        }
@@ -2702,19 +2814,69 @@ next_slot:
        btrfs_free_path(path);
        btrfs_free_path(dst_path);
-out:
        return 0;
 }
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
+/*
-                    struct btrfs_root *root, struct inode *inode,
+ * follow the dentry parent pointers up the chain and see if any
-                    int inode_only)
+ * of the directories in it require a full commit before they can
+ * be logged.  Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+                                               struct inode *inode,
+                                               struct dentry *parent,
+                                               struct super_block *sb,
+                                               u64 last_committed)
 {
-        int ret;
+        int ret = 0;
+        struct btrfs_root *root;
-        start_log_trans(trans, root);
+        /*
-        ret = __btrfs_log_inode(trans, root, inode, inode_only);
+         * for regular files, if its inode is already on disk, we don't
-        end_log_trans(root);
+         * have to worry about the parents at all.  This is because
+         * we can use the last_unlink_trans field to record renames
+         * and other fun in this file.
+         */
+        if (S_ISREG(inode->i_mode) &&
+            BTRFS_I(inode)->generation <= last_committed &&
+            BTRFS_I(inode)->last_unlink_trans <= last_committed)
+                        goto out;
+        if (!S_ISDIR(inode->i_mode)) {
+                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+                        goto out;
+                inode = parent->d_inode;
+        }
+        while (1) {
+                BTRFS_I(inode)->logged_trans = trans->transid;
+                smp_mb();
+                if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+                        root = BTRFS_I(inode)->root;
+                        /*
+                         * make sure any commits to the log are forced
+                         * to be full commits
+                         */
+                        root->fs_info->last_trans_log_full_commit =
+                                trans->transid;
+                        ret = 1;
+                        break;
+                }
+                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+                        break;
+                if (parent == sb->s_root)
+                        break;
+                parent = parent->d_parent;
+                inode = parent->d_inode;
+        }
+out:
        return ret;
 }
@@ -2724,31 +2886,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
 * only logging is done of any parent directories that are older than
 * the last committed transaction
 */
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-                    struct btrfs_root *root, struct dentry *dentry)
+                    struct btrfs_root *root, struct inode *inode,
+                    struct dentry *parent, int exists_only)
 {
-        int inode_only = LOG_INODE_ALL;
+        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
        struct super_block *sb;
-        int ret;
+        int ret = 0;
+        u64 last_committed = root->fs_info->last_trans_committed;
+        sb = inode->i_sb;
+        if (btrfs_test_opt(root, NOTREELOG)) {
+                ret = 1;
+                goto end_no_trans;
+        }
+        if (root->fs_info->last_trans_log_full_commit >
+            root->fs_info->last_trans_committed) {
+                ret = 1;
+                goto end_no_trans;
+        }
+        ret = check_parent_dirs_for_sync(trans, inode, parent,
+                                         sb, last_committed);
+        if (ret)
+                goto end_no_trans;
        start_log_trans(trans, root);
-        sb = dentry->d_inode->i_sb;
-        while (1) {
-                ret = __btrfs_log_inode(trans, root, dentry->d_inode,
-                                        inode_only);
-                BUG_ON(ret);
-                inode_only = LOG_INODE_EXISTS;
-                dentry = dentry->d_parent;
+        ret = btrfs_log_inode(trans, root, inode, inode_only);
-                if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+        BUG_ON(ret);
+        /*
+         * for regular files, if its inode is already on disk, we don't
+         * have to worry about the parents at all.  This is because
+         * we can use the last_unlink_trans field to record renames
+         * and other fun in this file.
+         */
+        if (S_ISREG(inode->i_mode) &&
+            BTRFS_I(inode)->generation <= last_committed &&
+            BTRFS_I(inode)->last_unlink_trans <= last_committed)
+                        goto no_parent;
+        inode_only = LOG_INODE_EXISTS;
+        while (1) {
+                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
                        break;
-                if (BTRFS_I(dentry->d_inode)->generation <=
+                inode = parent->d_inode;
-                    root->fs_info->last_trans_committed)
+                if (BTRFS_I(inode)->generation >
+                    root->fs_info->last_trans_committed) {
+                        ret = btrfs_log_inode(trans, root, inode, inode_only);
+                        BUG_ON(ret);
+                }
+                if (parent == sb->s_root)
                        break;
+                parent = parent->d_parent;
        }
-        end_log_trans(root);
+no_parent:
-        return 0;
+        ret = 0;
+        btrfs_end_log_trans(root);
+end_no_trans:
+        return ret;
 }
 /*
@@ -2760,12 +2961,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry)
 {
-        u64 gen;
+        return btrfs_log_inode_parent(trans, root, dentry->d_inode,
-        gen = root->fs_info->last_trans_new_blockgroup;
+                                      dentry->d_parent, 0);
-        if (gen > root->fs_info->last_trans_committed)
-                return 1;
-        else
-                return btrfs_log_dentry(trans, root, dentry);
 }
 /*
@@ -2884,3 +3081,94 @@ again:
        kfree(log_root_tree);
        return 0;
 }
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+                             struct inode *dir, struct inode *inode,
+                             int for_rename)
+{
+        /*
+         * when we're logging a file, if it hasn't been renamed
+         * or unlinked, and its inode is fully committed on disk,
+         * we don't have to worry about walking up the directory chain
+         * to log its parents.
+         *
+         * So, we use the last_unlink_trans field to put this transid
+         * into the file.  When the file is logged we check it and
+         * don't log the parents if the file is fully on disk.
+         */
+        if (S_ISREG(inode->i_mode))
+                BTRFS_I(inode)->last_unlink_trans = trans->transid;
+        /*
+         * if this directory was already logged any new
+         * names for this file/dir will get recorded
+         */
+        smp_mb();
+        if (BTRFS_I(dir)->logged_trans == trans->transid)
+                return;
+        /*
+         * if the inode we're about to unlink was logged,
+         * the log will be properly updated for any new names
+         */
+        if (BTRFS_I(inode)->logged_trans == trans->transid)
+                return;
+        /*
+         * when renaming files across directories, if the directory
+         * there we're unlinking from gets fsync'd later on, there's
+         * no way to find the destination directory later and fsync it
+         * properly.  So, we have to be conservative and force commits
+         * so the new name gets discovered.
+         */
+        if (for_rename)
+                goto record;
+        /* we can safely do the unlink without any special recording */
+        return;
+record:
+        BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+                        struct inode *inode, struct inode *old_dir,
+                        struct dentry *parent)
+{
+        struct btrfs_root * root = BTRFS_I(inode)->root;
+        /*
+         * this will force the logging code to walk the dentry chain
+         * up for the file
+         */
+        if (S_ISREG(inode->i_mode))
+                BTRFS_I(inode)->last_unlink_trans = trans->transid;
+        /*
+         * if this inode hasn't been logged and directory we're renaming it
+         * from hasn't been logged, we don't need to log it
+         */
+        if (BTRFS_I(inode)->logged_trans <=
+            root->fs_info->last_trans_committed &&
+            (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+                    root->fs_info->last_trans_committed))
+                return 0;
+        return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
-                    struct btrfs_root *root, struct dentry *dentry);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry);
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                    struct btrfs_root *root, struct inode *inode,
-                    int inode_only);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               const char *name, int name_len,
                               struct inode *inode, u64 dirid);
+int btrfs_join_running_log_trans(struct btrfs_root *root);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct inode *inode,
+                    struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+                             struct inode *dir, struct inode *inode,
+                             int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+                        struct inode *inode, struct inode *old_dir,
+                        struct dentry *parent);
 #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd06e18e5aac..a6d35b0054ca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
+#include <linux/iocontext.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -124,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
        return NULL;
 }
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+                        struct bio *head, struct bio *tail)
+{
+        struct bio *old_head;
+        old_head = pending_bios->head;
+        pending_bios->head = head;
+        if (pending_bios->tail)
+                tail->bi_next = old_head;
+        else
+                pending_bios->tail = tail;
+}
 /*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
@@ -140,31 +155,44 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        struct bio *pending;
        struct backing_dev_info *bdi;
        struct btrfs_fs_info *fs_info;
+        struct btrfs_pending_bios *pending_bios;
        struct bio *tail;
        struct bio *cur;
        int again = 0;
-        unsigned long num_run = 0;
+        unsigned long num_run;
+        unsigned long num_sync_run;
        unsigned long limit;
+        unsigned long last_waited = 0;
-        bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+        bdi = blk_get_backing_dev_info(device->bdev);
        fs_info = device->dev_root->fs_info;
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
+        /* we want to make sure that every time we switch from the sync
+         * list to the normal list, we unplug
+         */
+        num_sync_run = 0;
 loop:
        spin_lock(&device->io_lock);
+        num_run = 0;
 loop_lock:
        /* take all the bios off the list at once and process them
         * later on (without the lock held).  But, remember the
         * tail and other pointers so the bios can be properly reinserted
         * into the list if we hit congestion
         */
-        pending = device->pending_bios;
+        if (device->pending_sync_bios.head)
-        tail = device->pending_bio_tail;
+                pending_bios = &device->pending_sync_bios;
+        else
+                pending_bios = &device->pending_bios;
+        pending = pending_bios->head;
+        tail = pending_bios->tail;
        WARN_ON(pending && !tail);
-        device->pending_bios = NULL;
-        device->pending_bio_tail = NULL;
        /*
         * if pending was null this time around, no bios need processing
@@ -174,16 +202,41 @@ loop_lock:
         * device->running_pending is used to synchronize with the
         * schedule_bio code.
         */
-        if (pending) {
+        if (device->pending_sync_bios.head == NULL &&
-                again = 1;
+            device->pending_bios.head == NULL) {
-                device->running_pending = 1;
-        } else {
                again = 0;
                device->running_pending = 0;
+        } else {
+                again = 1;
+                device->running_pending = 1;
        }
+        pending_bios->head = NULL;
+        pending_bios->tail = NULL;
        spin_unlock(&device->io_lock);
+        /*
+         * if we're doing the regular priority list, make sure we unplug
+         * for any high prio bios we've sent down
+         */
+        if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+                num_sync_run = 0;
+                blk_run_backing_dev(bdi, NULL);
+        }
        while (pending) {
+                rmb();
+                if (pending_bios != &device->pending_sync_bios &&
+                    device->pending_sync_bios.head &&
+                    num_run > 16) {
+                        cond_resched();
+                        spin_lock(&device->io_lock);
+                        requeue_list(pending_bios, pending, tail);
+                        goto loop_lock;
+                }
                cur = pending;
                pending = pending->bi_next;
                cur->bi_next = NULL;
@@ -194,10 +247,18 @@ loop_lock:
                        wake_up(&fs_info->async_submit_wait);
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-                bio_get(cur);
                submit_bio(cur->bi_rw, cur);
-                bio_put(cur);
                num_run++;
+                if (bio_sync(cur))
+                        num_sync_run++;
+                if (need_resched()) {
+                        if (num_sync_run) {
+                                blk_run_backing_dev(bdi, NULL);
+                                num_sync_run = 0;
+                        }
+                        cond_resched();
+                }
                /*
                 * we made progress, there is more work to do and the bdi
@@ -206,17 +267,41 @@ loop_lock:
                 */
                if (pending && bdi_write_congested(bdi) && num_run > 16 &&
                    fs_info->fs_devices->open_devices > 1) {
-                        struct bio *old_head;
+                        struct io_context *ioc;
-                        spin_lock(&device->io_lock);
+                        ioc = current->io_context;
-                        old_head = device->pending_bios;
-                        device->pending_bios = pending;
-                        if (device->pending_bio_tail)
-                                tail->bi_next = old_head;
-                        else
-                                device->pending_bio_tail = tail;
+                        /*
+                         * the main goal here is that we don't want to
+                         * block if we're going to be able to submit
+                         * more requests without blocking.
+                         *
+                         * This code does two great things, it pokes into
+                         * the elevator code from a filesystem _and_
+                         * it makes assumptions about how batching works.
+                         */
+                        if (ioc && ioc->nr_batch_requests > 0 &&
+                            time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+                            (last_waited == 0 ||
+                             ioc->last_waited == last_waited)) {
+                                /*
+                                 * we want to go through our batch of
+                                 * requests and stop.  So, we copy out
+                                 * the ioc->last_waited time and test
+                                 * against it before looping
+                                 */
+                                last_waited = ioc->last_waited;
+                                if (need_resched()) {
+                                        if (num_sync_run) {
+                                                blk_run_backing_dev(bdi, NULL);
+                                                num_sync_run = 0;
+                                        }
+                                        cond_resched();
+                                }
+                                continue;
+                        }
+                        spin_lock(&device->io_lock);
+                        requeue_list(pending_bios, pending, tail);
                        device->running_pending = 1;
                        spin_unlock(&device->io_lock);
@@ -224,13 +309,32 @@ loop_lock:
                        goto done;
                }
        }
+        if (num_sync_run) {
+                num_sync_run = 0;
+                blk_run_backing_dev(bdi, NULL);
+        }
+        cond_resched();
        if (again)
                goto loop;
        spin_lock(&device->io_lock);
-        if (device->pending_bios)
+        if (device->pending_bios.head || device->pending_sync_bios.head)
                goto loop_lock;
        spin_unlock(&device->io_lock);
+        /*
+         * IO has already been through a long path to get here.  Checksumming,
+         * async helper threads, perhaps compression.  We've done a pretty
+         * good job of collecting a batch of IO and should just unplug
+         * the device right away.
+         *
+         * This will help anyone who is waiting on the IO, they might have
+         * already unplugged, but managed to do so before the bio they
+         * cared about found its way down here.
+         */
+        blk_run_backing_dev(bdi, NULL);
 done:
        return 0;
 }
@@ -1336,6 +1440,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->io_align = root->sectorsize;
        device->sector_size = root->sectorsize;
        device->total_bytes = i_size_read(bdev->bd_inode);
+        device->disk_total_bytes = device->total_bytes;
        device->dev_root = root->fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
@@ -1439,7 +1544,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+        btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
        btrfs_mark_buffer_dirty(leaf);
@@ -1836,14 +1941,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        device->total_bytes = new_size;
        if (device->writeable)
                device->fs_devices->total_rw_bytes -= diff;
-        ret = btrfs_update_device(trans, device);
-        if (ret) {
-                unlock_chunks(root);
-                btrfs_end_transaction(trans, root);
-                goto done;
-        }
-        WARN_ON(diff > old_total);
-        btrfs_set_super_total_bytes(super_copy, old_total - diff);
        unlock_chunks(root);
        btrfs_end_transaction(trans, root);
@@ -1875,7 +1972,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                length = btrfs_dev_extent_length(l, dev_extent);
                if (key.offset + length <= new_size)
-                        goto done;
+                        break;
                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -1888,6 +1985,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                        goto done;
        }
+        /* Shrinking succeeded, else we would be at "done". */
+        trans = btrfs_start_transaction(root, 1);
+        if (!trans) {
+                ret = -ENOMEM;
+                goto done;
+        }
+        lock_chunks(root);
+        device->disk_total_bytes = new_size;
+        /* Now btrfs_update_device() will change the on-disk size. */
+        ret = btrfs_update_device(trans, device);
+        if (ret) {
+                unlock_chunks(root);
+                btrfs_end_transaction(trans, root);
+                goto done;
+        }
+        WARN_ON(diff > old_total);
+        btrfs_set_super_total_bytes(super_copy, old_total - diff);
+        unlock_chunks(root);
+        btrfs_end_transaction(trans, root);
 done:
        btrfs_free_path(path);
        return ret;
@@ -2458,7 +2575,7 @@ again:
                        max_errors = 1;
                }
        }
-        if (multi_ret && rw == WRITE &&
+        if (multi_ret && (rw & (1 << BIO_RW)) &&
            stripes_allocated < stripes_required) {
                stripes_allocated = map->num_stripes;
                free_extent_map(em);
@@ -2723,6 +2840,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
                                 int rw, struct bio *bio)
 {
        int should_queue = 1;
+        struct btrfs_pending_bios *pending_bios;
        /* don't bother with additional async steps for reads, right now */
        if (!(rw & (1 << BIO_RW))) {
@@ -2744,13 +2862,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
        bio->bi_rw |= rw;
        spin_lock(&device->io_lock);
+        if (bio_sync(bio))
+                pending_bios = &device->pending_sync_bios;
+        else
+                pending_bios = &device->pending_bios;
-        if (device->pending_bio_tail)
+        if (pending_bios->tail)
-                device->pending_bio_tail->bi_next = bio;
+                pending_bios->tail->bi_next = bio;
-        device->pending_bio_tail = bio;
+        pending_bios->tail = bio;
-        if (!device->pending_bios)
+        if (!pending_bios->head)
-                device->pending_bios = bio;
+                pending_bios->head = bio;
        if (device->running_pending)
                should_queue = 0;
@@ -2967,7 +3089,8 @@ static int fill_device_from_item(struct extent_buffer *leaf,
        unsigned long ptr;
        device->devid = btrfs_device_id(leaf, dev_item);
-        device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+        device->total_bytes = device->disk_total_bytes;
        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
        device->type = btrfs_device_type(leaf, dev_item);
        device->io_align = btrfs_device_io_align(leaf, dev_item);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..5c3ff6d02fd7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
 #include "async-thread.h"
 struct buffer_head;
+struct btrfs_pending_bios {
+        struct bio *head;
+        struct bio *tail;
+};
 struct btrfs_device {
        struct list_head dev_list;
        struct list_head dev_alloc_list;
        struct btrfs_fs_devices *fs_devices;
        struct btrfs_root *dev_root;
-        struct bio *pending_bios;
-        struct bio *pending_bio_tail;
+        /* regular prio bios */
+        struct btrfs_pending_bios pending_bios;
+        /* WRITE_SYNC bios */
+        struct btrfs_pending_bios pending_sync_bios;
        int running_pending;
        u64 generation;
@@ -52,6 +61,9 @@ struct btrfs_device {
        /* size of the device */
        u64 total_bytes;
+        /* size of the disk */
+        u64 disk_total_bytes;
        /* bytes used */
        u64 bytes_used;
@@ -76,7 +88,7 @@ struct btrfs_device {
 struct btrfs_fs_devices {
        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
-        /* the device with this id has the most recent coyp of the super */
+        /* the device with this id has the most recent copy of the super */
        u64 latest_devid;
        u64 latest_trans;
        u64 num_devices;
diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97cb..49106127a4aa 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
        head = page_buffers(page);
        bh = head;
        do {
-                if (bh->b_blocknr == block) {
+                if (!buffer_mapped(bh))
+                        all_mapped = 0;
+                else if (bh->b_blocknr == block) {
                        ret = bh;
                        get_bh(bh);
                        goto out_unlock;
                }
-                if (!buffer_mapped(bh))
-                        all_mapped = 0;
                bh = bh->b_this_page;
        } while (bh != head);
@@ -290,7 +290,7 @@ static void free_more_memory(void)
                                                &zone);
                if (zone)
                        try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-                                                GFP_NOFS);
+                                                GFP_NOFS, NULL);
        }
 }
@@ -360,7 +360,7 @@ still_busy:
 * Completion handler for block_write_full_page() - pages which are unlocked
 * during I/O, and which have PageWriteback cleared upon I/O completion.
 */
-static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
        char b[BDEVNAME_SIZE];
        unsigned long flags;
@@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh)
        set_buffer_async_read(bh);
 }
-void mark_buffer_async_write(struct buffer_head *bh)
+void mark_buffer_async_write_endio(struct buffer_head *bh,
+                                   bh_end_io_t *handler)
 {
-        bh->b_end_io = end_buffer_async_write;
+        bh->b_end_io = handler;
        set_buffer_async_write(bh);
 }
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+        mark_buffer_async_write_endio(bh, end_buffer_async_write);
+}
 EXPORT_SYMBOL(mark_buffer_async_write);
@@ -547,6 +553,46 @@ repeat:
        return err;
 }
+void do_thaw_all(struct work_struct *work)
+{
+        struct super_block *sb;
+        char b[BDEVNAME_SIZE];
+        spin_lock(&sb_lock);
+restart:
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+                        printk(KERN_WARNING "Emergency Thaw on %s\n",
+                               bdevname(sb->s_bdev, b));
+                up_read(&sb->s_umount);
+                spin_lock(&sb_lock);
+                if (__put_super_and_need_restart(sb))
+                        goto restart;
+        }
+        spin_unlock(&sb_lock);
+        kfree(work);
+        printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+        struct work_struct *work;
+        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        if (work) {
+                INIT_WORK(work, do_thaw_all);
+                schedule_work(work);
+        }
+}
 /**
 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 * @mapping: the mapping which wants those buffers written
@@ -621,14 +667,7 @@ static void __set_page_dirty(struct page *page,
        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
+                account_page_dirtied(page, mapping);
-                if (mapping_cap_account_dirty(mapping)) {
-                        __inc_zone_page_state(page, NR_FILE_DIRTY);
-                        __inc_bdi_stat(mapping->backing_dev_info,
-                                        BDI_RECLAIMABLE);
-                        task_dirty_inc(current);
-                        task_io_account_write(PAGE_CACHE_SIZE);
-                }
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
@@ -711,7 +750,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
        struct buffer_head *bh;
        struct list_head tmp;
-        struct address_space *mapping;
+        struct address_space *mapping, *prev_mapping = NULL;
        int err = 0, err2;
        INIT_LIST_HEAD(&tmp);
@@ -736,7 +775,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * contents - it is a noop if I/O is still in
                                 * flight on potentially older contents.
                                 */
-                                ll_rw_block(SWRITE_SYNC, 1, &bh);
+                                ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+                                /*
+                                 * Kick off IO for the previous mapping. Note
+                                 * that we will not run the very last mapping,
+                                 * wait_on_buffer() will do that for us
+                                 * through sync_buffer().
+                                 */
+                                if (prev_mapping && prev_mapping != mapping)
+                                        blk_run_address_space(prev_mapping);
+                                prev_mapping = mapping;
                                brelse(bh);
                                spin_lock(lock);
                        }
@@ -1559,9 +1609,20 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
 * locked buffer.   This only can happen if someone has written the buffer
 * directly, with submit_bh().  At the address_space level PageWriteback
 * prevents this contention from occurring.
+ *
+ * If block_write_full_page() is called with wbc->sync_mode ==
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
+ * causes the writes to be flagged as synchronous writes, but the
+ * block device queue will NOT be unplugged, since usually many pages
+ * will be pushed to the out before the higher-level caller actually
+ * waits for the writes to be completed.  The various wait functions,
+ * such as wait_on_writeback_range() will ultimately call sync_page()
+ * which will ultimately call blk_run_backing_dev(), which will end up
+ * unplugging the device queue.
 */
 static int __block_write_full_page(struct inode *inode, struct page *page,
-                        get_block_t *get_block, struct writeback_control *wbc)
+                        get_block_t *get_block, struct writeback_control *wbc,
+                        bh_end_io_t *handler)
 {
        int err;
        sector_t block;
@@ -1569,6 +1630,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        struct buffer_head *bh, *head;
        const unsigned blocksize = 1 << inode->i_blkbits;
        int nr_underway = 0;
+        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
+                        WRITE_SYNC_PLUG : WRITE);
        BUG_ON(!PageLocked(page));
@@ -1644,7 +1707,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                        continue;
                }
                if (test_clear_buffer_dirty(bh)) {
-                        mark_buffer_async_write(bh);
+                        mark_buffer_async_write_endio(bh, handler);
                } else {
                        unlock_buffer(bh);
                }
@@ -1660,7 +1723,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
-                        submit_bh(WRITE, bh);
+                        submit_bh(write_op, bh);
                        nr_underway++;
                }
                bh = next;
@@ -1697,7 +1760,7 @@ recover:
                if (buffer_mapped(bh) && buffer_dirty(bh) &&
                    !buffer_delay(bh)) {
                        lock_buffer(bh);
-                        mark_buffer_async_write(bh);
+                        mark_buffer_async_write_endio(bh, handler);
                } else {
                        /*
                         * The buffer may have been set dirty during
@@ -1714,7 +1777,7 @@ recover:
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
-                        submit_bh(WRITE, bh);
+                        submit_bh(write_op, bh);
                        nr_underway++;
                }
                bh = next;
@@ -2320,20 +2383,22 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
 * unlock the page.
 */
 int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                   get_block_t get_block)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        unsigned long end;
        loff_t size;
-        int ret = -EINVAL;
+        int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
        lock_page(page);
        size = i_size_read(inode);
        if ((page->mapping != inode->i_mapping) ||
            (page_offset(page) > size)) {
                /* page got truncated out from underneath us */
-                goto out_unlock;
+                unlock_page(page);
+                goto out;
        }
        /* page is wholly or partially inside EOF */
@@ -2346,8 +2411,16 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
        if (!ret)
                ret = block_commit_write(page, 0, end);
-out_unlock:
+        if (unlikely(ret)) {
-        unlock_page(page);
+                unlock_page(page);
+                if (ret == -ENOMEM)
+                        ret = VM_FAULT_OOM;
+                else /* -ENOSPC, -EIO, etc */
+                        ret = VM_FAULT_SIGBUS;
+        } else
+                ret = VM_FAULT_LOCKED;
+out:
        return ret;
 }
@@ -2615,7 +2688,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 out:
        ret = mpage_writepage(page, get_block, wbc);
        if (ret == -EAGAIN)
-                ret = __block_write_full_page(inode, page, get_block, wbc);
+                ret = __block_write_full_page(inode, page, get_block, wbc,
+                                              end_buffer_async_write);
        return ret;
 }
 EXPORT_SYMBOL(nobh_writepage);
@@ -2662,6 +2736,8 @@ has_buffers:
                pos += blocksize;
        }
+        map_bh.b_size = blocksize;
+        map_bh.b_state = 0;
        err = get_block(inode, iblock, &map_bh, 0);
        if (err)
                goto unlock;
@@ -2773,9 +2849,10 @@ out:
 /*
 * The generic ->writepage function for buffer-backed address_spaces
+ * this form passes in the end_io handler used to finish the IO.
 */
-int block_write_full_page(struct page *page, get_block_t *get_block,
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
-                        struct writeback_control *wbc)
+                        struct writeback_control *wbc, bh_end_io_t *handler)
 {
        struct inode * const inode = page->mapping->host;
        loff_t i_size = i_size_read(inode);
@@ -2784,7 +2861,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
        /* Is the page fully inside i_size? */
        if (page->index < end_index)
-                return __block_write_full_page(inode, page, get_block, wbc);
+                return __block_write_full_page(inode, page, get_block, wbc,
+                                               handler);
        /* Is the page fully outside i_size? (truncate in progress) */
        offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2807,9 +2885,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
         * writes to that region are not written out to the file."
         */
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-        return __block_write_full_page(inode, page, get_block, wbc);
+        return __block_write_full_page(inode, page, get_block, wbc, handler);
+}
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+                        struct writeback_control *wbc)
+{
+        return block_write_full_page_endio(page, get_block, wbc,
+                                           end_buffer_async_write);
 }
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
                            get_block_t *get_block)
 {
@@ -2922,12 +3011,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];
-                if (rw == SWRITE || rw == SWRITE_SYNC)
+                if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
                        lock_buffer(bh);
                else if (!trylock_buffer(bh))
                        continue;
-                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
+                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
+                    rw == SWRITE_SYNC_PLUG) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
@@ -2963,7 +3053,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE, bh);
+                ret = submit_bh(WRITE_SYNC, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
@@ -3277,11 +3367,12 @@ EXPORT_SYMBOL(block_read_full_page);
 EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
+EXPORT_SYMBOL(block_write_full_page_endio);
 EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
+EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(file_fsync);
-EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644
index 000000000000..80e9c6167f0b
--- /dev/null
+++ b/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
+config CACHEFILES
+        tristate "Filesystem caching on files"
+        depends on FSCACHE && BLOCK
+        help
+          This permits use of a mounted filesystem as a cache for other
+          filesystems - primarily networking filesystems - thus allowing fast
+          local disk to enhance the speed of slower devices.
+          See Documentation/filesystems/caching/cachefiles.txt for more
+          information.
+config CACHEFILES_DEBUG
+        bool "Debug CacheFiles"
+        depends on CACHEFILES
+        help
+          This permits debugging to be dynamically enabled in the filesystem
+          caching on files module.  If this is set, the debugging output may be
+          enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
+          by including a debugging specifier in /etc/cachefilesd.conf.
+config CACHEFILES_HISTOGRAM
+        bool "Gather latency information on CacheFiles"
+        depends on CACHEFILES && PROC_FS
+        help
+          This option causes latency information to be gathered on CacheFiles
+          operation and exported through file:
+                /proc/fs/cachefiles/histogram
+          The generation of this histogram adds a certain amount of overhead to
+          execution as there are a number of points at which data is gathered,
+          and on a multi-CPU system these may be on cachelines that keep
+          bouncing between CPUs.  On the other hand, the histogram may be
+          useful for debugging purposes.  Saying 'N' here is recommended.
+          See Documentation/filesystems/caching/cachefiles.txt for more
+          information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644
index 000000000000..32cbab0ffce3
--- /dev/null
+++ b/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for caching in a mounted filesystem
+#
+cachefiles-y := \
+        bind.o \
+        daemon.o \
+        interface.o \
+        key.o \
+        main.o \
+        namei.o \
+        rdwr.o \
+        security.o \
+        xattr.o
+cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
+obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644
index 000000000000..3797e0077b35
--- /dev/null
+++ b/fs/cachefiles/bind.c
@@ -0,0 +1,286 @@
+/* Bind and unbind a cache from the filesystem backing it
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include "internal.h"
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
+/*
+ * bind a directory as a cache
+ */
+int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
+{
+        _enter("{%u,%u,%u,%u,%u,%u},%s",
+               cache->frun_percent,
+               cache->fcull_percent,
+               cache->fstop_percent,
+               cache->brun_percent,
+               cache->bcull_percent,
+               cache->bstop_percent,
+               args);
+        /* start by checking things over */
+        ASSERT(cache->fstop_percent >= 0 &&
+               cache->fstop_percent < cache->fcull_percent &&
+               cache->fcull_percent < cache->frun_percent &&
+               cache->frun_percent  < 100);
+        ASSERT(cache->bstop_percent >= 0 &&
+               cache->bstop_percent < cache->bcull_percent &&
+               cache->bcull_percent < cache->brun_percent &&
+               cache->brun_percent  < 100);
+        if (*args) {
+                kerror("'bind' command doesn't take an argument");
+                return -EINVAL;
+        }
+        if (!cache->rootdirname) {
+                kerror("No cache directory specified");
+                return -EINVAL;
+        }
+        /* don't permit already bound caches to be re-bound */
+        if (test_bit(CACHEFILES_READY, &cache->flags)) {
+                kerror("Cache already bound");
+                return -EBUSY;
+        }
+        /* make sure we have copies of the tag and dirname strings */
+        if (!cache->tag) {
+                /* the tag string is released by the fops->release()
+                 * function, so we don't release it on error here */
+                cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
+                if (!cache->tag)
+                        return -ENOMEM;
+        }
+        /* add the cache */
+        return cachefiles_daemon_add_cache(cache);
+}
+/*
+ * add a cache
+ */
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
+{
+        struct cachefiles_object *fsdef;
+        struct nameidata nd;
+        struct kstatfs stats;
+        struct dentry *graveyard, *cachedir, *root;
+        const struct cred *saved_cred;
+        int ret;
+        _enter("");
+        /* we want to work under the module's security ID */
+        ret = cachefiles_get_security_ID(cache);
+        if (ret < 0)
+                return ret;
+        cachefiles_begin_secure(cache, &saved_cred);
+        /* allocate the root index object */
+        ret = -ENOMEM;
+        fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+        if (!fsdef)
+                goto error_root_object;
+        ASSERTCMP(fsdef->backer, ==, NULL);
+        atomic_set(&fsdef->usage, 1);
+        fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
+        _debug("- fsdef %p", fsdef);
+        /* look up the directory at the root of the cache */
+        memset(&nd, 0, sizeof(nd));
+        ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
+        if (ret < 0)
+                goto error_open_root;
+        cache->mnt = mntget(nd.path.mnt);
+        root = dget(nd.path.dentry);
+        path_put(&nd.path);
+        /* check parameters */
+        ret = -EOPNOTSUPP;
+        if (!root->d_inode ||
+            !root->d_inode->i_op ||
+            !root->d_inode->i_op->lookup ||
+            !root->d_inode->i_op->mkdir ||
+            !root->d_inode->i_op->setxattr ||
+            !root->d_inode->i_op->getxattr ||
+            !root->d_sb ||
+            !root->d_sb->s_op ||
+            !root->d_sb->s_op->statfs ||
+            !root->d_sb->s_op->sync_fs)
+                goto error_unsupported;
+        ret = -EROFS;
+        if (root->d_sb->s_flags & MS_RDONLY)
+                goto error_unsupported;
+        /* determine the security of the on-disk cache as this governs
+         * security ID of files we create */
+        ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
+        if (ret < 0)
+                goto error_unsupported;
+        /* get the cache size and blocksize */
+        ret = vfs_statfs(root, &stats);
+        if (ret < 0)
+                goto error_unsupported;
+        ret = -ERANGE;
+        if (stats.f_bsize <= 0)
+                goto error_unsupported;
+        ret = -EOPNOTSUPP;
+        if (stats.f_bsize > PAGE_SIZE)
+                goto error_unsupported;
+        cache->bsize = stats.f_bsize;
+        cache->bshift = 0;
+        if (stats.f_bsize < PAGE_SIZE)
+                cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
+        _debug("blksize %u (shift %u)",
+               cache->bsize, cache->bshift);
+        _debug("size %llu, avail %llu",
+               (unsigned long long) stats.f_blocks,
+               (unsigned long long) stats.f_bavail);
+        /* set up caching limits */
+        do_div(stats.f_files, 100);
+        cache->fstop = stats.f_files * cache->fstop_percent;
+        cache->fcull = stats.f_files * cache->fcull_percent;
+        cache->frun  = stats.f_files * cache->frun_percent;
+        _debug("limits {%llu,%llu,%llu} files",
+               (unsigned long long) cache->frun,
+               (unsigned long long) cache->fcull,
+               (unsigned long long) cache->fstop);
+        stats.f_blocks >>= cache->bshift;
+        do_div(stats.f_blocks, 100);
+        cache->bstop = stats.f_blocks * cache->bstop_percent;
+        cache->bcull = stats.f_blocks * cache->bcull_percent;
+        cache->brun  = stats.f_blocks * cache->brun_percent;
+        _debug("limits {%llu,%llu,%llu} blocks",
+               (unsigned long long) cache->brun,
+               (unsigned long long) cache->bcull,
+               (unsigned long long) cache->bstop);
+        /* get the cache directory and check its type */
+        cachedir = cachefiles_get_directory(cache, root, "cache");
+        if (IS_ERR(cachedir)) {
+                ret = PTR_ERR(cachedir);
+                goto error_unsupported;
+        }
+        fsdef->dentry = cachedir;
+        fsdef->fscache.cookie = NULL;
+        ret = cachefiles_check_object_type(fsdef);
+        if (ret < 0)
+                goto error_unsupported;
+        /* get the graveyard directory */
+        graveyard = cachefiles_get_directory(cache, root, "graveyard");
+        if (IS_ERR(graveyard)) {
+                ret = PTR_ERR(graveyard);
+                goto error_unsupported;
+        }
+        cache->graveyard = graveyard;
+        /* publish the cache */
+        fscache_init_cache(&cache->cache,
+                           &cachefiles_cache_ops,
+                           "%s",
+                           fsdef->dentry->d_sb->s_id);
+        fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
+        ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
+        if (ret < 0)
+                goto error_add_cache;
+        /* done */
+        set_bit(CACHEFILES_READY, &cache->flags);
+        dput(root);
+        printk(KERN_INFO "CacheFiles:"
+               " File cache on %s registered\n",
+               cache->cache.identifier);
+        /* check how much space the cache has */
+        cachefiles_has_space(cache, 0, 0);
+        cachefiles_end_secure(cache, saved_cred);
+        return 0;
+error_add_cache:
+        dput(cache->graveyard);
+        cache->graveyard = NULL;
+error_unsupported:
+        mntput(cache->mnt);
+        cache->mnt = NULL;
+        dput(fsdef->dentry);
+        fsdef->dentry = NULL;
+        dput(root);
+error_open_root:
+        kmem_cache_free(cachefiles_object_jar, fsdef);
+error_root_object:
+        cachefiles_end_secure(cache, saved_cred);
+        kerror("Failed to register: %d", ret);
+        return ret;
+}
+/*
+ * unbind a cache on fd release
+ */
+void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
+{
+        _enter("");
+        if (test_bit(CACHEFILES_READY, &cache->flags)) {
+                printk(KERN_INFO "CacheFiles:"
+                       " File cache on %s unregistering\n",
+                       cache->cache.identifier);
+                fscache_withdraw_cache(&cache->cache);
+        }
+        dput(cache->graveyard);
+        mntput(cache->mnt);
+        kfree(cache->rootdirname);
+        kfree(cache->secctx);
+        kfree(cache->tag);
+        _leave("");
+}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644
index 000000000000..4618516dd994
--- /dev/null
+++ b/fs/cachefiles/daemon.c
@@ -0,0 +1,755 @@
+/* Daemon interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include <linux/fs_struct.h>
+#include "internal.h"
+static int cachefiles_daemon_open(struct inode *, struct file *);
+static int cachefiles_daemon_release(struct inode *, struct file *);
+static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
+                                      loff_t *);
+static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
+                                       size_t, loff_t *);
+static unsigned int cachefiles_daemon_poll(struct file *,
+                                           struct poll_table_struct *);
+static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
+static unsigned long cachefiles_open;
+const struct file_operations cachefiles_daemon_fops = {
+        .owner          = THIS_MODULE,
+        .open           = cachefiles_daemon_open,
+        .release        = cachefiles_daemon_release,
+        .read           = cachefiles_daemon_read,
+        .write          = cachefiles_daemon_write,
+        .poll           = cachefiles_daemon_poll,
+};
+struct cachefiles_daemon_cmd {
+        char name[8];
+        int (*handler)(struct cachefiles_cache *cache, char *args);
+};
+static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
+        { "bind",       cachefiles_daemon_bind          },
+        { "brun",       cachefiles_daemon_brun          },
+        { "bcull",      cachefiles_daemon_bcull         },
+        { "bstop",      cachefiles_daemon_bstop         },
+        { "cull",       cachefiles_daemon_cull          },
+        { "debug",      cachefiles_daemon_debug         },
+        { "dir",        cachefiles_daemon_dir           },
+        { "frun",       cachefiles_daemon_frun          },
+        { "fcull",      cachefiles_daemon_fcull         },
+        { "fstop",      cachefiles_daemon_fstop         },
+        { "inuse",      cachefiles_daemon_inuse         },
+        { "secctx",     cachefiles_daemon_secctx        },
+        { "tag",        cachefiles_daemon_tag           },
+        { "",           NULL                            }
+};
+/*
+ * do various checks
+ */
+static int cachefiles_daemon_open(struct inode *inode, struct file *file)
+{
+        struct cachefiles_cache *cache;
+        _enter("");
+        /* only the superuser may do this */
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /* the cachefiles device may only be open once at a time */
+        if (xchg(&cachefiles_open, 1) == 1)
+                return -EBUSY;
+        /* allocate a cache record */
+        cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
+        if (!cache) {
+                cachefiles_open = 0;
+                return -ENOMEM;
+        }
+        mutex_init(&cache->daemon_mutex);
+        cache->active_nodes = RB_ROOT;
+        rwlock_init(&cache->active_lock);
+        init_waitqueue_head(&cache->daemon_pollwq);
+        /* set default caching limits
+         * - limit at 1% free space and/or free files
+         * - cull below 5% free space and/or free files
+         * - cease culling above 7% free space and/or free files
+         */
+        cache->frun_percent = 7;
+        cache->fcull_percent = 5;
+        cache->fstop_percent = 1;
+        cache->brun_percent = 7;
+        cache->bcull_percent = 5;
+        cache->bstop_percent = 1;
+        file->private_data = cache;
+        cache->cachefilesd = file;
+        return 0;
+}
+/*
+ * release a cache
+ */
+static int cachefiles_daemon_release(struct inode *inode, struct file *file)
+{
+        struct cachefiles_cache *cache = file->private_data;
+        _enter("");
+        ASSERT(cache);
+        set_bit(CACHEFILES_DEAD, &cache->flags);
+        cachefiles_daemon_unbind(cache);
+        ASSERT(!cache->active_nodes.rb_node);
+        /* clean up the control file interface */
+        cache->cachefilesd = NULL;
+        file->private_data = NULL;
+        cachefiles_open = 0;
+        kfree(cache);
+        _leave("");
+        return 0;
+}
+/*
+ * read the cache state
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+                                      size_t buflen, loff_t *pos)
+{
+        struct cachefiles_cache *cache = file->private_data;
+        char buffer[256];
+        int n;
+        //_enter(",,%zu,", buflen);
+        if (!test_bit(CACHEFILES_READY, &cache->flags))
+                return 0;
+        /* check how much space the cache has */
+        cachefiles_has_space(cache, 0, 0);
+        /* summarise */
+        clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+        n = snprintf(buffer, sizeof(buffer),
+                     "cull=%c"
+                     " frun=%llx"
+                     " fcull=%llx"
+                     " fstop=%llx"
+                     " brun=%llx"
+                     " bcull=%llx"
+                     " bstop=%llx",
+                     test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
+                     (unsigned long long) cache->frun,
+                     (unsigned long long) cache->fcull,
+                     (unsigned long long) cache->fstop,
+                     (unsigned long long) cache->brun,
+                     (unsigned long long) cache->bcull,
+                     (unsigned long long) cache->bstop
+                     );
+        if (n > buflen)
+                return -EMSGSIZE;
+        if (copy_to_user(_buffer, buffer, n) != 0)
+                return -EFAULT;
+        return n;
+}
+/*
+ * command the cache
+ */
+static ssize_t cachefiles_daemon_write(struct file *file,
+                                       const char __user *_data,
+                                       size_t datalen,
+                                       loff_t *pos)
+{
+        const struct cachefiles_daemon_cmd *cmd;
+        struct cachefiles_cache *cache = file->private_data;
+        ssize_t ret;
+        char *data, *args, *cp;
+        //_enter(",,%zu,", datalen);
+        ASSERT(cache);
+        if (test_bit(CACHEFILES_DEAD, &cache->flags))
+                return -EIO;
+        if (datalen < 0 || datalen > PAGE_SIZE - 1)
+                return -EOPNOTSUPP;
+        /* drag the command string into the kernel so we can parse it */
+        data = kmalloc(datalen + 1, GFP_KERNEL);
+        if (!data)
+                return -ENOMEM;
+        ret = -EFAULT;
+        if (copy_from_user(data, _data, datalen) != 0)
+                goto error;
+        data[datalen] = '\0';
+        ret = -EINVAL;
+        if (memchr(data, '\0', datalen))
+                goto error;
+        /* strip any newline */
+        cp = memchr(data, '\n', datalen);
+        if (cp) {
+                if (cp == data)
+                        goto error;
+                *cp = '\0';
+        }
+        /* parse the command */
+        ret = -EOPNOTSUPP;
+        for (args = data; *args; args++)
+                if (isspace(*args))
+                        break;
+        if (*args) {
+                if (args == data)
+                        goto error;
+                *args = '\0';
+                for (args++; isspace(*args); args++)
+                        continue;
+        }
+        /* run the appropriate command handler */
+        for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
+                if (strcmp(cmd->name, data) == 0)
+                        goto found_command;
+error:
+        kfree(data);
+        //_leave(" = %zd", ret);
+        return ret;
+found_command:
+        mutex_lock(&cache->daemon_mutex);
+        ret = -EIO;
+        if (!test_bit(CACHEFILES_DEAD, &cache->flags))
+                ret = cmd->handler(cache, args);
+        mutex_unlock(&cache->daemon_mutex);
+        if (ret == 0)
+                ret = datalen;
+        goto error;
+}
+/*
+ * poll for culling state
+ * - use POLLOUT to indicate culling state
+ */
+static unsigned int cachefiles_daemon_poll(struct file *file,
+                                           struct poll_table_struct *poll)
+{
+        struct cachefiles_cache *cache = file->private_data;
+        unsigned int mask;
+        poll_wait(file, &cache->daemon_pollwq, poll);
+        mask = 0;
+        if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+                mask |= POLLIN;
+        if (test_bit(CACHEFILES_CULLING, &cache->flags))
+                mask |= POLLOUT;
+        return mask;
+}
+/*
+ * give a range error for cache space constraints
+ * - can be tail-called
+ */
+static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
+                                         char *args)
+{
+        kerror("Free space limits must be in range"
+               " 0%%<=stop<cull<run<100%%");
+        return -EINVAL;
+}
+/*
+ * set the percentage of files at which to stop culling
+ * - command: "frun <N>%"
+ */
+static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long frun;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        frun = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (frun <= cache->fcull_percent || frun >= 100)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->frun_percent = frun;
+        return 0;
+}
+/*
+ * set the percentage of files at which to start culling
+ * - command: "fcull <N>%"
+ */
+static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long fcull;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        fcull = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->fcull_percent = fcull;
+        return 0;
+}
+/*
+ * set the percentage of files at which to stop allocating
+ * - command: "fstop <N>%"
+ */
+static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long fstop;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        fstop = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (fstop < 0 || fstop >= cache->fcull_percent)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->fstop_percent = fstop;
+        return 0;
+}
+/*
+ * set the percentage of blocks at which to stop culling
+ * - command: "brun <N>%"
+ */
+static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long brun;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        brun = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (brun <= cache->bcull_percent || brun >= 100)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->brun_percent = brun;
+        return 0;
+}
+/*
+ * set the percentage of blocks at which to start culling
+ * - command: "bcull <N>%"
+ */
+static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long bcull;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        bcull = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->bcull_percent = bcull;
+        return 0;
+}
+/*
+ * set the percentage of blocks at which to stop allocating
+ * - command: "bstop <N>%"
+ */
+static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long bstop;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        bstop = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (bstop < 0 || bstop >= cache->bcull_percent)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->bstop_percent = bstop;
+        return 0;
+}
+/*
+ * set the cache directory
+ * - command: "dir <name>"
+ */
+static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
+{
+        char *dir;
+        _enter(",%s", args);
+        if (!*args) {
+                kerror("Empty directory specified");
+                return -EINVAL;
+        }
+        if (cache->rootdirname) {
+                kerror("Second cache directory specified");
+                return -EEXIST;
+        }
+        dir = kstrdup(args, GFP_KERNEL);
+        if (!dir)
+                return -ENOMEM;
+        cache->rootdirname = dir;
+        return 0;
+}
+/*
+ * set the cache security context
+ * - command: "secctx <ctx>"
+ */
+static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
+{
+        char *secctx;
+        _enter(",%s", args);
+        if (!*args) {
+                kerror("Empty security context specified");
+                return -EINVAL;
+        }
+        if (cache->secctx) {
+                kerror("Second security context specified");
+                return -EINVAL;
+        }
+        secctx = kstrdup(args, GFP_KERNEL);
+        if (!secctx)
+                return -ENOMEM;
+        cache->secctx = secctx;
+        return 0;
+}
+/*
+ * set the cache tag
+ * - command: "tag <name>"
+ */
+static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
+{
+        char *tag;
+        _enter(",%s", args);
+        if (!*args) {
+                kerror("Empty tag specified");
+                return -EINVAL;
+        }
+        if (cache->tag)
+                return -EEXIST;
+        tag = kstrdup(args, GFP_KERNEL);
+        if (!tag)
+                return -ENOMEM;
+        cache->tag = tag;
+        return 0;
+}
+/*
+ * request a node in the cache be culled from the current working directory
+ * - command: "cull <name>"
+ */
+static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
+{
+        struct fs_struct *fs;
+        struct dentry *dir;
+        const struct cred *saved_cred;
+        int ret;
+        _enter(",%s", args);
+        if (strchr(args, '/'))
+                goto inval;
+        if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+                kerror("cull applied to unready cache");
+                return -EIO;
+        }
+        if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+                kerror("cull applied to dead cache");
+                return -EIO;
+        }
+        /* extract the directory dentry from the cwd */
+        fs = current->fs;
+        read_lock(&fs->lock);
+        dir = dget(fs->pwd.dentry);
+        read_unlock(&fs->lock);
+        if (!S_ISDIR(dir->d_inode->i_mode))
+                goto notdir;
+        cachefiles_begin_secure(cache, &saved_cred);
+        ret = cachefiles_cull(cache, dir, args);
+        cachefiles_end_secure(cache, saved_cred);
+        dput(dir);
+        _leave(" = %d", ret);
+        return ret;
+notdir:
+        dput(dir);
+        kerror("cull command requires dirfd to be a directory");
+        return -ENOTDIR;
+inval:
+        kerror("cull command requires dirfd and filename");
+        return -EINVAL;
+}
+/*
+ * set debugging mode
+ * - command: "debug <mask>"
+ */
+static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long mask;
+        _enter(",%s", args);
+        mask = simple_strtoul(args, &args, 0);
+        if (args[0] != '\0')
+                goto inval;
+        cachefiles_debug = mask;
+        _leave(" = 0");
+        return 0;
+inval:
+        kerror("debug command requires mask");
+        return -EINVAL;
+}
+/*
+ * find out whether an object in the current working directory is in use or not
+ * - command: "inuse <name>"
+ */
+static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
+{
+        struct fs_struct *fs;
+        struct dentry *dir;
+        const struct cred *saved_cred;
+        int ret;
+        //_enter(",%s", args);
+        if (strchr(args, '/'))
+                goto inval;
+        if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+                kerror("inuse applied to unready cache");
+                return -EIO;
+        }
+        if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+                kerror("inuse applied to dead cache");
+                return -EIO;
+        }
+        /* extract the directory dentry from the cwd */
+        fs = current->fs;
+        read_lock(&fs->lock);
+        dir = dget(fs->pwd.dentry);
+        read_unlock(&fs->lock);
+        if (!S_ISDIR(dir->d_inode->i_mode))
+                goto notdir;
+        cachefiles_begin_secure(cache, &saved_cred);
+        ret = cachefiles_check_in_use(cache, dir, args);
+        cachefiles_end_secure(cache, saved_cred);
+        dput(dir);
+        //_leave(" = %d", ret);
+        return ret;
+notdir:
+        dput(dir);
+        kerror("inuse command requires dirfd to be a directory");
+        return -ENOTDIR;
+inval:
+        kerror("inuse command requires dirfd and filename");
+        return -EINVAL;
+}
+/*
+ * see if we have space for a number of pages and/or a number of files in the
+ * cache
+ */
+int cachefiles_has_space(struct cachefiles_cache *cache,
+                         unsigned fnr, unsigned bnr)
+{
+        struct kstatfs stats;
+        int ret;
+        //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
+        //       (unsigned long long) cache->frun,
+        //       (unsigned long long) cache->fcull,
+        //       (unsigned long long) cache->fstop,
+        //       (unsigned long long) cache->brun,
+        //       (unsigned long long) cache->bcull,
+        //       (unsigned long long) cache->bstop,
+        //       fnr, bnr);
+        /* find out how many pages of blockdev are available */
+        memset(&stats, 0, sizeof(stats));
+        ret = vfs_statfs(cache->mnt->mnt_root, &stats);
+        if (ret < 0) {
+                if (ret == -EIO)
+                        cachefiles_io_error(cache, "statfs failed");
+                _leave(" = %d", ret);
+                return ret;
+        }
+        stats.f_bavail >>= cache->bshift;
+        //_debug("avail %llu,%llu",
+        //       (unsigned long long) stats.f_ffree,
+        //       (unsigned long long) stats.f_bavail);
+        /* see if there is sufficient space */
+        if (stats.f_ffree > fnr)
+                stats.f_ffree -= fnr;
+        else
+                stats.f_ffree = 0;
+        if (stats.f_bavail > bnr)
+                stats.f_bavail -= bnr;
+        else
+                stats.f_bavail = 0;
+        ret = -ENOBUFS;
+        if (stats.f_ffree < cache->fstop ||
+            stats.f_bavail < cache->bstop)
+                goto begin_cull;
+        ret = 0;
+        if (stats.f_ffree < cache->fcull ||
+            stats.f_bavail < cache->bcull)
+                goto begin_cull;
+        if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
+            stats.f_ffree >= cache->frun &&
+            stats.f_bavail >= cache->brun &&
+            test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
+            ) {
+                _debug("cease culling");
+                cachefiles_state_changed(cache);
+        }
+        //_leave(" = 0");
+        return 0;
+begin_cull:
+        if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
+                _debug("### CULL CACHE ###");
+                cachefiles_state_changed(cache);
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644
index 000000000000..1e962348d111
--- /dev/null
+++ b/fs/cachefiles/interface.c
@@ -0,0 +1,449 @@
+/* FS-Cache interface to CacheFiles
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+struct cachefiles_lookup_data {
+        struct cachefiles_xattr *auxdata;       /* auxiliary data */
+        char                    *key;           /* key path */
+};
+static int cachefiles_attr_changed(struct fscache_object *_object);
+/*
+ * allocate an object record for a cookie lookup and prepare the lookup data
+ */
+static struct fscache_object *cachefiles_alloc_object(
+        struct fscache_cache *_cache,
+        struct fscache_cookie *cookie)
+{
+        struct cachefiles_lookup_data *lookup_data;
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct cachefiles_xattr *auxdata;
+        unsigned keylen, auxlen;
+        void *buffer;
+        char *key;
+        cache = container_of(_cache, struct cachefiles_cache, cache);
+        _enter("{%s},%p,", cache->cache.identifier, cookie);
+        lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+        if (!lookup_data)
+                goto nomem_lookup_data;
+        /* create a new object record and a temporary leaf image */
+        object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+        if (!object)
+                goto nomem_object;
+        ASSERTCMP(object->backer, ==, NULL);
+        BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+        atomic_set(&object->usage, 1);
+        fscache_object_init(&object->fscache, cookie, &cache->cache);
+        object->type = cookie->def->type;
+        /* get hold of the raw key
+         * - stick the length on the front and leave space on the back for the
+         *   encoder
+         */
+        buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+        if (!buffer)
+                goto nomem_buffer;
+        keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
+        ASSERTCMP(keylen, <, 512);
+        *(uint16_t *)buffer = keylen;
+        ((char *)buffer)[keylen + 2] = 0;
+        ((char *)buffer)[keylen + 3] = 0;
+        ((char *)buffer)[keylen + 4] = 0;
+        /* turn the raw key into something that can work with as a filename */
+        key = cachefiles_cook_key(buffer, keylen + 2, object->type);
+        if (!key)
+                goto nomem_key;
+        /* get hold of the auxiliary data and prepend the object type */
+        auxdata = buffer;
+        auxlen = 0;
+        if (cookie->def->get_aux) {
+                auxlen = cookie->def->get_aux(cookie->netfs_data,
+                                              auxdata->data, 511);
+                ASSERTCMP(auxlen, <, 511);
+        }
+        auxdata->len = auxlen + 1;
+        auxdata->type = cookie->def->type;
+        lookup_data->auxdata = auxdata;
+        lookup_data->key = key;
+        object->lookup_data = lookup_data;
+        _leave(" = %p [%p]", &object->fscache, lookup_data);
+        return &object->fscache;
+nomem_key:
+        kfree(buffer);
+nomem_buffer:
+        BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+        kmem_cache_free(cachefiles_object_jar, object);
+        fscache_object_destroyed(&cache->cache);
+nomem_object:
+        kfree(lookup_data);
+nomem_lookup_data:
+        _leave(" = -ENOMEM");
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * attempt to look up the nominated node in this cache
+ */
+static void cachefiles_lookup_object(struct fscache_object *_object)
+{
+        struct cachefiles_lookup_data *lookup_data;
+        struct cachefiles_object *parent, *object;
+        struct cachefiles_cache *cache;
+        const struct cred *saved_cred;
+        int ret;
+        _enter("{OBJ%x}", _object->debug_id);
+        cache = container_of(_object->cache, struct cachefiles_cache, cache);
+        parent = container_of(_object->parent,
+                              struct cachefiles_object, fscache);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        lookup_data = object->lookup_data;
+        ASSERTCMP(lookup_data, !=, NULL);
+        /* look up the key, creating any missing bits */
+        cachefiles_begin_secure(cache, &saved_cred);
+        ret = cachefiles_walk_to_object(parent, object,
+                                        lookup_data->key,
+                                        lookup_data->auxdata);
+        cachefiles_end_secure(cache, saved_cred);
+        /* polish off by setting the attributes of non-index files */
+        if (ret == 0 &&
+            object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+                cachefiles_attr_changed(&object->fscache);
+        if (ret < 0) {
+                printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
+                       ret);
+                fscache_object_lookup_error(&object->fscache);
+        }
+        _leave(" [%d]", ret);
+}
+/*
+ * indication of lookup completion
+ */
+static void cachefiles_lookup_complete(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        object = container_of(_object, struct cachefiles_object, fscache);
+        _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
+        if (object->lookup_data) {
+                kfree(object->lookup_data->key);
+                kfree(object->lookup_data->auxdata);
+                kfree(object->lookup_data);
+                object->lookup_data = NULL;
+        }
+}
+/*
+ * increment the usage count on an inode object (may fail if unmounting)
+ */
+static
+struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+{
+        struct cachefiles_object *object =
+                container_of(_object, struct cachefiles_object, fscache);
+        _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
+#ifdef CACHEFILES_DEBUG_SLAB
+        ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+        atomic_inc(&object->usage);
+        return &object->fscache;
+}
+/*
+ * update the auxilliary data for an object object on disk
+ */
+static void cachefiles_update_object(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_xattr *auxdata;
+        struct cachefiles_cache *cache;
+        struct fscache_cookie *cookie;
+        const struct cred *saved_cred;
+        unsigned auxlen;
+        _enter("{OBJ%x}", _object->debug_id);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache, struct cachefiles_cache,
+                             cache);
+        cookie = object->fscache.cookie;
+        if (!cookie->def->get_aux) {
+                _leave(" [no aux]");
+                return;
+        }
+        auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+        if (!auxdata) {
+                _leave(" [nomem]");
+                return;
+        }
+        auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+        ASSERTCMP(auxlen, <, 511);
+        auxdata->len = auxlen + 1;
+        auxdata->type = cookie->def->type;
+        cachefiles_begin_secure(cache, &saved_cred);
+        cachefiles_update_object_xattr(object, auxdata);
+        cachefiles_end_secure(cache, saved_cred);
+        kfree(auxdata);
+        _leave("");
+}
+/*
+ * discard the resources pinned by an object and effect retirement if
+ * requested
+ */
+static void cachefiles_drop_object(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        const struct cred *saved_cred;
+        ASSERT(_object);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        _enter("{OBJ%x,%d}",
+               object->fscache.debug_id, atomic_read(&object->usage));
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+#ifdef CACHEFILES_DEBUG_SLAB
+        ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+        /* delete retired objects */
+        if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+            _object != cache->cache.fsdef
+            ) {
+                _debug("- retire object OBJ%x", object->fscache.debug_id);
+                cachefiles_begin_secure(cache, &saved_cred);
+                cachefiles_delete_object(cache, object);
+                cachefiles_end_secure(cache, saved_cred);
+        }
+        /* close the filesystem stuff attached to the object */
+        if (object->backer != object->dentry)
+                dput(object->backer);
+        object->backer = NULL;
+        /* note that the object is now inactive */
+        if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+                write_lock(&cache->active_lock);
+                if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
+                                        &object->flags))
+                        BUG();
+                rb_erase(&object->active_node, &cache->active_nodes);
+                wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+                write_unlock(&cache->active_lock);
+        }
+        dput(object->dentry);
+        object->dentry = NULL;
+        _leave("");
+}
+/*
+ * dispose of a reference to an object
+ */
+static void cachefiles_put_object(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        struct fscache_cache *cache;
+        ASSERT(_object);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        _enter("{OBJ%x,%d}",
+               object->fscache.debug_id, atomic_read(&object->usage));
+#ifdef CACHEFILES_DEBUG_SLAB
+        ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+        ASSERTIFCMP(object->fscache.parent,
+                    object->fscache.parent->n_children, >, 0);
+        if (atomic_dec_and_test(&object->usage)) {
+                _debug("- kill object OBJ%x", object->fscache.debug_id);
+                ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+                ASSERTCMP(object->fscache.parent, ==, NULL);
+                ASSERTCMP(object->backer, ==, NULL);
+                ASSERTCMP(object->dentry, ==, NULL);
+                ASSERTCMP(object->fscache.n_ops, ==, 0);
+                ASSERTCMP(object->fscache.n_children, ==, 0);
+                if (object->lookup_data) {
+                        kfree(object->lookup_data->key);
+                        kfree(object->lookup_data->auxdata);
+                        kfree(object->lookup_data);
+                        object->lookup_data = NULL;
+                }
+                cache = object->fscache.cache;
+                kmem_cache_free(cachefiles_object_jar, object);
+                fscache_object_destroyed(cache);
+        }
+        _leave("");
+}
+/*
+ * sync a cache
+ */
+static void cachefiles_sync_cache(struct fscache_cache *_cache)
+{
+        struct cachefiles_cache *cache;
+        const struct cred *saved_cred;
+        int ret;
+        _enter("%p", _cache);
+        cache = container_of(_cache, struct cachefiles_cache, cache);
+        /* make sure all pages pinned by operations on behalf of the netfs are
+         * written to disc */
+        cachefiles_begin_secure(cache, &saved_cred);
+        ret = fsync_super(cache->mnt->mnt_sb);
+        cachefiles_end_secure(cache, saved_cred);
+        if (ret == -EIO)
+                cachefiles_io_error(cache,
+                                    "Attempt to sync backing fs superblock"
+                                    " returned error %d",
+                                    ret);
+}
+/*
+ * notification the attributes on an object have changed
+ * - called with reads/writes excluded by FS-Cache
+ */
+static int cachefiles_attr_changed(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        const struct cred *saved_cred;
+        struct iattr newattrs;
+        uint64_t ni_size;
+        loff_t oi_size;
+        int ret;
+        _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+        _enter("{OBJ%x},[%llu]",
+               _object->debug_id, (unsigned long long) ni_size);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        if (ni_size == object->i_size)
+                return 0;
+        if (!object->backer)
+                return -ENOBUFS;
+        ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+        fscache_set_store_limit(&object->fscache, ni_size);
+        oi_size = i_size_read(object->backer->d_inode);
+        if (oi_size == ni_size)
+                return 0;
+        newattrs.ia_size = ni_size;
+        newattrs.ia_valid = ATTR_SIZE;
+        cachefiles_begin_secure(cache, &saved_cred);
+        mutex_lock(&object->backer->d_inode->i_mutex);
+        ret = notify_change(object->backer, &newattrs);
+        mutex_unlock(&object->backer->d_inode->i_mutex);
+        cachefiles_end_secure(cache, saved_cred);
+        if (ret == -EIO) {
+                fscache_set_store_limit(&object->fscache, 0);
+                cachefiles_io_error_obj(object, "Size set failed");
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * dissociate a cache from all the pages it was backing
+ */
+static void cachefiles_dissociate_pages(struct fscache_cache *cache)
+{
+        _enter("");
+}
+const struct fscache_cache_ops cachefiles_cache_ops = {
+        .name                   = "cachefiles",
+        .alloc_object           = cachefiles_alloc_object,
+        .lookup_object          = cachefiles_lookup_object,
+        .lookup_complete        = cachefiles_lookup_complete,
+        .grab_object            = cachefiles_grab_object,
+        .update_object          = cachefiles_update_object,
+        .drop_object            = cachefiles_drop_object,
+        .put_object             = cachefiles_put_object,
+        .sync_cache             = cachefiles_sync_cache,
+        .attr_changed           = cachefiles_attr_changed,
+        .read_or_alloc_page     = cachefiles_read_or_alloc_page,
+        .read_or_alloc_pages    = cachefiles_read_or_alloc_pages,
+        .allocate_page          = cachefiles_allocate_page,
+        .allocate_pages         = cachefiles_allocate_pages,
+        .write_page             = cachefiles_write_page,
+        .uncache_page           = cachefiles_uncache_page,
+        .dissociate_pages       = cachefiles_dissociate_pages,
+};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644
index 000000000000..f7c255f9c624
--- /dev/null
+++ b/fs/cachefiles/internal.h
@@ -0,0 +1,360 @@
+/* General netfs cache on cache files internal defs
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/fscache-cache.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+struct cachefiles_cache;
+struct cachefiles_object;
+extern unsigned cachefiles_debug;
+#define CACHEFILES_DEBUG_KENTER 1
+#define CACHEFILES_DEBUG_KLEAVE 2
+#define CACHEFILES_DEBUG_KDEBUG 4
+/*
+ * node records
+ */
+struct cachefiles_object {
+        struct fscache_object           fscache;        /* fscache handle */
+        struct cachefiles_lookup_data   *lookup_data;   /* cached lookup data */
+        struct dentry                   *dentry;        /* the file/dir representing this object */
+        struct dentry                   *backer;        /* backing file */
+        loff_t                          i_size;         /* object size */
+        unsigned long                   flags;
+#define CACHEFILES_OBJECT_ACTIVE        0               /* T if marked active */
+        atomic_t                        usage;          /* object usage count */
+        uint8_t                         type;           /* object type */
+        uint8_t                         new;            /* T if object new */
+        spinlock_t                      work_lock;
+        struct rb_node                  active_node;    /* link in active tree (dentry is key) */
+};
+extern struct kmem_cache *cachefiles_object_jar;
+/*
+ * Cache files cache definition
+ */
+struct cachefiles_cache {
+        struct fscache_cache            cache;          /* FS-Cache record */
+        struct vfsmount                 *mnt;           /* mountpoint holding the cache */
+        struct dentry                   *graveyard;     /* directory into which dead objects go */
+        struct file                     *cachefilesd;   /* manager daemon handle */
+        const struct cred               *cache_cred;    /* security override for accessing cache */
+        struct mutex                    daemon_mutex;   /* command serialisation mutex */
+        wait_queue_head_t               daemon_pollwq;  /* poll waitqueue for daemon */
+        struct rb_root                  active_nodes;   /* active nodes (can't be culled) */
+        rwlock_t                        active_lock;    /* lock for active_nodes */
+        atomic_t                        gravecounter;   /* graveyard uniquifier */
+        unsigned                        frun_percent;   /* when to stop culling (% files) */
+        unsigned                        fcull_percent;  /* when to start culling (% files) */
+        unsigned                        fstop_percent;  /* when to stop allocating (% files) */
+        unsigned                        brun_percent;   /* when to stop culling (% blocks) */
+        unsigned                        bcull_percent;  /* when to start culling (% blocks) */
+        unsigned                        bstop_percent;  /* when to stop allocating (% blocks) */
+        unsigned                        bsize;          /* cache's block size */
+        unsigned                        bshift;         /* min(ilog2(PAGE_SIZE / bsize), 0) */
+        uint64_t                        frun;           /* when to stop culling */
+        uint64_t                        fcull;          /* when to start culling */
+        uint64_t                        fstop;          /* when to stop allocating */
+        sector_t                        brun;           /* when to stop culling */
+        sector_t                        bcull;          /* when to start culling */
+        sector_t                        bstop;          /* when to stop allocating */
+        unsigned long                   flags;
+#define CACHEFILES_READY                0       /* T if cache prepared */
+#define CACHEFILES_DEAD                 1       /* T if cache dead */
+#define CACHEFILES_CULLING              2       /* T if cull engaged */
+#define CACHEFILES_STATE_CHANGED        3       /* T if state changed (poll trigger) */
+        char                            *rootdirname;   /* name of cache root directory */
+        char                            *secctx;        /* LSM security context */
+        char                            *tag;           /* cache binding tag */
+};
+/*
+ * backing file read tracking
+ */
+struct cachefiles_one_read {
+        wait_queue_t                    monitor;        /* link into monitored waitqueue */
+        struct page                     *back_page;     /* backing file page we're waiting for */
+        struct page                     *netfs_page;    /* netfs page we're going to fill */
+        struct fscache_retrieval        *op;            /* retrieval op covering this */
+        struct list_head                op_link;        /* link in op's todo list */
+};
+/*
+ * backing file write tracking
+ */
+struct cachefiles_one_write {
+        struct page                     *netfs_page;    /* netfs page to copy */
+        struct cachefiles_object        *object;
+        struct list_head                obj_link;       /* link in object's lists */
+        fscache_rw_complete_t           end_io_func;
+        void                            *context;
+};
+/*
+ * auxiliary data xattr buffer
+ */
+struct cachefiles_xattr {
+        uint16_t                        len;
+        uint8_t                         type;
+        uint8_t                         data[];
+};
+/*
+ * note change of state for daemon
+ */
+static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
+{
+        set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+        wake_up_all(&cache->daemon_pollwq);
+}
+/*
+ * bind.c
+ */
+extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
+extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
+/*
+ * daemon.c
+ */
+extern const struct file_operations cachefiles_daemon_fops;
+extern int cachefiles_has_space(struct cachefiles_cache *cache,
+                                unsigned fnr, unsigned bnr);
+/*
+ * interface.c
+ */
+extern const struct fscache_cache_ops cachefiles_cache_ops;
+/*
+ * key.c
+ */
+extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+/*
+ * namei.c
+ */
+extern int cachefiles_delete_object(struct cachefiles_cache *cache,
+                                    struct cachefiles_object *object);
+extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
+                                     struct cachefiles_object *object,
+                                     const char *key,
+                                     struct cachefiles_xattr *auxdata);
+extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+                                               struct dentry *dir,
+                                               const char *name);
+extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+                           char *filename);
+extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
+                                   struct dentry *dir, char *filename);
+/*
+ * proc.c
+ */
+#ifdef CONFIG_CACHEFILES_HISTOGRAM
+extern atomic_t cachefiles_lookup_histogram[HZ];
+extern atomic_t cachefiles_mkdir_histogram[HZ];
+extern atomic_t cachefiles_create_histogram[HZ];
+extern int __init cachefiles_proc_init(void);
+extern void cachefiles_proc_cleanup(void);
+static inline
+void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
+{
+        unsigned long jif = jiffies - start_jif;
+        if (jif >= HZ)
+                jif = HZ - 1;
+        atomic_inc(&histogram[jif]);
+}
+#else
+#define cachefiles_proc_init()          (0)
+#define cachefiles_proc_cleanup()       do {} while (0)
+#define cachefiles_hist(hist, start_jif) do {} while (0)
+#endif
+/*
+ * rdwr.c
+ */
+extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
+                                         struct page *, gfp_t);
+extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
+                                          struct list_head *, unsigned *,
+                                          gfp_t);
+extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
+                                    gfp_t);
+extern int cachefiles_allocate_pages(struct fscache_retrieval *,
+                                     struct list_head *, unsigned *, gfp_t);
+extern int cachefiles_write_page(struct fscache_storage *, struct page *);
+extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
+/*
+ * security.c
+ */
+extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
+extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+                                               struct dentry *root,
+                                               const struct cred **_saved_cred);
+static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
+                                           const struct cred **_saved_cred)
+{
+        *_saved_cred = override_creds(cache->cache_cred);
+}
+static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
+                                         const struct cred *saved_cred)
+{
+        revert_creds(saved_cred);
+}
+/*
+ * xattr.c
+ */
+extern int cachefiles_check_object_type(struct cachefiles_object *object);
+extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
+                                       struct cachefiles_xattr *auxdata);
+extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
+                                          struct cachefiles_xattr *auxdata);
+extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
+                                         struct cachefiles_xattr *auxdata);
+extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+                                          struct dentry *dentry);
+/*
+ * error handling
+ */
+#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
+#define cachefiles_io_error(___cache, FMT, ...)         \
+do {                                                    \
+        kerror("I/O Error: " FMT, ##__VA_ARGS__);       \
+        fscache_io_error(&(___cache)->cache);           \
+        set_bit(CACHEFILES_DEAD, &(___cache)->flags);   \
+} while (0)
+#define cachefiles_io_error_obj(object, FMT, ...)                       \
+do {                                                                    \
+        struct cachefiles_cache *___cache;                              \
+                                                                        \
+        ___cache = container_of((object)->fscache.cache,                \
+                                struct cachefiles_cache, cache);        \
+        cachefiles_io_error(___cache, FMT, ##__VA_ARGS__);              \
+} while (0)
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+        printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline void _dbprintk(const char *fmt, ...)
+        __attribute__((format(printf, 1, 2)));
+static inline void _dbprintk(const char *fmt, ...)
+{
+}
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+#if defined(__KDEBUG)
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+#elif defined(CONFIG_CACHEFILES_DEBUG)
+#define _enter(FMT, ...)                                \
+do {                                                    \
+        if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \
+                kenter(FMT, ##__VA_ARGS__);             \
+} while (0)
+#define _leave(FMT, ...)                                \
+do {                                                    \
+        if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \
+                kleave(FMT, ##__VA_ARGS__);             \
+} while (0)
+#define _debug(FMT, ...)                                \
+do {                                                    \
+        if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \
+                kdebug(FMT, ##__VA_ARGS__);             \
+} while (0)
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+#if 1 /* defined(__KDEBUGALL) */
+#define ASSERT(X)                                                       \
+do {                                                                    \
+        if (unlikely(!(X))) {                                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTCMP(X, OP, Y)                                             \
+do {                                                                    \
+        if (unlikely(!((X) OP (Y)))) {                                  \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+                printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTIF(C, X)                                                  \
+do {                                                                    \
+        if (unlikely((C) && !(X))) {                                    \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)                                        \
+do {                                                                    \
+        if (unlikely((C) && !((X) OP (Y)))) {                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+                printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#else
+#define ASSERT(X)                       do {} while (0)
+#define ASSERTCMP(X, OP, Y)             do {} while (0)
+#define ASSERTIF(C, X)                  do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)        do {} while (0)
+#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644
index 000000000000..81b8b2b3a674
--- /dev/null
+++ b/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
+/* Key to pathname encoder
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/slab.h>
+#include "internal.h"
+static const char cachefiles_charmap[64] =
+        "0123456789"                    /* 0 - 9 */
+        "abcdefghijklmnopqrstuvwxyz"    /* 10 - 35 */
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"    /* 36 - 61 */
+        "_-"                            /* 62 - 63 */
+        ;
+static const char cachefiles_filecharmap[256] = {
+        /* we skip space and tab and control chars */
+        [33 ... 46] = 1,                /* '!' -> '.' */
+        /* we skip '/' as it's significant to pathwalk */
+        [48 ... 127] = 1,               /* '0' -> '~' */
+};
+/*
+ * turn the raw key into something cooked
+ * - the raw key should include the length in the two bytes at the front
+ * - the key may be up to 514 bytes in length (including the length word)
+ *   - "base64" encode the strange keys, mapping 3 bytes of raw to four of
+ *     cooked
+ *   - need to cut the cooked key into 252 char lengths (189 raw bytes)
+ */
+char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+{
+        unsigned char csum, ch;
+        unsigned int acc;
+        char *key;
+        int loop, len, max, seg, mark, print;
+        _enter(",%d", keylen);
+        BUG_ON(keylen < 2 || keylen > 514);
+        csum = raw[0] + raw[1];
+        print = 1;
+        for (loop = 2; loop < keylen; loop++) {
+                ch = raw[loop];
+                csum += ch;
+                print &= cachefiles_filecharmap[ch];
+        }
+        if (print) {
+                /* if the path is usable ASCII, then we render it directly */
+                max = keylen - 2;
+                max += 2;       /* two base64'd length chars on the front */
+                max += 5;       /* @checksum/M */
+                max += 3 * 2;   /* maximum number of segment dividers (".../M")
+                                 * is ((514 + 251) / 252) = 3
+                                 */
+                max += 1;       /* NUL on end */
+        } else {
+                /* calculate the maximum length of the cooked key */
+                keylen = (keylen + 2) / 3;
+                max = keylen * 4;
+                max += 5;       /* @checksum/M */
+                max += 3 * 2;   /* maximum number of segment dividers (".../M")
+                                 * is ((514 + 188) / 189) = 3
+                                 */
+                max += 1;       /* NUL on end */
+        }
+        max += 1;       /* 2nd NUL on end */
+        _debug("max: %d", max);
+        key = kmalloc(max, GFP_KERNEL);
+        if (!key)
+                return NULL;
+        len = 0;
+        /* build the cooked key */
+        sprintf(key, "@%02x%c+", (unsigned) csum, 0);
+        len = 5;
+        mark = len - 1;
+        if (print) {
+                acc = *(uint16_t *) raw;
+                raw += 2;
+                key[len + 1] = cachefiles_charmap[acc & 63];
+                acc >>= 6;
+                key[len] = cachefiles_charmap[acc & 63];
+                len += 2;
+                seg = 250;
+                for (loop = keylen; loop > 0; loop--) {
+                        if (seg <= 0) {
+                                key[len++] = '\0';
+                                mark = len;
+                                key[len++] = '+';
+                                seg = 252;
+                        }
+                        key[len++] = *raw++;
+                        ASSERT(len < max);
+                }
+                switch (type) {
+                case FSCACHE_COOKIE_TYPE_INDEX:         type = 'I';     break;
+                case FSCACHE_COOKIE_TYPE_DATAFILE:      type = 'D';     break;
+                default:                                type = 'S';     break;
+                }
+        } else {
+                seg = 252;
+                for (loop = keylen; loop > 0; loop--) {
+                        if (seg <= 0) {
+                                key[len++] = '\0';
+                                mark = len;
+                                key[len++] = '+';
+                                seg = 252;
+                        }
+                        acc = *raw++;
+                        acc |= *raw++ << 8;
+                        acc |= *raw++ << 16;
+                        _debug("acc: %06x", acc);
+                        key[len++] = cachefiles_charmap[acc & 63];
+                        acc >>= 6;
+                        key[len++] = cachefiles_charmap[acc & 63];
+                        acc >>= 6;
+                        key[len++] = cachefiles_charmap[acc & 63];
+                        acc >>= 6;
+                        key[len++] = cachefiles_charmap[acc & 63];
+                        ASSERT(len < max);
+                }
+                switch (type) {
+                case FSCACHE_COOKIE_TYPE_INDEX:         type = 'J';     break;
+                case FSCACHE_COOKIE_TYPE_DATAFILE:      type = 'E';     break;
+                default:                                type = 'T';     break;
+                }
+        }
+        key[mark] = type;
+        key[len++] = 0;
+        key[len] = 0;
+        _leave(" = %p %d", key, len);
+        return key;
+}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644
index 000000000000..4bfa8cf43bf5
--- /dev/null
+++ b/fs/cachefiles/main.c
@@ -0,0 +1,106 @@
+/* Network filesystem caching backend to use cache files on a premounted
+ * filesystem
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/sysctl.h>
+#include <linux/miscdevice.h>
+#include "internal.h"
+unsigned cachefiles_debug;
+module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
+MODULE_DESCRIPTION("Mounted-filesystem based cache");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+struct kmem_cache *cachefiles_object_jar;
+static struct miscdevice cachefiles_dev = {
+        .minor  = MISC_DYNAMIC_MINOR,
+        .name   = "cachefiles",
+        .fops   = &cachefiles_daemon_fops,
+};
+static void cachefiles_object_init_once(void *_object)
+{
+        struct cachefiles_object *object = _object;
+        memset(object, 0, sizeof(*object));
+        spin_lock_init(&object->work_lock);
+}
+/*
+ * initialise the fs caching module
+ */
+static int __init cachefiles_init(void)
+{
+        int ret;
+        ret = misc_register(&cachefiles_dev);
+        if (ret < 0)
+                goto error_dev;
+        /* create an object jar */
+        ret = -ENOMEM;
+        cachefiles_object_jar =
+                kmem_cache_create("cachefiles_object_jar",
+                                  sizeof(struct cachefiles_object),
+                                  0,
+                                  SLAB_HWCACHE_ALIGN,
+                                  cachefiles_object_init_once);
+        if (!cachefiles_object_jar) {
+                printk(KERN_NOTICE
+                       "CacheFiles: Failed to allocate an object jar\n");
+                goto error_object_jar;
+        }
+        ret = cachefiles_proc_init();
+        if (ret < 0)
+                goto error_proc;
+        printk(KERN_INFO "CacheFiles: Loaded\n");
+        return 0;
+error_proc:
+        kmem_cache_destroy(cachefiles_object_jar);
+error_object_jar:
+        misc_deregister(&cachefiles_dev);
+error_dev:
+        kerror("failed to register: %d", ret);
+        return ret;
+}
+fs_initcall(cachefiles_init);
+/*
+ * clean up on module removal
+ */
+static void __exit cachefiles_exit(void)
+{
+        printk(KERN_INFO "CacheFiles: Unloading\n");
+        cachefiles_proc_cleanup();
+        kmem_cache_destroy(cachefiles_object_jar);
+        misc_deregister(&cachefiles_dev);
+}
+module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644
index 000000000000..4ce818ae39ea
--- /dev/null
+++ b/fs/cachefiles/namei.c
@@ -0,0 +1,771 @@
+/* CacheFiles path walking and related routines
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include "internal.h"
+static int cachefiles_wait_bit(void *flags)
+{
+        schedule();
+        return 0;
+}
+/*
+ * record the fact that an object is now active
+ */
+static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
+                                          struct cachefiles_object *object)
+{
+        struct cachefiles_object *xobject;
+        struct rb_node **_p, *_parent = NULL;
+        struct dentry *dentry;
+        _enter(",%p", object);
+try_again:
+        write_lock(&cache->active_lock);
+        if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+                BUG();
+        dentry = object->dentry;
+        _p = &cache->active_nodes.rb_node;
+        while (*_p) {
+                _parent = *_p;
+                xobject = rb_entry(_parent,
+                                   struct cachefiles_object, active_node);
+                ASSERT(xobject != object);
+                if (xobject->dentry > dentry)
+                        _p = &(*_p)->rb_left;
+                else if (xobject->dentry < dentry)
+                        _p = &(*_p)->rb_right;
+                else
+                        goto wait_for_old_object;
+        }
+        rb_link_node(&object->active_node, _parent, _p);
+        rb_insert_color(&object->active_node, &cache->active_nodes);
+        write_unlock(&cache->active_lock);
+        _leave("");
+        return;
+        /* an old object from a previous incarnation is hogging the slot - we
+         * need to wait for it to be destroyed */
+wait_for_old_object:
+        if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+                printk(KERN_ERR "\n");
+                printk(KERN_ERR "CacheFiles: Error:"
+                       " Unexpected object collision\n");
+                printk(KERN_ERR "xobject: OBJ%x\n",
+                       xobject->fscache.debug_id);
+                printk(KERN_ERR "xobjstate=%s\n",
+                       fscache_object_states[xobject->fscache.state]);
+                printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
+                printk(KERN_ERR "xobjevent=%lx [%lx]\n",
+                       xobject->fscache.events, xobject->fscache.event_mask);
+                printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
+                       xobject->fscache.n_ops, xobject->fscache.n_in_progress,
+                       xobject->fscache.n_exclusive);
+                printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
+                       xobject->fscache.cookie,
+                       xobject->fscache.cookie->parent,
+                       xobject->fscache.cookie->netfs_data,
+                       xobject->fscache.cookie->flags);
+                printk(KERN_ERR "xparent=%p\n",
+                       xobject->fscache.parent);
+                printk(KERN_ERR "object: OBJ%x\n",
+                       object->fscache.debug_id);
+                printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
+                       object->fscache.cookie,
+                       object->fscache.cookie->parent,
+                       object->fscache.cookie->netfs_data,
+                       object->fscache.cookie->flags);
+                printk(KERN_ERR "parent=%p\n",
+                       object->fscache.parent);
+                BUG();
+        }
+        atomic_inc(&xobject->usage);
+        write_unlock(&cache->active_lock);
+        _debug(">>> wait");
+        wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
+                    cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
+        _debug("<<< waited");
+        cache->cache.ops->put_object(&xobject->fscache);
+        goto try_again;
+}
+/*
+ * delete an object representation from the cache
+ * - file backed objects are unlinked
+ * - directory backed objects are stuffed into the graveyard for userspace to
+ *   delete
+ * - unlocks the directory mutex
+ */
+static int cachefiles_bury_object(struct cachefiles_cache *cache,
+                                  struct dentry *dir,
+                                  struct dentry *rep)
+{
+        struct dentry *grave, *trap;
+        char nbuffer[8 + 8 + 1];
+        int ret;
+        _enter(",'%*.*s','%*.*s'",
+               dir->d_name.len, dir->d_name.len, dir->d_name.name,
+               rep->d_name.len, rep->d_name.len, rep->d_name.name);
+        /* non-directories can just be unlinked */
+        if (!S_ISDIR(rep->d_inode->i_mode)) {
+                _debug("unlink stale object");
+                ret = vfs_unlink(dir->d_inode, rep);
+                mutex_unlock(&dir->d_inode->i_mutex);
+                if (ret == -EIO)
+                        cachefiles_io_error(cache, "Unlink failed");
+                _leave(" = %d", ret);
+                return ret;
+        }
+        /* directories have to be moved to the graveyard */
+        _debug("move stale object to graveyard");
+        mutex_unlock(&dir->d_inode->i_mutex);
+try_again:
+        /* first step is to make up a grave dentry in the graveyard */
+        sprintf(nbuffer, "%08x%08x",
+                (uint32_t) get_seconds(),
+                (uint32_t) atomic_inc_return(&cache->gravecounter));
+        /* do the multiway lock magic */
+        trap = lock_rename(cache->graveyard, dir);
+        /* do some checks before getting the grave dentry */
+        if (rep->d_parent != dir) {
+                /* the entry was probably culled when we dropped the parent dir
+                 * lock */
+                unlock_rename(cache->graveyard, dir);
+                _leave(" = 0 [culled?]");
+                return 0;
+        }
+        if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
+                unlock_rename(cache->graveyard, dir);
+                cachefiles_io_error(cache, "Graveyard no longer a directory");
+                return -EIO;
+        }
+        if (trap == rep) {
+                unlock_rename(cache->graveyard, dir);
+                cachefiles_io_error(cache, "May not make directory loop");
+                return -EIO;
+        }
+        if (d_mountpoint(rep)) {
+                unlock_rename(cache->graveyard, dir);
+                cachefiles_io_error(cache, "Mountpoint in cache");
+                return -EIO;
+        }
+        grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+        if (IS_ERR(grave)) {
+                unlock_rename(cache->graveyard, dir);
+                if (PTR_ERR(grave) == -ENOMEM) {
+                        _leave(" = -ENOMEM");
+                        return -ENOMEM;
+                }
+                cachefiles_io_error(cache, "Lookup error %ld",
+                                    PTR_ERR(grave));
+                return -EIO;
+        }
+        if (grave->d_inode) {
+                unlock_rename(cache->graveyard, dir);
+                dput(grave);
+                grave = NULL;
+                cond_resched();
+                goto try_again;
+        }
+        if (d_mountpoint(grave)) {
+                unlock_rename(cache->graveyard, dir);
+                dput(grave);
+                cachefiles_io_error(cache, "Mountpoint in graveyard");
+                return -EIO;
+        }
+        /* target should not be an ancestor of source */
+        if (trap == grave) {
+                unlock_rename(cache->graveyard, dir);
+                dput(grave);
+                cachefiles_io_error(cache, "May not make directory loop");
+                return -EIO;
+        }
+        /* attempt the rename */
+        ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
+        if (ret != 0 && ret != -ENOMEM)
+                cachefiles_io_error(cache, "Rename failed with error %d", ret);
+        unlock_rename(cache->graveyard, dir);
+        dput(grave);
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * delete an object representation from the cache
+ */
+int cachefiles_delete_object(struct cachefiles_cache *cache,
+                             struct cachefiles_object *object)
+{
+        struct dentry *dir;
+        int ret;
+        _enter(",{%p}", object->dentry);
+        ASSERT(object->dentry);
+        ASSERT(object->dentry->d_inode);
+        ASSERT(object->dentry->d_parent);
+        dir = dget_parent(object->dentry);
+        mutex_lock(&dir->d_inode->i_mutex);
+        ret = cachefiles_bury_object(cache, dir, object->dentry);
+        dput(dir);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * walk from the parent object to the child object through the backing
+ * filesystem, creating directories as we go
+ */
+int cachefiles_walk_to_object(struct cachefiles_object *parent,
+                              struct cachefiles_object *object,
+                              const char *key,
+                              struct cachefiles_xattr *auxdata)
+{
+        struct cachefiles_cache *cache;
+        struct dentry *dir, *next = NULL;
+        unsigned long start;
+        const char *name;
+        int ret, nlen;
+        _enter("{%p},,%s,", parent->dentry, key);
+        cache = container_of(parent->fscache.cache,
+                             struct cachefiles_cache, cache);
+        ASSERT(parent->dentry);
+        ASSERT(parent->dentry->d_inode);
+        if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
+                // TODO: convert file to dir
+                _leave("looking up in none directory");
+                return -ENOBUFS;
+        }
+        dir = dget(parent->dentry);
+advance:
+        /* attempt to transit the first directory component */
+        name = key;
+        nlen = strlen(key);
+        /* key ends in a double NUL */
+        key = key + nlen + 1;
+        if (!*key)
+                key = NULL;
+lookup_again:
+        /* search the current directory for the element name */
+        _debug("lookup '%s'", name);
+        mutex_lock(&dir->d_inode->i_mutex);
+        start = jiffies;
+        next = lookup_one_len(name, dir, nlen);
+        cachefiles_hist(cachefiles_lookup_histogram, start);
+        if (IS_ERR(next))
+                goto lookup_error;
+        _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
+        if (!key)
+                object->new = !next->d_inode;
+        /* if this element of the path doesn't exist, then the lookup phase
+         * failed, and we can release any readers in the certain knowledge that
+         * there's nothing for them to actually read */
+        if (!next->d_inode)
+                fscache_object_lookup_negative(&object->fscache);
+        /* we need to create the object if it's negative */
+        if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
+                /* index objects and intervening tree levels must be subdirs */
+                if (!next->d_inode) {
+                        ret = cachefiles_has_space(cache, 1, 0);
+                        if (ret < 0)
+                                goto create_error;
+                        start = jiffies;
+                        ret = vfs_mkdir(dir->d_inode, next, 0);
+                        cachefiles_hist(cachefiles_mkdir_histogram, start);
+                        if (ret < 0)
+                                goto create_error;
+                        ASSERT(next->d_inode);
+                        _debug("mkdir -> %p{%p{ino=%lu}}",
+                               next, next->d_inode, next->d_inode->i_ino);
+                } else if (!S_ISDIR(next->d_inode->i_mode)) {
+                        kerror("inode %lu is not a directory",
+                               next->d_inode->i_ino);
+                        ret = -ENOBUFS;
+                        goto error;
+                }
+        } else {
+                /* non-index objects start out life as files */
+                if (!next->d_inode) {
+                        ret = cachefiles_has_space(cache, 1, 0);
+                        if (ret < 0)
+                                goto create_error;
+                        start = jiffies;
+                        ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
+                        cachefiles_hist(cachefiles_create_histogram, start);
+                        if (ret < 0)
+                                goto create_error;
+                        ASSERT(next->d_inode);
+                        _debug("create -> %p{%p{ino=%lu}}",
+                               next, next->d_inode, next->d_inode->i_ino);
+                } else if (!S_ISDIR(next->d_inode->i_mode) &&
+                           !S_ISREG(next->d_inode->i_mode)
+                           ) {
+                        kerror("inode %lu is not a file or directory",
+                               next->d_inode->i_ino);
+                        ret = -ENOBUFS;
+                        goto error;
+                }
+        }
+        /* process the next component */
+        if (key) {
+                _debug("advance");
+                mutex_unlock(&dir->d_inode->i_mutex);
+                dput(dir);
+                dir = next;
+                next = NULL;
+                goto advance;
+        }
+        /* we've found the object we were looking for */
+        object->dentry = next;
+        /* if we've found that the terminal object exists, then we need to
+         * check its attributes and delete it if it's out of date */
+        if (!object->new) {
+                _debug("validate '%*.*s'",
+                       next->d_name.len, next->d_name.len, next->d_name.name);
+                ret = cachefiles_check_object_xattr(object, auxdata);
+                if (ret == -ESTALE) {
+                        /* delete the object (the deleter drops the directory
+                         * mutex) */
+                        object->dentry = NULL;
+                        ret = cachefiles_bury_object(cache, dir, next);
+                        dput(next);
+                        next = NULL;
+                        if (ret < 0)
+                                goto delete_error;
+                        _debug("redo lookup");
+                        goto lookup_again;
+                }
+        }
+        /* note that we're now using this object */
+        cachefiles_mark_object_active(cache, object);
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(dir);
+        dir = NULL;
+        _debug("=== OBTAINED_OBJECT ===");
+        if (object->new) {
+                /* attach data to a newly constructed terminal object */
+                ret = cachefiles_set_object_xattr(object, auxdata);
+                if (ret < 0)
+                        goto check_error;
+        } else {
+                /* always update the atime on an object we've just looked up
+                 * (this is used to keep track of culling, and atimes are only
+                 * updated by read, write and readdir but not lookup or
+                 * open) */
+                touch_atime(cache->mnt, next);
+        }
+        /* open a file interface onto a data file */
+        if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
+                if (S_ISREG(object->dentry->d_inode->i_mode)) {
+                        const struct address_space_operations *aops;
+                        ret = -EPERM;
+                        aops = object->dentry->d_inode->i_mapping->a_ops;
+                        if (!aops->bmap)
+                                goto check_error;
+                        object->backer = object->dentry;
+                } else {
+                        BUG(); // TODO: open file in data-class subdir
+                }
+        }
+        object->new = 0;
+        fscache_obtained_object(&object->fscache);
+        _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
+        return 0;
+create_error:
+        _debug("create error %d", ret);
+        if (ret == -EIO)
+                cachefiles_io_error(cache, "Create/mkdir failed");
+        goto error;
+check_error:
+        _debug("check error %d", ret);
+        write_lock(&cache->active_lock);
+        rb_erase(&object->active_node, &cache->active_nodes);
+        clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+        wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+        write_unlock(&cache->active_lock);
+        dput(object->dentry);
+        object->dentry = NULL;
+        goto error_out;
+delete_error:
+        _debug("delete error %d", ret);
+        goto error_out2;
+lookup_error:
+        _debug("lookup error %ld", PTR_ERR(next));
+        ret = PTR_ERR(next);
+        if (ret == -EIO)
+                cachefiles_io_error(cache, "Lookup failed");
+        next = NULL;
+error:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(next);
+error_out2:
+        dput(dir);
+error_out:
+        if (ret == -ENOSPC)
+                ret = -ENOBUFS;
+        _leave(" = error %d", -ret);
+        return ret;
+}
+/*
+ * get a subdirectory
+ */
+struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+                                        struct dentry *dir,
+                                        const char *dirname)
+{
+        struct dentry *subdir;
+        unsigned long start;
+        int ret;
+        _enter(",,%s", dirname);
+        /* search the current directory for the element name */
+        mutex_lock(&dir->d_inode->i_mutex);
+        start = jiffies;
+        subdir = lookup_one_len(dirname, dir, strlen(dirname));
+        cachefiles_hist(cachefiles_lookup_histogram, start);
+        if (IS_ERR(subdir)) {
+                if (PTR_ERR(subdir) == -ENOMEM)
+                        goto nomem_d_alloc;
+                goto lookup_error;
+        }
+        _debug("subdir -> %p %s",
+               subdir, subdir->d_inode ? "positive" : "negative");
+        /* we need to create the subdir if it doesn't exist yet */
+        if (!subdir->d_inode) {
+                ret = cachefiles_has_space(cache, 1, 0);
+                if (ret < 0)
+                        goto mkdir_error;
+                _debug("attempt mkdir");
+                ret = vfs_mkdir(dir->d_inode, subdir, 0700);
+                if (ret < 0)
+                        goto mkdir_error;
+                ASSERT(subdir->d_inode);
+                _debug("mkdir -> %p{%p{ino=%lu}}",
+                       subdir,
+                       subdir->d_inode,
+                       subdir->d_inode->i_ino);
+        }
+        mutex_unlock(&dir->d_inode->i_mutex);
+        /* we need to make sure the subdir is a directory */
+        ASSERT(subdir->d_inode);
+        if (!S_ISDIR(subdir->d_inode->i_mode)) {
+                kerror("%s is not a directory", dirname);
+                ret = -EIO;
+                goto check_error;
+        }
+        ret = -EPERM;
+        if (!subdir->d_inode->i_op ||
+            !subdir->d_inode->i_op->setxattr ||
+            !subdir->d_inode->i_op->getxattr ||
+            !subdir->d_inode->i_op->lookup ||
+            !subdir->d_inode->i_op->mkdir ||
+            !subdir->d_inode->i_op->create ||
+            !subdir->d_inode->i_op->rename ||
+            !subdir->d_inode->i_op->rmdir ||
+            !subdir->d_inode->i_op->unlink)
+                goto check_error;
+        _leave(" = [%lu]", subdir->d_inode->i_ino);
+        return subdir;
+check_error:
+        dput(subdir);
+        _leave(" = %d [check]", ret);
+        return ERR_PTR(ret);
+mkdir_error:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(subdir);
+        kerror("mkdir %s failed with error %d", dirname, ret);
+        return ERR_PTR(ret);
+lookup_error:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        ret = PTR_ERR(subdir);
+        kerror("Lookup %s failed with error %d", dirname, ret);
+        return ERR_PTR(ret);
+nomem_d_alloc:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        _leave(" = -ENOMEM");
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * find out if an object is in use or not
+ * - if finds object and it's not in use:
+ *   - returns a pointer to the object and a reference on it
+ *   - returns with the directory locked
+ */
+static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
+                                              struct dentry *dir,
+                                              char *filename)
+{
+        struct cachefiles_object *object;
+        struct rb_node *_n;
+        struct dentry *victim;
+        unsigned long start;
+        int ret;
+        //_enter(",%*.*s/,%s",
+        //       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+        /* look up the victim */
+        mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+        start = jiffies;
+        victim = lookup_one_len(filename, dir, strlen(filename));
+        cachefiles_hist(cachefiles_lookup_histogram, start);
+        if (IS_ERR(victim))
+                goto lookup_error;
+        //_debug("victim -> %p %s",
+        //       victim, victim->d_inode ? "positive" : "negative");
+        /* if the object is no longer there then we probably retired the object
+         * at the netfs's request whilst the cull was in progress
+         */
+        if (!victim->d_inode) {
+                mutex_unlock(&dir->d_inode->i_mutex);
+                dput(victim);
+                _leave(" = -ENOENT [absent]");
+                return ERR_PTR(-ENOENT);
+        }
+        /* check to see if we're using this object */
+        read_lock(&cache->active_lock);
+        _n = cache->active_nodes.rb_node;
+        while (_n) {
+                object = rb_entry(_n, struct cachefiles_object, active_node);
+                if (object->dentry > victim)
+                        _n = _n->rb_left;
+                else if (object->dentry < victim)
+                        _n = _n->rb_right;
+                else
+                        goto object_in_use;
+        }
+        read_unlock(&cache->active_lock);
+        //_leave(" = %p", victim);
+        return victim;
+object_in_use:
+        read_unlock(&cache->active_lock);
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(victim);
+        //_leave(" = -EBUSY [in use]");
+        return ERR_PTR(-EBUSY);
+lookup_error:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        ret = PTR_ERR(victim);
+        if (ret == -ENOENT) {
+                /* file or dir now absent - probably retired by netfs */
+                _leave(" = -ESTALE [absent]");
+                return ERR_PTR(-ESTALE);
+        }
+        if (ret == -EIO) {
+                cachefiles_io_error(cache, "Lookup failed");
+        } else if (ret != -ENOMEM) {
+                kerror("Internal error: %d", ret);
+                ret = -EIO;
+        }
+        _leave(" = %d", ret);
+        return ERR_PTR(ret);
+}
+/*
+ * cull an object if it's not in use
+ * - called only by cache manager daemon
+ */
+int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+                    char *filename)
+{
+        struct dentry *victim;
+        int ret;
+        _enter(",%*.*s/,%s",
+               dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+        victim = cachefiles_check_active(cache, dir, filename);
+        if (IS_ERR(victim))
+                return PTR_ERR(victim);
+        _debug("victim -> %p %s",
+               victim, victim->d_inode ? "positive" : "negative");
+        /* okay... the victim is not being used so we can cull it
+         * - start by marking it as stale
+         */
+        _debug("victim is cullable");
+        ret = cachefiles_remove_object_xattr(cache, victim);
+        if (ret < 0)
+                goto error_unlock;
+        /*  actually remove the victim (drops the dir mutex) */
+        _debug("bury");
+        ret = cachefiles_bury_object(cache, dir, victim);
+        if (ret < 0)
+                goto error;
+        dput(victim);
+        _leave(" = 0");
+        return 0;
+error_unlock:
+        mutex_unlock(&dir->d_inode->i_mutex);
+error:
+        dput(victim);
+        if (ret == -ENOENT) {
+                /* file or dir now absent - probably retired by netfs */
+                _leave(" = -ESTALE [absent]");
+                return -ESTALE;
+        }
+        if (ret != -ENOMEM) {
+                kerror("Internal error: %d", ret);
+                ret = -EIO;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * find out if an object is in use or not
+ * - called only by cache manager daemon
+ * - returns -EBUSY or 0 to indicate whether an object is in use or not
+ */
+int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
+                            char *filename)
+{
+        struct dentry *victim;
+        //_enter(",%*.*s/,%s",
+        //       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+        victim = cachefiles_check_active(cache, dir, filename);
+        if (IS_ERR(victim))
+                return PTR_ERR(victim);
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(victim);
+        //_leave(" = 0");
+        return 0;
+}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644
index 000000000000..eccd33941199
--- /dev/null
+++ b/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
+/* CacheFiles statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+atomic_t cachefiles_lookup_histogram[HZ];
+atomic_t cachefiles_mkdir_histogram[HZ];
+atomic_t cachefiles_create_histogram[HZ];
+/*
+ * display the latency histogram
+ */
+static int cachefiles_histogram_show(struct seq_file *m, void *v)
+{
+        unsigned long index;
+        unsigned x, y, z, t;
+        switch ((unsigned long) v) {
+        case 1:
+                seq_puts(m, "JIFS  SECS  LOOKUPS   MKDIRS    CREATES\n");
+                return 0;
+        case 2:
+                seq_puts(m, "===== ===== ========= ========= =========\n");
+                return 0;
+        default:
+                index = (unsigned long) v - 3;
+                x = atomic_read(&cachefiles_lookup_histogram[index]);
+                y = atomic_read(&cachefiles_mkdir_histogram[index]);
+                z = atomic_read(&cachefiles_create_histogram[index]);
+                if (x == 0 && y == 0 && z == 0)
+                        return 0;
+                t = (index * 1000) / HZ;
+                seq_printf(m, "%4lu  0.%03u %9u %9u %9u\n", index, t, x, y, z);
+                return 0;
+        }
+}
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+        if ((unsigned long long)*_pos >= HZ + 2)
+                return NULL;
+        if (*_pos == 0)
+                *_pos = 1;
+        return (void *)(unsigned long) *_pos;
+}
+/*
+ * move to the next line
+ */
+static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return (unsigned long long)*pos > HZ + 2 ?
+                NULL : (void *)(unsigned long) *pos;
+}
+/*
+ * clean up after reading
+ */
+static void cachefiles_histogram_stop(struct seq_file *m, void *v)
+{
+}
+static const struct seq_operations cachefiles_histogram_ops = {
+        .start          = cachefiles_histogram_start,
+        .stop           = cachefiles_histogram_stop,
+        .next           = cachefiles_histogram_next,
+        .show           = cachefiles_histogram_show,
+};
+/*
+ * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
+ */
+static int cachefiles_histogram_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &cachefiles_histogram_ops);
+}
+static const struct file_operations cachefiles_histogram_fops = {
+        .owner          = THIS_MODULE,
+        .open           = cachefiles_histogram_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+/*
+ * initialise the /proc/fs/cachefiles/ directory
+ */
+int __init cachefiles_proc_init(void)
+{
+        _enter("");
+        if (!proc_mkdir("fs/cachefiles", NULL))
+                goto error_dir;
+        if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+                         &cachefiles_histogram_fops))
+                goto error_histogram;
+        _leave(" = 0");
+        return 0;
+error_histogram:
+        remove_proc_entry("fs/cachefiles", NULL);
+error_dir:
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+/*
+ * clean up the /proc/fs/cachefiles/ directory
+ */
+void cachefiles_proc_cleanup(void)
+{
+        remove_proc_entry("fs/cachefiles/histogram", NULL);
+        remove_proc_entry("fs/cachefiles", NULL);
+}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644
index 000000000000..a69787e7dd96
--- /dev/null
+++ b/fs/cachefiles/rdwr.c
@@ -0,0 +1,879 @@
+/* Storage object read/write
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/mount.h>
+#include <linux/file.h>
+#include "internal.h"
+/*
+ * detect wake up events generated by the unlocking of pages in which we're
+ * interested
+ * - we use this to detect read completion of backing pages
+ * - the caller holds the waitqueue lock
+ */
+static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+                                  int sync, void *_key)
+{
+        struct cachefiles_one_read *monitor =
+                container_of(wait, struct cachefiles_one_read, monitor);
+        struct cachefiles_object *object;
+        struct wait_bit_key *key = _key;
+        struct page *page = wait->private;
+        ASSERT(key);
+        _enter("{%lu},%u,%d,{%p,%u}",
+               monitor->netfs_page->index, mode, sync,
+               key->flags, key->bit_nr);
+        if (key->flags != &page->flags ||
+            key->bit_nr != PG_locked)
+                return 0;
+        _debug("--- monitor %p %lx ---", page, page->flags);
+        if (!PageUptodate(page) && !PageError(page))
+                dump_stack();
+        /* remove from the waitqueue */
+        list_del(&wait->task_list);
+        /* move onto the action list and queue for FS-Cache thread pool */
+        ASSERT(monitor->op);
+        object = container_of(monitor->op->op.object,
+                              struct cachefiles_object, fscache);
+        spin_lock(&object->work_lock);
+        list_add_tail(&monitor->op_link, &monitor->op->to_do);
+        spin_unlock(&object->work_lock);
+        fscache_enqueue_retrieval(monitor->op);
+        return 0;
+}
+/*
+ * copy data from backing pages to netfs pages to complete a read operation
+ * - driven by FS-Cache's thread pool
+ */
+static void cachefiles_read_copier(struct fscache_operation *_op)
+{
+        struct cachefiles_one_read *monitor;
+        struct cachefiles_object *object;
+        struct fscache_retrieval *op;
+        struct pagevec pagevec;
+        int error, max;
+        op = container_of(_op, struct fscache_retrieval, op);
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        _enter("{ino=%lu}", object->backer->d_inode->i_ino);
+        pagevec_init(&pagevec, 0);
+        max = 8;
+        spin_lock_irq(&object->work_lock);
+        while (!list_empty(&op->to_do)) {
+                monitor = list_entry(op->to_do.next,
+                                     struct cachefiles_one_read, op_link);
+                list_del(&monitor->op_link);
+                spin_unlock_irq(&object->work_lock);
+                _debug("- copy {%lu}", monitor->back_page->index);
+                error = -EIO;
+                if (PageUptodate(monitor->back_page)) {
+                        copy_highpage(monitor->netfs_page, monitor->back_page);
+                        pagevec_add(&pagevec, monitor->netfs_page);
+                        fscache_mark_pages_cached(monitor->op, &pagevec);
+                        error = 0;
+                }
+                if (error)
+                        cachefiles_io_error_obj(
+                                object,
+                                "Readpage failed on backing file %lx",
+                                (unsigned long) monitor->back_page->flags);
+                page_cache_release(monitor->back_page);
+                fscache_end_io(op, monitor->netfs_page, error);
+                page_cache_release(monitor->netfs_page);
+                fscache_put_retrieval(op);
+                kfree(monitor);
+                /* let the thread pool have some air occasionally */
+                max--;
+                if (max < 0 || need_resched()) {
+                        if (!list_empty(&op->to_do))
+                                fscache_enqueue_retrieval(op);
+                        _leave(" [maxed out]");
+                        return;
+                }
+                spin_lock_irq(&object->work_lock);
+        }
+        spin_unlock_irq(&object->work_lock);
+        _leave("");
+}
+/*
+ * read the corresponding page to the given set from the backing file
+ * - an uncertain page is simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
+                                            struct fscache_retrieval *op,
+                                            struct page *netpage,
+                                            struct pagevec *pagevec)
+{
+        struct cachefiles_one_read *monitor;
+        struct address_space *bmapping;
+        struct page *newpage, *backpage;
+        int ret;
+        _enter("");
+        pagevec_reinit(pagevec);
+        _debug("read back %p{%lu,%d}",
+               netpage, netpage->index, page_count(netpage));
+        monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+        if (!monitor)
+                goto nomem;
+        monitor->netfs_page = netpage;
+        monitor->op = fscache_get_retrieval(op);
+        init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
+        /* attempt to get hold of the backing page */
+        bmapping = object->backer->d_inode->i_mapping;
+        newpage = NULL;
+        for (;;) {
+                backpage = find_get_page(bmapping, netpage->index);
+                if (backpage)
+                        goto backing_page_already_present;
+                if (!newpage) {
+                        newpage = page_cache_alloc_cold(bmapping);
+                        if (!newpage)
+                                goto nomem_monitor;
+                }
+                ret = add_to_page_cache(newpage, bmapping,
+                                        netpage->index, GFP_KERNEL);
+                if (ret == 0)
+                        goto installed_new_backing_page;
+                if (ret != -EEXIST)
+                        goto nomem_page;
+        }
+        /* we've installed a new backing page, so now we need to add it
+         * to the LRU list and start it reading */
+installed_new_backing_page:
+        _debug("- new %p", newpage);
+        backpage = newpage;
+        newpage = NULL;
+        page_cache_get(backpage);
+        pagevec_add(pagevec, backpage);
+        __pagevec_lru_add_file(pagevec);
+read_backing_page:
+        ret = bmapping->a_ops->readpage(NULL, backpage);
+        if (ret < 0)
+                goto read_error;
+        /* set the monitor to transfer the data across */
+monitor_backing_page:
+        _debug("- monitor add");
+        /* install the monitor */
+        page_cache_get(monitor->netfs_page);
+        page_cache_get(backpage);
+        monitor->back_page = backpage;
+        monitor->monitor.private = backpage;
+        add_page_wait_queue(backpage, &monitor->monitor);
+        monitor = NULL;
+        /* but the page may have been read before the monitor was installed, so
+         * the monitor may miss the event - so we have to ensure that we do get
+         * one in such a case */
+        if (trylock_page(backpage)) {
+                _debug("jumpstart %p {%lx}", backpage, backpage->flags);
+                unlock_page(backpage);
+        }
+        goto success;
+        /* if the backing page is already present, it can be in one of
+         * three states: read in progress, read failed or read okay */
+backing_page_already_present:
+        _debug("- present");
+        if (newpage) {
+                page_cache_release(newpage);
+                newpage = NULL;
+        }
+        if (PageError(backpage))
+                goto io_error;
+        if (PageUptodate(backpage))
+                goto backing_page_already_uptodate;
+        if (!trylock_page(backpage))
+                goto monitor_backing_page;
+        _debug("read %p {%lx}", backpage, backpage->flags);
+        goto read_backing_page;
+        /* the backing page is already up to date, attach the netfs
+         * page to the pagecache and LRU and copy the data across */
+backing_page_already_uptodate:
+        _debug("- uptodate");
+        pagevec_add(pagevec, netpage);
+        fscache_mark_pages_cached(op, pagevec);
+        copy_highpage(netpage, backpage);
+        fscache_end_io(op, netpage, 0);
+success:
+        _debug("success");
+        ret = 0;
+out:
+        if (backpage)
+                page_cache_release(backpage);
+        if (monitor) {
+                fscache_put_retrieval(monitor->op);
+                kfree(monitor);
+        }
+        _leave(" = %d", ret);
+        return ret;
+read_error:
+        _debug("read error %d", ret);
+        if (ret == -ENOMEM)
+                goto out;
+io_error:
+        cachefiles_io_error_obj(object, "Page read error on backing file");
+        ret = -ENOBUFS;
+        goto out;
+nomem_page:
+        page_cache_release(newpage);
+nomem_monitor:
+        fscache_put_retrieval(monitor->op);
+        kfree(monitor);
+nomem:
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - if the page is backed by a block in the cache:
+ *   - a read will be started which will call the callback on completion
+ *   - 0 will be returned
+ * - else if the page is unbacked:
+ *   - the metadata will be retained
+ *   - -ENODATA will be returned
+ */
+int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
+                                  struct page *page,
+                                  gfp_t gfp)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct pagevec pagevec;
+        struct inode *inode;
+        sector_t block0, block;
+        unsigned shift;
+        int ret;
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("{%p},{%lx},,,", object, page->index);
+        if (!object->backer)
+                return -ENOBUFS;
+        inode = object->backer->d_inode;
+        ASSERT(S_ISREG(inode->i_mode));
+        ASSERT(inode->i_mapping->a_ops->bmap);
+        ASSERT(inode->i_mapping->a_ops->readpages);
+        /* calculate the shift required to use bmap */
+        if (inode->i_sb->s_blocksize > PAGE_SIZE)
+                return -ENOBUFS;
+        shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+        op->op.flags = FSCACHE_OP_FAST;
+        op->op.processor = cachefiles_read_copier;
+        pagevec_init(&pagevec, 0);
+        /* we assume the absence or presence of the first block is a good
+         * enough indication for the page as a whole
+         * - TODO: don't use bmap() for this as it is _not_ actually good
+         *   enough for this as it doesn't indicate errors, but it's all we've
+         *   got for the moment
+         */
+        block0 = page->index;
+        block0 <<= shift;
+        block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
+        _debug("%llx -> %llx",
+               (unsigned long long) block0,
+               (unsigned long long) block);
+        if (block) {
+                /* submit the apparently valid page to the backing fs to be
+                 * read from disk */
+                ret = cachefiles_read_backing_file_one(object, op, page,
+                                                       &pagevec);
+        } else if (cachefiles_has_space(cache, 0, 1) == 0) {
+                /* there's space in the cache we can use */
+                pagevec_add(&pagevec, page);
+                fscache_mark_pages_cached(op, &pagevec);
+                ret = -ENODATA;
+        } else {
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * read the corresponding pages to the given set from the backing file
+ * - any uncertain pages are simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file(struct cachefiles_object *object,
+                                        struct fscache_retrieval *op,
+                                        struct list_head *list,
+                                        struct pagevec *mark_pvec)
+{
+        struct cachefiles_one_read *monitor = NULL;
+        struct address_space *bmapping = object->backer->d_inode->i_mapping;
+        struct pagevec lru_pvec;
+        struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
+        int ret = 0;
+        _enter("");
+        pagevec_init(&lru_pvec, 0);
+        list_for_each_entry_safe(netpage, _n, list, lru) {
+                list_del(&netpage->lru);
+                _debug("read back %p{%lu,%d}",
+                       netpage, netpage->index, page_count(netpage));
+                if (!monitor) {
+                        monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+                        if (!monitor)
+                                goto nomem;
+                        monitor->op = fscache_get_retrieval(op);
+                        init_waitqueue_func_entry(&monitor->monitor,
+                                                  cachefiles_read_waiter);
+                }
+                for (;;) {
+                        backpage = find_get_page(bmapping, netpage->index);
+                        if (backpage)
+                                goto backing_page_already_present;
+                        if (!newpage) {
+                                newpage = page_cache_alloc_cold(bmapping);
+                                if (!newpage)
+                                        goto nomem;
+                        }
+                        ret = add_to_page_cache(newpage, bmapping,
+                                                netpage->index, GFP_KERNEL);
+                        if (ret == 0)
+                                goto installed_new_backing_page;
+                        if (ret != -EEXIST)
+                                goto nomem;
+                }
+                /* we've installed a new backing page, so now we need to add it
+                 * to the LRU list and start it reading */
+        installed_new_backing_page:
+                _debug("- new %p", newpage);
+                backpage = newpage;
+                newpage = NULL;
+                page_cache_get(backpage);
+                if (!pagevec_add(&lru_pvec, backpage))
+                        __pagevec_lru_add_file(&lru_pvec);
+        reread_backing_page:
+                ret = bmapping->a_ops->readpage(NULL, backpage);
+                if (ret < 0)
+                        goto read_error;
+                /* add the netfs page to the pagecache and LRU, and set the
+                 * monitor to transfer the data across */
+        monitor_backing_page:
+                _debug("- monitor add");
+                ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+                                        GFP_KERNEL);
+                if (ret < 0) {
+                        if (ret == -EEXIST) {
+                                page_cache_release(netpage);
+                                continue;
+                        }
+                        goto nomem;
+                }
+                page_cache_get(netpage);
+                if (!pagevec_add(&lru_pvec, netpage))
+                        __pagevec_lru_add_file(&lru_pvec);
+                /* install a monitor */
+                page_cache_get(netpage);
+                monitor->netfs_page = netpage;
+                page_cache_get(backpage);
+                monitor->back_page = backpage;
+                monitor->monitor.private = backpage;
+                add_page_wait_queue(backpage, &monitor->monitor);
+                monitor = NULL;
+                /* but the page may have been read before the monitor was
+                 * installed, so the monitor may miss the event - so we have to
+                 * ensure that we do get one in such a case */
+                if (trylock_page(backpage)) {
+                        _debug("2unlock %p {%lx}", backpage, backpage->flags);
+                        unlock_page(backpage);
+                }
+                page_cache_release(backpage);
+                backpage = NULL;
+                page_cache_release(netpage);
+                netpage = NULL;
+                continue;
+                /* if the backing page is already present, it can be in one of
+                 * three states: read in progress, read failed or read okay */
+        backing_page_already_present:
+                _debug("- present %p", backpage);
+                if (PageError(backpage))
+                        goto io_error;
+                if (PageUptodate(backpage))
+                        goto backing_page_already_uptodate;
+                _debug("- not ready %p{%lx}", backpage, backpage->flags);
+                if (!trylock_page(backpage))
+                        goto monitor_backing_page;
+                if (PageError(backpage)) {
+                        _debug("error %lx", backpage->flags);
+                        unlock_page(backpage);
+                        goto io_error;
+                }
+                if (PageUptodate(backpage))
+                        goto backing_page_already_uptodate_unlock;
+                /* we've locked a page that's neither up to date nor erroneous,
+                 * so we need to attempt to read it again */
+                goto reread_backing_page;
+                /* the backing page is already up to date, attach the netfs
+                 * page to the pagecache and LRU and copy the data across */
+        backing_page_already_uptodate_unlock:
+                _debug("uptodate %lx", backpage->flags);
+                unlock_page(backpage);
+        backing_page_already_uptodate:
+                _debug("- uptodate");
+                ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+                                        GFP_KERNEL);
+                if (ret < 0) {
+                        if (ret == -EEXIST) {
+                                page_cache_release(netpage);
+                                continue;
+                        }
+                        goto nomem;
+                }
+                copy_highpage(netpage, backpage);
+                page_cache_release(backpage);
+                backpage = NULL;
+                if (!pagevec_add(mark_pvec, netpage))
+                        fscache_mark_pages_cached(op, mark_pvec);
+                page_cache_get(netpage);
+                if (!pagevec_add(&lru_pvec, netpage))
+                        __pagevec_lru_add_file(&lru_pvec);
+                fscache_end_io(op, netpage, 0);
+                page_cache_release(netpage);
+                netpage = NULL;
+                continue;
+        }
+        netpage = NULL;
+        _debug("out");
+out:
+        /* tidy up */
+        pagevec_lru_add_file(&lru_pvec);
+        if (newpage)
+                page_cache_release(newpage);
+        if (netpage)
+                page_cache_release(netpage);
+        if (backpage)
+                page_cache_release(backpage);
+        if (monitor) {
+                fscache_put_retrieval(op);
+                kfree(monitor);
+        }
+        list_for_each_entry_safe(netpage, _n, list, lru) {
+                list_del(&netpage->lru);
+                page_cache_release(netpage);
+        }
+        _leave(" = %d", ret);
+        return ret;
+nomem:
+        _debug("nomem");
+        ret = -ENOMEM;
+        goto out;
+read_error:
+        _debug("read error %d", ret);
+        if (ret == -ENOMEM)
+                goto out;
+io_error:
+        cachefiles_io_error_obj(object, "Page read error on backing file");
+        ret = -ENOBUFS;
+        goto out;
+}
+/*
+ * read a list of pages from the cache or allocate blocks in which to store
+ * them
+ */
+int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
+                                   struct list_head *pages,
+                                   unsigned *nr_pages,
+                                   gfp_t gfp)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct list_head backpages;
+        struct pagevec pagevec;
+        struct inode *inode;
+        struct page *page, *_n;
+        unsigned shift, nrbackpages;
+        int ret, ret2, space;
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("{OBJ%x,%d},,%d,,",
+               object->fscache.debug_id, atomic_read(&op->op.usage),
+               *nr_pages);
+        if (!object->backer)
+                return -ENOBUFS;
+        space = 1;
+        if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
+                space = 0;
+        inode = object->backer->d_inode;
+        ASSERT(S_ISREG(inode->i_mode));
+        ASSERT(inode->i_mapping->a_ops->bmap);
+        ASSERT(inode->i_mapping->a_ops->readpages);
+        /* calculate the shift required to use bmap */
+        if (inode->i_sb->s_blocksize > PAGE_SIZE)
+                return -ENOBUFS;
+        shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+        pagevec_init(&pagevec, 0);
+        op->op.flags = FSCACHE_OP_FAST;
+        op->op.processor = cachefiles_read_copier;
+        INIT_LIST_HEAD(&backpages);
+        nrbackpages = 0;
+        ret = space ? -ENODATA : -ENOBUFS;
+        list_for_each_entry_safe(page, _n, pages, lru) {
+                sector_t block0, block;
+                /* we assume the absence or presence of the first block is a
+                 * good enough indication for the page as a whole
+                 * - TODO: don't use bmap() for this as it is _not_ actually
+                 *   good enough for this as it doesn't indicate errors, but
+                 *   it's all we've got for the moment
+                 */
+                block0 = page->index;
+                block0 <<= shift;
+                block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
+                                                      block0);
+                _debug("%llx -> %llx",
+                       (unsigned long long) block0,
+                       (unsigned long long) block);
+                if (block) {
+                        /* we have data - add it to the list to give to the
+                         * backing fs */
+                        list_move(&page->lru, &backpages);
+                        (*nr_pages)--;
+                        nrbackpages++;
+                } else if (space && pagevec_add(&pagevec, page) == 0) {
+                        fscache_mark_pages_cached(op, &pagevec);
+                        ret = -ENODATA;
+                }
+        }
+        if (pagevec_count(&pagevec) > 0)
+                fscache_mark_pages_cached(op, &pagevec);
+        if (list_empty(pages))
+                ret = 0;
+        /* submit the apparently valid pages to the backing fs to be read from
+         * disk */
+        if (nrbackpages > 0) {
+                ret2 = cachefiles_read_backing_file(object, op, &backpages,
+                                                    &pagevec);
+                if (ret2 == -ENOMEM || ret2 == -EINTR)
+                        ret = ret2;
+        }
+        if (pagevec_count(&pagevec) > 0)
+                fscache_mark_pages_cached(op, &pagevec);
+        _leave(" = %d [nr=%u%s]",
+               ret, *nr_pages, list_empty(pages) ? " empty" : "");
+        return ret;
+}
+/*
+ * allocate a block in the cache in which to store a page
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - otherwise:
+ *   - the metadata will be retained
+ *   - 0 will be returned
+ */
+int cachefiles_allocate_page(struct fscache_retrieval *op,
+                             struct page *page,
+                             gfp_t gfp)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct pagevec pagevec;
+        int ret;
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("%p,{%lx},", object, page->index);
+        ret = cachefiles_has_space(cache, 0, 1);
+        if (ret == 0) {
+                pagevec_init(&pagevec, 0);
+                pagevec_add(&pagevec, page);
+                fscache_mark_pages_cached(op, &pagevec);
+        } else {
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * allocate blocks in the cache in which to store a set of pages
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if some buffers couldn't be made available
+ * - returns -ENOBUFS if some pages are beyond EOF
+ * - otherwise:
+ *   - -ENODATA will be returned
+ * - metadata will be retained for any page marked
+ */
+int cachefiles_allocate_pages(struct fscache_retrieval *op,
+                              struct list_head *pages,
+                              unsigned *nr_pages,
+                              gfp_t gfp)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct pagevec pagevec;
+        struct page *page;
+        int ret;
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("%p,,,%d,", object, *nr_pages);
+        ret = cachefiles_has_space(cache, 0, *nr_pages);
+        if (ret == 0) {
+                pagevec_init(&pagevec, 0);
+                list_for_each_entry(page, pages, lru) {
+                        if (pagevec_add(&pagevec, page) == 0)
+                                fscache_mark_pages_cached(op, &pagevec);
+                }
+                if (pagevec_count(&pagevec) > 0)
+                        fscache_mark_pages_cached(op, &pagevec);
+                ret = -ENODATA;
+        } else {
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * request a page be stored in the cache
+ * - cache withdrawal is prevented by the caller
+ * - this request may be ignored if there's no cache block available, in which
+ *   case -ENOBUFS will be returned
+ * - if the op is in progress, 0 will be returned
+ */
+int cachefiles_write_page(struct fscache_storage *op, struct page *page)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        mm_segment_t old_fs;
+        struct file *file;
+        loff_t pos;
+        void *data;
+        int ret;
+        ASSERT(op != NULL);
+        ASSERT(page != NULL);
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        _enter("%p,%p{%lx},,,", object, page, page->index);
+        if (!object->backer) {
+                _leave(" = -ENOBUFS");
+                return -ENOBUFS;
+        }
+        ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        /* write the page to the backing filesystem and let it store it in its
+         * own time */
+        dget(object->backer);
+        mntget(cache->mnt);
+        file = dentry_open(object->backer, cache->mnt, O_RDWR,
+                           cache->cache_cred);
+        if (IS_ERR(file)) {
+                ret = PTR_ERR(file);
+        } else {
+                ret = -EIO;
+                if (file->f_op->write) {
+                        pos = (loff_t) page->index << PAGE_SHIFT;
+                        data = kmap(page);
+                        old_fs = get_fs();
+                        set_fs(KERNEL_DS);
+                        ret = file->f_op->write(
+                                file, (const void __user *) data, PAGE_SIZE,
+                                &pos);
+                        set_fs(old_fs);
+                        kunmap(page);
+                        if (ret != PAGE_SIZE)
+                                ret = -EIO;
+                }
+                fput(file);
+        }
+        if (ret < 0) {
+                if (ret == -EIO)
+                        cachefiles_io_error_obj(
+                                object, "Write page to backing file failed");
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * detach a backing block from a page
+ * - cache withdrawal is prevented by the caller
+ */
+void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        object = container_of(_object, struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("%p,{%lu}", object, page->index);
+        spin_unlock(&object->fscache.cookie->lock);
+}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644
index 000000000000..b5808cdb2232
--- /dev/null
+++ b/fs/cachefiles/security.c
@@ -0,0 +1,116 @@
+/* CacheFiles security management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/fs.h>
+#include <linux/cred.h>
+#include "internal.h"
+/*
+ * determine the security context within which we access the cache from within
+ * the kernel
+ */
+int cachefiles_get_security_ID(struct cachefiles_cache *cache)
+{
+        struct cred *new;
+        int ret;
+        _enter("{%s}", cache->secctx);
+        new = prepare_kernel_cred(current);
+        if (!new) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        if (cache->secctx) {
+                ret = set_security_override_from_ctx(new, cache->secctx);
+                if (ret < 0) {
+                        put_cred(new);
+                        printk(KERN_ERR "CacheFiles:"
+                               " Security denies permission to nominate"
+                               " security context: error %d\n",
+                               ret);
+                        goto error;
+                }
+        }
+        cache->cache_cred = new;
+        ret = 0;
+error:
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * see if mkdir and create can be performed in the root directory
+ */
+static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
+                                      struct dentry *root)
+{
+        int ret;
+        ret = security_inode_mkdir(root->d_inode, root, 0);
+        if (ret < 0) {
+                printk(KERN_ERR "CacheFiles:"
+                       " Security denies permission to make dirs: error %d",
+                       ret);
+                return ret;
+        }
+        ret = security_inode_create(root->d_inode, root, 0);
+        if (ret < 0)
+                printk(KERN_ERR "CacheFiles:"
+                       " Security denies permission to create files: error %d",
+                       ret);
+        return ret;
+}
+/*
+ * check the security details of the on-disk cache
+ * - must be called with security override in force
+ */
+int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+                                        struct dentry *root,
+                                        const struct cred **_saved_cred)
+{
+        struct cred *new;
+        int ret;
+        _enter("");
+        /* duplicate the cache creds for COW (the override is currently in
+         * force, so we can use prepare_creds() to do this) */
+        new = prepare_creds();
+        if (!new)
+                return -ENOMEM;
+        cachefiles_end_secure(cache, *_saved_cred);
+        /* use the cache root dir's security context as the basis with
+         * which create files */
+        ret = set_create_files_as(new, root->d_inode);
+        if (ret < 0) {
+                _leave(" = %d [cfa]", ret);
+                return ret;
+        }
+        put_cred(cache->cache_cred);
+        cache->cache_cred = new;
+        cachefiles_begin_secure(cache, _saved_cred);
+        ret = cachefiles_check_cache_dir(cache, root);
+        if (ret == -EOPNOTSUPP)
+                ret = 0;
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644
index 000000000000..f3e7a0bf068b
--- /dev/null
+++ b/fs/cachefiles/xattr.c
@@ -0,0 +1,291 @@
+/* CacheFiles extended attribute management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include "internal.h"
+static const char cachefiles_xattr_cache[] =
+        XATTR_USER_PREFIX "CacheFiles.cache";
+/*
+ * check the type label on an object
+ * - done using xattrs
+ */
+int cachefiles_check_object_type(struct cachefiles_object *object)
+{
+        struct dentry *dentry = object->dentry;
+        char type[3], xtype[3];
+        int ret;
+        ASSERT(dentry);
+        ASSERT(dentry->d_inode);
+        if (!object->fscache.cookie)
+                strcpy(type, "C3");
+        else
+                snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
+        _enter("%p{%s}", object, type);
+        /* attempt to install a type label directly */
+        ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
+                           XATTR_CREATE);
+        if (ret == 0) {
+                _debug("SET"); /* we succeeded */
+                goto error;
+        }
+        if (ret != -EEXIST) {
+                kerror("Can't set xattr on %*.*s [%lu] (err %d)",
+                       dentry->d_name.len, dentry->d_name.len,
+                       dentry->d_name.name, dentry->d_inode->i_ino,
+                       -ret);
+                goto error;
+        }
+        /* read the current type label */
+        ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+        if (ret < 0) {
+                if (ret == -ERANGE)
+                        goto bad_type_length;
+                kerror("Can't read xattr on %*.*s [%lu] (err %d)",
+                       dentry->d_name.len, dentry->d_name.len,
+                       dentry->d_name.name, dentry->d_inode->i_ino,
+                       -ret);
+                goto error;
+        }
+        /* check the type is what we're expecting */
+        if (ret != 2)
+                goto bad_type_length;
+        if (xtype[0] != type[0] || xtype[1] != type[1])
+                goto bad_type;
+        ret = 0;
+error:
+        _leave(" = %d", ret);
+        return ret;
+bad_type_length:
+        kerror("Cache object %lu type xattr length incorrect",
+               dentry->d_inode->i_ino);
+        ret = -EIO;
+        goto error;
+bad_type:
+        xtype[2] = 0;
+        kerror("Cache object %*.*s [%lu] type %s not %s",
+               dentry->d_name.len, dentry->d_name.len,
+               dentry->d_name.name, dentry->d_inode->i_ino,
+               xtype, type);
+        ret = -EIO;
+        goto error;
+}
+/*
+ * set the state xattr on a cache file
+ */
+int cachefiles_set_object_xattr(struct cachefiles_object *object,
+                                struct cachefiles_xattr *auxdata)
+{
+        struct dentry *dentry = object->dentry;
+        int ret;
+        ASSERT(object->fscache.cookie);
+        ASSERT(dentry);
+        _enter("%p,#%d", object, auxdata->len);
+        /* attempt to install the cache metadata directly */
+        _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+        ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+                           &auxdata->type, auxdata->len,
+                           XATTR_CREATE);
+        if (ret < 0 && ret != -ENOMEM)
+                cachefiles_io_error_obj(
+                        object,
+                        "Failed to set xattr with error %d", ret);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * update the state xattr on a cache file
+ */
+int cachefiles_update_object_xattr(struct cachefiles_object *object,
+                                   struct cachefiles_xattr *auxdata)
+{
+        struct dentry *dentry = object->dentry;
+        int ret;
+        ASSERT(object->fscache.cookie);
+        ASSERT(dentry);
+        _enter("%p,#%d", object, auxdata->len);
+        /* attempt to install the cache metadata directly */
+        _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+        ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+                           &auxdata->type, auxdata->len,
+                           XATTR_REPLACE);
+        if (ret < 0 && ret != -ENOMEM)
+                cachefiles_io_error_obj(
+                        object,
+                        "Failed to update xattr with error %d", ret);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * check the state xattr on a cache file
+ * - return -ESTALE if the object should be deleted
+ */
+int cachefiles_check_object_xattr(struct cachefiles_object *object,
+                                  struct cachefiles_xattr *auxdata)
+{
+        struct cachefiles_xattr *auxbuf;
+        struct dentry *dentry = object->dentry;
+        int ret;
+        _enter("%p,#%d", object, auxdata->len);
+        ASSERT(dentry);
+        ASSERT(dentry->d_inode);
+        auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+        if (!auxbuf) {
+                _leave(" = -ENOMEM");
+                return -ENOMEM;
+        }
+        /* read the current type label */
+        ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+                           &auxbuf->type, 512 + 1);
+        if (ret < 0) {
+                if (ret == -ENODATA)
+                        goto stale; /* no attribute - power went off
+                                     * mid-cull? */
+                if (ret == -ERANGE)
+                        goto bad_type_length;
+                cachefiles_io_error_obj(object,
+                                        "Can't read xattr on %lu (err %d)",
+                                        dentry->d_inode->i_ino, -ret);
+                goto error;
+        }
+        /* check the on-disk object */
+        if (ret < 1)
+                goto bad_type_length;
+        if (auxbuf->type != auxdata->type)
+                goto stale;
+        auxbuf->len = ret;
+        /* consult the netfs */
+        if (object->fscache.cookie->def->check_aux) {
+                enum fscache_checkaux result;
+                unsigned int dlen;
+                dlen = auxbuf->len - 1;
+                _debug("checkaux %s #%u",
+                       object->fscache.cookie->def->name, dlen);
+                result = fscache_check_aux(&object->fscache,
+                                           &auxbuf->data, dlen);
+                switch (result) {
+                        /* entry okay as is */
+                case FSCACHE_CHECKAUX_OKAY:
+                        goto okay;
+                        /* entry requires update */
+                case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+                        break;
+                        /* entry requires deletion */
+                case FSCACHE_CHECKAUX_OBSOLETE:
+                        goto stale;
+                default:
+                        BUG();
+                }
+                /* update the current label */
+                ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+                                   &auxdata->type, auxdata->len,
+                                   XATTR_REPLACE);
+                if (ret < 0) {
+                        cachefiles_io_error_obj(object,
+                                                "Can't update xattr on %lu"
+                                                " (error %d)",
+                                                dentry->d_inode->i_ino, -ret);
+                        goto error;
+                }
+        }
+okay:
+        ret = 0;
+error:
+        kfree(auxbuf);
+        _leave(" = %d", ret);
+        return ret;
+bad_type_length:
+        kerror("Cache object %lu xattr length incorrect",
+               dentry->d_inode->i_ino);
+        ret = -EIO;
+        goto error;
+stale:
+        ret = -ESTALE;
+        goto error;
+}
+/*
+ * remove the object's xattr to mark it stale
+ */
+int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+                                   struct dentry *dentry)
+{
+        int ret;
+        ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
+        if (ret < 0) {
+                if (ret == -ENOENT || ret == -ENODATA)
+                        ret = 0;
+                else if (ret != -ENOMEM)
+                        cachefiles_io_error(cache,
+                                            "Can't remove xattr from %lu"
+                                            " (error %d)",
+                                            dentry->d_inode->i_ino, -ret);
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 65984006192c..f20c4069c220 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,16 @@
+Version 1.58
+------------
+Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
+when the UTF-8 string is composed of unusually long (more than 4 byte) converted
+characters. Add support for mounting root of a share which redirects immediately
+to DFS target. Convert string conversion functions from Unicode to more
+accurately mark string length before allocating memory (which may help the
+rare cases where a UTF-8 string is much larger than the UCS2 string that
+we converted from).  Fix endianness of the vcnum field used during
+session setup to distinguish multiple mounts to same server from different
+userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
+flag to be set to 2, and mount must enable krb5 to turn on extended security).
+ 
 Version 1.57
 ------------
 Improve support for multiple security contexts to the same server. We
@@ -15,7 +28,8 @@ Posix file open support added (turned off after one attempt if server
 fails to support it properly, as with Samba server versions prior to 3.3.2)
 Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
 little memory for the "nativeFileSystem" field returned by the server
-during mount). 
+during mount).  Endian convert inode numbers if necessary (makes it easier
+to compare inode numbers on network files from big endian systems). 
 Version 1.56
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 07434181623b..db208ddb9899 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -651,7 +651,15 @@ Experimental            When set to 1 used to enable certain experimental
                        signing turned on in case buffer was modified
                        just before it was sent, also this flag will
                        be used to use the new experimental directory change 
-                        notification code).
+                        notification code).  When set to 2 enables
+                        an additional experimental feature, "raw ntlmssp"
+                        session establishment support (which allows
+                        specifying "sec=ntlmssp" on mount). The Linux cifs
+                        module will use ntlmv2 authentication encapsulated
+                        in "raw ntlmssp" (not using SPNEGO) when
+                        "sec=ntlmssp" is specified on mount.
+                        This support also requires building cifs with
+                        the CONFIG_CIFS_EXPERIMENTAL configuration flag.
 These experimental features and tracing can be enabled by changing flags in 
 /proc/fs/cifs (after the cifs module has been installed or built into the 
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 5fdbf8a14472..83d62759c7c7 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -340,28 +340,24 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        for (i = 0; i < num_referrals; i++) {
+                int len;
                dump_referral(referrals+i);
-                /* connect to a storage node */
+                /* connect to a node */
-                if (referrals[i].flags & DFSREF_STORAGE_SERVER) {
+                len = strlen(referrals[i].node_name);
-                        int len;
+                if (len < 2) {
-                        len = strlen(referrals[i].node_name);
+                        cERROR(1, ("%s: Net Address path too short: %s",
-                        if (len < 2) {
-                                cERROR(1, ("%s: Net Address path too short: %s",
                                        __func__, referrals[i].node_name));
-                                rc = -EINVAL;
+                        rc = -EINVAL;
-                                goto out_err;
+                        goto out_err;
-                        }
+                }
-                        mnt = cifs_dfs_do_refmount(nd->path.mnt,
+                mnt = cifs_dfs_do_refmount(nd->path.mnt,
-                                                nd->path.dentry,
+                                nd->path.dentry, referrals + i);
-                                                referrals + i);
+                cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
-                        cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
-                                         __func__,
                                        referrals[i].node_name, mnt));
-                        /* complete mount procedure if we accured submount */
+                /* complete mount procedure if we accured submount */
-                        if (!IS_ERR(mnt))
+                if (!IS_ERR(mnt))
-                                break;
+                        break;
-                }
        }
        /* we need it cause for() above could exit without valid submount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 3fd3a9df043a..67bf93a40d2e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -41,7 +41,7 @@ cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
        /* attach the data */
        memcpy(payload, data, datalen);
-        rcu_assign_pointer(key->payload.data, payload);
+        key->payload.data = payload;
        ret = 0;
 error:
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 7d75272a6b3f..60e3c4253de0 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifs_unicode.c
 *
- *   Copyright (c) International Business Machines  Corp., 2000,2005
+ *   Copyright (c) International Business Machines  Corp., 2000,2009
 *   Modified by Steve French (sfrench@us.ibm.com)
 *
 *   This program is free software;  you can redistribute it and/or modify
@@ -26,31 +26,157 @@
 #include "cifs_debug.h"
 /*
- * NAME:        cifs_strfromUCS()
+ * cifs_ucs2_bytes - how long will a string be after conversion?
- *
+ * @ucs - pointer to input string
- * FUNCTION:    Convert little-endian unicode string to character string
+ * @maxbytes - don't go past this many bytes of input string
+ * @codepage - destination codepage
 *
+ * Walk a ucs2le string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
 */
 int
-cifs_strfromUCS_le(char *to, const __le16 *from,
+cifs_ucs2_bytes(const __le16 *from, int maxbytes,
-                   int len, const struct nls_table *codepage)
+                const struct nls_table *codepage)
 {
        int i;
-        int outlen = 0;
+        int charlen, outlen = 0;
+        int maxwords = maxbytes / 2;
+        char tmp[NLS_MAX_CHARSET_SIZE];
-        for (i = 0; (i < len) && from[i]; i++) {
+        for (i = 0; from[i] && i < maxwords; i++) {
-                int charlen;
+                charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
-                /* 2.4.0 kernel or greater */
+                                             NLS_MAX_CHARSET_SIZE);
-                charlen =
+                if (charlen > 0)
-                    codepage->uni2char(le16_to_cpu(from[i]), &to[outlen],
-                                       NLS_MAX_CHARSET_SIZE);
-                if (charlen > 0) {
                        outlen += charlen;
-                } else {
+                else
-                        to[outlen++] = '?';
+                        outlen++;
+        }
+        return outlen;
+}
+/*
+ * cifs_mapchar - convert a little-endian char to proper char in codepage
+ * @target - where converted character should be copied
+ * @src_char - 2 byte little-endian source character
+ * @cp - codepage to which character should be converted
+ * @mapchar - should character be mapped according to mapchars mount option?
+ *
+ * This function handles the conversion of a single character. It is the
+ * responsibility of the caller to ensure that the target buffer is large
+ * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
+ */
+static int
+cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
+             bool mapchar)
+{
+        int len = 1;
+        if (!mapchar)
+                goto cp_convert;
+        /*
+         * BB: Cannot handle remapping UNI_SLASH until all the calls to
+         *     build_path_from_dentry are modified, as they use slash as
+         *     separator.
+         */
+        switch (le16_to_cpu(src_char)) {
+        case UNI_COLON:
+                *target = ':';
+                break;
+        case UNI_ASTERIK:
+                *target = '*';
+                break;
+        case UNI_QUESTION:
+                *target = '?';
+                break;
+        case UNI_PIPE:
+                *target = '|';
+                break;
+        case UNI_GRTRTHAN:
+                *target = '>';
+                break;
+        case UNI_LESSTHAN:
+                *target = '<';
+                break;
+        default:
+                goto cp_convert;
+        }
+out:
+        return len;
+cp_convert:
+        len = cp->uni2char(le16_to_cpu(src_char), target,
+                           NLS_MAX_CHARSET_SIZE);
+        if (len <= 0) {
+                *target = '?';
+                len = 1;
+        }
+        goto out;
+}
+/*
+ * cifs_from_ucs2 - convert utf16le string to local charset
+ * @to - destination buffer
+ * @from - source buffer
+ * @tolen - destination buffer size (in bytes)
+ * @fromlen - source buffer size (in bytes)
+ * @codepage - codepage to which characters should be converted
+ * @mapchar - should characters be remapped according to the mapchars option?
+ *
+ * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * in the provided codepage. The tolen and fromlen parameters are to ensure
+ * that the code doesn't walk off of the end of the buffer (which is always
+ * a danger if the alignment of the source buffer is off). The destination
+ * string is always properly null terminated and fits in the destination
+ * buffer. Returns the length of the destination string in bytes (including
+ * null terminator).
+ *
+ * Note that some windows versions actually send multiword UTF-16 characters
+ * instead of straight UCS-2. The linux nls routines however aren't able to
+ * deal with those characters properly. In the event that we get some of
+ * those characters, they won't be translated properly.
+ */
+int
+cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+                 const struct nls_table *codepage, bool mapchar)
+{
+        int i, charlen, safelen;
+        int outlen = 0;
+        int nullsize = nls_nullsize(codepage);
+        int fromwords = fromlen / 2;
+        char tmp[NLS_MAX_CHARSET_SIZE];
+        /*
+         * because the chars can be of varying widths, we need to take care
+         * not to overflow the destination buffer when we get close to the
+         * end of it. Until we get to this offset, we don't need to check
+         * for overflow however.
+         */
+        safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
+        for (i = 0; i < fromwords && from[i]; i++) {
+                /*
+                 * check to see if converting this character might make the
+                 * conversion bleed into the null terminator
+                 */
+                if (outlen >= safelen) {
+                        charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
+                        if ((outlen + charlen) > (tolen - nullsize))
+                                break;
                }
+                /* put converted char into 'to' buffer */
+                charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
+                outlen += charlen;
        }
-        to[outlen] = 0;
+        /* properly null-terminate string */
+        for (i = 0; i < nullsize; i++)
+                to[outlen++] = 0;
        return outlen;
 }
@@ -88,3 +214,41 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
        return i;
 }
+/*
+ * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * @src - source string
+ * @maxlen - don't walk past this many bytes in the source string
+ * @is_unicode - is this a unicode string?
+ * @codepage - destination codepage
+ *
+ * Take a string given by the server, convert it to the local codepage and
+ * put it in a new buffer. Returns a pointer to the new string or NULL on
+ * error.
+ */
+char *
+cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
+             const struct nls_table *codepage)
+{
+        int len;
+        char *dst;
+        if (is_unicode) {
+                len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+                len += nls_nullsize(codepage);
+                dst = kmalloc(len, GFP_KERNEL);
+                if (!dst)
+                        return NULL;
+                cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+                               false);
+        } else {
+                len = strnlen(src, maxlen);
+                len++;
+                dst = kmalloc(len, GFP_KERNEL);
+                if (!dst)
+                        return NULL;
+                strlcpy(dst, src, len);
+        }
+        return dst;
+}
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 14eb9a2395d3..650638275a6f 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -5,7 +5,7 @@
 *     Convert a unicode character to upper or lower case using
 *     compressed tables.
 *
- *   Copyright (c) International Business Machines  Corp., 2000,2007
+ *   Copyright (c) International Business Machines  Corp., 2000,2009
 *
 *   This program is free software;  you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@@ -37,6 +37,19 @@
 #define  UNIUPR_NOLOWER         /* Example to not expand lower case tables */
+/*
+ * Windows maps these to the user defined 16 bit Unicode range since they are
+ * reserved symbols (along with \ and /), otherwise illegal to store
+ * in filenames in NTFS
+ */
+#define UNI_ASTERIK     (__u16) ('*' + 0xF000)
+#define UNI_QUESTION    (__u16) ('?' + 0xF000)
+#define UNI_COLON       (__u16) (':' + 0xF000)
+#define UNI_GRTRTHAN    (__u16) ('>' + 0xF000)
+#define UNI_LESSTHAN    (__u16) ('<' + 0xF000)
+#define UNI_PIPE        (__u16) ('|' + 0xF000)
+#define UNI_SLASH       (__u16) ('\\' + 0xF000)
 /* Just define what we want from uniupr.h.  We don't want to define the tables
 * in each source file.
 */
@@ -59,8 +72,14 @@ extern struct UniCaseRange UniLowerRange[];
 #endif                          /* UNIUPR_NOLOWER */
 #ifdef __KERNEL__
-int cifs_strfromUCS_le(char *, const __le16 *, int, const struct nls_table *);
+int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+                   const struct nls_table *codepage, bool mapchar);
+int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+                    const struct nls_table *codepage);
 int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_ucs(const char *src, const int maxlen,
+                            const bool is_unicode,
+                            const struct nls_table *codepage);
 #endif
 /*
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 38491fd3871d..5e6d35804d73 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,6 +35,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/smp_lock.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
@@ -66,9 +67,6 @@ unsigned int sign_CIFS_PDUs = 1;
 extern struct task_struct *oplockThread; /* remove sparse warning */
 struct task_struct *oplockThread = NULL;
 /* extern struct task_struct * dnotifyThread; remove sparse warning */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-static struct task_struct *dnotifyThread = NULL;
-#endif
 static const struct super_operations cifs_super_ops;
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
 module_param(CIFSMaxBufSize, int, 0);
@@ -316,6 +314,7 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_inode->clientCanCacheAll = false;
        cifs_inode->delete_pending = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
+        cifs_inode->server_eof = 0;
        /* Can not set i_flags here - they get immediately overwritten
           to zero by the VFS */
@@ -532,6 +531,7 @@ static void cifs_umount_begin(struct super_block *sb)
        if (tcon == NULL)
                return;
+        lock_kernel();
        read_lock(&cifs_tcp_ses_lock);
        if (tcon->tc_count == 1)
                tcon->tidStatus = CifsExiting;
@@ -550,6 +550,7 @@ static void cifs_umount_begin(struct super_block *sb)
        }
 /* BB FIXME - finish add checks for tidStatus BB */
+        unlock_kernel();
        return;
 }
@@ -601,8 +602,7 @@ cifs_get_sb(struct file_system_type *fs_type,
        rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
        if (rc) {
-                up_write(&sb->s_umount);
+                deactivate_locked_super(sb);
-                deactivate_super(sb);
                return rc;
        }
        sb->s_flags |= MS_ACTIVE;
@@ -1040,34 +1040,6 @@ static int cifs_oplock_thread(void *dummyarg)
        return 0;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-static int cifs_dnotify_thread(void *dummyarg)
-{
-        struct list_head *tmp;
-        struct TCP_Server_Info *server;
-        do {
-                if (try_to_freeze())
-                        continue;
-                set_current_state(TASK_INTERRUPTIBLE);
-                schedule_timeout(15*HZ);
-                /* check if any stuck requests that need
-                   to be woken up and wakeq so the
-                   thread can wake up and error out */
-                read_lock(&cifs_tcp_ses_lock);
-                list_for_each(tmp, &cifs_tcp_ses_list) {
-                        server = list_entry(tmp, struct TCP_Server_Info,
-                                         tcp_ses_list);
-                        if (atomic_read(&server->inFlight))
-                                wake_up_all(&server->response_q);
-                }
-                read_unlock(&cifs_tcp_ses_lock);
-        } while (!kthread_should_stop());
-        return 0;
-}
-#endif
 static int __init
 init_cifs(void)
 {
@@ -1144,21 +1116,8 @@ init_cifs(void)
                goto out_unregister_dfs_key_type;
        }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
-        if (IS_ERR(dnotifyThread)) {
-                rc = PTR_ERR(dnotifyThread);
-                cERROR(1, ("error %d create dnotify thread", rc));
-                goto out_stop_oplock_thread;
-        }
-#endif
        return 0;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
- out_stop_oplock_thread:
-#endif
-        kthread_stop(oplockThread);
 out_unregister_dfs_key_type:
 #ifdef CONFIG_CIFS_DFS_UPCALL
        unregister_key_type(&key_type_dns_resolver);
@@ -1196,9 +1155,6 @@ exit_cifs(void)
        cifs_destroy_inodecache();
        cifs_destroy_mids();
        cifs_destroy_request_bufs();
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        kthread_stop(dnotifyThread);
-#endif
        kthread_stop(oplockThread);
 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 77e190dc2883..051b71cfdea9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.57"
+#define CIFS_VERSION   "1.58"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9fbf4dff5da6..a61ab772c6f6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -82,8 +82,8 @@ enum securityEnum {
        LANMAN,                 /* Legacy LANMAN auth */
        NTLM,                   /* Legacy NTLM012 auth with NTLM hash */
        NTLMv2,                 /* Legacy NTLM auth with NTLMv2 hash */
-        RawNTLMSSP,             /* NTLMSSP without SPNEGO */
+        RawNTLMSSP,             /* NTLMSSP without SPNEGO, NTLMv2 hash */
-        NTLMSSP,                /* NTLMSSP via SPNEGO */
+        NTLMSSP,                /* NTLMSSP via SPNEGO, NTLMv2 hash */
        Kerberos,               /* Kerberos via SPNEGO */
        MSKerberos,             /* MS Kerberos via SPNEGO */
 };
@@ -350,7 +350,7 @@ struct cifsFileInfo {
        bool invalidHandle:1;   /* file closed via session abend */
        bool messageMode:1;     /* for pipes: message vs byte mode */
        atomic_t wrtPending;   /* handle in use - defer close */
-        struct semaphore fh_sem; /* prevents reopen race after dead ses*/
+        struct mutex fh_mutex; /* prevents reopen race after dead ses*/
        struct cifs_search_info srch_inf;
 };
@@ -370,6 +370,7 @@ struct cifsInodeInfo {
        bool clientCanCacheAll:1;       /* read and writebehind oplock */
        bool oplockPending:1;
        bool delete_pending:1;          /* DELETE_ON_CLOSE is set */
+        u64  server_eof;                /* current file size on server */
        struct inode vfs_inode;
 };
@@ -530,6 +531,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFSSEC_MAY_PLNTXT    0
 #endif /* weak passwords */
 #define   CIFSSEC_MAY_SEAL      0x00040 /* not supported yet */
+#define   CIFSSEC_MAY_NTLMSSP   0x00080 /* raw ntlmssp with ntlmv2 */
 #define   CIFSSEC_MUST_SIGN     0x01001
 /* note that only one of the following can be set so the
@@ -542,22 +544,23 @@ require use of the stronger protocol */
 #define   CIFSSEC_MUST_LANMAN   0x10010
 #define   CIFSSEC_MUST_PLNTXT   0x20020
 #ifdef CONFIG_CIFS_UPCALL
-#define   CIFSSEC_MASK          0x3F03F /* allows weak security but also krb5 */
+#define   CIFSSEC_MASK          0xAF0AF /* allows weak security but also krb5 */
 #else
-#define   CIFSSEC_MASK          0x37037 /* current flags supported if weak */
+#define   CIFSSEC_MASK          0xA70A7 /* current flags supported if weak */
 #endif /* UPCALL */
 #else /* do not allow weak pw hash */
 #ifdef CONFIG_CIFS_UPCALL
-#define   CIFSSEC_MASK          0x0F00F /* flags supported if no weak allowed */
+#define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */
 #else
-#define   CIFSSEC_MASK          0x07007 /* flags supported if no weak allowed */
+#define   CIFSSEC_MASK          0x87087 /* flags supported if no weak allowed */
 #endif /* UPCALL */
 #endif /* WEAK_PW_HASH */
 #define   CIFSSEC_MUST_SEAL     0x40040 /* not supported yet */
+#define   CIFSSEC_MUST_NTLMSSP  0x80080 /* raw ntlmssp with ntlmv2 */
 #define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
 #define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
-#define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5)
+#define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
 /*
 *****************************************************************
 * All constants go here
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b370489c8da5..a785f69dbc9f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2163,7 +2163,7 @@ typedef struct {
        __le32 Type;
        __le64 DevMajor;
        __le64 DevMinor;
-        __u64 UniqueId;
+        __le64 UniqueId;
        __le64 Permissions;
        __le64 Nlinks;
 } __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */
@@ -2308,7 +2308,7 @@ struct unlink_psx_rq { /* level 0x20a SetPathInfo */
 } __attribute__((packed));
 struct file_internal_info {
-        __u64  UniqueId; /* inode number */
+        __le64  UniqueId; /* inode number */
 } __attribute__((packed));      /* level 0x3ee */
 struct file_mode_info {
@@ -2338,7 +2338,7 @@ typedef struct {
        __le32 Type;
        __le64 DevMajor;
        __le64 DevMinor;
-        __u64 UniqueId;
+        __le64 UniqueId;
        __le64 Permissions;
        __le64 Nlinks;
        char FileName[1];
@@ -2386,7 +2386,7 @@ typedef struct {
        __le32 FileNameLength;
        __le32 EaSize; /* EA size */
        __le32 Reserved;
-        __u64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
+        __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
        char FileName[1];
 } __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 4167716d32f2..fae083930eee 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -260,8 +260,7 @@ extern int CIFSUnixCreateSymLink(const int xid,
                        const struct nls_table *nls_codepage);
 extern int CIFSSMBUnixQuerySymLink(const int xid,
                        struct cifsTconInfo *tcon,
-                        const unsigned char *searchName,
+                        const unsigned char *searchName, char **syminfo,
-                        char *syminfo, const int buflen,
                        const struct nls_table *nls_codepage);
 extern int CIFSSMBQueryReparseLinkInfo(const int xid,
                        struct cifsTconInfo *tcon,
@@ -307,8 +306,6 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName, __u64 *inode_number,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
-                        const struct nls_table *codepage);
 extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
                        const struct nls_table *cp, int mapChars);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bc09c998631f..d06260251c30 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifssmb.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   Contains the routines for constructing the SMB PDUs themselves
@@ -81,41 +81,6 @@ static struct {
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
 #endif /* CIFS_POSIX */
-/* Allocates buffer into dst and copies smb string from src to it.
- * caller is responsible for freeing dst if function returned 0.
- * returns:
- *      on success - 0
- *      on failure - errno
- */
-static int
-cifs_strncpy_to_host(char **dst, const char *src, const int maxlen,
-                 const bool is_unicode, const struct nls_table *nls_codepage)
-{
-        int plen;
-        if (is_unicode) {
-                plen = UniStrnlen((wchar_t *)src, maxlen);
-                *dst = kmalloc(plen + 2, GFP_KERNEL);
-                if (!*dst)
-                        goto cifs_strncpy_to_host_ErrExit;
-                cifs_strfromUCS_le(*dst, (__le16 *)src, plen, nls_codepage);
-        } else {
-                plen = strnlen(src, maxlen);
-                *dst = kmalloc(plen + 2, GFP_KERNEL);
-                if (!*dst)
-                        goto cifs_strncpy_to_host_ErrExit;
-                strncpy(*dst, src, plen);
-        }
-        (*dst)[plen] = 0;
-        (*dst)[plen+1] = 0; /* harmless for ASCII case, needed for Unicode */
-        return 0;
-cifs_strncpy_to_host_ErrExit:
-        cERROR(1, ("Failed to allocate buffer for string\n"));
-        return -ENOMEM;
-}
 /* Mark as invalid, all open files on tree connections since they
   were closed when session to server was lost */
 static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
@@ -484,6 +449,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                cFYI(1, ("Kerberos only mechanism, enable extended security"));
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+        else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
+                cFYI(1, ("NTLMSSP only mechanism, enable extended security"));
+                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+        }
+#endif
        count = 0;
        for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -620,6 +593,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                server->secType = NTLMv2;
        else if (secFlags & CIFSSEC_MAY_KRB5)
                server->secType = Kerberos;
+        else if (secFlags & CIFSSEC_MAY_NTLMSSP)
+                server->secType = NTLMSSP;
        else if (secFlags & CIFSSEC_MAY_LANMAN)
                server->secType = LANMAN;
 /* #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -1626,6 +1601,8 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        int smb_hdr_len;
        int resp_buf_type = 0;
+        *nbytes = 0;
        cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
        if (tcon->ses->capabilities & CAP_LARGE_FILES) {
@@ -1682,11 +1659,9 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        cifs_stats_inc(&tcon->num_writes);
        if (rc) {
                cFYI(1, ("Send error Write2 = %d", rc));
-                *nbytes = 0;
        } else if (resp_buf_type == 0) {
                /* presumably this can not happen, but best to be safe */
                rc = -EIO;
-                *nbytes = 0;
        } else {
                WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base;
                *nbytes = le16_to_cpu(pSMBr->CountHigh);
@@ -2417,8 +2392,7 @@ winCreateHardLinkRetry:
 int
 CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
-                        const unsigned char *searchName,
+                        const unsigned char *searchName, char **symlinkinfo,
-                        char *symlinkinfo, const int buflen,
                        const struct nls_table *nls_codepage)
 {
 /* SMB_QUERY_FILE_UNIX_LINK */
@@ -2428,6 +2402,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        int name_len;
        __u16 params, byte_count;
+        char *data_start;
        cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName));
@@ -2482,30 +2457,26 @@ querySymLinkRetry:
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 2))
                /* BB also check enough total bytes returned */
-                        rc = -EIO;      /* bad smb */
+                if (rc || (pSMBr->ByteCount < 2))
+                        rc = -EIO;
                else {
-                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+                        bool is_unicode;
-                        __u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+                        u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+                        data_start = ((char *) &pSMBr->hdr.Protocol) +
+                                           le16_to_cpu(pSMBr->t2.DataOffset);
+                        if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+                                is_unicode = true;
+                        else
+                                is_unicode = false;
-                        if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-                                name_len = UniStrnlen((wchar_t *) ((char *)
-                                        &pSMBr->hdr.Protocol + data_offset),
-                                        min_t(const int, buflen, count) / 2);
                        /* BB FIXME investigate remapping reserved chars here */
-                                cifs_strfromUCS_le(symlinkinfo,
+                        *symlinkinfo = cifs_strndup_from_ucs(data_start, count,
-                                        (__le16 *) ((char *)&pSMBr->hdr.Protocol
+                                                    is_unicode, nls_codepage);
-                                                        + data_offset),
+                        if (!*symlinkinfo)
-                                        name_len, nls_codepage);
+                                rc = -ENOMEM;
-                        } else {
-                                strncpy(symlinkinfo,
-                                        (char *) &pSMBr->hdr.Protocol +
-                                                data_offset,
-                                        min_t(const int, buflen, count));
-                        }
-                        symlinkinfo[buflen] = 0;
-        /* just in case so calling code does not go off the end of buffer */
                }
        }
        cifs_buf_release(pSMB);
@@ -2603,7 +2574,6 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
        *pparmlen = parm_count;
        return 0;
 }
-#endif /* CIFS_EXPERIMENTAL */
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
@@ -2613,7 +2583,6 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 {
        int rc = 0;
        int bytes_returned;
-        int name_len;
        struct smb_com_transaction_ioctl_req *pSMB;
        struct smb_com_transaction_ioctl_rsp *pSMBr;
@@ -2650,59 +2619,55 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
        } else {                /* decode response */
                __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
                __u32 data_count = le32_to_cpu(pSMBr->DataCount);
-                if ((pSMBr->ByteCount < 2) || (data_offset > 512))
+                if ((pSMBr->ByteCount < 2) || (data_offset > 512)) {
                /* BB also check enough total bytes returned */
                        rc = -EIO;      /* bad smb */
-                else {
+                        goto qreparse_out;
-                        if (data_count && (data_count < 2048)) {
+                }
-                                char *end_of_smb = 2 /* sizeof byte count */ +
+                if (data_count && (data_count < 2048)) {
-                                                pSMBr->ByteCount +
+                        char *end_of_smb = 2 /* sizeof byte count */ +
-                                                (char *)&pSMBr->ByteCount;
+                                pSMBr->ByteCount + (char *)&pSMBr->ByteCount;
-                                struct reparse_data *reparse_buf =
+                        struct reparse_data *reparse_buf =
                                                (struct reparse_data *)
                                                ((char *)&pSMBr->hdr.Protocol
                                                                 + data_offset);
-                                if ((char *)reparse_buf >= end_of_smb) {
+                        if ((char *)reparse_buf >= end_of_smb) {
-                                        rc = -EIO;
+                                rc = -EIO;
-                                        goto qreparse_out;
+                                goto qreparse_out;
-                                }
+                        }
-                                if ((reparse_buf->LinkNamesBuf +
+                        if ((reparse_buf->LinkNamesBuf +
-                                        reparse_buf->TargetNameOffset +
+                                reparse_buf->TargetNameOffset +
-                                        reparse_buf->TargetNameLen) >
+                                reparse_buf->TargetNameLen) > end_of_smb) {
-                                                end_of_smb) {
+                                cFYI(1, ("reparse buf beyond SMB"));
-                                        cFYI(1, ("reparse buf beyond SMB"));
+                                rc = -EIO;
-                                        rc = -EIO;
+                                goto qreparse_out;
-                                        goto qreparse_out;
+                        }
-                                }
-                                if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
+                        if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-                                        name_len = UniStrnlen((wchar_t *)
+                                cifs_from_ucs2(symlinkinfo, (__le16 *)
                                                (reparse_buf->LinkNamesBuf +
                                                reparse_buf->TargetNameOffset),
-                                                min(buflen/2,
+                                                buflen,
-                                                reparse_buf->TargetNameLen / 2));
+                                                reparse_buf->TargetNameLen,
-                                        cifs_strfromUCS_le(symlinkinfo,
+                                                nls_codepage, 0);
-                                                (__le16 *) (reparse_buf->LinkNamesBuf +
+                        } else { /* ASCII names */
-                                                reparse_buf->TargetNameOffset),
+                                strncpy(symlinkinfo,
-                                                name_len, nls_codepage);
+                                        reparse_buf->LinkNamesBuf +
-                                } else { /* ASCII names */
+                                        reparse_buf->TargetNameOffset,
-                                        strncpy(symlinkinfo,
+                                        min_t(const int, buflen,
-                                                reparse_buf->LinkNamesBuf +
+                                           reparse_buf->TargetNameLen));
-                                                reparse_buf->TargetNameOffset,
-                                                min_t(const int, buflen,
-                                                   reparse_buf->TargetNameLen));
-                                }
-                        } else {
-                                rc = -EIO;
-                                cFYI(1, ("Invalid return data count on "
-                                         "get reparse info ioctl"));
                        }
-                        symlinkinfo[buflen] = 0; /* just in case so the caller
+                } else {
-                                        does not go off the end of the buffer */
+                        rc = -EIO;
-                        cFYI(1, ("readlink result - %s", symlinkinfo));
+                        cFYI(1, ("Invalid return data count on "
+                                 "get reparse info ioctl"));
                }
+                symlinkinfo[buflen] = 0; /* just in case so the caller
+                                        does not go off the end of the buffer */
+                cFYI(1, ("readlink result - %s", symlinkinfo));
        }
 qreparse_out:
        cifs_buf_release(pSMB);
@@ -2711,6 +2676,7 @@ qreparse_out:
        return rc;
 }
+#endif /* CIFS_EXPERIMENTAL */
 #ifdef CONFIG_CIFS_POSIX
@@ -3918,7 +3884,7 @@ GetInodeNumberRetry:
                        }
                        pfinfo = (struct file_internal_info *)
                                (data_offset + (char *) &pSMBr->hdr.Protocol);
-                        *inode_number = pfinfo->UniqueId;
+                        *inode_number = le64_to_cpu(pfinfo->UniqueId);
                }
        }
 GetInodeNumOut:
@@ -3928,27 +3894,6 @@ GetInodeNumOut:
        return rc;
 }
-/* computes length of UCS string converted to host codepage
- * @src:        UCS string
- * @maxlen:     length of the input string in UCS characters
- *              (not in bytes)
- *
- * return:      size of input string in host codepage
- */
-static int hostlen_fromUCS(const __le16 *src, const int maxlen,
-                const struct nls_table *nls_codepage) {
-        int i;
-        int hostlen = 0;
-        char to[4];
-        int charlen;
-        for (i = 0; (i < maxlen) && src[i]; ++i) {
-                charlen = nls_codepage->uni2char(le16_to_cpu(src[i]),
-                                to, NLS_MAX_CHARSET_SIZE);
-                hostlen += charlen > 0 ? charlen : 1;
-        }
-        return hostlen;
-}
 /* parses DFS refferal V3 structure
 * caller is responsible for freeing target_nodes
 * returns:
@@ -3994,7 +3939,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
        cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
                        *num_of_nodes,
-                        le16_to_cpu(pSMBr->DFSFlags)));
+                        le32_to_cpu(pSMBr->DFSFlags)));
        *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
                        *num_of_nodes, GFP_KERNEL);
@@ -4010,14 +3955,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                int max_len;
                struct dfs_info3_param *node = (*target_nodes)+i;
-                node->flags = le16_to_cpu(pSMBr->DFSFlags);
+                node->flags = le32_to_cpu(pSMBr->DFSFlags);
                if (is_unicode) {
                        __le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
                                                GFP_KERNEL);
                        cifsConvertToUCS((__le16 *) tmp, searchName,
                                        PATH_MAX, nls_codepage, remap);
-                        node->path_consumed = hostlen_fromUCS(tmp,
+                        node->path_consumed = cifs_ucs2_bytes(tmp,
-                                        le16_to_cpu(pSMBr->PathConsumed)/2,
+                                        le16_to_cpu(pSMBr->PathConsumed),
                                        nls_codepage);
                        kfree(tmp);
                } else
@@ -4029,20 +3974,20 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                /* copy DfsPath */
                temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
                max_len = data_end - temp;
-                rc = cifs_strncpy_to_host(&(node->path_name), temp,
+                node->path_name = cifs_strndup_from_ucs(temp, max_len,
-                                        max_len, is_unicode, nls_codepage);
+                                                      is_unicode, nls_codepage);
-                if (rc)
+                if (!node->path_name) {
+                        rc = -ENOMEM;
                        goto parse_DFS_referrals_exit;
+                }
                /* copy link target UNC */
                temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
                max_len = data_end - temp;
-                rc = cifs_strncpy_to_host(&(node->node_name), temp,
+                node->node_name = cifs_strndup_from_ucs(temp, max_len,
-                                        max_len, is_unicode, nls_codepage);
+                                                      is_unicode, nls_codepage);
-                if (rc)
+                if (!node->node_name)
-                        goto parse_DFS_referrals_exit;
+                        rc = -ENOMEM;
-                ref += le16_to_cpu(ref->Size);
        }
 parse_DFS_referrals_exit:
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0de3b5615a22..4aa81a507b74 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/connect.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -32,6 +32,7 @@
 #include <linux/kthread.h>
 #include <linux/pagevec.h>
 #include <linux/freezer.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <net/ipv6.h>
@@ -978,6 +979,13 @@ cifs_parse_mount_options(char *options, const char *devname,
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+                        } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
+                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
+                                        CIFSSEC_MUST_SIGN;
+                        } else if (strnicmp(value, "ntlmssp", 7) == 0) {
+                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
+#endif
                        } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
                                        CIFSSEC_MUST_SIGN;
@@ -2214,9 +2222,58 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
        return rc;
 }
+static void
+cleanup_volume_info(struct smb_vol **pvolume_info)
+{
+        struct smb_vol *volume_info;
+        if (!pvolume_info && !*pvolume_info)
+                return;
+        volume_info = *pvolume_info;
+        kzfree(volume_info->password);
+        kfree(volume_info->UNC);
+        kfree(volume_info->prepath);
+        kfree(volume_info);
+        *pvolume_info = NULL;
+        return;
+}
+#ifdef CONFIG_CIFS_DFS_UPCALL
+/* build_path_to_root returns full path to root when
+ * we do not have an exiting connection (tcon) */
+static char *
+build_unc_path_to_root(const struct smb_vol *volume_info,
+                const struct cifs_sb_info *cifs_sb)
+{
+        char *full_path;
+        int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1);
+        full_path = kmalloc(unc_len + cifs_sb->prepathlen + 1, GFP_KERNEL);
+        if (full_path == NULL)
+                return ERR_PTR(-ENOMEM);
+        strncpy(full_path, volume_info->UNC, unc_len);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+                int i;
+                for (i = 0; i < unc_len; i++) {
+                        if (full_path[i] == '\\')
+                                full_path[i] = '/';
+                }
+        }
+        if (cifs_sb->prepathlen)
+                strncpy(full_path + unc_len, cifs_sb->prepath,
+                                cifs_sb->prepathlen);
+        full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
+        return full_path;
+}
+#endif
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
-           char *mount_data, const char *devname)
+                char *mount_data_global, const char *devname)
 {
        int rc = 0;
        int xid;
@@ -2225,6 +2282,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        struct cifsTconInfo *tcon = NULL;
        struct TCP_Server_Info *srvTcp = NULL;
        char   *full_path;
+        char *mount_data = mount_data_global;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        struct dfs_info3_param *referrals = NULL;
+        unsigned int num_referrals = 0;
+        int referral_walks_count = 0;
+try_mount_again:
+#endif
+        full_path = NULL;
        xid = GetXid();
@@ -2371,11 +2436,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                }
                        }
-                        /* check for null share name ie connect to dfs root */
                        if ((strchr(volume_info->UNC + 3, '\\') == NULL)
                            && (strchr(volume_info->UNC + 3, '/') == NULL)) {
-                                /* rc = connect_to_dfs_path(...) */
+                                cERROR(1, ("Missing share name"));
-                                cFYI(1, ("DFS root not supported"));
                                rc = -ENODEV;
                                goto mount_fail_check;
                        } else {
@@ -2392,7 +2455,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                }
                        }
                        if (rc)
-                                goto mount_fail_check;
+                                goto remote_path_check;
                        tcon->seal = volume_info->seal;
                        write_lock(&cifs_tcp_ses_lock);
                        list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
@@ -2417,19 +2480,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
        sb->s_time_gran = 100;
-mount_fail_check:
+        if (rc)
-        /* on error free sesinfo and tcon struct if needed */
+                goto remote_path_check;
-        if (rc) {
-                /* If find_unc succeeded then rc == 0 so we can not end */
-                /* up accidently freeing someone elses tcon struct */
-                if (tcon)
-                        cifs_put_tcon(tcon);
-                else if (pSesInfo)
-                        cifs_put_smb_ses(pSesInfo);
-                else
-                        cifs_put_tcp_session(srvTcp);
-                goto out;
-        }
        cifs_sb->tcon = tcon;
        /* do not care if following two calls succeed - informational */
@@ -2461,7 +2514,9 @@ mount_fail_check:
                cifs_sb->rsize = min(cifs_sb->rsize,
                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
-        if (!rc && cifs_sb->prepathlen) {
+remote_path_check:
+        /* check if a whole path (including prepath) is not remote */
+        if (!rc && cifs_sb->prepathlen && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
                full_path = cifs_build_path_to_root(cifs_sb);
                if (full_path == NULL) {
@@ -2469,1079 +2524,91 @@ mount_fail_check:
                        goto mount_fail_check;
                }
                rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
-                if (rc) {
+                if (rc != -EREMOTE) {
-                        cERROR(1, ("Path %s in not accessible: %d",
-                                                full_path, rc));
                        kfree(full_path);
                        goto mount_fail_check;
                }
                kfree(full_path);
        }
-        /* volume_info->password is freed above when existing session found
+        /* get referral if needed */
-        (in which case it is not needed anymore) but when new sesion is created
+        if (rc == -EREMOTE) {
-        the password ptr is put in the new session structure (in which case the
+#ifdef CONFIG_CIFS_DFS_UPCALL
-        password will be freed at unmount time) */
+                if (referral_walks_count > MAX_NESTED_LINKS) {
-out:
+                        /*
-        /* zero out password before freeing */
+                         * BB: when we implement proper loop detection,
-        if (volume_info) {
+                         *     we will remove this check. But now we need it
-                if (volume_info->password != NULL) {
+                         *     to prevent an indefinite loop if 'DFS tree' is
-                        memset(volume_info->password, 0,
+                         *     misconfigured (i.e. has loops).
-                                strlen(volume_info->password));
+                         */
-                        kfree(volume_info->password);
+                        rc = -ELOOP;
-                }
+                        goto mount_fail_check;
-                kfree(volume_info->UNC);
-                kfree(volume_info->prepath);
-                kfree(volume_info);
-        }
-        FreeXid(xid);
-        return rc;
-}
-static int
-CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
-              char session_key[CIFS_SESS_KEY_SIZE],
-              const struct nls_table *nls_codepage)
-{
-        struct smb_hdr *smb_buffer;
-        struct smb_hdr *smb_buffer_response;
-        SESSION_SETUP_ANDX *pSMB;
-        SESSION_SETUP_ANDX *pSMBr;
-        char *bcc_ptr;
-        char *user;
-        char *domain;
-        int rc = 0;
-        int remaining_words = 0;
-        int bytes_returned = 0;
-        int len;
-        __u32 capabilities;
-        __u16 count;
-        cFYI(1, ("In sesssetup"));
-        if (ses == NULL)
-                return -EINVAL;
-        user = ses->userName;
-        domain = ses->domainName;
-        smb_buffer = cifs_buf_get();
-        if (smb_buffer == NULL)
-                return -ENOMEM;
-        smb_buffer_response = smb_buffer;
-        pSMBr = pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
-        /* send SMBsessionSetup here */
-        header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
-                        NULL /* no tCon exists yet */ , 13 /* wct */ );
-        smb_buffer->Mid = GetNextMid(ses->server);
-        pSMB->req_no_secext.AndXCommand = 0xFF;
-        pSMB->req_no_secext.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-        pSMB->req_no_secext.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-        if (ses->server->secMode &
-                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-        capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-                CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
-        if (ses->capabilities & CAP_UNICODE) {
-                smb_buffer->Flags2 |= SMBFLG2_UNICODE;
-                capabilities |= CAP_UNICODE;
-        }
-        if (ses->capabilities & CAP_STATUS32) {
-                smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
-                capabilities |= CAP_STATUS32;
-        }
-        if (ses->capabilities & CAP_DFS) {
-                smb_buffer->Flags2 |= SMBFLG2_DFS;
-                capabilities |= CAP_DFS;
-        }
-        pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
-        pSMB->req_no_secext.CaseInsensitivePasswordLength =
-                cpu_to_le16(CIFS_SESS_KEY_SIZE);
-        pSMB->req_no_secext.CaseSensitivePasswordLength =
-            cpu_to_le16(CIFS_SESS_KEY_SIZE);
-        bcc_ptr = pByteArea(smb_buffer);
-        memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
-        bcc_ptr += CIFS_SESS_KEY_SIZE;
-        memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
-        bcc_ptr += CIFS_SESS_KEY_SIZE;
-        if (ses->capabilities & CAP_UNICODE) {
-                if ((long) bcc_ptr % 2) { /* must be word aligned for Unicode */
-                        *bcc_ptr = 0;
-                        bcc_ptr++;
-                }
-                if (user == NULL)
-                        bytes_returned = 0; /* skip null user */
-                else
-                        bytes_returned =
-                                cifs_strtoUCS((__le16 *) bcc_ptr, user, 100,
-                                        nls_codepage);
-                /* convert number of 16 bit words to bytes */
-                bcc_ptr += 2 * bytes_returned;
-                bcc_ptr += 2;   /* trailing null */
-                if (domain == NULL)
-                        bytes_returned =
-                            cifs_strtoUCS((__le16 *) bcc_ptr,
-                                          "CIFS_LINUX_DOM", 32, nls_codepage);
-                else
-                        bytes_returned =
-                            cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
-                                          nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bcc_ptr += 2;
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
-                                  32, nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release,
-                                  32, nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bcc_ptr += 2;
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-                                  64, nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bcc_ptr += 2;
-        } else {
-                if (user != NULL) {
-                    strncpy(bcc_ptr, user, 200);
-                    bcc_ptr += strnlen(user, 200);
-                }
-                *bcc_ptr = 0;
-                bcc_ptr++;
-                if (domain == NULL) {
-                        strcpy(bcc_ptr, "CIFS_LINUX_DOM");
-                        bcc_ptr += strlen("CIFS_LINUX_DOM") + 1;
-                } else {
-                        strncpy(bcc_ptr, domain, 64);
-                        bcc_ptr += strnlen(domain, 64);
-                        *bcc_ptr = 0;
-                        bcc_ptr++;
-                }
-                strcpy(bcc_ptr, "Linux version ");
-                bcc_ptr += strlen("Linux version ");
-                strcpy(bcc_ptr, utsname()->release);
-                bcc_ptr += strlen(utsname()->release) + 1;
-                strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
-                bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
-        }
-        count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
-        smb_buffer->smb_buf_length += count;
-        pSMB->req_no_secext.ByteCount = cpu_to_le16(count);
-        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-                         &bytes_returned, CIFS_LONG_OP);
-        if (rc) {
-/* rc = map_smb_to_linux_error(smb_buffer_response); now done in SendReceive */
-        } else if ((smb_buffer_response->WordCount == 3)
-                   || (smb_buffer_response->WordCount == 4)) {
-                __u16 action = le16_to_cpu(pSMBr->resp.Action);
-                __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
-                if (action & GUEST_LOGIN)
-                        cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
-                ses->Suid = smb_buffer_response->Uid; /* UID left in wire format
-                                                         (little endian) */
-                cFYI(1, ("UID = %d ", ses->Suid));
-        /* response can have either 3 or 4 word count - Samba sends 3 */
-                bcc_ptr = pByteArea(smb_buffer_response);
-                if ((pSMBr->resp.hdr.WordCount == 3)
-                    || ((pSMBr->resp.hdr.WordCount == 4)
-                        && (blob_len < pSMBr->resp.ByteCount))) {
-                        if (pSMBr->resp.hdr.WordCount == 4)
-                                bcc_ptr += blob_len;
-                        if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-                                if ((long) (bcc_ptr) % 2) {
-                                        remaining_words =
-                                            (BCC(smb_buffer_response) - 1) / 2;
-                                        /* Unicode strings must be word
-                                           aligned */
-                                        bcc_ptr++;
-                                } else {
-                                        remaining_words =
-                                                BCC(smb_buffer_response) / 2;
-                                }
-                                len =
-                                    UniStrnlen((wchar_t *) bcc_ptr,
-                                               remaining_words - 1);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-   the end since (at least) WIN2K and Windows XP have a major bug in not null
-   terminating last Unicode string in response  */
-                                if (ses->serverOS)
-                                        kfree(ses->serverOS);
-                                ses->serverOS = kzalloc(2 * (len + 1),
-                                                        GFP_KERNEL);
-                                if (ses->serverOS == NULL)
-                                        goto sesssetup_nomem;
-                                cifs_strfromUCS_le(ses->serverOS,
-                                                   (__le16 *)bcc_ptr,
-                                                   len, nls_codepage);
-                                bcc_ptr += 2 * (len + 1);
-                                remaining_words -= len + 1;
-                                ses->serverOS[2 * len] = 0;
-                                ses->serverOS[1 + (2 * len)] = 0;
-                                if (remaining_words > 0) {
-                                        len = UniStrnlen((wchar_t *)bcc_ptr,
-                                                         remaining_words-1);
-                                        kfree(ses->serverNOS);
-                                        ses->serverNOS = kzalloc(2 * (len + 1),
-                                                                 GFP_KERNEL);
-                                        if (ses->serverNOS == NULL)
-                                                goto sesssetup_nomem;
-                                        cifs_strfromUCS_le(ses->serverNOS,
-                                                           (__le16 *)bcc_ptr,
-                                                           len, nls_codepage);
-                                        bcc_ptr += 2 * (len + 1);
-                                        ses->serverNOS[2 * len] = 0;
-                                        ses->serverNOS[1 + (2 * len)] = 0;
-                                        if (strncmp(ses->serverNOS,
-                                                "NT LAN Manager 4", 16) == 0) {
-                                                cFYI(1, ("NT4 server"));
-                                                ses->flags |= CIFS_SES_NT4;
-                                        }
-                                        remaining_words -= len + 1;
-                                        if (remaining_words > 0) {
-                                                len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
-                                /* last string is not always null terminated
-                                   (for e.g. for Windows XP & 2000) */
-                                                if (ses->serverDomain)
-                                                        kfree(ses->serverDomain);
-                                                ses->serverDomain =
-                                                    kzalloc(2*(len+1),
-                                                            GFP_KERNEL);
-                                                if (ses->serverDomain == NULL)
-                                                        goto sesssetup_nomem;
-                                                cifs_strfromUCS_le(ses->serverDomain,
-                                                        (__le16 *)bcc_ptr,
-                                                        len, nls_codepage);
-                                                bcc_ptr += 2 * (len + 1);
-                                                ses->serverDomain[2*len] = 0;
-                                                ses->serverDomain[1+(2*len)] = 0;
-                                        } else { /* else no more room so create
-                                                  dummy domain string */
-                                                if (ses->serverDomain)
-                                                        kfree(ses->serverDomain);
-                                                ses->serverDomain =
-                                                        kzalloc(2, GFP_KERNEL);
-                                        }
-                                } else { /* no room so create dummy domain
-                                            and NOS string */
-                                        /* if these kcallocs fail not much we
-                                           can do, but better to not fail the
-                                           sesssetup itself */
-                                        kfree(ses->serverDomain);
-                                        ses->serverDomain =
-                                            kzalloc(2, GFP_KERNEL);
-                                        kfree(ses->serverNOS);
-                                        ses->serverNOS =
-                                            kzalloc(2, GFP_KERNEL);
-                                }
-                        } else {        /* ASCII */
-                                len = strnlen(bcc_ptr, 1024);
-                                if (((long) bcc_ptr + len) - (long)
-                                    pByteArea(smb_buffer_response)
-                                            <= BCC(smb_buffer_response)) {
-                                        kfree(ses->serverOS);
-                                        ses->serverOS = kzalloc(len + 1,
-                                                                GFP_KERNEL);
-                                        if (ses->serverOS == NULL)
-                                                goto sesssetup_nomem;
-                                        strncpy(ses->serverOS, bcc_ptr, len);
-                                        bcc_ptr += len;
-                                        /* null terminate the string */
-                                        bcc_ptr[0] = 0;
-                                        bcc_ptr++;
-                                        len = strnlen(bcc_ptr, 1024);
-                                        kfree(ses->serverNOS);
-                                        ses->serverNOS = kzalloc(len + 1,
-                                                                 GFP_KERNEL);
-                                        if (ses->serverNOS == NULL)
-                                                goto sesssetup_nomem;
-                                        strncpy(ses->serverNOS, bcc_ptr, len);
-                                        bcc_ptr += len;
-                                        bcc_ptr[0] = 0;
-                                        bcc_ptr++;
-                                        len = strnlen(bcc_ptr, 1024);
-                                        if (ses->serverDomain)
-                                                kfree(ses->serverDomain);
-                                        ses->serverDomain = kzalloc(len + 1,
-                                                                    GFP_KERNEL);
-                                        if (ses->serverDomain == NULL)
-                                                goto sesssetup_nomem;
-                                        strncpy(ses->serverDomain, bcc_ptr,
-                                                len);
-                                        bcc_ptr += len;
-                                        bcc_ptr[0] = 0;
-                                        bcc_ptr++;
-                                } else
-                                        cFYI(1,
-                                             ("Variable field of length %d "
-                                                "extends beyond end of smb ",
-                                              len));
-                        }
-                } else {
-                        cERROR(1, ("Security Blob Length extends beyond "
-                                "end of SMB"));
                }
-        } else {
+                /* convert forward to back slashes in prepath here if needed */
-                cERROR(1, ("Invalid Word count %d: ",
+                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
-                        smb_buffer_response->WordCount));
+                        convert_delimiter(cifs_sb->prepath,
-                rc = -EIO;
+                                        CIFS_DIR_SEP(cifs_sb));
-        }
+                full_path = build_unc_path_to_root(volume_info, cifs_sb);
-sesssetup_nomem:        /* do not return an error on nomem for the info strings,
+                if (IS_ERR(full_path)) {
-                           since that could make reconnection harder, and
+                        rc = PTR_ERR(full_path);
-                           reconnection might be needed to free memory */
+                        goto mount_fail_check;
-        cifs_buf_release(smb_buffer);
-        return rc;
-}
-static int
-CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
-                              struct cifsSesInfo *ses, bool *pNTLMv2_flag,
-                              const struct nls_table *nls_codepage)
-{
-        struct smb_hdr *smb_buffer;
-        struct smb_hdr *smb_buffer_response;
-        SESSION_SETUP_ANDX *pSMB;
-        SESSION_SETUP_ANDX *pSMBr;
-        char *bcc_ptr;
-        char *domain;
-        int rc = 0;
-        int remaining_words = 0;
-        int bytes_returned = 0;
-        int len;
-        int SecurityBlobLength = sizeof(NEGOTIATE_MESSAGE);
-        PNEGOTIATE_MESSAGE SecurityBlob;
-        PCHALLENGE_MESSAGE SecurityBlob2;
-        __u32 negotiate_flags, capabilities;
-        __u16 count;
-        cFYI(1, ("In NTLMSSP sesssetup (negotiate)"));
-        if (ses == NULL)
-                return -EINVAL;
-        domain = ses->domainName;
-        *pNTLMv2_flag = false;
-        smb_buffer = cifs_buf_get();
-        if (smb_buffer == NULL) {
-                return -ENOMEM;
-        }
-        smb_buffer_response = smb_buffer;
-        pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
-        pSMBr = (SESSION_SETUP_ANDX *) smb_buffer_response;
-        /* send SMBsessionSetup here */
-        header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
-                        NULL /* no tCon exists yet */ , 12 /* wct */ );
-        smb_buffer->Mid = GetNextMid(ses->server);
-        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        pSMB->req.hdr.Flags |= (SMBFLG_CASELESS | SMBFLG_CANONICAL_PATH_FORMAT);
-        pSMB->req.AndXCommand = 0xFF;
-        pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-        pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-        if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-        capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-            CAP_EXTENDED_SECURITY;
-        if (ses->capabilities & CAP_UNICODE) {
-                smb_buffer->Flags2 |= SMBFLG2_UNICODE;
-                capabilities |= CAP_UNICODE;
-        }
-        if (ses->capabilities & CAP_STATUS32) {
-                smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
-                capabilities |= CAP_STATUS32;
-        }
-        if (ses->capabilities & CAP_DFS) {
-                smb_buffer->Flags2 |= SMBFLG2_DFS;
-                capabilities |= CAP_DFS;
-        }
-        pSMB->req.Capabilities = cpu_to_le32(capabilities);
-        bcc_ptr = (char *) &pSMB->req.SecurityBlob;
-        SecurityBlob = (PNEGOTIATE_MESSAGE) bcc_ptr;
-        strncpy(SecurityBlob->Signature, NTLMSSP_SIGNATURE, 8);
-        SecurityBlob->MessageType = NtLmNegotiate;
-        negotiate_flags =
-            NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_OEM |
-            NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_NTLM |
-            NTLMSSP_NEGOTIATE_56 |
-            /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN | */ NTLMSSP_NEGOTIATE_128;
-        if (sign_CIFS_PDUs)
-                negotiate_flags |= NTLMSSP_NEGOTIATE_SIGN;
-/*      if (ntlmv2_support)
-                negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;*/
-        /* setup pointers to domain name and workstation name */
-        bcc_ptr += SecurityBlobLength;
-        SecurityBlob->WorkstationName.Buffer = 0;
-        SecurityBlob->WorkstationName.Length = 0;
-        SecurityBlob->WorkstationName.MaximumLength = 0;
-        /* Domain not sent on first Sesssetup in NTLMSSP, instead it is sent
-        along with username on auth request (ie the response to challenge) */
-        SecurityBlob->DomainName.Buffer = 0;
-        SecurityBlob->DomainName.Length = 0;
-        SecurityBlob->DomainName.MaximumLength = 0;
-        if (ses->capabilities & CAP_UNICODE) {
-                if ((long) bcc_ptr % 2) {
-                        *bcc_ptr = 0;
-                        bcc_ptr++;
                }
-                bytes_returned =
+                cFYI(1, ("Getting referral for: %s", full_path));
-                    cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
+                rc = get_dfs_path(xid, pSesInfo , full_path + 1,
-                                  32, nls_codepage);
+                        cifs_sb->local_nls, &num_referrals, &referrals,
-                bcc_ptr += 2 * bytes_returned;
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                bytes_returned =
+                if (!rc && num_referrals > 0) {
-                    cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
+                        char *fake_devname = NULL;
-                                  nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
+                        if (mount_data != mount_data_global)
-                bcc_ptr += 2;   /* null terminate Linux version */
+                                kfree(mount_data);
-                bytes_returned =
+                        mount_data = cifs_compose_mount_options(
-                    cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+                                        cifs_sb->mountdata, full_path + 1,
-                                  64, nls_codepage);
+                                        referrals, &fake_devname);
-                bcc_ptr += 2 * bytes_returned;
+                        kfree(fake_devname);
-                *(bcc_ptr + 1) = 0;
+                        free_dfs_info_array(referrals, num_referrals);
-                *(bcc_ptr + 2) = 0;
-                bcc_ptr += 2;   /* null terminate network opsys string */
+                        if (tcon)
-                *(bcc_ptr + 1) = 0;
+                                cifs_put_tcon(tcon);
-                *(bcc_ptr + 2) = 0;
+                        else if (pSesInfo)
-                bcc_ptr += 2;   /* null domain */
+                                cifs_put_smb_ses(pSesInfo);
-        } else {                /* ASCII */
-                strcpy(bcc_ptr, "Linux version ");
+                        cleanup_volume_info(&volume_info);
-                bcc_ptr += strlen("Linux version ");
+                        FreeXid(xid);
-                strcpy(bcc_ptr, utsname()->release);
+                        kfree(full_path);
-                bcc_ptr += strlen(utsname()->release) + 1;
+                        referral_walks_count++;
-                strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
+                        goto try_mount_again;
-                bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
-                bcc_ptr++;      /* empty domain field */
-                *bcc_ptr = 0;
-        }
-        SecurityBlob->NegotiateFlags = cpu_to_le32(negotiate_flags);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
-        count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
-        smb_buffer->smb_buf_length += count;
-        pSMB->req.ByteCount = cpu_to_le16(count);
-        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-                         &bytes_returned, CIFS_LONG_OP);
-        if (smb_buffer_response->Status.CifsError ==
-            cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
-                rc = 0;
-        if (rc) {
-/*    rc = map_smb_to_linux_error(smb_buffer_response);  *//* done in SendReceive now */
-        } else if ((smb_buffer_response->WordCount == 3)
-                   || (smb_buffer_response->WordCount == 4)) {
-                __u16 action = le16_to_cpu(pSMBr->resp.Action);
-                __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
-                if (action & GUEST_LOGIN)
-                        cFYI(1, ("Guest login"));
-        /* Do we want to set anything in SesInfo struct when guest login? */
-                bcc_ptr = pByteArea(smb_buffer_response);
-        /* response can have either 3 or 4 word count - Samba sends 3 */
-                SecurityBlob2 = (PCHALLENGE_MESSAGE) bcc_ptr;
-                if (SecurityBlob2->MessageType != NtLmChallenge) {
-                        cFYI(1, ("Unexpected NTLMSSP message type received %d",
-                              SecurityBlob2->MessageType));
-                } else if (ses) {
-                        ses->Suid = smb_buffer_response->Uid; /* UID left in le format */
-                        cFYI(1, ("UID = %d", ses->Suid));
-                        if ((pSMBr->resp.hdr.WordCount == 3)
-                            || ((pSMBr->resp.hdr.WordCount == 4)
-                                && (blob_len <
-                                    pSMBr->resp.ByteCount))) {
-                                if (pSMBr->resp.hdr.WordCount == 4) {
-                                        bcc_ptr += blob_len;
-                                        cFYI(1, ("Security Blob Length %d",
-                                              blob_len));
-                                }
-                                cFYI(1, ("NTLMSSP Challenge rcvd"));
-                                memcpy(ses->server->cryptKey,
-                                       SecurityBlob2->Challenge,
-                                       CIFS_CRYPTO_KEY_SIZE);
-                                if (SecurityBlob2->NegotiateFlags &
-                                        cpu_to_le32(NTLMSSP_NEGOTIATE_NTLMV2))
-                                        *pNTLMv2_flag = true;
-                                if ((SecurityBlob2->NegotiateFlags &
-                                        cpu_to_le32(NTLMSSP_NEGOTIATE_ALWAYS_SIGN))
-                                        || (sign_CIFS_PDUs > 1))
-                                                ses->server->secMode |=
-                                                        SECMODE_SIGN_REQUIRED;
-                                if ((SecurityBlob2->NegotiateFlags &
-                                        cpu_to_le32(NTLMSSP_NEGOTIATE_SIGN)) && (sign_CIFS_PDUs))
-                                                ses->server->secMode |=
-                                                        SECMODE_SIGN_ENABLED;
-                                if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-                                        if ((long) (bcc_ptr) % 2) {
-                                                remaining_words =
-                                                    (BCC(smb_buffer_response)
-                                                     - 1) / 2;
-                                         /* Must word align unicode strings */
-                                                bcc_ptr++;
-                                        } else {
-                                                remaining_words =
-                                                    BCC
-                                                    (smb_buffer_response) / 2;
-                                        }
-                                        len =
-                                            UniStrnlen((wchar_t *) bcc_ptr,
-                                                       remaining_words - 1);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-   the end since (at least) WIN2K and Windows XP have a major bug in not null
-   terminating last Unicode string in response  */
-                                        if (ses->serverOS)
-                                                kfree(ses->serverOS);
-                                        ses->serverOS =
-                                            kzalloc(2 * (len + 1), GFP_KERNEL);
-                                        cifs_strfromUCS_le(ses->serverOS,
-                                                           (__le16 *)
-                                                           bcc_ptr, len,
-                                                           nls_codepage);
-                                        bcc_ptr += 2 * (len + 1);
-                                        remaining_words -= len + 1;
-                                        ses->serverOS[2 * len] = 0;
-                                        ses->serverOS[1 + (2 * len)] = 0;
-                                        if (remaining_words > 0) {
-                                                len = UniStrnlen((wchar_t *)
-                                                                 bcc_ptr,
-                                                                 remaining_words
-                                                                 - 1);
-                                                kfree(ses->serverNOS);
-                                                ses->serverNOS =
-                                                    kzalloc(2 * (len + 1),
-                                                            GFP_KERNEL);
-                                                cifs_strfromUCS_le(ses->
-                                                                   serverNOS,
-                                                                   (__le16 *)
-                                                                   bcc_ptr,
-                                                                   len,
-                                                                   nls_codepage);
-                                                bcc_ptr += 2 * (len + 1);
-                                                ses->serverNOS[2 * len] = 0;
-                                                ses->serverNOS[1 +
-                                                               (2 * len)] = 0;
-                                                remaining_words -= len + 1;
-                                                if (remaining_words > 0) {
-                                                        len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
-                                /* last string not always null terminated
-                                   (for e.g. for Windows XP & 2000) */
-                                                        kfree(ses->serverDomain);
-                                                        ses->serverDomain =
-                                                            kzalloc(2 *
-                                                                    (len +
-                                                                     1),
-                                                                    GFP_KERNEL);
-                                                        cifs_strfromUCS_le
-                                                            (ses->serverDomain,
-                                                             (__le16 *)bcc_ptr,
-                                                             len, nls_codepage);
-                                                        bcc_ptr +=
-                                                            2 * (len + 1);
-                                                        ses->serverDomain[2*len]
-                                                            = 0;
-                                                        ses->serverDomain
-                                                                [1 + (2 * len)]
-                                                            = 0;
-                                                } /* else no more room so create dummy domain string */
-                                                else {
-                                                        kfree(ses->serverDomain);
-                                                        ses->serverDomain =
-                                                            kzalloc(2,
-                                                                    GFP_KERNEL);
-                                                }
-                                        } else {        /* no room so create dummy domain and NOS string */
-                                                kfree(ses->serverDomain);
-                                                ses->serverDomain =
-                                                    kzalloc(2, GFP_KERNEL);
-                                                kfree(ses->serverNOS);
-                                                ses->serverNOS =
-                                                    kzalloc(2, GFP_KERNEL);
-                                        }
-                                } else {        /* ASCII */
-                                        len = strnlen(bcc_ptr, 1024);
-                                        if (((long) bcc_ptr + len) - (long)
-                                            pByteArea(smb_buffer_response)
-                                            <= BCC(smb_buffer_response)) {
-                                                if (ses->serverOS)
-                                                        kfree(ses->serverOS);
-                                                ses->serverOS =
-                                                    kzalloc(len + 1,
-                                                            GFP_KERNEL);
-                                                strncpy(ses->serverOS,
-                                                        bcc_ptr, len);
-                                                bcc_ptr += len;
-                                                bcc_ptr[0] = 0; /* null terminate string */
-                                                bcc_ptr++;
-                                                len = strnlen(bcc_ptr, 1024);
-                                                kfree(ses->serverNOS);
-                                                ses->serverNOS =
-                                                    kzalloc(len + 1,
-                                                            GFP_KERNEL);
-                                                strncpy(ses->serverNOS, bcc_ptr, len);
-                                                bcc_ptr += len;
-                                                bcc_ptr[0] = 0;
-                                                bcc_ptr++;
-                                                len = strnlen(bcc_ptr, 1024);
-                                                kfree(ses->serverDomain);
-                                                ses->serverDomain =
-                                                    kzalloc(len + 1,
-                                                            GFP_KERNEL);
-                                                strncpy(ses->serverDomain,
-                                                        bcc_ptr, len);
-                                                bcc_ptr += len;
-                                                bcc_ptr[0] = 0;
-                                                bcc_ptr++;
-                                        } else
-                                                cFYI(1,
-                                                     ("field of length %d "
-                                                    "extends beyond end of smb",
-                                                      len));
-                                }
-                        } else {
-                                cERROR(1, ("Security Blob Length extends beyond"
-                                           " end of SMB"));
-                        }
-                } else {
-                        cERROR(1, ("No session structure passed in."));
                }
-        } else {
+#else /* No DFS support, return error on mount */
-                cERROR(1, ("Invalid Word count %d:",
+                rc = -EOPNOTSUPP;
-                        smb_buffer_response->WordCount));
+#endif
-                rc = -EIO;
-        }
-        cifs_buf_release(smb_buffer);
-        return rc;
-}
-static int
-CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
-                        char *ntlm_session_key, bool ntlmv2_flag,
-                        const struct nls_table *nls_codepage)
-{
-        struct smb_hdr *smb_buffer;
-        struct smb_hdr *smb_buffer_response;
-        SESSION_SETUP_ANDX *pSMB;
-        SESSION_SETUP_ANDX *pSMBr;
-        char *bcc_ptr;
-        char *user;
-        char *domain;
-        int rc = 0;
-        int remaining_words = 0;
-        int bytes_returned = 0;
-        int len;
-        int SecurityBlobLength = sizeof(AUTHENTICATE_MESSAGE);
-        PAUTHENTICATE_MESSAGE SecurityBlob;
-        __u32 negotiate_flags, capabilities;
-        __u16 count;
-        cFYI(1, ("In NTLMSSPSessSetup (Authenticate)"));
-        if (ses == NULL)
-                return -EINVAL;
-        user = ses->userName;
-        domain = ses->domainName;
-        smb_buffer = cifs_buf_get();
-        if (smb_buffer == NULL) {
-                return -ENOMEM;
-        }
-        smb_buffer_response = smb_buffer;
-        pSMB = (SESSION_SETUP_ANDX *)smb_buffer;
-        pSMBr = (SESSION_SETUP_ANDX *)smb_buffer_response;
-        /* send SMBsessionSetup here */
-        header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
-                        NULL /* no tCon exists yet */ , 12 /* wct */ );
-        smb_buffer->Mid = GetNextMid(ses->server);
-        pSMB->req.hdr.Flags |= (SMBFLG_CASELESS | SMBFLG_CANONICAL_PATH_FORMAT);
-        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        pSMB->req.AndXCommand = 0xFF;
-        pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-        pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-        pSMB->req.hdr.Uid = ses->Suid;
-        if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-        capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-                        CAP_EXTENDED_SECURITY;
-        if (ses->capabilities & CAP_UNICODE) {
-                smb_buffer->Flags2 |= SMBFLG2_UNICODE;
-                capabilities |= CAP_UNICODE;
-        }
-        if (ses->capabilities & CAP_STATUS32) {
-                smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
-                capabilities |= CAP_STATUS32;
        }
-        if (ses->capabilities & CAP_DFS) {
-                smb_buffer->Flags2 |= SMBFLG2_DFS;
-                capabilities |= CAP_DFS;
-        }
-        pSMB->req.Capabilities = cpu_to_le32(capabilities);
-        bcc_ptr = (char *)&pSMB->req.SecurityBlob;
-        SecurityBlob = (PAUTHENTICATE_MESSAGE)bcc_ptr;
-        strncpy(SecurityBlob->Signature, NTLMSSP_SIGNATURE, 8);
-        SecurityBlob->MessageType = NtLmAuthenticate;
-        bcc_ptr += SecurityBlobLength;
-        negotiate_flags = NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_REQUEST_TARGET |
-                        NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_TARGET_INFO |
-                        0x80000000 | NTLMSSP_NEGOTIATE_128;
-        if (sign_CIFS_PDUs)
-                negotiate_flags |= /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN |*/ NTLMSSP_NEGOTIATE_SIGN;
-        if (ntlmv2_flag)
-                negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;
-/* setup pointers to domain name and workstation name */
-        SecurityBlob->WorkstationName.Buffer = 0;
-        SecurityBlob->WorkstationName.Length = 0;
-        SecurityBlob->WorkstationName.MaximumLength = 0;
-        SecurityBlob->SessionKey.Length = 0;
-        SecurityBlob->SessionKey.MaximumLength = 0;
-        SecurityBlob->SessionKey.Buffer = 0;
-        SecurityBlob->LmChallengeResponse.Length = 0;
-        SecurityBlob->LmChallengeResponse.MaximumLength = 0;
-        SecurityBlob->LmChallengeResponse.Buffer = 0;
-        SecurityBlob->NtChallengeResponse.Length =
-            cpu_to_le16(CIFS_SESS_KEY_SIZE);
-        SecurityBlob->NtChallengeResponse.MaximumLength =
-            cpu_to_le16(CIFS_SESS_KEY_SIZE);
-        memcpy(bcc_ptr, ntlm_session_key, CIFS_SESS_KEY_SIZE);
-        SecurityBlob->NtChallengeResponse.Buffer =
-            cpu_to_le32(SecurityBlobLength);
-        SecurityBlobLength += CIFS_SESS_KEY_SIZE;
-        bcc_ptr += CIFS_SESS_KEY_SIZE;
-        if (ses->capabilities & CAP_UNICODE) {
+mount_fail_check:
-                if (domain == NULL) {
+        /* on error free sesinfo and tcon struct if needed */
-                        SecurityBlob->DomainName.Buffer = 0;
-                        SecurityBlob->DomainName.Length = 0;
-                        SecurityBlob->DomainName.MaximumLength = 0;
-                } else {
-                        __u16 ln = cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
-                                          nls_codepage);
-                        ln *= 2;
-                        SecurityBlob->DomainName.MaximumLength =
-                            cpu_to_le16(ln);
-                        SecurityBlob->DomainName.Buffer =
-                            cpu_to_le32(SecurityBlobLength);
-                        bcc_ptr += ln;
-                        SecurityBlobLength += ln;
-                        SecurityBlob->DomainName.Length = cpu_to_le16(ln);
-                }
-                if (user == NULL) {
-                        SecurityBlob->UserName.Buffer = 0;
-                        SecurityBlob->UserName.Length = 0;
-                        SecurityBlob->UserName.MaximumLength = 0;
-                } else {
-                        __u16 ln = cifs_strtoUCS((__le16 *) bcc_ptr, user, 64,
-                                          nls_codepage);
-                        ln *= 2;
-                        SecurityBlob->UserName.MaximumLength =
-                            cpu_to_le16(ln);
-                        SecurityBlob->UserName.Buffer =
-                            cpu_to_le32(SecurityBlobLength);
-                        bcc_ptr += ln;
-                        SecurityBlobLength += ln;
-                        SecurityBlob->UserName.Length = cpu_to_le16(ln);
-                }
-                /* SecurityBlob->WorkstationName.Length =
-                 cifs_strtoUCS((__le16 *) bcc_ptr, "AMACHINE",64, nls_codepage);
-                   SecurityBlob->WorkstationName.Length *= 2;
-                   SecurityBlob->WorkstationName.MaximumLength =
-                        cpu_to_le16(SecurityBlob->WorkstationName.Length);
-                   SecurityBlob->WorkstationName.Buffer =
-                                 cpu_to_le32(SecurityBlobLength);
-                   bcc_ptr += SecurityBlob->WorkstationName.Length;
-                   SecurityBlobLength += SecurityBlob->WorkstationName.Length;
-                   SecurityBlob->WorkstationName.Length =
-                        cpu_to_le16(SecurityBlob->WorkstationName.Length);  */
-                if ((long) bcc_ptr % 2) {
-                        *bcc_ptr = 0;
-                        bcc_ptr++;
-                }
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
-                                  32, nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
-                                  nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bcc_ptr += 2;   /* null term version string */
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-                                  64, nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                *(bcc_ptr + 1) = 0;
-                *(bcc_ptr + 2) = 0;
-                bcc_ptr += 2;   /* null terminate network opsys string */
-                *(bcc_ptr + 1) = 0;
-                *(bcc_ptr + 2) = 0;
-                bcc_ptr += 2;   /* null domain */
-        } else {                /* ASCII */
-                if (domain == NULL) {
-                        SecurityBlob->DomainName.Buffer = 0;
-                        SecurityBlob->DomainName.Length = 0;
-                        SecurityBlob->DomainName.MaximumLength = 0;
-                } else {
-                        __u16 ln;
-                        negotiate_flags |= NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED;
-                        strncpy(bcc_ptr, domain, 63);
-                        ln = strnlen(domain, 64);
-                        SecurityBlob->DomainName.MaximumLength =
-                            cpu_to_le16(ln);
-                        SecurityBlob->DomainName.Buffer =
-                            cpu_to_le32(SecurityBlobLength);
-                        bcc_ptr += ln;
-                        SecurityBlobLength += ln;
-                        SecurityBlob->DomainName.Length = cpu_to_le16(ln);
-                }
-                if (user == NULL) {
-                        SecurityBlob->UserName.Buffer = 0;
-                        SecurityBlob->UserName.Length = 0;
-                        SecurityBlob->UserName.MaximumLength = 0;
-                } else {
-                        __u16 ln;
-                        strncpy(bcc_ptr, user, 63);
-                        ln = strnlen(user, 64);
-                        SecurityBlob->UserName.MaximumLength = cpu_to_le16(ln);
-                        SecurityBlob->UserName.Buffer =
-                                                cpu_to_le32(SecurityBlobLength);
-                        bcc_ptr += ln;
-                        SecurityBlobLength += ln;
-                        SecurityBlob->UserName.Length = cpu_to_le16(ln);
-                }
-                /* BB fill in our workstation name if known BB */
-                strcpy(bcc_ptr, "Linux version ");
-                bcc_ptr += strlen("Linux version ");
-                strcpy(bcc_ptr, utsname()->release);
-                bcc_ptr += strlen(utsname()->release) + 1;
-                strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
-                bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
-                bcc_ptr++;      /* null domain */
-                *bcc_ptr = 0;
-        }
-        SecurityBlob->NegotiateFlags = cpu_to_le32(negotiate_flags);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
-        count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
-        smb_buffer->smb_buf_length += count;
-        pSMB->req.ByteCount = cpu_to_le16(count);
-        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-                         &bytes_returned, CIFS_LONG_OP);
        if (rc) {
-/*   rc = map_smb_to_linux_error(smb_buffer_response) done in SendReceive now */
+                if (mount_data != mount_data_global)
-        } else if ((smb_buffer_response->WordCount == 3) ||
+                        kfree(mount_data);
-                   (smb_buffer_response->WordCount == 4)) {
+                /* If find_unc succeeded then rc == 0 so we can not end */
-                __u16 action = le16_to_cpu(pSMBr->resp.Action);
+                /* up accidently freeing someone elses tcon struct */
-                __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
+                if (tcon)
-                if (action & GUEST_LOGIN)
+                        cifs_put_tcon(tcon);
-                        cFYI(1, ("Guest login")); /* BB Should we set anything
+                else if (pSesInfo)
-                                                         in SesInfo struct ? */
+                        cifs_put_smb_ses(pSesInfo);
-/*              if (SecurityBlob2->MessageType != NtLm??) {
+                else
-                        cFYI("Unexpected message type on auth response is %d"));
+                        cifs_put_tcp_session(srvTcp);
-                } */
+                goto out;
-                if (ses) {
-                        cFYI(1,
-                             ("Check challenge UID %d vs auth response UID %d",
-                              ses->Suid, smb_buffer_response->Uid));
-                        /* UID left in wire format */
-                        ses->Suid = smb_buffer_response->Uid;
-                        bcc_ptr = pByteArea(smb_buffer_response);
-                /* response can have either 3 or 4 word count - Samba sends 3 */
-                        if ((pSMBr->resp.hdr.WordCount == 3)
-                            || ((pSMBr->resp.hdr.WordCount == 4)
-                                && (blob_len <
-                                    pSMBr->resp.ByteCount))) {
-                                if (pSMBr->resp.hdr.WordCount == 4) {
-                                        bcc_ptr +=
-                                            blob_len;
-                                        cFYI(1,
-                                             ("Security Blob Length %d ",
-                                              blob_len));
-                                }
-                                cFYI(1,
-                                     ("NTLMSSP response to Authenticate "));
-                                if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-                                        if ((long) (bcc_ptr) % 2) {
-                                                remaining_words =
-                                                    (BCC(smb_buffer_response)
-                                                     - 1) / 2;
-                                                bcc_ptr++;      /* Unicode strings must be word aligned */
-                                        } else {
-                                                remaining_words = BCC(smb_buffer_response) / 2;
-                                        }
-                                        len = UniStrnlen((wchar_t *) bcc_ptr,
-                                                        remaining_words - 1);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-  the end since (at least) WIN2K and Windows XP have a major bug in not null
-  terminating last Unicode string in response  */
-                                        if (ses->serverOS)
-                                                kfree(ses->serverOS);
-                                        ses->serverOS =
-                                            kzalloc(2 * (len + 1), GFP_KERNEL);
-                                        cifs_strfromUCS_le(ses->serverOS,
-                                                           (__le16 *)
-                                                           bcc_ptr, len,
-                                                           nls_codepage);
-                                        bcc_ptr += 2 * (len + 1);
-                                        remaining_words -= len + 1;
-                                        ses->serverOS[2 * len] = 0;
-                                        ses->serverOS[1 + (2 * len)] = 0;
-                                        if (remaining_words > 0) {
-                                                len = UniStrnlen((wchar_t *)
-                                                                 bcc_ptr,
-                                                                 remaining_words
-                                                                 - 1);
-                                                kfree(ses->serverNOS);
-                                                ses->serverNOS =
-                                                    kzalloc(2 * (len + 1),
-                                                            GFP_KERNEL);
-                                                cifs_strfromUCS_le(ses->
-                                                                   serverNOS,
-                                                                   (__le16 *)
-                                                                   bcc_ptr,
-                                                                   len,
-                                                                   nls_codepage);
-                                                bcc_ptr += 2 * (len + 1);
-                                                ses->serverNOS[2 * len] = 0;
-                                                ses->serverNOS[1+(2*len)] = 0;
-                                                remaining_words -= len + 1;
-                                                if (remaining_words > 0) {
-                                                        len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
-     /* last string not always null terminated (e.g. for Windows XP & 2000) */
-                                                        if (ses->serverDomain)
-                                                                kfree(ses->serverDomain);
-                                                        ses->serverDomain =
-                                                            kzalloc(2 *
-                                                                    (len +
-                                                                     1),
-                                                                    GFP_KERNEL);
-                                                        cifs_strfromUCS_le
-                                                            (ses->
-                                                             serverDomain,
-                                                             (__le16 *)
-                                                             bcc_ptr, len,
-                                                             nls_codepage);
-                                                        bcc_ptr +=
-                                                            2 * (len + 1);
-                                                        ses->
-                                                            serverDomain[2
-                                                                         * len]
-                                                            = 0;
-                                                        ses->
-                                                            serverDomain[1
-                                                                         +
-                                                                         (2
-                                                                          *
-                                                                          len)]
-                                                            = 0;
-                                                } /* else no more room so create dummy domain string */
-                                                else {
-                                                        if (ses->serverDomain)
-                                                                kfree(ses->serverDomain);
-                                                        ses->serverDomain = kzalloc(2,GFP_KERNEL);
-                                                }
-                                        } else {  /* no room so create dummy domain and NOS string */
-                                                if (ses->serverDomain)
-                                                        kfree(ses->serverDomain);
-                                                ses->serverDomain = kzalloc(2, GFP_KERNEL);
-                                                kfree(ses->serverNOS);
-                                                ses->serverNOS = kzalloc(2, GFP_KERNEL);
-                                        }
-                                } else {        /* ASCII */
-                                        len = strnlen(bcc_ptr, 1024);
-                                        if (((long) bcc_ptr + len) -
-                                           (long) pByteArea(smb_buffer_response)
-                                                <= BCC(smb_buffer_response)) {
-                                                if (ses->serverOS)
-                                                        kfree(ses->serverOS);
-                                                ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
-                                                strncpy(ses->serverOS,bcc_ptr, len);
-                                                bcc_ptr += len;
-                                                bcc_ptr[0] = 0; /* null terminate the string */
-                                                bcc_ptr++;
-                                                len = strnlen(bcc_ptr, 1024);
-                                                kfree(ses->serverNOS);
-                                                ses->serverNOS = kzalloc(len+1,
-                                                                    GFP_KERNEL);
-                                                strncpy(ses->serverNOS,
-                                                        bcc_ptr, len);
-                                                bcc_ptr += len;
-                                                bcc_ptr[0] = 0;
-                                                bcc_ptr++;
-                                                len = strnlen(bcc_ptr, 1024);
-                                                if (ses->serverDomain)
-                                                        kfree(ses->serverDomain);
-                                                ses->serverDomain =
-                                                                kzalloc(len+1,
-                                                                    GFP_KERNEL);
-                                                strncpy(ses->serverDomain,
-                                                        bcc_ptr, len);
-                                                bcc_ptr += len;
-                                                bcc_ptr[0] = 0;
-                                                bcc_ptr++;
-                                        } else
-                                                cFYI(1, ("field of length %d "
-                                                   "extends beyond end of smb ",
-                                                      len));
-                                }
-                        } else {
-                                cERROR(1, ("Security Blob extends beyond end "
-                                        "of SMB"));
-                        }
-                } else {
-                        cERROR(1, ("No session structure passed in."));
-                }
-        } else {
-                cERROR(1, ("Invalid Word count %d: ",
-                        smb_buffer_response->WordCount));
-                rc = -EIO;
        }
-        cifs_buf_release(smb_buffer);
+        /* volume_info->password is freed above when existing session found
+        (in which case it is not needed anymore) but when new sesion is created
+        the password ptr is put in the new session structure (in which case the
+        password will be freed at unmount time) */
+out:
+        /* zero out password before freeing */
+        cleanup_volume_info(&volume_info);
+        FreeXid(xid);
        return rc;
 }
@@ -3556,7 +2623,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        TCONX_RSP *pSMBr;
        unsigned char *bcc_ptr;
        int rc = 0;
-        int length;
+        int length, bytes_left;
        __u16 count;
        if (ses == NULL)
@@ -3644,14 +2711,22 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
                         CIFS_STD_OP);
-        /* if (rc) rc = map_smb_to_linux_error(smb_buffer_response); */
        /* above now done in SendReceive */
        if ((rc == 0) && (tcon != NULL)) {
+                bool is_unicode;
                tcon->tidStatus = CifsGood;
                tcon->need_reconnect = false;
                tcon->tid = smb_buffer_response->Tid;
                bcc_ptr = pByteArea(smb_buffer_response);
-                length = strnlen(bcc_ptr, BCC(smb_buffer_response) - 2);
+                bytes_left = BCC(smb_buffer_response);
+                length = strnlen(bcc_ptr, bytes_left - 2);
+                if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
+                        is_unicode = true;
+                else
+                        is_unicode = false;
                /* skip service field (NB: this field is always ASCII) */
                if (length == 3) {
                        if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
@@ -3666,40 +2741,16 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                        }
                }
                bcc_ptr += length + 1;
+                bytes_left -= (length + 1);
                strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
-                if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-                        length = UniStrnlen((wchar_t *) bcc_ptr, 512);
+                /* mostly informational -- no need to fail on error here */
-                        if ((bcc_ptr + (2 * length)) -
+                tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
-                             pByteArea(smb_buffer_response) <=
+                                                      bytes_left, is_unicode,
-                            BCC(smb_buffer_response)) {
+                                                      nls_codepage);
-                                kfree(tcon->nativeFileSystem);
-                                tcon->nativeFileSystem =
+                cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem));
-                                    kzalloc(2*(length + 1), GFP_KERNEL);
-                                if (tcon->nativeFileSystem)
-                                        cifs_strfromUCS_le(
-                                                tcon->nativeFileSystem,
-                                                (__le16 *) bcc_ptr,
-                                                length, nls_codepage);
-                                bcc_ptr += 2 * length;
-                                bcc_ptr[0] = 0; /* null terminate the string */
-                                bcc_ptr[1] = 0;
-                                bcc_ptr += 2;
-                        }
-                        /* else do not bother copying these information fields*/
-                } else {
-                        length = strnlen(bcc_ptr, 1024);
-                        if ((bcc_ptr + length) -
-                            pByteArea(smb_buffer_response) <=
-                            BCC(smb_buffer_response)) {
-                                kfree(tcon->nativeFileSystem);
-                                tcon->nativeFileSystem =
-                                    kzalloc(length + 1, GFP_KERNEL);
-                                if (tcon->nativeFileSystem)
-                                        strncpy(tcon->nativeFileSystem, bcc_ptr,
-                                                length);
-                        }
-                        /* else do not bother copying these information fields*/
-                }
                if ((smb_buffer_response->WordCount == 3) ||
                         (smb_buffer_response->WordCount == 7))
                        /* field is in same location */
@@ -3738,8 +2789,6 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                                           struct nls_table *nls_info)
 {
        int rc = 0;
-        char ntlm_session_key[CIFS_SESS_KEY_SIZE];
-        bool ntlmv2_flag = false;
        int first_time = 0;
        struct TCP_Server_Info *server = pSesInfo->server;
@@ -3771,83 +2820,19 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
        pSesInfo->capabilities = server->capabilities;
        if (linuxExtEnabled == 0)
                pSesInfo->capabilities &= (~CAP_UNIX);
-        /*      pSesInfo->sequence_number = 0;*/
        cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
                 server->secMode, server->capabilities, server->timeAdj));
-        if (experimEnabled < 2)
+        rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
-                rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
-        else if (extended_security
-                        && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
-                        && (server->secType == NTLMSSP)) {
-                rc = -EOPNOTSUPP;
-        } else if (extended_security
-                        && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
-                        && (server->secType == RawNTLMSSP)) {
-                cFYI(1, ("NTLMSSP sesssetup"));
-                rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
-                                                   nls_info);
-                if (!rc) {
-                        if (ntlmv2_flag) {
-                                char *v2_response;
-                                cFYI(1, ("more secure NTLM ver2 hash"));
-                                if (CalcNTLMv2_partial_mac_key(pSesInfo,
-                                                                nls_info)) {
-                                        rc = -ENOMEM;
-                                        goto ss_err_exit;
-                                } else
-                                        v2_response = kmalloc(16 + 64 /* blob*/,
-                                                                GFP_KERNEL);
-                                if (v2_response) {
-                                        CalcNTLMv2_response(pSesInfo,
-                                                                v2_response);
-                                /*      if (first_time)
-                                                cifs_calculate_ntlmv2_mac_key */
-                                        kfree(v2_response);
-                                        /* BB Put dummy sig in SessSetup PDU? */
-                                } else {
-                                        rc = -ENOMEM;
-                                        goto ss_err_exit;
-                                }
-                        } else {
-                                SMBNTencrypt(pSesInfo->password,
-                                             server->cryptKey,
-                                             ntlm_session_key);
-                                if (first_time)
-                                        cifs_calculate_mac_key(
-                                             &server->mac_signing_key,
-                                             ntlm_session_key,
-                                             pSesInfo->password);
-                        }
-                        /* for better security the weaker lanman hash not sent
-                           in AuthSessSetup so we no longer calculate it */
-                        rc = CIFSNTLMSSPAuthSessSetup(xid, pSesInfo,
-                                                      ntlm_session_key,
-                                                      ntlmv2_flag,
-                                                      nls_info);
-                }
-        } else { /* old style NTLM 0.12 session setup */
-                SMBNTencrypt(pSesInfo->password, server->cryptKey,
-                             ntlm_session_key);
-                if (first_time)
-                        cifs_calculate_mac_key(&server->mac_signing_key,
-                                                ntlm_session_key,
-                                                pSesInfo->password);
-                rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
-        }
        if (rc) {
                cERROR(1, ("Send error in SessSetup = %d", rc));
        } else {
                cFYI(1, ("CIFS Session Established successfully"));
-                        spin_lock(&GlobalMid_Lock);
+                spin_lock(&GlobalMid_Lock);
-                        pSesInfo->status = CifsGood;
+                pSesInfo->status = CifsGood;
-                        pSesInfo->need_reconnect = false;
+                pSesInfo->need_reconnect = false;
-                        spin_unlock(&GlobalMid_Lock);
+                spin_unlock(&GlobalMid_Lock);
        }
 ss_err_exit:
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2f35cccfcd8d..3758965d73d5 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,12 +129,62 @@ cifs_bp_rename_retry:
        return full_path;
 }
+static void
+cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
+                        struct cifsTconInfo *tcon, bool write_only)
+{
+        int oplock = 0;
+        struct cifsFileInfo *pCifsFile;
+        struct cifsInodeInfo *pCifsInode;
+        pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+        if (pCifsFile == NULL)
+                return;
+        if (oplockEnabled)
+                oplock = REQ_OPLOCK;
+        pCifsFile->netfid = fileHandle;
+        pCifsFile->pid = current->tgid;
+        pCifsFile->pInode = newinode;
+        pCifsFile->invalidHandle = false;
+        pCifsFile->closePend = false;
+        mutex_init(&pCifsFile->fh_mutex);
+        mutex_init(&pCifsFile->lock_mutex);
+        INIT_LIST_HEAD(&pCifsFile->llist);
+        atomic_set(&pCifsFile->wrtPending, 0);
+        /* set the following in open now
+                        pCifsFile->pfile = file; */
+        write_lock(&GlobalSMBSeslock);
+        list_add(&pCifsFile->tlist, &tcon->openFileList);
+        pCifsInode = CIFS_I(newinode);
+        if (pCifsInode) {
+                /* if readable file instance put first in list*/
+                if (write_only)
+                        list_add_tail(&pCifsFile->flist,
+                                      &pCifsInode->openFileList);
+                else
+                        list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+                if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+                        pCifsInode->clientCanCacheAll = true;
+                        pCifsInode->clientCanCacheRead = true;
+                        cFYI(1, ("Exclusive Oplock inode %p", newinode));
+                } else if ((oplock & 0xF) == OPLOCK_READ)
+                                pCifsInode->clientCanCacheRead = true;
+        }
+        write_unlock(&GlobalSMBSeslock);
+}
 int cifs_posix_open(char *full_path, struct inode **pinode,
                    struct super_block *sb, int mode, int oflags,
                    int *poplock, __u16 *pnetfid, int xid)
 {
        int rc;
        __u32 oplock;
+        bool write_only = false;
        FILE_UNIX_BASIC_INFO *presp_data;
        __u32 posix_flags = 0;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -172,7 +222,10 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        if (oflags & O_DIRECT)
                posix_flags |= SMB_O_DIRECT;
+        if (!(oflags & FMODE_READ))
+                write_only = true;
+        mode &= ~current_umask();
        rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
                        pnetfid, presp_data, &oplock, full_path,
                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
@@ -187,8 +240,10 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        if (!pinode)
                goto posix_open_ret; /* caller does not need info */
-        if (*pinode == NULL)
+        if (*pinode == NULL) {
-                *pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+                __u64 unique_id = le64_to_cpu(presp_data->UniqueId);
+                *pinode = cifs_new_inode(sb, &unique_id);
+        }
        /* else an inode was passed in. Update its info, don't create one */
        /* We do not need to close the file if new_inode fails since
@@ -198,6 +253,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        posix_fill_in_inode(*pinode, presp_data, 1);
+        cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
 posix_open_ret:
        kfree(presp_data);
        return rc;
@@ -225,6 +282,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        int create_options = CREATE_NOT_DIR;
        int oplock = 0;
        int oflags;
+        bool posix_create = false;
        /*
         * BB below access is probably too much for mknod to request
         *    but we have to do query and setpathinfo so requesting
@@ -239,7 +297,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        char *full_path = NULL;
        FILE_ALL_INFO *buf = NULL;
        struct inode *newinode = NULL;
-        struct cifsInodeInfo *pCifsInode;
        int disposition = FILE_OVERWRITE_IF;
        bool write_only = false;
@@ -254,7 +311,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                return -ENOMEM;
        }
-        mode &= ~current->fs->umask;
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -273,12 +329,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                   negotation.  EREMOTE indicates DFS junction, which is not
                   handled in posix open */
-                if ((rc == 0) && (newinode == NULL))
+                if (rc == 0) {
-                        goto cifs_create_get_file_info; /* query inode info */
+                        posix_create = true;
-                else if (rc == 0) /* success, no need to query */
+                        if (newinode == NULL) /* query inode info */
-                        goto cifs_create_set_dentry;
+                                goto cifs_create_get_file_info;
-                else if ((rc != -EIO) && (rc != -EREMOTE) &&
+                        else /* success, no need to query */
-                         (rc != -EOPNOTSUPP)) /* path not found or net err */
+                                goto cifs_create_set_dentry;
+                } else if ((rc != -EIO) && (rc != -EREMOTE) &&
+                         (rc != -EOPNOTSUPP) && (rc != -EINVAL))
                        goto cifs_create_out;
                /* else fallthrough to retry, using older open call, this is
                   case where server does not support this SMB level, and
@@ -409,45 +467,9 @@ cifs_create_set_dentry:
        if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
                /* mknod case - do not leave file open */
                CIFSSMBClose(xid, tcon, fileHandle);
-        } else if (newinode) {
+        } else if (!(posix_create) && (newinode)) {
-                struct cifsFileInfo *pCifsFile =
+                        cifs_fill_fileinfo(newinode, fileHandle,
-                        kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+                                        cifs_sb->tcon, write_only);
-                if (pCifsFile == NULL)
-                        goto cifs_create_out;
-                pCifsFile->netfid = fileHandle;
-                pCifsFile->pid = current->tgid;
-                pCifsFile->pInode = newinode;
-                pCifsFile->invalidHandle = false;
-                pCifsFile->closePend     = false;
-                init_MUTEX(&pCifsFile->fh_sem);
-                mutex_init(&pCifsFile->lock_mutex);
-                INIT_LIST_HEAD(&pCifsFile->llist);
-                atomic_set(&pCifsFile->wrtPending, 0);
-                /* set the following in open now
-                                pCifsFile->pfile = file; */
-                write_lock(&GlobalSMBSeslock);
-                list_add(&pCifsFile->tlist, &tcon->openFileList);
-                pCifsInode = CIFS_I(newinode);
-                if (pCifsInode) {
-                        /* if readable file instance put first in list*/
-                        if (write_only) {
-                                list_add_tail(&pCifsFile->flist,
-                                              &pCifsInode->openFileList);
-                        } else {
-                                list_add(&pCifsFile->flist,
-                                         &pCifsInode->openFileList);
-                        }
-                        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                                pCifsInode->clientCanCacheAll = true;
-                                pCifsInode->clientCanCacheRead = true;
-                                cFYI(1, ("Exclusive Oplock inode %p",
-                                        newinode));
-                        } else if ((oplock & 0xF) == OPLOCK_READ)
-                                pCifsInode->clientCanCacheRead = true;
-                }
-                write_unlock(&GlobalSMBSeslock);
        }
 cifs_create_out:
        kfree(buf);
@@ -479,7 +501,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                rc = -ENOMEM;
        else if (pTcon->unix_ext) {
                struct cifs_unix_set_info_args args = {
-                        .mode   = mode & ~current->fs->umask,
+                        .mode   = mode & ~current_umask(),
                        .ctime  = NO_CHANGE_64,
                        .atime  = NO_CHANGE_64,
                        .mtime  = NO_CHANGE_64,
@@ -580,17 +602,20 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        return rc;
 }
 struct dentry *
 cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
            struct nameidata *nd)
 {
        int xid;
        int rc = 0; /* to get around spurious gcc warning, set to zero here */
+        int oplock = 0;
+        __u16 fileHandle = 0;
+        bool posix_open = false;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
        struct inode *newInode = NULL;
        char *full_path = NULL;
+        struct file *filp;
        xid = GetXid();
@@ -632,12 +657,43 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        }
        cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
-        if (pTcon->unix_ext)
+        /* Posix open is only called (at lookup time) for file create now.
-                rc = cifs_get_inode_info_unix(&newInode, full_path,
+         * For opens (rather than creates), because we do not know if it
-                                              parent_dir_inode->i_sb, xid);
+         * is a file or directory yet, and current Samba no longer allows
-        else
+         * us to do posix open on dirs, we could end up wasting an open call
+         * on what turns out to be a dir. For file opens, we wait to call posix
+         * open till cifs_open.  It could be added here (lookup) in the future
+         * but the performance tradeoff of the extra network request when EISDIR
+         * or EACCES is returned would have to be weighed against the 50%
+         * reduction in network traffic in the other paths.
+         */
+        if (pTcon->unix_ext) {
+                if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
+                     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
+                     (nd->intent.open.flags & O_CREAT)) {
+                        rc = cifs_posix_open(full_path, &newInode,
+                                        parent_dir_inode->i_sb,
+                                        nd->intent.open.create_mode,
+                                        nd->intent.open.flags, &oplock,
+                                        &fileHandle, xid);
+                        /*
+                         * The check below works around a bug in POSIX
+                         * open in samba versions 3.3.1 and earlier where
+                         * open could incorrectly fail with invalid parameter.
+                         * If either that or op not supported returned, follow
+                         * the normal lookup.
+                         */
+                        if ((rc == 0) || (rc == -ENOENT))
+                                posix_open = true;
+                        else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP))
+                                pTcon->broken_posix_open = true;
+                }
+                if (!posix_open)
+                        rc = cifs_get_inode_info_unix(&newInode, full_path,
+                                                parent_dir_inode->i_sb, xid);
+        } else
                rc = cifs_get_inode_info(&newInode, full_path, NULL,
-                                         parent_dir_inode->i_sb, xid, NULL);
+                                parent_dir_inode->i_sb, xid, NULL);
        if ((rc == 0) && (newInode != NULL)) {
                if (pTcon->nocase)
@@ -645,7 +701,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                else
                        direntry->d_op = &cifs_dentry_ops;
                d_add(direntry, newInode);
+                if (posix_open)
+                        filp = lookup_instantiate_filp(nd, direntry, NULL);
                /* since paths are not looked up by component - the parent
                   directories are presumed to be good here */
                renew_parental_timestamps(direntry);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 1e0c1bd8f2e4..df4a306f697e 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -78,7 +78,7 @@ dns_resolver_instantiate(struct key *key, const void *data,
        }
        key->type_data.x[0] = datalen;
-        rcu_assign_pointer(key->payload.data, ip);
+        key->payload.data = ip;
        return rc;
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 81747acca4c4..302ea15f02e6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -46,7 +46,7 @@ static inline struct cifsFileInfo *cifs_init_private(
        memset(private_data, 0, sizeof(struct cifsFileInfo));
        private_data->netfid = netfid;
        private_data->pid = current->tgid;
-        init_MUTEX(&private_data->fh_sem);
+        mutex_init(&private_data->fh_mutex);
        mutex_init(&private_data->lock_mutex);
        INIT_LIST_HEAD(&private_data->llist);
        private_data->pfile = file; /* needed for writepage */
@@ -129,15 +129,8 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode,
                        struct file *file, struct cifsInodeInfo *pCifsInode,
                        struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
 {
-        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-/*      struct timespec temp; */   /* BB REMOVEME BB */
-        file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-        if (file->private_data == NULL)
-                return -ENOMEM;
-        pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
        write_lock(&GlobalSMBSeslock);
-        list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
        if (pCifsInode == NULL) {
@@ -145,17 +138,6 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode,
                return -EINVAL;
        }
-        /* want handles we can use to read with first
-           in the list so we do not have to walk the
-           list to search for one in write_begin */
-        if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
-                list_add_tail(&pCifsFile->flist,
-                              &pCifsInode->openFileList);
-        } else {
-                list_add(&pCifsFile->flist,
-                         &pCifsInode->openFileList);
-        }
        if (pCifsInode->clientCanCacheRead) {
                /* we have the inode open somewhere else
                   no need to discard cache data */
@@ -198,6 +180,38 @@ psx_client_can_cache:
        return 0;
 }
+static struct cifsFileInfo *
+cifs_fill_filedata(struct file *file)
+{
+        struct list_head *tmp;
+        struct cifsFileInfo *pCifsFile = NULL;
+        struct cifsInodeInfo *pCifsInode = NULL;
+        /* search inode for this file and fill in file->private_data */
+        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+        read_lock(&GlobalSMBSeslock);
+        list_for_each(tmp, &pCifsInode->openFileList) {
+                pCifsFile = list_entry(tmp, struct cifsFileInfo, flist);
+                if ((pCifsFile->pfile == NULL) &&
+                    (pCifsFile->pid == current->tgid)) {
+                        /* mode set in cifs_create */
+                        /* needed for writepage */
+                        pCifsFile->pfile = file;
+                        file->private_data = pCifsFile;
+                        break;
+                }
+        }
+        read_unlock(&GlobalSMBSeslock);
+        if (file->private_data != NULL) {
+                return pCifsFile;
+        } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
+                        cERROR(1, ("could not find file instance for "
+                                   "new file %p", file));
+        return NULL;
+}
 /* all arguments to this function must be checked for validity in caller */
 static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
        struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
@@ -272,7 +286,6 @@ int cifs_open(struct inode *inode, struct file *file)
        struct cifsTconInfo *tcon;
        struct cifsFileInfo *pCifsFile;
        struct cifsInodeInfo *pCifsInode;
-        struct list_head *tmp;
        char *full_path = NULL;
        int desiredAccess;
        int disposition;
@@ -284,34 +297,11 @@ int cifs_open(struct inode *inode, struct file *file)
        cifs_sb = CIFS_SB(inode->i_sb);
        tcon = cifs_sb->tcon;
-        if (file->f_flags & O_CREAT) {
+        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-                /* search inode for this file and fill in file->private_data */
+        pCifsFile = cifs_fill_filedata(file);
-                pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+        if (pCifsFile) {
-                read_lock(&GlobalSMBSeslock);
+                FreeXid(xid);
-                list_for_each(tmp, &pCifsInode->openFileList) {
+                return 0;
-                        pCifsFile = list_entry(tmp, struct cifsFileInfo,
-                                               flist);
-                        if ((pCifsFile->pfile == NULL) &&
-                            (pCifsFile->pid == current->tgid)) {
-                                /* mode set in cifs_create */
-                                /* needed for writepage */
-                                pCifsFile->pfile = file;
-                                file->private_data = pCifsFile;
-                                break;
-                        }
-                }
-                read_unlock(&GlobalSMBSeslock);
-                if (file->private_data != NULL) {
-                        rc = 0;
-                        FreeXid(xid);
-                        return rc;
-                } else {
-                        if (file->f_flags & O_EXCL)
-                                cERROR(1, ("could not find file instance for "
-                                           "new file %p", file));
-                }
        }
        full_path = build_path_from_dentry(file->f_path.dentry);
@@ -342,6 +332,7 @@ int cifs_open(struct inode *inode, struct file *file)
                        /* no need for special case handling of setting mode
                           on read only files needed here */
+                        pCifsFile = cifs_fill_filedata(file);
                        cifs_posix_open_inode_helper(inode, file, pCifsInode,
                                                     pCifsFile, oplock, netfid);
                        goto out;
@@ -500,9 +491,9 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
                return -EBADF;
        xid = GetXid();
-        down(&pCifsFile->fh_sem);
+        mutex_unlock(&pCifsFile->fh_mutex);
        if (!pCifsFile->invalidHandle) {
-                up(&pCifsFile->fh_sem);
+                mutex_lock(&pCifsFile->fh_mutex);
                FreeXid(xid);
                return 0;
        }
@@ -533,7 +524,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        if (full_path == NULL) {
                rc = -ENOMEM;
 reopen_error_exit:
-                up(&pCifsFile->fh_sem);
+                mutex_lock(&pCifsFile->fh_mutex);
                FreeXid(xid);
                return rc;
        }
@@ -575,14 +566,14 @@ reopen_error_exit:
                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                up(&pCifsFile->fh_sem);
+                mutex_lock(&pCifsFile->fh_mutex);
                cFYI(1, ("cifs_open returned 0x%x", rc));
                cFYI(1, ("oplock: %d", oplock));
        } else {
 reopen_success:
                pCifsFile->netfid = netfid;
                pCifsFile->invalidHandle = false;
-                up(&pCifsFile->fh_sem);
+                mutex_lock(&pCifsFile->fh_mutex);
                pCifsInode = CIFS_I(inode);
                if (pCifsInode) {
                        if (can_flush) {
@@ -971,6 +962,40 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        return rc;
 }
+/*
+ * Set the timeout on write requests past EOF. For some servers (Windows)
+ * these calls can be very long.
+ *
+ * If we're writing >10M past the EOF we give a 180s timeout. Anything less
+ * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
+ * The 10M cutoff is totally arbitrary. A better scheme for this would be
+ * welcome if someone wants to suggest one.
+ *
+ * We may be able to do a better job with this if there were some way to
+ * declare that a file should be sparse.
+ */
+static int
+cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
+{
+        if (offset <= cifsi->server_eof)
+                return CIFS_STD_OP;
+        else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
+                return CIFS_VLONG_OP;
+        else
+                return CIFS_LONG_OP;
+}
+/* update the file size (if needed) after a write */
+static void
+cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+                      unsigned int bytes_written)
+{
+        loff_t end_of_write = offset + bytes_written;
+        if (end_of_write > cifsi->server_eof)
+                cifsi->server_eof = end_of_write;
+}
 ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        size_t write_size, loff_t *poffset)
 {
@@ -981,6 +1006,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        struct cifsTconInfo *pTcon;
        int xid, long_op;
        struct cifsFileInfo *open_file;
+        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1000,11 +1026,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        xid = GetXid();
-        if (*poffset > file->f_path.dentry->d_inode->i_size)
+        long_op = cifs_write_timeout(cifsi, *poffset);
-                long_op = CIFS_VLONG_OP; /* writes past EOF take long time */
-        else
-                long_op = CIFS_LONG_OP;
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
@@ -1048,8 +1070,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                                FreeXid(xid);
                                return rc;
                        }
-                } else
+                } else {
+                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
+                }
                long_op = CIFS_STD_OP; /* subsequent writes fast -
                                    15 seconds is plenty */
        }
@@ -1085,6 +1109,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        struct cifsTconInfo *pTcon;
        int xid, long_op;
        struct cifsFileInfo *open_file;
+        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1099,11 +1124,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        xid = GetXid();
-        if (*poffset > file->f_path.dentry->d_inode->i_size)
+        long_op = cifs_write_timeout(cifsi, *poffset);
-                long_op = CIFS_VLONG_OP; /* writes past EOF can be slow */
-        else
-                long_op = CIFS_LONG_OP;
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
@@ -1166,8 +1187,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
                                FreeXid(xid);
                                return rc;
                        }
-                } else
+                } else {
+                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
+                }
                long_op = CIFS_STD_OP; /* subsequent writes fast -
                                    15 seconds is plenty */
        }
@@ -1380,11 +1403,12 @@ static int cifs_writepages(struct address_space *mapping,
        int nr_pages;
        __u64 offset = 0;
        struct cifsFileInfo *open_file;
+        struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
        struct page *page;
        struct pagevec pvec;
        int rc = 0;
        int scanned = 0;
-        int xid;
+        int xid, long_op;
        cifs_sb = CIFS_SB(mapping->host->i_sb);
@@ -1528,12 +1552,15 @@ retry:
                                cERROR(1, ("No writable handles for inode"));
                                rc = -EBADF;
                        } else {
+                                long_op = cifs_write_timeout(cifsi, offset);
                                rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
                                                   open_file->netfid,
                                                   bytes_to_write, offset,
                                                   &bytes_written, iov, n_iov,
-                                                   CIFS_LONG_OP);
+                                                   long_op);
                                atomic_dec(&open_file->wrtPending);
+                                cifs_update_eof(cifsi, offset, bytes_written);
                                if (rc || bytes_written < bytes_to_write) {
                                        cERROR(1, ("Write2 ret %d, wrote %d",
                                                  rc, bytes_written));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8797cc60805..9c869a6dcba1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -143,6 +143,7 @@ static void cifs_unix_info_to_inode(struct inode *inode,
        inode->i_nlink = le64_to_cpu(info->Nlinks);
+        cifsInfo->server_eof = end_of_file;
        spin_lock(&inode->i_lock);
        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
                /*
@@ -276,7 +277,8 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        /* get new inode */
        if (*pinode == NULL) {
-                *pinode = cifs_new_inode(sb, &find_data.UniqueId);
+                __u64 unique_id = le64_to_cpu(find_data.UniqueId);
+                *pinode = cifs_new_inode(sb, &unique_id);
                if (*pinode == NULL) {
                        rc = -ENOMEM;
                        goto cgiiu_exit;
@@ -605,12 +607,12 @@ int cifs_get_inode_info(struct inode **pinode,
                        inode->i_mode |= S_IFREG;
        }
+        cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile);
        spin_lock(&inode->i_lock);
-        if (is_size_safe_to_change(cifsInfo,
+        if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) {
-                                   le64_to_cpu(pfindData->EndOfFile))) {
                /* can not safely shrink the file size here if the
                   client is writing to it due to potential races */
-                i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
+                i_size_write(inode, cifsInfo->server_eof);
                /* 512 bytes (2**9) is the fake blocksize that must be
                   used for this calculation */
@@ -960,13 +962,21 @@ undo_setattr:
        goto out_close;
 }
+/*
+ * If dentry->d_inode is null (usually meaning the cached dentry
+ * is a negative dentry) then we would attempt a standard SMB delete, but
+ * if that fails we can not attempt the fall back mechanisms on EACESS
+ * but will return the EACESS to the caller.  Note that the VFS does not call
+ * unlink on negative dentries currently.
+ */
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
 {
        int rc = 0;
        int xid;
        char *full_path = NULL;
        struct inode *inode = dentry->d_inode;
-        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+        struct cifsInodeInfo *cifs_inode;
        struct super_block *sb = dir->i_sb;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifsTconInfo *tcon = cifs_sb->tcon;
@@ -1010,7 +1020,7 @@ psx_del_no_retry:
                rc = cifs_rename_pending_delete(full_path, dentry, xid);
                if (rc == 0)
                        drop_nlink(inode);
-        } else if (rc == -EACCES && dosattr == 0) {
+        } else if ((rc == -EACCES) && (dosattr == 0) && inode) {
                attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
                if (attrs == NULL) {
                        rc = -ENOMEM;
@@ -1018,7 +1028,8 @@ psx_del_no_retry:
                }
                /* try to reset dos attributes */
-                origattr = cifsInode->cifsAttrs;
+                cifs_inode = CIFS_I(inode);
+                origattr = cifs_inode->cifsAttrs;
                if (origattr == 0)
                        origattr |= ATTR_NORMAL;
                dosattr = origattr & ~ATTR_READONLY;
@@ -1039,13 +1050,13 @@ psx_del_no_retry:
 out_reval:
        if (inode) {
-                cifsInode = CIFS_I(inode);
+                cifs_inode = CIFS_I(inode);
-                cifsInode->time = 0;    /* will force revalidate to get info
+                cifs_inode->time = 0;   /* will force revalidate to get info
                                           when needed */
                inode->i_ctime = current_fs_time(sb);
        }
        dir->i_ctime = dir->i_mtime = current_fs_time(sb);
-        cifsInode = CIFS_I(dir);
+        cifs_inode = CIFS_I(dir);
        CIFS_I(dir)->time = 0;  /* force revalidate of dir as well */
        kfree(full_path);
@@ -1125,7 +1136,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        goto mkdir_out;
                }
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
                rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
                                mode, NULL /* netfid */, pInfo, &oplock,
                                full_path, cifs_sb->local_nls,
@@ -1138,6 +1149,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        cFYI(1, ("posix mkdir returned 0x%x", rc));
                        d_drop(direntry);
                } else {
+                        __u64 unique_id;
                        if (pInfo->Type == cpu_to_le32(-1)) {
                                /* no return info, go query for it */
                                kfree(pInfo);
@@ -1151,8 +1163,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        else
                                direntry->d_op = &cifs_dentry_ops;
-                        newinode = cifs_new_inode(inode->i_sb,
+                        unique_id = le64_to_cpu(pInfo->UniqueId);
-                                                  &pInfo->UniqueId);
+                        newinode = cifs_new_inode(inode->i_sb, &unique_id);
                        if (newinode == NULL) {
                                kfree(pInfo);
                                goto mkdir_get_info;
@@ -1204,7 +1216,7 @@ mkdir_get_info:
                if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
                                direntry->d_inode->i_nlink = 2;
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
                /* must turn on setgid bit if parent dir has it */
                if (inode->i_mode & S_ISGID)
                        mode |= S_ISGID;
@@ -1450,7 +1462,8 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
                     checking the UniqueId via FILE_INTERNAL_INFO */
 unlink_target:
-        if ((rc == -EACCES) || (rc == -EEXIST)) {
+        /* Try unlinking the target dentry if it's not negative */
+        if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
                tmprc = cifs_unlink(target_dir, target_dentry);
                if (tmprc)
                        goto cifs_rename_exit;
@@ -1753,6 +1766,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
        }
        if (rc == 0) {
+                cifsInode->server_eof = attrs->ia_size;
                rc = cifs_vmtruncate(inode, attrs->ia_size);
                cifs_truncate_page(inode->i_mapping, inode->i_size);
        }
@@ -1792,20 +1806,21 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                goto out;
        }
-        if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
+        /*
-                /*
+         * Attempt to flush data before changing attributes. We need to do
-                   Flush data before changing file size or changing the last
+         * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
-                   write time of the file on the server. If the
+         * ownership or mode then we may also need to do this. Here, we take
-                   flush returns error, store it to report later and continue.
+         * the safe way out and just do the flush on all setattr requests. If
-                   BB: This should be smarter. Why bother flushing pages that
+         * the flush returns error, store it to report later and continue.
-                   will be truncated anyway? Also, should we error out here if
+         *
-                   the flush returns error?
+         * BB: This should be smarter. Why bother flushing pages that
-                 */
+         * will be truncated anyway? Also, should we error out here if
-                rc = filemap_write_and_wait(inode->i_mapping);
+         * the flush returns error?
-                if (rc != 0) {
+         */
-                        cifsInode->write_behind_rc = rc;
+        rc = filemap_write_and_wait(inode->i_mapping);
-                        rc = 0;
+        if (rc != 0) {
-                }
+                cifsInode->write_behind_rc = rc;
+                rc = 0;
        }
        if (attrs->ia_valid & ATTR_SIZE) {
@@ -1903,20 +1918,21 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
                return -ENOMEM;
        }
-        if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
+        /*
-                /*
+         * Attempt to flush data before changing attributes. We need to do
-                   Flush data before changing file size or changing the last
+         * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
-                   write time of the file on the server. If the
+         * ownership or mode then we may also need to do this. Here, we take
-                   flush returns error, store it to report later and continue.
+         * the safe way out and just do the flush on all setattr requests. If
-                   BB: This should be smarter. Why bother flushing pages that
+         * the flush returns error, store it to report later and continue.
-                   will be truncated anyway? Also, should we error out here if
+         *
-                   the flush returns error?
+         * BB: This should be smarter. Why bother flushing pages that
-                 */
+         * will be truncated anyway? Also, should we error out here if
-                rc = filemap_write_and_wait(inode->i_mapping);
+         * the flush returns error?
-                if (rc != 0) {
+         */
-                        cifsInode->write_behind_rc = rc;
+        rc = filemap_write_and_wait(inode->i_mapping);
-                        rc = 0;
+        if (rc != 0) {
-                }
+                cifsInode->write_behind_rc = rc;
+                rc = 0;
        }
        if (attrs->ia_valid & ATTR_SIZE) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 63f644000ce5..cd83c53fcbb5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -107,63 +107,51 @@ void *
 cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 {
        struct inode *inode = direntry->d_inode;
-        int rc = -EACCES;
+        int rc = -ENOMEM;
        int xid;
        char *full_path = NULL;
-        char *target_path = ERR_PTR(-ENOMEM);
+        char *target_path = NULL;
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon;
+        struct cifsTconInfo *tcon = cifs_sb->tcon;
        xid = GetXid();
-        full_path = build_path_from_dentry(direntry);
+        /*
+         * For now, we just handle symlinks with unix extensions enabled.
-        if (!full_path)
+         * Eventually we should handle NTFS reparse points, and MacOS
-                goto out_no_free;
+         * symlink support. For instance...
+         *
-        cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
+         * rc = CIFSSMBQueryReparseLinkInfo(...)
-        cifs_sb = CIFS_SB(inode->i_sb);
+         *
-        pTcon = cifs_sb->tcon;
+         * For now, just return -EACCES when the server doesn't support posix
-        target_path = kmalloc(PATH_MAX, GFP_KERNEL);
+         * extensions. Note that we still allow querying symlinks when posix
-        if (!target_path) {
+         * extensions are manually disabled. We could disable these as well
-                target_path = ERR_PTR(-ENOMEM);
+         * but there doesn't seem to be any harm in allowing the client to
+         * read them.
+         */
+        if (!(tcon->ses->capabilities & CAP_UNIX)) {
+                rc = -EACCES;
                goto out;
        }
-        /* We could change this to:
+        full_path = build_path_from_dentry(direntry);
-                if (pTcon->unix_ext)
+        if (!full_path)
-           but there does not seem any point in refusing to
+                goto out;
-           get symlink info if we can, even if unix extensions
-           turned off for this mount */
-        if (pTcon->ses->capabilities & CAP_UNIX)
-                rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path,
-                                             target_path,
-                                             PATH_MAX-1,
-                                             cifs_sb->local_nls);
-        else {
-                /* BB add read reparse point symlink code here */
-                /* rc = CIFSSMBQueryReparseLinkInfo */
-                /* BB Add code to Query ReparsePoint info */
-                /* BB Add MAC style xsymlink check here if enabled */
-        }
-        if (rc == 0) {
-/* BB Add special case check for Samba DFS symlinks */
+        cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
-                target_path[PATH_MAX-1] = 0;
+        rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
-        } else {
+                                     cifs_sb->local_nls);
+        kfree(full_path);
+out:
+        if (rc != 0) {
                kfree(target_path);
                target_path = ERR_PTR(rc);
        }
-out:
-        kfree(full_path);
-out_no_free:
        FreeXid(xid);
        nd_set_link(nd, target_path);
-        return NULL;    /* No cookie */
+        return NULL;
 }
 int
@@ -224,98 +212,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
        return rc;
 }
-int
-cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
-{
-        struct inode *inode = direntry->d_inode;
-        int rc = -EACCES;
-        int xid;
-        int oplock = 0;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
-        char *full_path = NULL;
-        char *tmpbuffer;
-        int len;
-        __u16 fid;
-        xid = GetXid();
-        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
-/* BB would it be safe against deadlock to grab this sem
-      even though rename itself grabs the sem and calls lookup? */
-/*       mutex_lock(&inode->i_sb->s_vfs_rename_mutex);*/
-        full_path = build_path_from_dentry(direntry);
-/*       mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);*/
-        if (full_path == NULL) {
-                FreeXid(xid);
-                return -ENOMEM;
-        }
-        cFYI(1,
-             ("Full path: %s inode = 0x%p pBuffer = 0x%p buflen = %d",
-              full_path, inode, pBuffer, buflen));
-        if (buflen > PATH_MAX)
-                len = PATH_MAX;
-        else
-                len = buflen;
-        tmpbuffer = kmalloc(len, GFP_KERNEL);
-        if (tmpbuffer == NULL) {
-                kfree(full_path);
-                FreeXid(xid);
-                return -ENOMEM;
-        }
-/* BB add read reparse point symlink code and
-        Unix extensions symlink code here BB */
-/* We could disable this based on pTcon->unix_ext flag instead ... but why? */
-        if (cifs_sb->tcon->ses->capabilities & CAP_UNIX)
-                rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path,
-                                tmpbuffer,
-                                len - 1,
-                                cifs_sb->local_nls);
-        else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-                cERROR(1, ("SFU style symlinks not implemented yet"));
-                /* add open and read as in fs/cifs/inode.c */
-        } else {
-                rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ,
-                                OPEN_REPARSE_POINT, &fid, &oplock, NULL,
-                                cifs_sb->local_nls,
-                                cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (!rc) {
-                        rc = CIFSSMBQueryReparseLinkInfo(xid, pTcon, full_path,
-                                tmpbuffer,
-                                len - 1,
-                                fid,
-                                cifs_sb->local_nls);
-                        if (CIFSSMBClose(xid, pTcon, fid)) {
-                                cFYI(1, ("Error closing junction point "
-                                         "(open for ioctl)"));
-                        }
-                        /* If it is a DFS junction earlier we would have gotten
-                           PATH_NOT_COVERED returned from server so we do
-                           not need to request the DFS info here */
-                }
-        }
-        /* BB Anything else to do to handle recursive links? */
-        /* BB Should we be using page ops here? */
-        /* BB null terminate returned string in pBuffer? BB */
-        if (rc == 0) {
-                rc = vfs_readlink(direntry, pBuffer, len, tmpbuffer);
-                cFYI(1,
-                     ("vfs_readlink called from cifs_readlink returned %d",
-                      rc));
-        }
-        kfree(tmpbuffer);
-        kfree(full_path);
-        FreeXid(xid);
-        return rc;
-}
 void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie)
 {
        char *p = nd_get_link(nd);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4c89c572891a..e079a9190ec4 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -635,77 +635,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
        return;
 }
-/* Windows maps these to the user defined 16 bit Unicode range since they are
-   reserved symbols (along with \ and /), otherwise illegal to store
-   in filenames in NTFS */
-#define UNI_ASTERIK     (__u16) ('*' + 0xF000)
-#define UNI_QUESTION    (__u16) ('?' + 0xF000)
-#define UNI_COLON       (__u16) (':' + 0xF000)
-#define UNI_GRTRTHAN    (__u16) ('>' + 0xF000)
-#define UNI_LESSTHAN    (__u16) ('<' + 0xF000)
-#define UNI_PIPE        (__u16) ('|' + 0xF000)
-#define UNI_SLASH       (__u16) ('\\' + 0xF000)
-/* Convert 16 bit Unicode pathname from wire format to string in current code
-   page.  Conversion may involve remapping up the seven characters that are
-   only legal in POSIX-like OS (if they are present in the string). Path
-   names are little endian 16 bit Unicode on the wire */
-int
-cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
-                    const struct nls_table *cp)
-{
-        int i, j, len;
-        __u16 src_char;
-        for (i = 0, j = 0; i < maxlen; i++) {
-                src_char = le16_to_cpu(source[i]);
-                switch (src_char) {
-                        case 0:
-                                goto cUCS_out; /* BB check this BB */
-                        case UNI_COLON:
-                                target[j] = ':';
-                                break;
-                        case UNI_ASTERIK:
-                                target[j] = '*';
-                                break;
-                        case UNI_QUESTION:
-                                target[j] = '?';
-                                break;
-                        /* BB We can not handle remapping slash until
-                           all the calls to build_path_from_dentry
-                           are modified, as they use slash as separator BB */
-                        /* case UNI_SLASH:
-                                target[j] = '\\';
-                                break;*/
-                        case UNI_PIPE:
-                                target[j] = '|';
-                                break;
-                        case UNI_GRTRTHAN:
-                                target[j] = '>';
-                                break;
-                        case UNI_LESSTHAN:
-                                target[j] = '<';
-                                break;
-                        default:
-                                len = cp->uni2char(src_char, &target[j],
-                                                NLS_MAX_CHARSET_SIZE);
-                                if (len > 0) {
-                                        j += len;
-                                        continue;
-                                } else {
-                                        target[j] = '?';
-                                }
-                }
-                j++;
-                /* make sure we do not overrun callers allocated temp buffer */
-                if (j >= (2 * NAME_MAX))
-                        break;
-        }
-cUCS_out:
-        target[j] = 0;
-        return j;
-}
 /* Convert 16 bit Unicode pathname to wire format from string in current code
   page.  Conversion may involve remapping up the seven characters that are
   only legal in POSIX-like OS (if they are present in the string). Path
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8703d68f5b20..e2fe998989a3 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -79,6 +79,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
        {ErrQuota, -EDQUOT},
        {ErrNotALink, -ENOLINK},
        {ERRnetlogonNotStarted, -ENOPROTOOPT},
+        {ERRsymlink, -EOPNOTSUPP},
        {ErrTooManyLinks, -EMLINK},
        {0, 0}
 };
@@ -714,6 +715,7 @@ static const struct {
        ERRDOS, ERRnoaccess, 0xc000028f}, {
        ERRDOS, ERRnoaccess, 0xc0000290}, {
        ERRDOS, ERRbadfunc, 0xc000029c}, {
+        ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, {
        ERRDOS, ERRinvlevel, 0x007c0001}, };
 /*****************************************************************************
diff --git a/fs/cifs/nterr.h b/fs/cifs/nterr.h
index 588abbb9d08c..257267367d41 100644
--- a/fs/cifs/nterr.h
+++ b/fs/cifs/nterr.h
@@ -35,8 +35,6 @@ struct nt_err_code_struct {
 extern const struct nt_err_code_struct nt_errs[];
 /* Win32 Status codes. */
-#define STATUS_BUFFER_OVERFLOW            0x80000005
 #define STATUS_MORE_ENTRIES               0x0105
 #define ERROR_INVALID_PARAMETER           0x0057
 #define ERROR_INSUFFICIENT_BUFFER         0x007a
@@ -50,6 +48,13 @@ extern const struct nt_err_code_struct nt_errs[];
 #define STATUS_SOME_UNMAPPED       0x0107
 #define STATUS_BUFFER_OVERFLOW     0x80000005
 #define NT_STATUS_NO_MORE_ENTRIES  0x8000001a
+#define NT_STATUS_MEDIA_CHANGED    0x8000001c
+#define NT_STATUS_END_OF_MEDIA     0x8000001e
+#define NT_STATUS_MEDIA_CHECK      0x80000020
+#define NT_STATUS_NO_DATA_DETECTED 0x8000001c
+#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d
+#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288
+#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288
 #define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001
 #define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002
 #define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index c377d8065d99..49c9a4e75319 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -27,29 +27,39 @@
 #define UnknownMessage    cpu_to_le32(8)
 /* Negotiate Flags */
-#define NTLMSSP_NEGOTIATE_UNICODE       0x01 /* Text strings are in unicode */
+#define NTLMSSP_NEGOTIATE_UNICODE         0x01 /* Text strings are unicode */
-#define NTLMSSP_NEGOTIATE_OEM           0x02 /* Text strings are in OEM */
+#define NTLMSSP_NEGOTIATE_OEM             0x02 /* Text strings are in OEM */
-#define NTLMSSP_REQUEST_TARGET          0x04 /* Server return its auth realm */
+#define NTLMSSP_REQUEST_TARGET            0x04 /* Srv returns its auth realm */
-#define NTLMSSP_NEGOTIATE_SIGN        0x0010 /* Request signature capability */
+/* define reserved9                       0x08 */
-#define NTLMSSP_NEGOTIATE_SEAL        0x0020 /*  Request confidentiality */
+#define NTLMSSP_NEGOTIATE_SIGN          0x0010 /* Request signing capability */
-#define NTLMSSP_NEGOTIATE_DGRAM       0x0040
+#define NTLMSSP_NEGOTIATE_SEAL          0x0020 /* Request confidentiality */
-#define NTLMSSP_NEGOTIATE_LM_KEY      0x0080 /* Sign/seal use LM session key */
+#define NTLMSSP_NEGOTIATE_DGRAM         0x0040
-#define NTLMSSP_NEGOTIATE_NTLM        0x0200 /* NTLM authentication */
+#define NTLMSSP_NEGOTIATE_LM_KEY        0x0080 /* Use LM session key */
-#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000
+/* defined reserved 8                   0x0100 */
+#define NTLMSSP_NEGOTIATE_NTLM          0x0200 /* NTLM authentication */
+#define NTLMSSP_NEGOTIATE_NT_ONLY       0x0400 /* Lanman not allowed */
+#define NTLMSSP_ANONYMOUS               0x0800
+#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */
 #define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000
-#define NTLMSSP_NEGOTIATE_LOCAL_CALL  0x4000 /* client/server on same machine */
+#define NTLMSSP_NEGOTIATE_LOCAL_CALL    0x4000 /* client/server same machine */
-#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign for all security levels */
+#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN   0x8000 /* Sign. All security levels  */
-#define NTLMSSP_TARGET_TYPE_DOMAIN   0x10000
+#define NTLMSSP_TARGET_TYPE_DOMAIN     0x10000
-#define NTLMSSP_TARGET_TYPE_SERVER   0x20000
+#define NTLMSSP_TARGET_TYPE_SERVER     0x20000
-#define NTLMSSP_TARGET_TYPE_SHARE    0x40000
+#define NTLMSSP_TARGET_TYPE_SHARE      0x40000
-#define NTLMSSP_NEGOTIATE_NTLMV2     0x80000
+#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/
-#define NTLMSSP_REQUEST_INIT_RESP   0x100000
+/* #define NTLMSSP_REQUEST_INIT_RESP     0x100000 */
-#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000
+#define NTLMSSP_NEGOTIATE_IDENTIFY    0x100000
-#define NTLMSSP_REQUEST_NOT_NT_KEY  0x400000
+#define NTLMSSP_REQUEST_ACCEPT_RESP   0x200000 /* reserved5 */
+#define NTLMSSP_REQUEST_NON_NT_KEY    0x400000
 #define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000
-#define NTLMSSP_NEGOTIATE_128     0x20000000
+/* #define reserved4                 0x1000000 */
-#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
+#define NTLMSSP_NEGOTIATE_VERSION    0x2000000 /* we do not set */
-#define NTLMSSP_NEGOTIATE_56      0x80000000
+/* #define reserved3                 0x4000000 */
+/* #define reserved2                 0x8000000 */
+/* #define reserved1                0x10000000 */
+#define NTLMSSP_NEGOTIATE_128       0x20000000
+#define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
+#define NTLMSSP_NEGOTIATE_56        0x80000000
 /* Although typedefs are not commonly used for structure definitions */
 /* in the Linux kernel, in this particular case they are useful      */
@@ -60,32 +70,36 @@
 typedef struct _SECURITY_BUFFER {
        __le16 Length;
        __le16 MaximumLength;
-        __le32 Buffer;          /* offset to buffer */
+        __le32 BufferOffset;    /* offset to buffer */
 } __attribute__((packed)) SECURITY_BUFFER;
 typedef struct _NEGOTIATE_MESSAGE {
        __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
-        __le32 MessageType;     /* 1 */
+        __le32 MessageType;     /* NtLmNegotiate = 1 */
        __le32 NegotiateFlags;
        SECURITY_BUFFER DomainName;     /* RFC 1001 style and ASCII */
        SECURITY_BUFFER WorkstationName;        /* RFC 1001 and ASCII */
+        /* SECURITY_BUFFER for version info not present since we
+           do not set the version is present flag */
        char DomainString[0];
        /* followed by WorkstationString */
 } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
 typedef struct _CHALLENGE_MESSAGE {
        __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
-        __le32 MessageType;   /* 2 */
+        __le32 MessageType;   /* NtLmChallenge = 2 */
        SECURITY_BUFFER TargetName;
        __le32 NegotiateFlags;
        __u8 Challenge[CIFS_CRYPTO_KEY_SIZE];
        __u8 Reserved[8];
        SECURITY_BUFFER TargetInfoArray;
+        /* SECURITY_BUFFER for version info not present since we
+           do not set the version is present flag */
 } __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE;
 typedef struct _AUTHENTICATE_MESSAGE {
-        __u8 Signature[sizeof (NTLMSSP_SIGNATURE)];
+        __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
-        __le32 MessageType;  /* 3 */
+        __le32 MessageType;  /* NtLmsAuthenticate = 3 */
        SECURITY_BUFFER LmChallengeResponse;
        SECURITY_BUFFER NtChallengeResponse;
        SECURITY_BUFFER DomainName;
@@ -93,5 +107,7 @@ typedef struct _AUTHENTICATE_MESSAGE {
        SECURITY_BUFFER WorkstationName;
        SECURITY_BUFFER SessionKey;
        __le32 NegotiateFlags;
+        /* SECURITY_BUFFER for version info not present since we
+           do not set the version is present flag */
        char UserString[0];
 } __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c2c01ff4c32c..964e097c8203 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -31,6 +31,13 @@
 #include "cifs_fs_sb.h"
 #include "cifsfs.h"
+/*
+ * To be safe - for UCS to UTF-8 with strings loaded with the rare long
+ * characters alloc more to account for such multibyte target UTF-8
+ * characters.
+ */
+#define UNICODE_NAME_MAX ((4 * NAME_MAX) + 2)
 #ifdef CONFIG_CIFS_DEBUG2
 static void dump_cifs_file_struct(struct file *file, char *label)
 {
@@ -239,6 +246,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        if (atomic_read(&cifsInfo->inUse) == 0)
                atomic_set(&cifsInfo->inUse, 1);
+        cifsInfo->server_eof = end_of_file;
        spin_lock(&tmp_inode->i_lock);
        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
                /* can not safely change the file size here if the
@@ -375,6 +383,7 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
                tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
        tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
+        cifsInfo->server_eof = end_of_file;
        spin_lock(&tmp_inode->i_lock);
        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
                /* can not safely change the file size here if the
@@ -436,6 +445,38 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
        }
 }
+/* BB eventually need to add the following helper function to
+      resolve NT_STATUS_STOPPED_ON_SYMLINK return code when
+      we try to do FindFirst on (NTFS) directory symlinks */
+/*
+int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
+                             int xid)
+{
+        __u16 fid;
+        int len;
+        int oplock = 0;
+        int rc;
+        struct cifsTconInfo *ptcon = cifs_sb->tcon;
+        char *tmpbuffer;
+        rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
+                        OPEN_REPARSE_POINT, &fid, &oplock, NULL,
+                        cifs_sb->local_nls,
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (!rc) {
+                tmpbuffer = kmalloc(maxpath);
+                rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
+                                tmpbuffer,
+                                maxpath -1,
+                                fid,
+                                cifs_sb->local_nls);
+                if (CIFSSMBClose(xid, ptcon, fid)) {
+                        cFYI(1, ("Error closing temporary reparsepoint open)"));
+                }
+        }
+}
+ */
 static int initiate_cifs_search(const int xid, struct file *file)
 {
        int rc = 0;
@@ -491,7 +532,10 @@ ffirst_retry:
                        CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
        if (rc == 0)
                cifsFile->invalidHandle = false;
-        if ((rc == -EOPNOTSUPP) &&
+        /* BB add following call to handle readdir on new NTFS symlink errors
+        else if STATUS_STOPPED_ON_SYMLINK
+                call get_symlink_reparse_path and retry with new path */
+        else if ((rc == -EOPNOTSUPP) &&
                (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
                cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
                goto ffirst_retry;
@@ -820,7 +864,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
        char *current_entry, __u16 level, unsigned int unicode,
-        struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
+        struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum)
 {
        int rc = 0;
        unsigned int len = 0;
@@ -840,7 +884,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                        len = strnlen(filename, PATH_MAX);
                }
-                *pinum = pFindData->UniqueId;
+                *pinum = le64_to_cpu(pFindData->UniqueId);
        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
                FILE_DIRECTORY_INFO *pFindData =
                        (FILE_DIRECTORY_INFO *)current_entry;
@@ -856,7 +900,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
-                *pinum = pFindData->UniqueId;
+                *pinum = le64_to_cpu(pFindData->UniqueId);
        } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
                FILE_BOTH_DIRECTORY_INFO *pFindData =
                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
@@ -879,14 +923,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
        }
        if (unicode) {
-                /* BB fixme - test with long names */
+                pqst->len = cifs_from_ucs2((char *) pqst->name,
-                /* Note converted filename can be longer than in unicode */
+                                           (__le16 *) filename,
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+                                           UNICODE_NAME_MAX,
-                        pqst->len = cifs_convertUCSpath((char *)pqst->name,
+                                           min(len, max_len), nlt,
-                                        (__le16 *)filename, len/2, nlt);
+                                           cifs_sb->mnt_cifs_flags &
-                else
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        pqst->len = cifs_strfromUCS_le((char *)pqst->name,
-                                        (__le16 *)filename, len/2, nlt);
        } else {
                pqst->name = filename;
                pqst->len = len;
@@ -896,8 +938,8 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
        return rc;
 }
-static int cifs_filldir(char *pfindEntry, struct file *file,
+static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
-        filldir_t filldir, void *direntry, char *scratch_buf, int max_len)
+                        void *direntry, char *scratch_buf, unsigned int max_len)
 {
        int rc = 0;
        struct qstr qstring;
@@ -994,7 +1036,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        int num_to_fill = 0;
        char *tmp_buf = NULL;
        char *end_of_smb;
-        int max_len;
+        unsigned int max_len;
        xid = GetXid();
@@ -1068,11 +1110,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                                cifsFile->srch_inf.ntwrk_buf_start);
                end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
-                /* To be safe - for UCS to UTF-8 with strings loaded
+                tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
-                with the rare long characters alloc more to account for
-                such multibyte target UTF-8 characters. cifs_unicode.c,
-                which actually does the conversion, has the same limit */
-                tmp_buf = kmalloc((2 * NAME_MAX) + 4, GFP_KERNEL);
                for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
                        if (current_entry == NULL) {
                                /* evaluate whether this case is an error */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5c68b4282be9..897a052270f9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -3,7 +3,7 @@
 *
 *   SMB/CIFS session setup handling routines
 *
- *   Copyright (c) International Business Machines  Corp., 2006, 2007
+ *   Copyright (c) International Business Machines  Corp., 2006, 2009
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -111,7 +111,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
 get_vc_num_exit:
        write_unlock(&cifs_tcp_ses_lock);
-        return le16_to_cpu(vcnum);
+        return cpu_to_le16(vcnum);
 }
 static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
@@ -277,85 +277,51 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
        *pbcc_area = bcc_ptr;
 }
-static int decode_unicode_ssetup(char **pbcc_area, int bleft,
+static void
-                                 struct cifsSesInfo *ses,
+decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
-                                 const struct nls_table *nls_cp)
+                      const struct nls_table *nls_cp)
 {
-        int rc = 0;
+        int len;
-        int words_left, len;
        char *data = *pbcc_area;
        cFYI(1, ("bleft %d", bleft));
+        /*
-        /* SMB header is unaligned, so cifs servers word align start of
+         * Windows servers do not always double null terminate their final
-           Unicode strings */
+         * Unicode string. Check to see if there are an uneven number of bytes
-        data++;
+         * left. If so, then add an extra NULL pad byte to the end of the
-        bleft--; /* Windows servers do not always double null terminate
+         * response.
-                    their final Unicode string - in which case we
+         *
-                    now will not attempt to decode the byte of junk
+         * See section 2.7.2 in "Implementing CIFS" for details
-                    which follows it */
+         */
+        if (bleft % 2) {
-        words_left = bleft / 2;
+                data[bleft] = 0;
+                ++bleft;
-        /* save off server operating system */
+        }
-        len = UniStrnlen((wchar_t *) data, words_left);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-   the end since (at least) WIN2K and Windows XP have a major bug in not null
-   terminating last Unicode string in response  */
-        if (len >= words_left)
-                return rc;
        kfree(ses->serverOS);
-        /* UTF-8 string will not grow more than four times as big as UCS-16 */
+        ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
+        cFYI(1, ("serverOS=%s", ses->serverOS));
-        if (ses->serverOS != NULL)
+        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
-                cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
+        data += len;
-        data += 2 * (len + 1);
+        bleft -= len;
-        words_left -= len + 1;
+        if (bleft <= 0)
+                return;
-        /* save off server network operating system */
-        len = UniStrnlen((wchar_t *) data, words_left);
-        if (len >= words_left)
-                return rc;
        kfree(ses->serverNOS);
-        ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
+        ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        if (ses->serverNOS != NULL) {
+        cFYI(1, ("serverNOS=%s", ses->serverNOS));
-                cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
+        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
-                                   nls_cp);
+        data += len;
-                if (strncmp(ses->serverNOS, "NT LAN Manager 4", 16) == 0) {
+        bleft -= len;
-                        cFYI(1, ("NT4 server"));
+        if (bleft <= 0)
-                        ses->flags |= CIFS_SES_NT4;
+                return;
-                }
-        }
-        data += 2 * (len + 1);
-        words_left -= len + 1;
-        /* save off server domain */
-        len = UniStrnlen((wchar_t *) data, words_left);
-        if (len > words_left)
-                return rc;
        kfree(ses->serverDomain);
-        ses->serverDomain = kzalloc(2 * (len + 1), GFP_KERNEL); /* BB FIXME wrong length */
+        ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        if (ses->serverDomain != NULL) {
+        cFYI(1, ("serverDomain=%s", ses->serverDomain));
-                cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len,
-                                   nls_cp);
-                ses->serverDomain[2*len] = 0;
-                ses->serverDomain[(2*len) + 1] = 0;
-        }
-        data += 2 * (len + 1);
-        words_left -= len + 1;
-        cFYI(1, ("words left: %d", words_left));
+        return;
-        return rc;
 }
 static int decode_ascii_ssetup(char **pbcc_area, int bleft,
@@ -412,6 +378,186 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        return rc;
 }
+static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
+                                    struct cifsSesInfo *ses)
+{
+        CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
+        if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
+                cERROR(1, ("challenge blob len %d too small", blob_len));
+                return -EINVAL;
+        }
+        if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
+                cERROR(1, ("blob signature incorrect %s", pblob->Signature));
+                return -EINVAL;
+        }
+        if (pblob->MessageType != NtLmChallenge) {
+                cERROR(1, ("Incorrect message type %d", pblob->MessageType));
+                return -EINVAL;
+        }
+        memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+        /* BB we could decode pblob->NegotiateFlags; some may be useful */
+        /* In particular we can examine sign flags */
+        /* BB spec says that if AvId field of MsvAvTimestamp is populated then
+                we must set the MIC field of the AUTHENTICATE_MESSAGE */
+        return 0;
+}
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+/* BB Move to ntlmssp.c eventually */
+/* We do not malloc the blob, it is passed in pbuffer, because
+   it is fixed size, and small, making this approach cleaner */
+static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
+                                         struct cifsSesInfo *ses)
+{
+        NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
+        __u32 flags;
+        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
+        sec_blob->MessageType = NtLmNegotiate;
+        /* BB is NTLMV2 session security format easier to use here? */
+        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
+                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
+                NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+        if (ses->server->secMode &
+           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                flags |= NTLMSSP_NEGOTIATE_SIGN;
+        if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->WorkstationName.BufferOffset = 0;
+        sec_blob->WorkstationName.Length = 0;
+        sec_blob->WorkstationName.MaximumLength = 0;
+        /* Domain name is sent on the Challenge not Negotiate NTLMSSP request */
+        sec_blob->DomainName.BufferOffset = 0;
+        sec_blob->DomainName.Length = 0;
+        sec_blob->DomainName.MaximumLength = 0;
+}
+/* We do not malloc the blob, it is passed in pbuffer, because its
+   maximum possible size is fixed and small, making this approach cleaner.
+   This function returns the length of the data in the blob */
+static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+                                   struct cifsSesInfo *ses,
+                                   const struct nls_table *nls_cp, int first)
+{
+        AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
+        __u32 flags;
+        unsigned char *tmp;
+        char ntlm_session_key[CIFS_SESS_KEY_SIZE];
+        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
+        sec_blob->MessageType = NtLmAuthenticate;
+        flags = NTLMSSP_NEGOTIATE_56 |
+                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
+                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
+                NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+        if (ses->server->secMode &
+           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                flags |= NTLMSSP_NEGOTIATE_SIGN;
+        if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+        tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
+        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->LmChallengeResponse.BufferOffset =
+                                cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
+        sec_blob->LmChallengeResponse.Length = 0;
+        sec_blob->LmChallengeResponse.MaximumLength = 0;
+        /* calculate session key,  BB what about adding similar ntlmv2 path? */
+        SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
+        if (first)
+                cifs_calculate_mac_key(&ses->server->mac_signing_key,
+                                       ntlm_session_key, ses->password);
+        memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
+        sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
+        sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+        sec_blob->NtChallengeResponse.MaximumLength =
+                                cpu_to_le16(CIFS_SESS_KEY_SIZE);
+        tmp += CIFS_SESS_KEY_SIZE;
+        if (ses->domainName == NULL) {
+                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->DomainName.Length = 0;
+                sec_blob->DomainName.MaximumLength = 0;
+                tmp += 2;
+        } else {
+                int len;
+                len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
+                                    MAX_USERNAME_SIZE, nls_cp);
+                len *= 2; /* unicode is 2 bytes each */
+                len += 2; /* trailing null */
+                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->DomainName.Length = cpu_to_le16(len);
+                sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
+                tmp += len;
+        }
+        if (ses->userName == NULL) {
+                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->UserName.Length = 0;
+                sec_blob->UserName.MaximumLength = 0;
+                tmp += 2;
+        } else {
+                int len;
+                len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
+                                    MAX_USERNAME_SIZE, nls_cp);
+                len *= 2; /* unicode is 2 bytes each */
+                len += 2; /* trailing null */
+                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->UserName.Length = cpu_to_le16(len);
+                sec_blob->UserName.MaximumLength = cpu_to_le16(len);
+                tmp += len;
+        }
+        sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+        sec_blob->WorkstationName.Length = 0;
+        sec_blob->WorkstationName.MaximumLength = 0;
+        tmp += 2;
+        sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+        sec_blob->SessionKey.Length = 0;
+        sec_blob->SessionKey.MaximumLength = 0;
+        return tmp - pbuffer;
+}
+static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
+                                 struct cifsSesInfo *ses)
+{
+        build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
+        pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+        return;
+}
+static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
+                                  struct cifsSesInfo *ses,
+                                  const struct nls_table *nls, int first_time)
+{
+        int bloblen;
+        bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
+                                          first_time);
+        pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
+        return bloblen;
+}
+#endif
 int
 CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                const struct nls_table *nls_cp)
@@ -430,6 +576,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        __u16 action;
        int bytes_remaining;
        struct key *spnego_key = NULL;
+        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
        if (ses == NULL)
                return -EINVAL;
@@ -437,6 +584,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        type = ses->server->secType;
        cFYI(1, ("sess setup type %d", type));
+ssetup_ntlmssp_authenticate:
+        if (phase == NtLmChallenge)
+                phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
        if (type == LANMAN) {
 #ifndef CONFIG_CIFS_WEAK_PW_HASH
                /* LANMAN and plaintext are less secure and off by default.
@@ -650,9 +801,53 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
        } else {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if ((experimEnabled > 1) && (type == RawNTLMSSP)) {
+                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+                                cERROR(1, ("NTLMSSP requires Unicode support"));
+                                rc = -ENOSYS;
+                                goto ssetup_exit;
+                        }
+                        cFYI(1, ("ntlmssp session setup phase %d", phase));
+                        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                        capabilities |= CAP_EXTENDED_SECURITY;
+                        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+                        if (phase == NtLmNegotiate) {
+                                setup_ntlmssp_neg_req(pSMB, ses);
+                                iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+                        } else if (phase == NtLmAuthenticate) {
+                                int blob_len;
+                                blob_len = setup_ntlmssp_auth_req(pSMB, ses,
+                                                                  nls_cp,
+                                                                  first_time);
+                                iov[1].iov_len = blob_len;
+                                /* Make sure that we tell the server that we
+                                   are using the uid that it just gave us back
+                                   on the response (challenge) */
+                                smb_buf->Uid = ses->Suid;
+                        } else {
+                                cERROR(1, ("invalid phase %d", phase));
+                                rc = -ENOSYS;
+                                goto ssetup_exit;
+                        }
+                        iov[1].iov_base = &pSMB->req.SecurityBlob[0];
+                        /* unicode strings must be word aligned */
+                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                                *bcc_ptr = 0;
+                                bcc_ptr++;
+                        }
+                        unicode_oslm_strings(&bcc_ptr, nls_cp);
+                } else {
+                        cERROR(1, ("secType %d not supported!", type));
+                        rc = -ENOSYS;
+                        goto ssetup_exit;
+                }
+#else
                cERROR(1, ("secType %d not supported!", type));
                rc = -ENOSYS;
                goto ssetup_exit;
+#endif
        }
        iov[2].iov_base = str_area;
@@ -668,12 +863,23 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        /* SMB request buf freed in SendReceive2 */
        cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
-        if (rc)
-                goto ssetup_exit;
        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
        smb_buf = (struct smb_hdr *)iov[0].iov_base;
+        if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
+                        cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
+                if (phase != NtLmNegotiate) {
+                        cERROR(1, ("Unexpected more processing error"));
+                        goto ssetup_exit;
+                }
+                /* NTLMSSP Negotiate sent now processing challenge (response) */
+                phase = NtLmChallenge; /* process ntlmssp challenge */
+                rc = 0; /* MORE_PROC rc is not an error here, but expected */
+        }
+        if (rc)
+                goto ssetup_exit;
        if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
                rc = -EIO;
                cERROR(1, ("bad word count %d", smb_buf->WordCount));
@@ -692,22 +898,33 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        if (smb_buf->WordCount == 4) {
                __u16 blob_len;
                blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
-                bcc_ptr += blob_len;
                if (blob_len > bytes_remaining) {
                        cERROR(1, ("bad security blob length %d", blob_len));
                        rc = -EINVAL;
                        goto ssetup_exit;
                }
+                if (phase == NtLmChallenge) {
+                        rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
+                        /* now goto beginning for ntlmssp authenticate phase */
+                        if (rc)
+                                goto ssetup_exit;
+                }
+                bcc_ptr += blob_len;
                bytes_remaining -= blob_len;
        }
        /* BB check if Unicode and decode strings */
-        if (smb_buf->Flags2 & SMBFLG2_UNICODE)
+        if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
-                rc = decode_unicode_ssetup(&bcc_ptr, bytes_remaining,
+                /* unicode string area must be word-aligned */
-                                                   ses, nls_cp);
+                if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
-        else
+                        ++bcc_ptr;
+                        --bytes_remaining;
+                }
+                decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
+        } else {
                rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
                                         ses, nls_cp);
+        }
 ssetup_exit:
        if (spnego_key) {
@@ -721,5 +938,9 @@ ssetup_exit:
        } else if (resp_buf_type == CIFS_LARGE_BUFFER)
                cifs_buf_release(iov[0].iov_base);
+        /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
+        if ((phase == NtLmChallenge) && (rc == 0))
+                goto ssetup_ntlmssp_authenticate;
        return rc;
 }
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index 7f50e8577c1c..c5084d27db7c 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -110,6 +110,7 @@
 /* Below errors are used internally (do not come over the wire) for passthrough
   from STATUS codes to POSIX only  */
+#define ERRsymlink              0xFFFD
 #define ErrTooManyLinks         0xFFFE
 /* Following error codes may be generated with the ERRSRV error class.*/
diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5a..681ed81e6be0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -180,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename,
                struct compat_stat __user *statbuf)
 {
        struct kstat stat;
-        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_stat(filename, &stat);
-                error = cp_compat_stat(&stat, statbuf);
+        if (error)
-        return error;
+                return error;
+        return cp_compat_stat(&stat, statbuf);
 }
 asmlinkage long compat_sys_newlstat(char __user * filename,
                struct compat_stat __user *statbuf)
 {
        struct kstat stat;
-        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_lstat(filename, &stat);
-                error = cp_compat_stat(&stat, statbuf);
+        if (error)
-        return error;
+                return error;
+        return cp_compat_stat(&stat, statbuf);
 }
 #ifndef __ARCH_WANT_STAT64
@@ -203,21 +206,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
                struct compat_stat __user *statbuf, int flag)
 {
        struct kstat stat;
-        int error = -EINVAL;
+        int error;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-                goto out;
-        if (flag & AT_SYMLINK_NOFOLLOW)
-                error = vfs_lstat_fd(dfd, filename, &stat);
-        else
-                error = vfs_stat_fd(dfd, filename, &stat);
-        if (!error)
-                error = cp_compat_stat(&stat, statbuf);
-out:
+        error = vfs_fstatat(dfd, filename, &stat, flag);
-        return error;
+        if (error)
+                return error;
+        return cp_compat_stat(&stat, statbuf);
 }
 #endif
@@ -1195,16 +1189,12 @@ out:
        return ret;
 }
-asmlinkage ssize_t
+static size_t compat_readv(struct file *file,
-compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen)
+                           const struct compat_iovec __user *vec,
+                           unsigned long vlen, loff_t *pos)
 {
-        struct file *file;
        ssize_t ret = -EBADF;
-        file = fget(fd);
-        if (!file)
-                return -EBADF;
        if (!(file->f_mode & FMODE_READ))
                goto out;
@@ -1212,25 +1202,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
                goto out;
-        ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
+        ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
 out:
        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
-        fput(file);
        return ret;
 }
 asmlinkage ssize_t
-compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen)
+compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
+                 unsigned long vlen)
 {
        struct file *file;
-        ssize_t ret = -EBADF;
+        int fput_needed;
+        ssize_t ret;
-        file = fget(fd);
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        ret = compat_readv(file, vec, vlen, &file->f_pos);
+        fput_light(file, fput_needed);
+        return ret;
+}
+asmlinkage ssize_t
+compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
+                  unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+        struct file *file;
+        int fput_needed;
+        ssize_t ret;
+        if (pos < 0)
+                return -EINVAL;
+        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
+        ret = compat_readv(file, vec, vlen, &pos);
+        fput_light(file, fput_needed);
+        return ret;
+}
+static size_t compat_writev(struct file *file,
+                            const struct compat_iovec __user *vec,
+                            unsigned long vlen, loff_t *pos)
+{
+        ssize_t ret = -EBADF;
        if (!(file->f_mode & FMODE_WRITE))
                goto out;
@@ -1238,13 +1259,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
                goto out;
-        ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
+        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
 out:
        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
-        fput(file);
+        return ret;
+}
+asmlinkage ssize_t
+compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
+                  unsigned long vlen)
+{
+        struct file *file;
+        int fput_needed;
+        ssize_t ret;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        ret = compat_writev(file, vec, vlen, &file->f_pos);
+        fput_light(file, fput_needed);
+        return ret;
+}
+asmlinkage ssize_t
+compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
+                   unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+        struct file *file;
+        int fput_needed;
+        ssize_t ret;
+        if (pos < 0)
+                return -EINVAL;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        ret = compat_writev(file, vec, vlen, &pos);
+        fput_light(file, fput_needed);
        return ret;
 }
@@ -1421,6 +1476,7 @@ int compat_do_execve(char * filename,
        struct linux_binprm *bprm;
        struct file *file;
        struct files_struct *displaced;
+        bool clear_in_exec;
        int retval;
        retval = unshare_files(&displaced);
@@ -1441,12 +1497,16 @@ int compat_do_execve(char * filename,
        bprm->cred = prepare_exec_creds();
        if (!bprm->cred)
                goto out_unlock;
-        check_unsafe_exec(bprm);
+        retval = check_unsafe_exec(bprm);
+        if (retval < 0)
+                goto out_unlock;
+        clear_in_exec = retval;
        file = open_exec(filename);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
-                goto out_unlock;
+                goto out_unmark;
        sched_exec();
@@ -1488,6 +1548,7 @@ int compat_do_execve(char * filename,
                goto out;
        /* execve succeeded */
+        current->fs->in_exec = 0;
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
        acct_update_integrals(current);
@@ -1506,6 +1567,10 @@ out_file:
                fput(bprm->file);
        }
+out_unmark:
+        if (clear_in_exec)
+                current->fs->in_exec = 0;
 out_unlock:
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ff786687e93b..b83f6bcfa51a 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
 #include <linux/if.h>
 #include <linux/if_bridge.h>
 #include <linux/slab.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
 #include <linux/in6.h>
@@ -58,7 +58,6 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
-#include <linux/loop.h>
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -68,6 +67,7 @@
 #include <linux/gigaset_dev.h>
 #ifdef CONFIG_BLOCK
+#include <linux/loop.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/sg.h>
@@ -2660,6 +2660,8 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
 #ifdef CONFIG_BLOCK
+/* loop */
+IGNORE_IOCTL(LOOP_CLR_FD)
 /* Raw devices */
 HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
 HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
@@ -2728,9 +2730,6 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
 #ifdef CONFIG_SPARC
 /* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
 IGNORE_IOCTL(FBIOGTYPE)
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 932a92b31483..c8afa6b1d91d 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -135,7 +135,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
        struct path path;
        struct configfs_dirent *sd;
        struct config_item *parent_item;
-        struct config_item *target_item;
+        struct config_item *target_item = NULL;
        struct config_item_type *type;
        ret = -EPERM;  /* What lack-of-symlink returns */
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a07338d2d140..dd3634e4c967 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -318,6 +318,7 @@ out:
 static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = CRAMFS_MAGIC;
        buf->f_bsize = PAGE_CACHE_SIZE;
@@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = 0;
        buf->f_files = CRAMFS_SB(sb)->files;
        buf->f_ffree = 0;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = CRAMFS_MAXPATHLEN;
        return 0;
 }
@@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
 static int cramfs_readpage(struct file *file, struct page * page)
 {
        struct inode *inode = page->mapping->host;
-        u32 maxblock, bytes_filled;
+        u32 maxblock;
+        int bytes_filled;
        void *pgdata;
        maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        bytes_filled = 0;
+        pgdata = kmap(page);
        if (page->index < maxblock) {
                struct super_block *sb = inode->i_sb;
                u32 blkptr_offset = OFFSET(inode) + page->index*4;
@@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page)
                start_offset = OFFSET(inode) + maxblock*4;
                mutex_lock(&read_mutex);
                if (page->index)
-                        start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4);
+                        start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
-                compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset);
+                                4);
+                compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
+                        start_offset);
                mutex_unlock(&read_mutex);
-                pgdata = kmap(page);
                if (compr_len == 0)
                        ; /* hole */
-                else if (compr_len > (PAGE_CACHE_SIZE << 1))
+                else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
-                        printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len);
+                        pr_err("cramfs: bad compressed blocksize %u\n",
-                else {
+                                compr_len);
+                        goto err;
+                } else {
                        mutex_lock(&read_mutex);
                        bytes_filled = cramfs_uncompress_block(pgdata,
                                 PAGE_CACHE_SIZE,
                                 cramfs_read(sb, start_offset, compr_len),
                                 compr_len);
                        mutex_unlock(&read_mutex);
+                        if (unlikely(bytes_filled < 0))
+                                goto err;
                }
-        } else
+        }
-                pgdata = kmap(page);
        memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
-        kunmap(page);
        flush_dcache_page(page);
+        kunmap(page);
        SetPageUptodate(page);
        unlock_page(page);
        return 0;
+err:
+        kunmap(page);
+        ClearPageUptodate(page);
+        SetPageError(page);
+        unlock_page(page);
+        return 0;
 }
 static const struct address_space_operations cramfs_aops = {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index fc3ccb74626f..023329800d2e 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
 err:
        printk("Error %d while decompressing!\n", err);
        printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
-        return 0;
+        return -EIO;
 }
 int cramfs_uncompress_init(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b116..75659a6fd1f8 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,7 +17,6 @@
 #include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/fsnotify.h>
 #include <linux/slab.h>
@@ -32,6 +31,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 int sysctl_vfs_cache_pressure __read_mostly = 100;
@@ -481,7 +481,7 @@ restart:
                        if ((flags & DCACHE_REFERENCED)
                                && (dentry->d_flags & DCACHE_REFERENCED)) {
                                dentry->d_flags &= ~DCACHE_REFERENCED;
-                                list_move_tail(&dentry->d_lru, &referenced);
+                                list_move(&dentry->d_lru, &referenced);
                                spin_unlock(&dentry->d_lock);
                        } else {
                                list_move_tail(&dentry->d_lru, &tmp);
@@ -2149,7 +2149,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
        int result;
        unsigned long seq;
-        /* FIXME: This is old behavior, needed? Please check callers. */
        if (new_dentry == old_dentry)
                return 1;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
+static bool debugfs_registered;
 static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 {
@@ -496,6 +497,16 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
+/**
+ * debugfs_initialized - Tells whether debugfs has been registered
+ */
+bool debugfs_initialized(void)
+{
+        return debugfs_registered;
+}
+EXPORT_SYMBOL_GPL(debugfs_initialized);
 static struct kobject *debug_kobj;
 static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
        retval = register_filesystem(&debug_fs_type);
        if (retval)
                kobject_put(debug_kobj);
+        else
+                debugfs_registered = true;
        return retval;
 }
 static void __exit debugfs_exit(void)
 {
+        debugfs_registered = false;
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        unregister_filesystem(&debug_fs_type);
        kobject_put(debug_kobj);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 63a4a59e4148..c68edb969441 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -90,6 +90,15 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 #define PARSE_MOUNT     0
 #define PARSE_REMOUNT   1
+/*
+ * parse_mount_options():
+ *      Set @opts to mount options specified in @data. If an option is not
+ *      specified in @data, set it to its default value. The exception is
+ *      'newinstance' option which can only be set/cleared on a mount (i.e.
+ *      cannot be changed during remount).
+ *
+ * Note: @data may be NULL (in which case all options are set to default).
+ */
 static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 {
        char *p;
@@ -355,12 +364,9 @@ static int devpts_get_sb(struct file_system_type *fs_type,
        struct pts_mount_opts opts;
        struct super_block *s;
-        memset(&opts, 0, sizeof(opts));
+        error = parse_mount_options(data, PARSE_MOUNT, &opts);
-        if (data) {
+        if (error)
-                error = parse_mount_options(data, PARSE_MOUNT, &opts);
+                return error;
-                if (error)
-                        return error;
-        }
        if (opts.newinstance)
                s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -389,11 +395,10 @@ static int devpts_get_sb(struct file_system_type *fs_type,
        return 0;
 out_dput:
-        dput(s->s_root);
+        dput(s->s_root); /* undo dget() in simple_set_mnt() */
 out_undo_sget:
-        up_write(&s->s_umount);
+        deactivate_locked_super(s);
-        deactivate_super(s);
        return error;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..05763bbc2050 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -307,8 +307,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
        struct bio *bio;
        bio = bio_alloc(GFP_KERNEL, nr_vecs);
-        if (bio == NULL)
-                return -ENOMEM;
        bio->bi_bdev = bdev;
        bio->bi_sector = first_sector;
@@ -1126,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        int acquire_i_mutex = 0;
        if (rw & WRITE)
-                rw = WRITE_SYNC;
+                rw = WRITE_ODIRECT;
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 44d725f612cf..b6a719a909f8 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb)
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                        continue;
                if (inode->i_mapping->nrpages == 0)
                        continue;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 8b65f289ee00..b91851f1cda3 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -483,15 +483,7 @@ int ecryptfs_encrypt_page(struct page *page)
        ecryptfs_inode = page->mapping->host;
        crypt_stat =
                &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
-        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+        BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
-                rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page,
-                                                       0, PAGE_CACHE_SIZE);
-                if (rc)
-                        printk(KERN_ERR "%s: Error attempting to copy "
-                               "page at index [%ld]\n", __func__,
-                               page->index);
-                goto out;
-        }
        enc_extent_page = alloc_page(GFP_USER);
        if (!enc_extent_page) {
                rc = -ENOMEM;
@@ -620,16 +612,7 @@ int ecryptfs_decrypt_page(struct page *page)
        ecryptfs_inode = page->mapping->host;
        crypt_stat =
                &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
-        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+        BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
-                rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
-                                                      PAGE_CACHE_SIZE,
-                                                      ecryptfs_inode);
-                if (rc)
-                        printk(KERN_ERR "%s: Error attempting to copy "
-                               "page at index [%ld]\n", __func__,
-                               page->index);
-                goto out;
-        }
        enc_extent_page = alloc_page(GFP_USER);
        if (!enc_extent_page) {
                rc = -ENOMEM;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 064c5820e4e5..00b30a2d5466 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat {
 #define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
 #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
 #define ECRYPTFS_ENCFN_USE_FEK        0x00002000
+#define ECRYPTFS_UNLINK_SIGS          0x00004000
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 55b3145b8072..2f0945d63297 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -379,9 +379,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                goto out_d_drop;
        }
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
                                      lower_dir_dentry,
                                      ecryptfs_dentry->d_name.len);
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -406,9 +408,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                       "filename; rc = [%d]\n", __func__, rc);
                goto out_d_drop;
        }
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
                                      lower_dir_dentry,
                                      encrypted_and_encoded_name_size - 1);
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -636,8 +640,9 @@ static int
 ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 {
        char *lower_buf;
+        size_t lower_bufsiz;
        struct dentry *lower_dentry;
-        struct ecryptfs_crypt_stat *crypt_stat;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
        char *plaintext_name;
        size_t plaintext_name_size;
        mm_segment_t old_fs;
@@ -648,12 +653,21 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
                rc = -EINVAL;
                goto out;
        }
-        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
+                                                dentry->d_sb)->mount_crypt_stat;
+        /*
+         * If the lower filename is encrypted, it will result in a significantly
+         * longer name.  If needed, truncate the name after decode and decrypt.
+         */
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+                lower_bufsiz = PATH_MAX;
+        else
+                lower_bufsiz = bufsiz;
        /* Released in this function */
-        lower_buf = kmalloc(bufsiz, GFP_KERNEL);
+        lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
        if (lower_buf == NULL) {
                printk(KERN_ERR "%s: Out of memory whilst attempting to "
-                       "kmalloc [%d] bytes\n", __func__, bufsiz);
+                       "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
                rc = -ENOMEM;
                goto out;
        }
@@ -661,7 +675,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
        set_fs(get_ds());
        rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
                                                   (char __user *)lower_buf,
-                                                   bufsiz);
+                                                   lower_bufsiz);
        set_fs(old_fs);
        if (rc >= 0) {
                rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
@@ -674,7 +688,9 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
                                rc);
                        goto out_free_lower_buf;
                }
-                rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
+                /* Check for bufsiz <= 0 done in sys_readlinkat() */
+                rc = copy_to_user(buf, plaintext_name,
+                                  min((size_t) bufsiz, plaintext_name_size));
                if (rc)
                        rc = -EFAULT;
                else
@@ -814,6 +830,13 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                size_t num_zeros = (PAGE_CACHE_SIZE
                                    - (new_length & ~PAGE_CACHE_MASK));
+                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+                        rc = vmtruncate(inode, new_length);
+                        if (rc)
+                                goto out_free;
+                        rc = vmtruncate(lower_dentry->d_inode, new_length);
+                        goto out_free;
+                }
                if (num_zeros) {
                        char *zeros_virt;
@@ -915,8 +938,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
                        }
                        rc = 0;
                        crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
-                        mutex_unlock(&crypt_stat->cs_mutex);
-                        goto out;
                }
        }
        mutex_unlock(&crypt_stat->cs_mutex);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e4a6223c3145..af737bb56cb7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 out_release_free_unlock:
        crypto_free_hash(s->hash_desc.tfm);
 out_free_unlock:
-        memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
+        kzfree(s->block_aligned_filename);
-        kfree(s->block_aligned_filename);
 out_unlock:
        mutex_unlock(s->tfm_mutex);
 out:
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index aed56c25539b..9f0aa9883c28 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -190,14 +190,14 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
        dentry->d_op = &ecryptfs_dops;
-        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
-                d_add(dentry, inode);
-        else
-                d_instantiate(dentry, inode);
        fsstack_copy_attr_all(inode, lower_inode, NULL);
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
        fsstack_copy_inode_size(inode, lower_inode);
+        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
+                d_add(dentry, inode);
+        else
+                d_instantiate(dentry, inode);
 out:
        return rc;
 }
@@ -208,7 +208,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
-       ecryptfs_opt_err };
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
 static const match_table_t tokens = {
        {ecryptfs_opt_sig, "sig=%s"},
@@ -222,6 +222,7 @@ static const match_table_t tokens = {
        {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
        {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
+        {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
        {ecryptfs_opt_err, NULL}
 };
@@ -402,6 +403,9 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                                fn_cipher_key_bytes;
                        fn_cipher_key_bytes_set = 1;
                        break;
+                case ecryptfs_opt_unlink_sigs:
+                        mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
+                        break;
                case ecryptfs_opt_err:
                default:
                        printk(KERN_WARNING
@@ -610,9 +614,8 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
        }
        goto out;
 out_abort:
-        dput(sb->s_root);
+        dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */
-        up_write(&sb->s_umount);
+        deactivate_locked_super(sb);
-        deactivate_super(sb);
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 96ef51489e01..f1c17e87c5fb 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -133,45 +133,6 @@ out:
        return rc;
 }
-static int
-ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
-                             struct ecryptfs_msg_ctx **msg_ctx);
-/**
- * ecryptfs_send_raw_message
- * @msg_type: Message type
- * @daemon: Daemon struct for recipient of message
- *
- * A raw message is one that does not include an ecryptfs_message
- * struct. It simply has a type.
- *
- * Must be called with ecryptfs_daemon_hash_mux held.
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_send_raw_message(u8 msg_type,
-                                     struct ecryptfs_daemon *daemon)
-{
-        struct ecryptfs_msg_ctx *msg_ctx;
-        int rc;
-        rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx);
-        if (rc) {
-                printk(KERN_ERR "%s: Error whilst attempting to send "
-                       "message to ecryptfsd; rc = [%d]\n", __func__, rc);
-                goto out;
-        }
-        /* Raw messages are logically context-free (e.g., no
-         * reply is expected), so we set the state of the
-         * ecryptfs_msg_ctx object to indicate that it should
-         * be freed as soon as the message is sent. */
-        mutex_lock(&msg_ctx->mux);
-        msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
-        mutex_unlock(&msg_ctx->mux);
-out:
-        return rc;
-}
 /**
 * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
 * @daemon: Pointer to set to newly allocated daemon struct
@@ -212,49 +173,6 @@ out:
 }
 /**
- * ecryptfs_process_helo
- * @euid: The user ID owner of the message
- * @user_ns: The namespace in which @euid applies
- * @pid: The process ID for the userspace program that sent the
- *       message
- *
- * Adds the euid and pid values to the daemon euid hash.  If an euid
- * already has a daemon pid registered, the daemon will be
- * unregistered before the new daemon is put into the hash list.
- * Returns zero after adding a new daemon to the hash list;
- * non-zero otherwise.
- */
-int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
-                          struct pid *pid)
-{
-        struct ecryptfs_daemon *new_daemon;
-        struct ecryptfs_daemon *old_daemon;
-        int rc;
-        mutex_lock(&ecryptfs_daemon_hash_mux);
-        rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns);
-        if (rc != 0) {
-                printk(KERN_WARNING "Received request from user [%d] "
-                       "to register daemon [0x%p]; unregistering daemon "
-                       "[0x%p]\n", euid, pid, old_daemon->pid);
-                rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon);
-                if (rc)
-                        printk(KERN_WARNING "Failed to send QUIT "
-                               "message to daemon [0x%p]; rc = [%d]\n",
-                               old_daemon->pid, rc);
-                hlist_del(&old_daemon->euid_chain);
-                kfree(old_daemon);
-        }
-        rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid);
-        if (rc)
-                printk(KERN_ERR "%s: The gods are displeased with this attempt "
-                       "to create a new daemon object for euid [%d]; pid "
-                       "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc);
-        mutex_unlock(&ecryptfs_daemon_hash_mux);
-        return rc;
-}
-/**
 * ecryptfs_exorcise_daemon - Destroy the daemon struct
 *
 * Must be called ceremoniously while in possession of
@@ -291,8 +209,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
        if (daemon->user_ns)
                put_user_ns(daemon->user_ns);
        mutex_unlock(&daemon->mux);
-        memset(daemon, 0, sizeof(*daemon));
+        kzfree(daemon);
-        kfree(daemon);
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index a67fea655f49..4ec8f61ccf5a 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -193,26 +193,20 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
        int rc = 0;
        mutex_lock(&msg_ctx->mux);
-        if (data) {
+        msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
-                msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
+                               GFP_KERNEL);
-                                       GFP_KERNEL);
+        if (!msg_ctx->msg) {
-                if (!msg_ctx->msg) {
+                rc = -ENOMEM;
-                        rc = -ENOMEM;
+                printk(KERN_ERR "%s: Out of memory whilst attempting "
-                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
-                               "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
+                       (sizeof(*msg_ctx->msg) + data_size));
-                               (sizeof(*msg_ctx->msg) + data_size));
+                goto out_unlock;
-                        goto out_unlock;
+        }
-                }
-        } else
-                msg_ctx->msg = NULL;
        msg_ctx->msg->index = msg_ctx->index;
        msg_ctx->msg->data_len = data_size;
        msg_ctx->type = msg_type;
-        if (data) {
+        memcpy(msg_ctx->msg->data, data, data_size);
-                memcpy(msg_ctx->msg->data, data, data_size);
+        msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
-                msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
-        } else
-                msg_ctx->msg_size = 0;
        mutex_lock(&daemon->mux);
        list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
        daemon->num_queued_msg_ctx++;
@@ -418,18 +412,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        if (count == 0)
                goto out;
-        data = kmalloc(count, GFP_KERNEL);
-        if (!data) {
+        data = memdup_user(buf, count);
-                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+        if (IS_ERR(data)) {
-                       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
+                printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
+                       __func__, PTR_ERR(data));
                goto out;
        }
-        rc = copy_from_user(data, buf, count);
-        if (rc) {
-                printk(KERN_ERR "%s: copy_from_user returned error [%d]\n",
-                       __func__, rc);
-                goto out_free;
-        }
        sz = count;
        i = 0;
        switch (data[i++]) {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 46cec2b69796..5c6bab9786e3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -449,6 +449,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
        struct ecryptfs_crypt_stat *crypt_stat;
        crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+        BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
                return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode);
        else
@@ -490,6 +491,16 @@ static int ecryptfs_write_end(struct file *file,
                ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
        ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
                        "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+                rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
+                                                       to);
+                if (!rc) {
+                        rc = copied;
+                        fsstack_copy_inode_size(ecryptfs_inode,
+                                ecryptfs_inode_to_lower(ecryptfs_inode));
+                }
+                goto out;
+        }
        /* Fills in zeros if 'to' goes beyond inode size */
        rc = fill_zeros_to_end_of_page(page, to);
        if (rc) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 75c2ea9fee35..a137c6ea2fee 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -117,13 +117,15 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                   size_t size)
 {
        struct page *ecryptfs_page;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
        char *ecryptfs_page_virt;
-        loff_t ecryptfs_file_size =
+        loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
-                i_size_read(ecryptfs_file->f_dentry->d_inode);
        loff_t data_offset = 0;
        loff_t pos;
        int rc = 0;
+        crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        /*
         * if we are writing beyond current size, then start pos
         * at the current size - we'll fill in zeros from there.
@@ -184,7 +186,13 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                flush_dcache_page(ecryptfs_page);
                SetPageUptodate(ecryptfs_page);
                unlock_page(ecryptfs_page);
-                rc = ecryptfs_encrypt_page(ecryptfs_page);
+                if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
+                        rc = ecryptfs_encrypt_page(ecryptfs_page);
+                else
+                        rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
+                                                ecryptfs_page,
+                                                start_offset_in_page,
+                                                data_offset);
                page_cache_release(ecryptfs_page);
                if (rc) {
                        printk(KERN_ERR "%s: Error encrypting "
@@ -194,14 +202,16 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                pos += num_bytes;
        }
        if ((offset + size) > ecryptfs_file_size) {
-                i_size_write(ecryptfs_file->f_dentry->d_inode, (offset + size));
+                i_size_write(ecryptfs_inode, (offset + size));
-                rc = ecryptfs_write_inode_size_to_metadata(
+                if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
-                        ecryptfs_file->f_dentry->d_inode);
+                        rc = ecryptfs_write_inode_size_to_metadata(
-                if (rc) {
+                                                                ecryptfs_inode);
-                        printk(KERN_ERR "Problem with "
+                        if (rc) {
-                               "ecryptfs_write_inode_size_to_metadata; "
+                                printk(KERN_ERR "Problem with "
-                               "rc = [%d]\n", rc);
+                                       "ecryptfs_write_inode_size_to_metadata; "
-                        goto out;
+                                       "rc = [%d]\n", rc);
+                                goto out;
+                        }
                }
        }
 out:
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index c27ac2b358a1..fa4c7e7d15d9 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -170,7 +170,10 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
        list_for_each_entry(walker,
                            &mount_crypt_stat->global_auth_tok_list,
                            mount_crypt_stat_list) {
-                seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
+                if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK)
+                        seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig);
+                else
+                        seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
        }
        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
@@ -186,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_printf(m, ",ecryptfs_xattr_metadata");
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
                seq_printf(m, ",ecryptfs_encrypted_view");
+        if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
+                seq_printf(m, ",ecryptfs_unlink_sigs");
        return 0;
 }
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 73b19cfc91fc..f04942810818 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -329,18 +329,22 @@ out_no_fs:
 }
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
-        struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
+        struct super_block *sb = dentry->d_sb;
+        struct efs_sb_info *sbi = SUPER_INFO(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type    = EFS_SUPER_MAGIC;       /* efs magic number */
        buf->f_bsize   = EFS_BLOCKSIZE;         /* blocksize */
-        buf->f_blocks  = sb->total_groups *     /* total data blocks */
+        buf->f_blocks  = sbi->total_groups *    /* total data blocks */
-                        (sb->group_size - sb->inode_blocks);
+                        (sbi->group_size - sbi->inode_blocks);
-        buf->f_bfree   = sb->data_free;         /* free data blocks */
+        buf->f_bfree   = sbi->data_free;        /* free data blocks */
-        buf->f_bavail  = sb->data_free;         /* free blocks for non-root */
+        buf->f_bavail  = sbi->data_free;        /* free blocks for non-root */
-        buf->f_files   = sb->total_groups *     /* total inodes */
+        buf->f_files   = sbi->total_groups *    /* total inodes */
-                        sb->inode_blocks *
+                        sbi->inode_blocks *
                        (EFS_BLOCKSIZE / sizeof(struct efs_dinode));
-        buf->f_ffree   = sb->inode_free;        /* free inodes */
+        buf->f_ffree   = sbi->inode_free;       /* free inodes */
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = EFS_MAXNAMELEN;        /* max filename length */
        return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa2..2a701d593d35 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
         * issue a wakeup.
         */
        __u64 count;
+        unsigned int flags;
 };
 /*
@@ -50,7 +51,7 @@ int eventfd_signal(struct file *file, int n)
                n = (int) (ULLONG_MAX - ctx->count);
        ctx->count += n;
        if (waitqueue_active(&ctx->wqh))
-                wake_up_locked(&ctx->wqh);
+                wake_up_locked_poll(&ctx->wqh, POLLIN);
        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
        return n;
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 {
        struct eventfd_ctx *ctx = file->private_data;
        ssize_t res;
-        __u64 ucnt;
+        __u64 ucnt = 0;
        DECLARE_WAITQUEUE(wait, current);
        if (count < sizeof(ucnt))
                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
        res = -EAGAIN;
-        ucnt = ctx->count;
+        if (ctx->count > 0)
-        if (ucnt > 0)
                res = sizeof(ucnt);
        else if (!(file->f_flags & O_NONBLOCK)) {
                __add_wait_queue(&ctx->wqh, &wait);
                for (res = 0;;) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (ctx->count > 0) {
-                                ucnt = ctx->count;
                                res = sizeof(ucnt);
                                break;
                        }
@@ -117,10 +116,11 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
                __remove_wait_queue(&ctx->wqh, &wait);
                __set_current_state(TASK_RUNNING);
        }
-        if (res > 0) {
+        if (likely(res > 0)) {
-                ctx->count = 0;
+                ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+                ctx->count -= ucnt;
                if (waitqueue_active(&ctx->wqh))
-                        wake_up_locked(&ctx->wqh);
+                        wake_up_locked_poll(&ctx->wqh, POLLOUT);
        }
        spin_unlock_irq(&ctx->wqh.lock);
        if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
@@ -166,10 +166,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
                __remove_wait_queue(&ctx->wqh, &wait);
                __set_current_state(TASK_RUNNING);
        }
-        if (res > 0) {
+        if (likely(res > 0)) {
                ctx->count += ucnt;
                if (waitqueue_active(&ctx->wqh))
-                        wake_up_locked(&ctx->wqh);
+                        wake_up_locked_poll(&ctx->wqh, POLLIN);
        }
        spin_unlock_irq(&ctx->wqh.lock);
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
        BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
-        if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+        if (flags & ~EFD_FLAGS_SET)
                return -EINVAL;
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
        init_waitqueue_head(&ctx->wqh);
        ctx->count = count;
+        ctx->flags = flags;
        /*
         * When we call this, the initialization must be complete, since
         * anon_inode_getfd() will install the fd.
         */
        fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
-                              flags & (O_CLOEXEC | O_NONBLOCK));
+                              flags & EFD_SHARED_FCNTL_FLAGS);
        if (fd < 0)
                kfree(ctx);
        return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
        return sys_eventfd2(count, 0);
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c5c424f23fd5..5458e80fc558 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
- *  fs/eventpoll.c (Efficent event polling implementation)
+ *  fs/eventpoll.c (Efficient event retrieval implementation)
- *  Copyright (C) 2001,...,2007  Davide Libenzi
+ *  Copyright (C) 2001,...,2009  Davide Libenzi
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
@@ -71,29 +71,11 @@
 * a better scalability.
 */
-#define DEBUG_EPOLL 0
-#if DEBUG_EPOLL > 0
-#define DPRINTK(x) printk x
-#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
-#else /* #if DEBUG_EPOLL > 0 */
-#define DPRINTK(x) (void) 0
-#define DNPRINTK(n, x) (void) 0
-#endif /* #if DEBUG_EPOLL > 0 */
-#define DEBUG_EPI 0
-#if DEBUG_EPI != 0
-#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
-#else /* #if DEBUG_EPI != 0 */
-#define EPI_SLAB_DEBUG 0
-#endif /* #if DEBUG_EPI != 0 */
 /* Epoll private bits inside the event mask */
 #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
-/* Maximum number of poll wake up nests we are allowing */
+/* Maximum number of nesting allowed inside epoll sets */
-#define EP_MAX_POLLWAKE_NESTS 4
+#define EP_MAX_NESTS 4
 /* Maximum msec timeout value storeable in a long int */
 #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +92,21 @@ struct epoll_filefd {
 };
 /*
- * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
+ * Structure used to track possible nested calls, for too deep recursions
- * It is used to keep track on all tasks that are currently inside the wake_up() code
+ * and loop cycles.
- * to 1) short-circuit the one coming from the same task and same wait queue head
- * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
- * 3) let go the ones coming from other tasks.
 */
-struct wake_task_node {
+struct nested_call_node {
        struct list_head llink;
-        struct task_struct *task;
+        void *cookie;
-        wait_queue_head_t *wq;
+        int cpu;
 };
 /*
- * This is used to implement the safe poll wake up avoiding to reenter
+ * This structure is used as collector for nested calls, to check for
- * the poll callback from inside wake_up().
+ * maximum recursion dept and loop cycles.
 */
-struct poll_safewake {
+struct nested_calls {
-        struct list_head wake_task_list;
+        struct list_head tasks_call_list;
        spinlock_t lock;
 };
@@ -213,7 +192,7 @@ struct eppoll_entry {
        struct list_head llink;
        /* The "base" pointer is set to the container "struct epitem" */
-        void *base;
+        struct epitem *base;
        /*
         * Wait queue item that will be linked to the target file wait
@@ -231,6 +210,12 @@ struct ep_pqueue {
        struct epitem *epi;
 };
+/* Used by the ep_send_events() function as callback private data */
+struct ep_send_events_data {
+        int maxevents;
+        struct epoll_event __user *events;
+};
 /*
 * Configuration options available inside /proc/sys/fs/epoll/
 */
@@ -242,8 +227,11 @@ static int max_user_watches __read_mostly;
 */
 static DEFINE_MUTEX(epmutex);
-/* Safe wake up implementation */
+/* Used for safe wake up implementation */
-static struct poll_safewake psw;
+static struct nested_calls poll_safewake_ncalls;
+/* Used to call file's f_op->poll() under the nested calls boundaries */
+static struct nested_calls poll_readywalk_ncalls;
 /* Slab cache used to allocate "struct epitem" */
 static struct kmem_cache *epi_cache __read_mostly;
@@ -312,89 +300,230 @@ static inline int ep_op_has_event(int op)
 }
 /* Initialize the poll safe wake up structure */
-static void ep_poll_safewake_init(struct poll_safewake *psw)
+static void ep_nested_calls_init(struct nested_calls *ncalls)
 {
+        INIT_LIST_HEAD(&ncalls->tasks_call_list);
-        INIT_LIST_HEAD(&psw->wake_task_list);
+        spin_lock_init(&ncalls->lock);
-        spin_lock_init(&psw->lock);
 }
-/*
+/**
- * Perform a safe wake up of the poll wait list. The problem is that
+ * ep_call_nested - Perform a bound (possibly) nested call, by checking
- * with the new callback'd wake up system, it is possible that the
+ *                  that the recursion limit is not exceeded, and that
- * poll callback is reentered from inside the call to wake_up() done
+ *                  the same nested call (by the meaning of same cookie) is
- * on the poll wait queue head. The rule is that we cannot reenter the
+ *                  no re-entered.
- * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
+ *
- * and we cannot reenter the same wait queue head at all. This will
+ * @ncalls: Pointer to the nested_calls structure to be used for this call.
- * enable to have a hierarchy of epoll file descriptor of no more than
+ * @max_nests: Maximum number of allowed nesting calls.
- * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
+ * @nproc: Nested call core function pointer.
- * because this one gets called by the poll callback, that in turn is called
+ * @priv: Opaque data to be passed to the @nproc callback.
- * from inside a wake_up(), that might be called from irq context.
+ * @cookie: Cookie to be used to identify this nested call.
+ *
+ * Returns: Returns the code returned by the @nproc callback, or -1 if
+ *          the maximum recursion limit has been exceeded.
 */
-static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
+static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+                          int (*nproc)(void *, void *, int), void *priv,
+                          void *cookie)
 {
-        int wake_nests = 0;
+        int error, call_nests = 0;
        unsigned long flags;
-        struct task_struct *this_task = current;
+        int this_cpu = get_cpu();
-        struct list_head *lsthead = &psw->wake_task_list;
+        struct list_head *lsthead = &ncalls->tasks_call_list;
-        struct wake_task_node *tncur;
+        struct nested_call_node *tncur;
-        struct wake_task_node tnode;
+        struct nested_call_node tnode;
-        spin_lock_irqsave(&psw->lock, flags);
+        spin_lock_irqsave(&ncalls->lock, flags);
-        /* Try to see if the current task is already inside this wakeup call */
+        /*
+         * Try to see if the current task is already inside this wakeup call.
+         * We use a list here, since the population inside this set is always
+         * very much limited.
+         */
        list_for_each_entry(tncur, lsthead, llink) {
+                if (tncur->cpu == this_cpu &&
-                if (tncur->wq == wq ||
+                    (tncur->cookie == cookie || ++call_nests > max_nests)) {
-                    (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
                        /*
                         * Ops ... loop detected or maximum nest level reached.
                         * We abort this wake by breaking the cycle itself.
                         */
-                        spin_unlock_irqrestore(&psw->lock, flags);
+                        error = -1;
-                        return;
+                        goto out_unlock;
                }
        }
-        /* Add the current task to the list */
+        /* Add the current task and cookie to the list */
-        tnode.task = this_task;
+        tnode.cpu = this_cpu;
-        tnode.wq = wq;
+        tnode.cookie = cookie;
        list_add(&tnode.llink, lsthead);
-        spin_unlock_irqrestore(&psw->lock, flags);
+        spin_unlock_irqrestore(&ncalls->lock, flags);
-        /* Do really wake up now */
+        /* Call the nested function */
-        wake_up_nested(wq, 1 + wake_nests);
+        error = (*nproc)(priv, cookie, call_nests);
        /* Remove the current task from the list */
-        spin_lock_irqsave(&psw->lock, flags);
+        spin_lock_irqsave(&ncalls->lock, flags);
        list_del(&tnode.llink);
-        spin_unlock_irqrestore(&psw->lock, flags);
+ out_unlock:
+        spin_unlock_irqrestore(&ncalls->lock, flags);
+        put_cpu();
+        return error;
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+                                     unsigned long events, int subclass)
+{
+        unsigned long flags;
+        spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
+        wake_up_locked_poll(wqueue, events);
+        spin_unlock_irqrestore(&wqueue->lock, flags);
+}
+#else
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+                                     unsigned long events, int subclass)
+{
+        wake_up_poll(wqueue, events);
+}
+#endif
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
+{
+        ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
+                          1 + call_nests);
+        return 0;
+}
+/*
+ * Perform a safe wake up of the poll wait list. The problem is that
+ * with the new callback'd wake up system, it is possible that the
+ * poll callback is reentered from inside the call to wake_up() done
+ * on the poll wait queue head. The rule is that we cannot reenter the
+ * wake up code from the same task more than EP_MAX_NESTS times,
+ * and we cannot reenter the same wait queue head at all. This will
+ * enable to have a hierarchy of epoll file descriptor of no more than
+ * EP_MAX_NESTS deep.
+ */
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+        ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+                       ep_poll_wakeup_proc, NULL, wq);
 }
 /*
- * This function unregister poll callbacks from the associated file descriptor.
+ * This function unregisters poll callbacks from the associated file
- * Since this must be called without holding "ep->lock" the atomic exchange trick
+ * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
- * will protect us from multiple unregister.
+ * ep_free).
 */
 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 {
-        int nwait;
        struct list_head *lsthead = &epi->pwqlist;
        struct eppoll_entry *pwq;
-        /* This is called without locks, so we need the atomic exchange */
+        while (!list_empty(lsthead)) {
-        nwait = xchg(&epi->nwait, 0);
+                pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
-        if (nwait) {
+                list_del(&pwq->llink);
-                while (!list_empty(lsthead)) {
+                remove_wait_queue(pwq->whead, &pwq->wait);
-                        pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
+                kmem_cache_free(pwq_cache, pwq);
+        }
+}
-                        list_del_init(&pwq->llink);
+/**
-                        remove_wait_queue(pwq->whead, &pwq->wait);
+ * ep_scan_ready_list - Scans the ready list in a way that makes possible for
-                        kmem_cache_free(pwq_cache, pwq);
+ *                      the scan code, to call f_op->poll(). Also allows for
-                }
+ *                      O(NumReady) performance.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @sproc: Pointer to the scan callback.
+ * @priv: Private opaque data passed to the @sproc callback.
+ *
+ * Returns: The same integer error code returned by the @sproc callback.
+ */
+static int ep_scan_ready_list(struct eventpoll *ep,
+                              int (*sproc)(struct eventpoll *,
+                                           struct list_head *, void *),
+                              void *priv)
+{
+        int error, pwake = 0;
+        unsigned long flags;
+        struct epitem *epi, *nepi;
+        LIST_HEAD(txlist);
+        /*
+         * We need to lock this because we could be hit by
+         * eventpoll_release_file() and epoll_ctl().
+         */
+        mutex_lock(&ep->mtx);
+        /*
+         * Steal the ready list, and re-init the original one to the
+         * empty list. Also, set ep->ovflist to NULL so that events
+         * happening while looping w/out locks, are not lost. We cannot
+         * have the poll callback to queue directly on ep->rdllist,
+         * because we want the "sproc" callback to be able to do it
+         * in a lockless way.
+         */
+        spin_lock_irqsave(&ep->lock, flags);
+        list_splice_init(&ep->rdllist, &txlist);
+        ep->ovflist = NULL;
+        spin_unlock_irqrestore(&ep->lock, flags);
+        /*
+         * Now call the callback function.
+         */
+        error = (*sproc)(ep, &txlist, priv);
+        spin_lock_irqsave(&ep->lock, flags);
+        /*
+         * During the time we spent inside the "sproc" callback, some
+         * other events might have been queued by the poll callback.
+         * We re-insert them inside the main ready-list here.
+         */
+        for (nepi = ep->ovflist; (epi = nepi) != NULL;
+             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+                /*
+                 * We need to check if the item is already in the list.
+                 * During the "sproc" callback execution time, items are
+                 * queued into ->ovflist but the "txlist" might already
+                 * contain them, and the list_splice() below takes care of them.
+                 */
+                if (!ep_is_linked(&epi->rdllink))
+                        list_add_tail(&epi->rdllink, &ep->rdllist);
+        }
+        /*
+         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
+         * releasing the lock, events will be queued in the normal way inside
+         * ep->rdllist.
+         */
+        ep->ovflist = EP_UNACTIVE_PTR;
+        /*
+         * Quickly re-inject items left on "txlist".
+         */
+        list_splice(&txlist, &ep->rdllist);
+        if (!list_empty(&ep->rdllist)) {
+                /*
+                 * Wake up (if active) both the eventpoll wait list and
+                 * the ->poll() wait list (delayed after we release the lock).
+                 */
+                if (waitqueue_active(&ep->wq))
+                        wake_up_locked(&ep->wq);
+                if (waitqueue_active(&ep->poll_wait))
+                        pwake++;
        }
+        spin_unlock_irqrestore(&ep->lock, flags);
+        mutex_unlock(&ep->mtx);
+        /* We have to call this outside the lock */
+        if (pwake)
+                ep_poll_safewake(&ep->poll_wait);
+        return error;
 }
 /*
@@ -434,9 +563,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        atomic_dec(&ep->user->epoll_watches);
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
-                     current, ep, file));
        return 0;
 }
@@ -447,7 +573,7 @@ static void ep_free(struct eventpoll *ep)
        /* We need to release all tasks waiting for these file */
        if (waitqueue_active(&ep->poll_wait))
-                ep_poll_safewake(&psw, &ep->poll_wait);
+                ep_poll_safewake(&ep->poll_wait);
        /*
         * We need to lock this because we could be hit by
@@ -492,26 +618,54 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
        if (ep)
                ep_free(ep);
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
        return 0;
 }
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+                               void *priv)
+{
+        struct epitem *epi, *tmp;
+        list_for_each_entry_safe(epi, tmp, head, rdllink) {
+                if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+                    epi->event.events)
+                        return POLLIN | POLLRDNORM;
+                else {
+                        /*
+                         * Item has been dropped into the ready list by the poll
+                         * callback, but it's not actually ready, as far as
+                         * caller requested events goes. We can remove it here.
+                         */
+                        list_del_init(&epi->rdllink);
+                }
+        }
+        return 0;
+}
+static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
+{
+        return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+}
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
-        unsigned int pollflags = 0;
+        int pollflags;
-        unsigned long flags;
        struct eventpoll *ep = file->private_data;
        /* Insert inside our poll wait queue */
        poll_wait(file, &ep->poll_wait, wait);
-        /* Check our condition */
+        /*
-        spin_lock_irqsave(&ep->lock, flags);
+         * Proceed to find out if wanted events are really available inside
-        if (!list_empty(&ep->rdllist))
+         * the ready list. This need to be done under ep_call_nested()
-                pollflags = POLLIN | POLLRDNORM;
+         * supervision, since the call to f_op->poll() done on listed files
-        spin_unlock_irqrestore(&ep->lock, flags);
+         * could re-enter here.
+         */
+        pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+                                   ep_poll_readyevents_proc, ep, ep);
-        return pollflags;
+        return pollflags != -1 ? pollflags : 0;
 }
 /* File callbacks that implement the eventpoll file behaviour */
@@ -541,7 +695,7 @@ void eventpoll_release_file(struct file *file)
         * We don't want to get "file->f_lock" because it is not
         * necessary. It is not necessary because we're in the "struct file"
         * cleanup path, and this means that noone is using this file anymore.
-         * So, for example, epoll_ctl() cannot hit here sicne if we reach this
+         * So, for example, epoll_ctl() cannot hit here since if we reach this
         * point, the file counter already went to zero and fget() would fail.
         * The only hit might come from ep_free() but by holding the mutex
         * will correctly serialize the operation. We do need to acquire
@@ -588,8 +742,6 @@ static int ep_alloc(struct eventpoll **pep)
        *pep = ep;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
-                     current, ep));
        return 0;
 free_uid:
@@ -623,9 +775,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
                }
        }
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
-                     current, file, epir));
        return epir;
 }
@@ -641,9 +790,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
        struct epitem *epi = ep_item_from_wait(wait);
        struct eventpoll *ep = epi->ep;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
-                     current, epi->ffd.file, epi, ep));
        spin_lock_irqsave(&ep->lock, flags);
        /*
@@ -656,6 +802,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
                goto out_unlock;
        /*
+         * Check the events coming with the callback. At this stage, not
+         * every device reports the events in the "key" parameter of the
+         * callback. We need to be able to handle both cases here, hence the
+         * test for "key" != NULL before the event match test.
+         */
+        if (key && !((unsigned long) key & epi->event.events))
+                goto out_unlock;
+        /*
         * If we are trasfering events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
         * semantics). All the events that happens during that period of time are
@@ -670,12 +825,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
        }
        /* If this file is already in the ready list we exit soon */
-        if (ep_is_linked(&epi->rdllink))
+        if (!ep_is_linked(&epi->rdllink))
-                goto is_linked;
+                list_add_tail(&epi->rdllink, &ep->rdllist);
-        list_add_tail(&epi->rdllink, &ep->rdllist);
-is_linked:
        /*
         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
         * wait list.
@@ -690,7 +842,7 @@ out_unlock:
        /* We have to call this outside the lock */
        if (pwake)
-                ep_poll_safewake(&psw, &ep->poll_wait);
+                ep_poll_safewake(&ep->poll_wait);
        return 1;
 }
@@ -817,10 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
        /* We have to call this outside the lock */
        if (pwake)
-                ep_poll_safewake(&psw, &ep->poll_wait);
+                ep_poll_safewake(&ep->poll_wait);
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
-                     current, ep, tfile, fd));
        return 0;
@@ -851,15 +1000,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 {
        int pwake = 0;
        unsigned int revents;
-        unsigned long flags;
        /*
-         * Set the new event interest mask before calling f_op->poll(), otherwise
+         * Set the new event interest mask before calling f_op->poll();
-         * a potential race might occur. In fact if we do this operation inside
+         * otherwise we might miss an event that happens between the
-         * the lock, an event might happen between the f_op->poll() call and the
+         * f_op->poll() call and the new event set registering.
-         * new event set registering.
         */
        epi->event.events = event->events;
+        epi->event.data = event->data; /* protected by mtx */
        /*
         * Get current event bits. We can safely use the file* here because
@@ -867,16 +1015,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
         */
        revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
-        spin_lock_irqsave(&ep->lock, flags);
-        /* Copy the data member from inside the lock */
-        epi->event.data = event->data;
        /*
         * If the item is "hot" and it is not registered inside the ready
         * list, push it inside.
         */
        if (revents & event->events) {
+                spin_lock_irq(&ep->lock);
                if (!ep_is_linked(&epi->rdllink)) {
                        list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -886,142 +1030,84 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
                        if (waitqueue_active(&ep->poll_wait))
                                pwake++;
                }
+                spin_unlock_irq(&ep->lock);
        }
-        spin_unlock_irqrestore(&ep->lock, flags);
        /* We have to call this outside the lock */
        if (pwake)
-                ep_poll_safewake(&psw, &ep->poll_wait);
+                ep_poll_safewake(&ep->poll_wait);
        return 0;
 }
-static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
+static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
-                          int maxevents)
+                               void *priv)
 {
-        int eventcnt, error = -EFAULT, pwake = 0;
+        struct ep_send_events_data *esed = priv;
+        int eventcnt;
        unsigned int revents;
-        unsigned long flags;
+        struct epitem *epi;
-        struct epitem *epi, *nepi;
+        struct epoll_event __user *uevent;
-        struct list_head txlist;
-        INIT_LIST_HEAD(&txlist);
-        /*
-         * We need to lock this because we could be hit by
-         * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
-         */
-        mutex_lock(&ep->mtx);
-        /*
-         * Steal the ready list, and re-init the original one to the
-         * empty list. Also, set ep->ovflist to NULL so that events
-         * happening while looping w/out locks, are not lost. We cannot
-         * have the poll callback to queue directly on ep->rdllist,
-         * because we are doing it in the loop below, in a lockless way.
-         */
-        spin_lock_irqsave(&ep->lock, flags);
-        list_splice(&ep->rdllist, &txlist);
-        INIT_LIST_HEAD(&ep->rdllist);
-        ep->ovflist = NULL;
-        spin_unlock_irqrestore(&ep->lock, flags);
        /*
-         * We can loop without lock because this is a task private list.
+         * We can loop without lock because we are passed a task private list.
-         * We just splice'd out the ep->rdllist in ep_collect_ready_items().
+         * Items cannot vanish during the loop because ep_scan_ready_list() is
-         * Items cannot vanish during the loop because we are holding "mtx".
+         * holding "mtx" during this call.
         */
-        for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {
+        for (eventcnt = 0, uevent = esed->events;
-                epi = list_first_entry(&txlist, struct epitem, rdllink);
+             !list_empty(head) && eventcnt < esed->maxevents;) {
+                epi = list_first_entry(head, struct epitem, rdllink);
                list_del_init(&epi->rdllink);
-                /*
+                revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
-                 * Get the ready file event set. We can safely use the file
+                        epi->event.events;
-                 * because we are holding the "mtx" and this will guarantee
-                 * that both the file and the item will not vanish.
-                 */
-                revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
-                revents &= epi->event.events;
                /*
-                 * Is the event mask intersect the caller-requested one,
+                 * If the event mask intersect the caller-requested one,
-                 * deliver the event to userspace. Again, we are holding
+                 * deliver the event to userspace. Again, ep_scan_ready_list()
-                 * "mtx", so no operations coming from userspace can change
+                 * is holding "mtx", so no operations coming from userspace
-                 * the item.
+                 * can change the item.
                 */
                if (revents) {
-                        if (__put_user(revents,
+                        if (__put_user(revents, &uevent->events) ||
-                                       &events[eventcnt].events) ||
+                            __put_user(epi->event.data, &uevent->data)) {
-                            __put_user(epi->event.data,
+                                list_add(&epi->rdllink, head);
-                                       &events[eventcnt].data))
+                                return eventcnt ? eventcnt : -EFAULT;
-                                goto errxit;
+                        }
+                        eventcnt++;
+                        uevent++;
                        if (epi->event.events & EPOLLONESHOT)
                                epi->event.events &= EP_PRIVATE_BITS;
-                        eventcnt++;
+                        else if (!(epi->event.events & EPOLLET)) {
+                                /*
+                                 * If this file has been added with Level
+                                 * Trigger mode, we need to insert back inside
+                                 * the ready list, so that the next call to
+                                 * epoll_wait() will check again the events
+                                 * availability. At this point, noone can insert
+                                 * into ep->rdllist besides us. The epoll_ctl()
+                                 * callers are locked out by
+                                 * ep_scan_ready_list() holding "mtx" and the
+                                 * poll callback will queue them in ep->ovflist.
+                                 */
+                                list_add_tail(&epi->rdllink, &ep->rdllist);
+                        }
                }
-                /*
-                 * At this point, noone can insert into ep->rdllist besides
-                 * us. The epoll_ctl() callers are locked out by us holding
-                 * "mtx" and the poll callback will queue them in ep->ovflist.
-                 */
-                if (!(epi->event.events & EPOLLET) &&
-                    (revents & epi->event.events))
-                        list_add_tail(&epi->rdllink, &ep->rdllist);
-        }
-        error = 0;
-errxit:
-        spin_lock_irqsave(&ep->lock, flags);
-        /*
-         * During the time we spent in the loop above, some other events
-         * might have been queued by the poll callback. We re-insert them
-         * inside the main ready-list here.
-         */
-        for (nepi = ep->ovflist; (epi = nepi) != NULL;
-             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
-                /*
-                 * If the above loop quit with errors, the epoll item might still
-                 * be linked to "txlist", and the list_splice() done below will
-                 * take care of those cases.
-                 */
-                if (!ep_is_linked(&epi->rdllink))
-                        list_add_tail(&epi->rdllink, &ep->rdllist);
        }
-        /*
-         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
-         * releasing the lock, events will be queued in the normal way inside
-         * ep->rdllist.
-         */
-        ep->ovflist = EP_UNACTIVE_PTR;
-        /*
+        return eventcnt;
-         * In case of error in the event-send loop, or in case the number of
+}
-         * ready events exceeds the userspace limit, we need to splice the
-         * "txlist" back inside ep->rdllist.
-         */
-        list_splice(&txlist, &ep->rdllist);
-        if (!list_empty(&ep->rdllist)) {
-                /*
-                 * Wake up (if active) both the eventpoll wait list and the ->poll()
-                 * wait list (delayed after we release the lock).
-                 */
-                if (waitqueue_active(&ep->wq))
-                        wake_up_locked(&ep->wq);
-                if (waitqueue_active(&ep->poll_wait))
-                        pwake++;
-        }
-        spin_unlock_irqrestore(&ep->lock, flags);
-        mutex_unlock(&ep->mtx);
+static int ep_send_events(struct eventpoll *ep,
+                          struct epoll_event __user *events, int maxevents)
+{
+        struct ep_send_events_data esed;
-        /* We have to call this outside the lock */
+        esed.maxevents = maxevents;
-        if (pwake)
+        esed.events = events;
-                ep_poll_safewake(&psw, &ep->poll_wait);
-        return eventcnt == 0 ? error: eventcnt;
+        return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 }
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1033,7 +1119,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
        wait_queue_t wait;
        /*
-         * Calculate the timeout by checking for the "infinite" value ( -1 )
+         * Calculate the timeout by checking for the "infinite" value (-1)
         * and the overflow condition. The passed timeout is in milliseconds,
         * that why (t * HZ) / 1000.
         */
@@ -1076,9 +1162,8 @@ retry:
                set_current_state(TASK_RUNNING);
        }
        /* Is it worth to try to dig for events ? */
-        eavail = !list_empty(&ep->rdllist);
+        eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
        spin_unlock_irqrestore(&ep->lock, flags);
@@ -1099,46 +1184,35 @@ retry:
 */
 SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
-        int error, fd = -1;
+        int error;
-        struct eventpoll *ep;
+        struct eventpoll *ep = NULL;
        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
        if (flags & ~EPOLL_CLOEXEC)
                return -EINVAL;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
-                     current, flags));
        /*
-         * Create the internal data structure ( "struct eventpoll" ).
+         * Create the internal data structure ("struct eventpoll").
         */
        error = ep_alloc(&ep);
-        if (error < 0) {
+        if (error < 0)
-                fd = error;
+                return error;
-                goto error_return;
-        }
        /*
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
-        fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+        error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
-                              flags & O_CLOEXEC);
+                                 flags & O_CLOEXEC);
-        if (fd < 0)
+        if (error < 0)
                ep_free(ep);
-error_return:
+        return error;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-                     current, flags, fd));
-        return fd;
 }
 SYSCALL_DEFINE1(epoll_create, int, size)
 {
-        if (size < 0)
+        if (size <= 0)
                return -EINVAL;
        return sys_epoll_create1(0);
@@ -1158,9 +1232,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epitem *epi;
        struct epoll_event epds;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
-                     current, epfd, op, fd, event));
        error = -EFAULT;
        if (ep_op_has_event(op) &&
            copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -1211,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        case EPOLL_CTL_ADD:
                if (!epi) {
                        epds.events |= POLLERR | POLLHUP;
                        error = ep_insert(ep, &epds, tfile, fd);
                } else
                        error = -EEXIST;
@@ -1237,8 +1307,6 @@ error_tgt_fput:
 error_fput:
        fput(file);
 error_return:
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
-                     current, epfd, op, fd, event, error));
        return error;
 }
@@ -1254,9 +1322,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
        struct file *file;
        struct eventpoll *ep;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
-                     current, epfd, events, maxevents, timeout));
        /* The maximum number of event must be greater than zero */
        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
                return -EINVAL;
@@ -1293,8 +1358,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 error_fput:
        fput(file);
 error_return:
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
-                     current, epfd, events, maxevents, timeout, error));
        return error;
 }
@@ -1359,17 +1422,18 @@ static int __init eventpoll_init(void)
                EP_ITEM_COST;
        /* Initialize the structure used to perform safe poll wait head wake ups */
-        ep_poll_safewake_init(&psw);
+        ep_nested_calls_init(&poll_safewake_ncalls);
+        /* Initialize the structure used to perform file's f_op->poll() calls */
+        ep_nested_calls_init(&poll_readywalk_ncalls);
        /* Allocates slab cache used to allocate "struct epitem" items */
        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
-                        0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
+                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
-                        NULL);
        /* Allocates slab cache used to allocate "struct eppoll_entry" */
        pwq_cache = kmem_cache_create("eventpoll_pwq",
-                        sizeof(struct eppoll_entry), 0,
+                        sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
-                        EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
        return 0;
 }
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc9165..895823d0149d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -68,17 +69,18 @@ int suid_dumpable = 0;
 static LIST_HEAD(formats);
 static DEFINE_RWLOCK(binfmt_lock);
-int register_binfmt(struct linux_binfmt * fmt)
+int __register_binfmt(struct linux_binfmt * fmt, int insert)
 {
        if (!fmt)
                return -EINVAL;
        write_lock(&binfmt_lock);
-        list_add(&fmt->lh, &formats);
+        insert ? list_add(&fmt->lh, &formats) :
+                 list_add_tail(&fmt->lh, &formats);
        write_unlock(&binfmt_lock);
        return 0;       
 }
-EXPORT_SYMBOL(register_binfmt);
+EXPORT_SYMBOL(__register_binfmt);
 void unregister_binfmt(struct linux_binfmt * fmt)
 {
@@ -103,40 +105,28 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
 SYSCALL_DEFINE1(uselib, const char __user *, library)
 {
        struct file *file;
-        struct nameidata nd;
        char *tmp = getname(library);
        int error = PTR_ERR(tmp);
-        if (!IS_ERR(tmp)) {
+        if (IS_ERR(tmp))
-                error = path_lookup_open(AT_FDCWD, tmp,
+                goto out;
-                                         LOOKUP_FOLLOW, &nd,
-                                         FMODE_READ|FMODE_EXEC);
+        file = do_filp_open(AT_FDCWD, tmp,
-                putname(tmp);
+                                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
-        }
+                                MAY_READ | MAY_EXEC | MAY_OPEN);
-        if (error)
+        putname(tmp);
+        error = PTR_ERR(file);
+        if (IS_ERR(file))
                goto out;
        error = -EINVAL;
-        if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
+        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
                goto exit;
        error = -EACCES;
-        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
-        error = inode_permission(nd.path.dentry->d_inode,
-                                 MAY_READ | MAY_EXEC | MAY_OPEN);
-        if (error)
-                goto exit;
-        error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
-        if (error)
-                goto exit;
-        file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
-        error = PTR_ERR(file);
-        if (IS_ERR(file))
-                goto out;
        fsnotify_open(file->f_path.dentry);
        error = -ENOEXEC;
@@ -158,13 +148,10 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
                }
                read_unlock(&binfmt_lock);
        }
+exit:
        fput(file);
 out:
        return error;
-exit:
-        release_open_intent(&nd);
-        path_put(&nd.path);
-        goto out;
 }
 #ifdef CONFIG_MMU
@@ -659,47 +646,33 @@ EXPORT_SYMBOL(setup_arg_pages);
 struct file *open_exec(const char *name)
 {
-        struct nameidata nd;
        struct file *file;
        int err;
-        err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd,
+        file = do_filp_open(AT_FDCWD, name,
-                                FMODE_READ|FMODE_EXEC);
+                                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
-        if (err)
+                                MAY_EXEC | MAY_OPEN);
+        if (IS_ERR(file))
                goto out;
        err = -EACCES;
-        if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
+        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
-                goto out_path_put;
+                goto exit;
-        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
-                goto out_path_put;
-        err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
-        if (err)
-                goto out_path_put;
-        err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
-        if (err)
-                goto out_path_put;
-        file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
+        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
-        if (IS_ERR(file))
+                goto exit;
-                return file;
        fsnotify_open(file->f_path.dentry);
        err = deny_write_access(file);
-        if (err) {
+        if (err)
-                fput(file);
+                goto exit;
-                goto out;
-        }
+out:
        return file;
- out_path_put:
+exit:
-        release_open_intent(&nd);
+        fput(file);
-        path_put(&nd.path);
- out:
        return ERR_PTR(err);
 }
 EXPORT_SYMBOL(open_exec);
@@ -1056,28 +1029,35 @@ EXPORT_SYMBOL(install_exec_creds);
 * - the caller must hold current->cred_exec_mutex to protect against
 *   PTRACE_ATTACH
 */
-void check_unsafe_exec(struct linux_binprm *bprm)
+int check_unsafe_exec(struct linux_binprm *bprm)
 {
        struct task_struct *p = current, *t;
-        unsigned long flags;
+        unsigned n_fs;
-        unsigned n_fs, n_sighand;
+        int res = 0;
        bprm->unsafe = tracehook_unsafe_exec(p);
        n_fs = 1;
-        n_sighand = 1;
+        write_lock(&p->fs->lock);
-        lock_task_sighand(p, &flags);
+        rcu_read_lock();
        for (t = next_thread(p); t != p; t = next_thread(t)) {
                if (t->fs == p->fs)
                        n_fs++;
-                n_sighand++;
        }
+        rcu_read_unlock();
-        if (atomic_read(&p->fs->count) > n_fs ||
+        if (p->fs->users > n_fs) {
-            atomic_read(&p->sighand->count) > n_sighand)
                bprm->unsafe |= LSM_UNSAFE_SHARE;
+        } else {
+                res = -EAGAIN;
+                if (!p->fs->in_exec) {
+                        p->fs->in_exec = 1;
+                        res = 1;
+                }
+        }
+        write_unlock(&p->fs->lock);
-        unlock_task_sighand(p, &flags);
+        return res;
 }
 /* 
@@ -1276,6 +1256,7 @@ int do_execve(char * filename,
        struct linux_binprm *bprm;
        struct file *file;
        struct files_struct *displaced;
+        bool clear_in_exec;
        int retval;
        retval = unshare_files(&displaced);
@@ -1296,12 +1277,16 @@ int do_execve(char * filename,
        bprm->cred = prepare_exec_creds();
        if (!bprm->cred)
                goto out_unlock;
-        check_unsafe_exec(bprm);
+        retval = check_unsafe_exec(bprm);
+        if (retval < 0)
+                goto out_unlock;
+        clear_in_exec = retval;
        file = open_exec(filename);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
-                goto out_unlock;
+                goto out_unmark;
        sched_exec();
@@ -1344,6 +1329,7 @@ int do_execve(char * filename,
                goto out;
        /* execve succeeded */
+        current->fs->in_exec = 0;
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
        acct_update_integrals(current);
@@ -1362,6 +1348,10 @@ out_file:
                fput(bprm->file);
        }
+out_unmark:
+        if (clear_in_exec)
+                current->fs->in_exec = 0;
 out_unlock:
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 000000000000..1b2d4c63a579
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
+- Out-of-space may cause a severe problem if the object (and directory entry)
+  were written, but the inode attributes failed. Then if the filesystem was
+  unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 000000000000..cc2d22db119c
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc.  All rights reserved.
+#
+# Authors:
+#   Boaz Harrosh <bharrosh@panasas.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 000000000000..86194b2f799d
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
+config EXOFS_FS
+        tristate "exofs: OSD based file system support"
+        depends on SCSI_OSD_ULD
+        help
+          EXOFS is a file system that uses an OSD storage device,
+          as its backing storage.
+# Debugging-related stuff
+config EXOFS_DEBUG
+        bool "Enable debugging"
+        depends on EXOFS_FS
+        help
+          This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 000000000000..b1512c4bb8c7
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
+/*
+ * common.h - Common definitions for both Kernel and user-mode utilities
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+#include <linux/types.h>
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_MIN_PID   0x10000 /* Smallest partition ID */
+#define EXOFS_OBJ_OFF   0x10000 /* offset for objects */
+#define EXOFS_SUPER_ID  0x10000 /* object ID for on-disk superblock */
+#define EXOFS_ROOT_ID   0x10002 /* object ID for root directory */
+/* exofs Application specific page/attribute */
+# define EXOFS_APAGE_FS_DATA    (OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define EXOFS_ATTR_INODE_DATA  1
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number.  This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+        EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+                                        (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
+        EXOFS_MAX_ID     = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT  12
+#define EXOFS_BLKSIZE   (1UL << EXOFS_BLKSHIFT)
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC       0x5DF5
+/*
+ * The file system control block - stored in an object's data (mainly, the one
+ * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
+ * on disk.  Right now it just has a magic value, which is basically a sanity
+ * check on our ability to communicate with the object store.
+ */
+struct exofs_fscb {
+        __le64  s_nextid;       /* Highest object ID used */
+        __le32  s_numfiles;     /* Number of files on fs */
+        __le16  s_magic;        /* Magic signature */
+        __le16  s_newfs;        /* Non-zero if this is a new fs */
+};
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA             5
+/*
+ * The file control block - stored in an object's attributes.  This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+        __le64  i_size;                 /* Size of the file */
+        __le16  i_mode;                 /* File mode */
+        __le16  i_links_count;          /* Links count */
+        __le32  i_uid;                  /* Owner Uid */
+        __le32  i_gid;                  /* Group Id */
+        __le32  i_atime;                /* Access time */
+        __le32  i_ctime;                /* Creation time */
+        __le32  i_mtime;                /* Modification time */
+        __le32  i_flags;                /* File flags (unused for now)*/
+        __le32  i_generation;           /* File version (for NFS) */
+        __le32  i_data[EXOFS_IDATA];    /* Short symlink names and device #s */
+};
+#define EXOFS_INO_ATTR_SIZE     sizeof(struct exofs_fcb)
+/* This is the Attribute the fcb is stored in */
+static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
+        EXOFS_APAGE_FS_DATA,
+        EXOFS_ATTR_INODE_DATA,
+        EXOFS_INO_ATTR_SIZE);
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN  255
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+        __le64          inode_no;               /* inode number           */
+        __le16          rec_len;                /* directory entry length */
+        u8              name_len;               /* name length            */
+        u8              file_type;              /* umm...file type        */
+        char            name[EXOFS_NAME_LEN];   /* file name              */
+};
+enum {
+        EXOFS_FT_UNKNOWN,
+        EXOFS_FT_REG_FILE,
+        EXOFS_FT_DIR,
+        EXOFS_FT_CHRDEV,
+        EXOFS_FT_BLKDEV,
+        EXOFS_FT_FIFO,
+        EXOFS_FT_SOCK,
+        EXOFS_FT_SYMLINK,
+        EXOFS_FT_MAX
+};
+#define EXOFS_DIR_PAD                   4
+#define EXOFS_DIR_ROUND                 (EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+        (((name_len) + offsetof(struct exofs_dir_entry, name)  + \
+          EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+/*************************
+ * function declarations *
+ *************************/
+/* osd.c                 */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+                           const struct osd_obj_id *obj);
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
+static inline int exofs_check_ok(struct osd_request *or)
+{
+        return exofs_check_ok_resid(or, NULL, NULL);
+}
+int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
+int exofs_async_op(struct osd_request *or,
+        osd_req_done_fn *async_done, void *caller_context, u8 *cred);
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
+int osd_req_read_kern(struct osd_request *or,
+        const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+int osd_req_write_kern(struct osd_request *or,
+        const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 000000000000..65b0c8c776a1
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,672 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "exofs.h"
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+        return inode->i_sb->s_blocksize;
+}
+static inline void exofs_put_page(struct page *page)
+{
+        kunmap(page);
+        page_cache_release(page);
+}
+/* Accesses dir's inode->i_size must be called under inode lock */
+static inline unsigned long dir_pages(struct inode *inode)
+{
+        return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+        loff_t last_byte = inode->i_size;
+        last_byte -= page_nr << PAGE_CACHE_SHIFT;
+        if (last_byte > PAGE_CACHE_SIZE)
+                last_byte = PAGE_CACHE_SIZE;
+        return last_byte;
+}
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *dir = mapping->host;
+        int err = 0;
+        dir->i_version++;
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
+        if (pos+len > dir->i_size) {
+                i_size_write(dir, pos+len);
+                mark_inode_dirty(dir);
+        }
+        set_page_dirty(page);
+        if (IS_DIRSYNC(dir))
+                err = write_one_page(page, 1);
+        else
+                unlock_page(page);
+        return err;
+}
+static void exofs_check_page(struct page *page)
+{
+        struct inode *dir = page->mapping->host;
+        unsigned chunk_size = exofs_chunk_size(dir);
+        char *kaddr = page_address(page);
+        unsigned offs, rec_len;
+        unsigned limit = PAGE_CACHE_SIZE;
+        struct exofs_dir_entry *p;
+        char *error;
+        /* if the page is the last one in the directory */
+        if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+                limit = dir->i_size & ~PAGE_CACHE_MASK;
+                if (limit & (chunk_size - 1))
+                        goto Ebadsize;
+                if (!limit)
+                        goto out;
+        }
+        for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+                p = (struct exofs_dir_entry *)(kaddr + offs);
+                rec_len = le16_to_cpu(p->rec_len);
+                if (rec_len < EXOFS_DIR_REC_LEN(1))
+                        goto Eshort;
+                if (rec_len & 3)
+                        goto Ealign;
+                if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+                        goto Enamelen;
+                if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                        goto Espan;
+        }
+        if (offs != limit)
+                goto Eend;
+out:
+        SetPageChecked(page);
+        return;
+Ebadsize:
+        EXOFS_ERR("ERROR [exofs_check_page]: "
+                "size of directory #%lu is not a multiple of chunk size",
+                dir->i_ino
+        );
+        goto fail;
+Eshort:
+        error = "rec_len is smaller than minimal";
+        goto bad_entry;
+Ealign:
+        error = "unaligned directory entry";
+        goto bad_entry;
+Enamelen:
+        error = "rec_len is too small for name_len";
+        goto bad_entry;
+Espan:
+        error = "directory entry across blocks";
+        goto bad_entry;
+bad_entry:
+        EXOFS_ERR(
+                "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+                "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+                dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                _LLU(le64_to_cpu(p->inode_no)),
+                rec_len, p->name_len);
+        goto fail;
+Eend:
+        p = (struct exofs_dir_entry *)(kaddr + offs);
+        EXOFS_ERR("ERROR [exofs_check_page]: "
+                "entry in directory #%lu spans the page boundary"
+                "offset=%lu, inode=%llu",
+                dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                _LLU(le64_to_cpu(p->inode_no)));
+fail:
+        SetPageChecked(page);
+        SetPageError(page);
+}
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+        struct address_space *mapping = dir->i_mapping;
+        struct page *page = read_mapping_page(mapping, n, NULL);
+        if (!IS_ERR(page)) {
+                kmap(page);
+                if (!PageChecked(page))
+                        exofs_check_page(page);
+                if (PageError(page))
+                        goto fail;
+        }
+        return page;
+fail:
+        exofs_put_page(page);
+        return ERR_PTR(-EIO);
+}
+static inline int exofs_match(int len, const unsigned char *name,
+                                        struct exofs_dir_entry *de)
+{
+        if (len != de->name_len)
+                return 0;
+        if (!de->inode_no)
+                return 0;
+        return !memcmp(name, de->name, len);
+}
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+        return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+        struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+        struct exofs_dir_entry *p =
+                        (struct exofs_dir_entry *)(base + (offset&mask));
+        while ((char *)p < (char *)de) {
+                if (p->rec_len == 0)
+                        break;
+                p = exofs_next_entry(p);
+        }
+        return (char *)p - base;
+}
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+        [EXOFS_FT_UNKNOWN]      = DT_UNKNOWN,
+        [EXOFS_FT_REG_FILE]     = DT_REG,
+        [EXOFS_FT_DIR]          = DT_DIR,
+        [EXOFS_FT_CHRDEV]       = DT_CHR,
+        [EXOFS_FT_BLKDEV]       = DT_BLK,
+        [EXOFS_FT_FIFO]         = DT_FIFO,
+        [EXOFS_FT_SOCK]         = DT_SOCK,
+        [EXOFS_FT_SYMLINK]      = DT_LNK,
+};
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+        [S_IFREG >> S_SHIFT]    = EXOFS_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = EXOFS_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = EXOFS_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = EXOFS_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = EXOFS_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = EXOFS_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = EXOFS_FT_SYMLINK,
+};
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+        mode_t mode = inode->i_mode;
+        de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+static int
+exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        loff_t pos = filp->f_pos;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        unsigned int offset = pos & ~PAGE_CACHE_MASK;
+        unsigned long n = pos >> PAGE_CACHE_SHIFT;
+        unsigned long npages = dir_pages(inode);
+        unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+        unsigned char *types = NULL;
+        int need_revalidate = (filp->f_version != inode->i_version);
+        if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+                return 0;
+        types = exofs_filetype_table;
+        for ( ; n < npages; n++, offset = 0) {
+                char *kaddr, *limit;
+                struct exofs_dir_entry *de;
+                struct page *page = exofs_get_page(inode, n);
+                if (IS_ERR(page)) {
+                        EXOFS_ERR("ERROR: "
+                                   "bad page in #%lu",
+                                   inode->i_ino);
+                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        return PTR_ERR(page);
+                }
+                kaddr = page_address(page);
+                if (unlikely(need_revalidate)) {
+                        if (offset) {
+                                offset = exofs_validate_entry(kaddr, offset,
+                                                                chunk_mask);
+                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                        }
+                        filp->f_version = inode->i_version;
+                        need_revalidate = 0;
+                }
+                de = (struct exofs_dir_entry *)(kaddr + offset);
+                limit = kaddr + exofs_last_byte(inode, n) -
+                                                        EXOFS_DIR_REC_LEN(1);
+                for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+                        if (de->rec_len == 0) {
+                                EXOFS_ERR("ERROR: "
+                                        "zero-length directory entry");
+                                exofs_put_page(page);
+                                return -EIO;
+                        }
+                        if (de->inode_no) {
+                                int over;
+                                unsigned char d_type = DT_UNKNOWN;
+                                if (types && de->file_type < EXOFS_FT_MAX)
+                                        d_type = types[de->file_type];
+                                offset = (char *)de - kaddr;
+                                over = filldir(dirent, de->name, de->name_len,
+                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                                le64_to_cpu(de->inode_no),
+                                                d_type);
+                                if (over) {
+                                        exofs_put_page(page);
+                                        return 0;
+                                }
+                        }
+                        filp->f_pos += le16_to_cpu(de->rec_len);
+                }
+                exofs_put_page(page);
+        }
+        return 0;
+}
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+                        struct dentry *dentry, struct page **res_page)
+{
+        const unsigned char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+        unsigned long start, n;
+        unsigned long npages = dir_pages(dir);
+        struct page *page = NULL;
+        struct exofs_i_info *oi = exofs_i(dir);
+        struct exofs_dir_entry *de;
+        if (npages == 0)
+                goto out;
+        *res_page = NULL;
+        start = oi->i_dir_start_lookup;
+        if (start >= npages)
+                start = 0;
+        n = start;
+        do {
+                char *kaddr;
+                page = exofs_get_page(dir, n);
+                if (!IS_ERR(page)) {
+                        kaddr = page_address(page);
+                        de = (struct exofs_dir_entry *) kaddr;
+                        kaddr += exofs_last_byte(dir, n) - reclen;
+                        while ((char *) de <= kaddr) {
+                                if (de->rec_len == 0) {
+                                        EXOFS_ERR(
+                                                "ERROR: exofs_find_entry: "
+                                                "zero-length directory entry");
+                                        exofs_put_page(page);
+                                        goto out;
+                                }
+                                if (exofs_match(namelen, name, de))
+                                        goto found;
+                                de = exofs_next_entry(de);
+                        }
+                        exofs_put_page(page);
+                }
+                if (++n >= npages)
+                        n = 0;
+        } while (n != start);
+out:
+        return NULL;
+found:
+        *res_page = page;
+        oi->i_dir_start_lookup = n;
+        return de;
+}
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+        struct page *page = exofs_get_page(dir, 0);
+        struct exofs_dir_entry *de = NULL;
+        if (!IS_ERR(page)) {
+                de = exofs_next_entry(
+                                (struct exofs_dir_entry *)page_address(page));
+                *p = page;
+        }
+        return de;
+}
+ino_t exofs_parent_ino(struct dentry *child)
+{
+        struct page *page;
+        struct exofs_dir_entry *de;
+        ino_t ino;
+        de = exofs_dotdot(child->d_inode, &page);
+        if (!de)
+                return 0;
+        ino = le64_to_cpu(de->inode_no);
+        exofs_put_page(page);
+        return ino;
+}
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+        ino_t res = 0;
+        struct exofs_dir_entry *de;
+        struct page *page;
+        de = exofs_find_entry(dir, dentry, &page);
+        if (de) {
+                res = le64_to_cpu(de->inode_no);
+                exofs_put_page(page);
+        }
+        return res;
+}
+int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+                        struct page *page, struct inode *inode)
+{
+        loff_t pos = page_offset(page) +
+                        (char *) de - (char *) page_address(page);
+        unsigned len = le16_to_cpu(de->rec_len);
+        int err;
+        lock_page(page);
+        err = exofs_write_begin(NULL, page->mapping, pos, len,
+                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+        if (err)
+                EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+                          err);
+        de->inode_no = cpu_to_le64(inode->i_ino);
+        exofs_set_de_type(de, inode);
+        if (likely(!err))
+                err = exofs_commit_chunk(page, pos, len);
+        exofs_put_page(page);
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(dir);
+        return err;
+}
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        const unsigned char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        unsigned chunk_size = exofs_chunk_size(dir);
+        unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+        unsigned short rec_len, name_len;
+        struct page *page = NULL;
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        struct exofs_dir_entry *de;
+        unsigned long npages = dir_pages(dir);
+        unsigned long n;
+        char *kaddr;
+        loff_t pos;
+        int err;
+        for (n = 0; n <= npages; n++) {
+                char *dir_end;
+                page = exofs_get_page(dir, n);
+                err = PTR_ERR(page);
+                if (IS_ERR(page))
+                        goto out;
+                lock_page(page);
+                kaddr = page_address(page);
+                dir_end = kaddr + exofs_last_byte(dir, n);
+                de = (struct exofs_dir_entry *)kaddr;
+                kaddr += PAGE_CACHE_SIZE - reclen;
+                while ((char *)de <= kaddr) {
+                        if ((char *)de == dir_end) {
+                                name_len = 0;
+                                rec_len = chunk_size;
+                                de->rec_len = cpu_to_le16(chunk_size);
+                                de->inode_no = 0;
+                                goto got_it;
+                        }
+                        if (de->rec_len == 0) {
+                                EXOFS_ERR("ERROR: exofs_add_link: "
+                                        "zero-length directory entry");
+                                err = -EIO;
+                                goto out_unlock;
+                        }
+                        err = -EEXIST;
+                        if (exofs_match(namelen, name, de))
+                                goto out_unlock;
+                        name_len = EXOFS_DIR_REC_LEN(de->name_len);
+                        rec_len = le16_to_cpu(de->rec_len);
+                        if (!de->inode_no && rec_len >= reclen)
+                                goto got_it;
+                        if (rec_len >= name_len + reclen)
+                                goto got_it;
+                        de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+                }
+                unlock_page(page);
+                exofs_put_page(page);
+        }
+        EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+        return -EINVAL;
+got_it:
+        pos = page_offset(page) +
+                (char *)de - (char *)page_address(page);
+        err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+                                                        &page, NULL);
+        if (err)
+                goto out_unlock;
+        if (de->inode_no) {
+                struct exofs_dir_entry *de1 =
+                        (struct exofs_dir_entry *)((char *)de + name_len);
+                de1->rec_len = cpu_to_le16(rec_len - name_len);
+                de->rec_len = cpu_to_le16(name_len);
+                de = de1;
+        }
+        de->name_len = namelen;
+        memcpy(de->name, name, namelen);
+        de->inode_no = cpu_to_le64(inode->i_ino);
+        exofs_set_de_type(de, inode);
+        err = exofs_commit_chunk(page, pos, rec_len);
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(dir);
+        sbi->s_numfiles++;
+out_put:
+        exofs_put_page(page);
+out:
+        return err;
+out_unlock:
+        unlock_page(page);
+        goto out_put;
+}
+int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        char *kaddr = page_address(page);
+        unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
+        unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+        loff_t pos;
+        struct exofs_dir_entry *pde = NULL;
+        struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
+        int err;
+        while (de < dir) {
+                if (de->rec_len == 0) {
+                        EXOFS_ERR("ERROR: exofs_delete_entry:"
+                                "zero-length directory entry");
+                        err = -EIO;
+                        goto out;
+                }
+                pde = de;
+                de = exofs_next_entry(de);
+        }
+        if (pde)
+                from = (char *)pde - (char *)page_address(page);
+        pos = page_offset(page) + from;
+        lock_page(page);
+        err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
+                                                        &page, NULL);
+        if (err)
+                EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+                          err);
+        if (pde)
+                pde->rec_len = cpu_to_le16(to - from);
+        dir->inode_no = 0;
+        if (likely(!err))
+                err = exofs_commit_chunk(page, pos, to - from);
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        mark_inode_dirty(inode);
+        sbi->s_numfiles--;
+out:
+        exofs_put_page(page);
+        return err;
+}
+/* kept aligned on 4 bytes */
+#define THIS_DIR ".\0\0"
+#define PARENT_DIR "..\0"
+int exofs_make_empty(struct inode *inode, struct inode *parent)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct page *page = grab_cache_page(mapping, 0);
+        unsigned chunk_size = exofs_chunk_size(inode);
+        struct exofs_dir_entry *de;
+        int err;
+        void *kaddr;
+        if (!page)
+                return -ENOMEM;
+        err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+                                                        &page, NULL);
+        if (err) {
+                unlock_page(page);
+                goto fail;
+        }
+        kaddr = kmap_atomic(page, KM_USER0);
+        de = (struct exofs_dir_entry *)kaddr;
+        de->name_len = 1;
+        de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+        memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
+        de->inode_no = cpu_to_le64(inode->i_ino);
+        exofs_set_de_type(de, inode);
+        de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
+        de->name_len = 2;
+        de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
+        de->inode_no = cpu_to_le64(parent->i_ino);
+        memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
+        exofs_set_de_type(de, inode);
+        kunmap_atomic(page, KM_USER0);
+        err = exofs_commit_chunk(page, 0, chunk_size);
+fail:
+        page_cache_release(page);
+        return err;
+}
+int exofs_empty_dir(struct inode *inode)
+{
+        struct page *page = NULL;
+        unsigned long i, npages = dir_pages(inode);
+        for (i = 0; i < npages; i++) {
+                char *kaddr;
+                struct exofs_dir_entry *de;
+                page = exofs_get_page(inode, i);
+                if (IS_ERR(page))
+                        continue;
+                kaddr = page_address(page);
+                de = (struct exofs_dir_entry *)kaddr;
+                kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
+                while ((char *)de <= kaddr) {
+                        if (de->rec_len == 0) {
+                                EXOFS_ERR("ERROR: exofs_empty_dir: "
+                                          "zero-length directory entry"
+                                          "kaddr=%p, de=%p\n", kaddr, de);
+                                goto not_empty;
+                        }
+                        if (de->inode_no != 0) {
+                                /* check for . and .. */
+                                if (de->name[0] != '.')
+                                        goto not_empty;
+                                if (de->name_len > 2)
+                                        goto not_empty;
+                                if (de->name_len < 2) {
+                                        if (le64_to_cpu(de->inode_no) !=
+                                            inode->i_ino)
+                                                goto not_empty;
+                                } else if (de->name[1] != '.')
+                                        goto not_empty;
+                        }
+                        de = exofs_next_entry(de);
+                }
+                exofs_put_page(page);
+        }
+        return 1;
+not_empty:
+        exofs_put_page(page);
+        return 0;
+}
+const struct file_operations exofs_dir_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = exofs_readdir,
+};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 000000000000..0fd4c7859679
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/fs.h>
+#include <linux/time.h>
+#include "common.h"
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
+#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
+#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG(fmt, a...) \
+        printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define EXOFS_DBGMSG(fmt, a...) \
+        do { if (0) printk(fmt, ##a); } while (0)
+#endif
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+/*
+ * our extension to the in-memory superblock
+ */
+struct exofs_sb_info {
+        struct osd_dev  *s_dev;                 /* returned by get_osd_dev    */
+        osd_id          s_pid;                  /* partition ID of file system*/
+        int             s_timeout;              /* timeout for OSD operations */
+        uint64_t        s_nextid;               /* highest object ID used     */
+        uint32_t        s_numfiles;             /* number of files on fs      */
+        spinlock_t      s_next_gen_lock;        /* spinlock for gen # update  */
+        u32             s_next_generation;      /* next gen # to use          */
+        atomic_t        s_curr_pending;         /* number of pending commands */
+        uint8_t         s_cred[OSD_CAP_LEN];    /* all-powerful credential    */
+};
+/*
+ * our extension to the in-memory inode
+ */
+struct exofs_i_info {
+        unsigned long  i_flags;            /* various atomic flags            */
+        uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
+        uint32_t       i_dir_start_lookup; /* which page to start lookup      */
+        wait_queue_head_t i_wq;            /* wait queue for inode            */
+        uint64_t       i_commit_size;      /* the object's written length     */
+        uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
+        struct inode   vfs_inode;          /* normal in-memory inode          */
+};
+/*
+ * our inode flags
+ */
+#define OBJ_2BCREATED   0       /* object will be created soon*/
+#define OBJ_CREATED     1       /* object has been created on the osd*/
+static inline int obj_2bcreated(struct exofs_i_info *oi)
+{
+        return test_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+static inline void set_obj_2bcreated(struct exofs_i_info *oi)
+{
+        set_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+static inline int obj_created(struct exofs_i_info *oi)
+{
+        return test_bit(OBJ_CREATED, &oi->i_flags);
+}
+static inline void set_obj_created(struct exofs_i_info *oi)
+{
+        set_bit(OBJ_CREATED, &oi->i_flags);
+}
+int __exofs_wait_obj_created(struct exofs_i_info *oi);
+static inline int wait_obj_created(struct exofs_i_info *oi)
+{
+        if (likely(obj_created(oi)))
+                return 0;
+        return __exofs_wait_obj_created(oi);
+}
+/*
+ * get to our inode from the vfs inode
+ */
+static inline struct exofs_i_info *exofs_i(struct inode *inode)
+{
+        return container_of(inode, struct exofs_i_info, vfs_inode);
+}
+/*
+ * Maximum count of links to a file
+ */
+#define EXOFS_LINK_MAX           32000
+/*************************
+ * function declarations *
+ *************************/
+/* inode.c               */
+void exofs_truncate(struct inode *inode);
+int exofs_setattr(struct dentry *, struct iattr *);
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata);
+extern struct inode *exofs_iget(struct super_block *, unsigned long);
+struct inode *exofs_new_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, int);
+extern void exofs_delete_inode(struct inode *);
+/* dir.c:                */
+int exofs_add_link(struct dentry *, struct inode *);
+ino_t exofs_inode_by_name(struct inode *, struct dentry *);
+int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
+int exofs_make_empty(struct inode *, struct inode *);
+struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
+                                         struct page **);
+int exofs_empty_dir(struct inode *);
+struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+ino_t exofs_parent_ino(struct dentry *child);
+int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
+                    struct inode *);
+/*********************
+ * operation vectors *
+ *********************/
+/* dir.c:            */
+extern const struct file_operations exofs_dir_operations;
+/* file.c            */
+extern const struct inode_operations exofs_file_inode_operations;
+extern const struct file_operations exofs_file_operations;
+/* inode.c           */
+extern const struct address_space_operations exofs_aops;
+/* namei.c           */
+extern const struct inode_operations exofs_dir_inode_operations;
+extern const struct inode_operations exofs_special_inode_operations;
+/* symlink.c         */
+extern const struct inode_operations exofs_symlink_inode_operations;
+extern const struct inode_operations exofs_fast_symlink_inode_operations;
+#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 000000000000..6ed7fe484752
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/buffer_head.h>
+#include "exofs.h"
+static int exofs_release_file(struct inode *inode, struct file *filp)
+{
+        return 0;
+}
+static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+                            int datasync)
+{
+        int ret;
+        struct address_space *mapping = filp->f_mapping;
+        ret = filemap_write_and_wait(mapping);
+        if (ret)
+                return ret;
+        /*Note: file_fsync below also calles sync_blockdev, which is a no-op
+         *      for exofs, but other then that it does sync_inode and
+         *      sync_superblock which is what we need here.
+         */
+        return file_fsync(filp, dentry, datasync);
+}
+static int exofs_flush(struct file *file, fl_owner_t id)
+{
+        exofs_file_fsync(file, file->f_path.dentry, 1);
+        /* TODO: Flush the OSD target */
+        return 0;
+}
+const struct file_operations exofs_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = generic_file_aio_read,
+        .aio_write      = generic_file_aio_write,
+        .mmap           = generic_file_mmap,
+        .open           = generic_file_open,
+        .release        = exofs_release_file,
+        .fsync          = exofs_file_fsync,
+        .flush          = exofs_flush,
+        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
+};
+const struct inode_operations exofs_file_inode_operations = {
+        .truncate       = exofs_truncate,
+        .setattr        = exofs_setattr,
+};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 000000000000..ba8d9fab4693
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,1303 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <scsi/scsi_device.h>
+#include "exofs.h"
+#ifdef CONFIG_EXOFS_DEBUG
+#  define EXOFS_DEBUG_OBJ_ISIZE 1
+#endif
+struct page_collect {
+        struct exofs_sb_info *sbi;
+        struct request_queue *req_q;
+        struct inode *inode;
+        unsigned expected_pages;
+        struct bio *bio;
+        unsigned nr_pages;
+        unsigned long length;
+        loff_t pg_first; /* keep 64bit also in 32-arches */
+};
+static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
+                struct inode *inode)
+{
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
+        pcol->sbi = sbi;
+        pcol->req_q = req_q;
+        pcol->inode = inode;
+        pcol->expected_pages = expected_pages;
+        pcol->bio = NULL;
+        pcol->nr_pages = 0;
+        pcol->length = 0;
+        pcol->pg_first = -1;
+        EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
+                     expected_pages);
+}
+static void _pcol_reset(struct page_collect *pcol)
+{
+        pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
+        pcol->bio = NULL;
+        pcol->nr_pages = 0;
+        pcol->length = 0;
+        pcol->pg_first = -1;
+        EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+                     pcol->inode->i_ino, pcol->expected_pages);
+        /* this is probably the end of the loop but in writes
+         * it might not end here. don't be left with nothing
+         */
+        if (!pcol->expected_pages)
+                pcol->expected_pages = 128;
+}
+static int pcol_try_alloc(struct page_collect *pcol)
+{
+        int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+        for (; pages; pages >>= 1) {
+                pcol->bio = bio_alloc(GFP_KERNEL, pages);
+                if (likely(pcol->bio))
+                        return 0;
+        }
+        EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+                  pcol->expected_pages);
+        return -ENOMEM;
+}
+static void pcol_free(struct page_collect *pcol)
+{
+        bio_put(pcol->bio);
+        pcol->bio = NULL;
+}
+static int pcol_add_page(struct page_collect *pcol, struct page *page,
+                         unsigned len)
+{
+        int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
+        if (unlikely(len != added_len))
+                return -ENOMEM;
+        ++pcol->nr_pages;
+        pcol->length += len;
+        return 0;
+}
+static int update_read_page(struct page *page, int ret)
+{
+        if (ret == 0) {
+                /* Everything is OK */
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+        } else if (ret == -EFAULT) {
+                /* In this case we were trying to read something that wasn't on
+                 * disk yet - return a page full of zeroes.  This should be OK,
+                 * because the object should be empty (if there was a write
+                 * before this read, the read would be waiting with the page
+                 * locked */
+                clear_highpage(page);
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+                ret = 0; /* recovered error */
+                EXOFS_DBGMSG("recovered read error\n");
+        } else /* Error */
+                SetPageError(page);
+        return ret;
+}
+static void update_write_page(struct page *page, int ret)
+{
+        if (ret) {
+                mapping_set_error(page->mapping, ret);
+                SetPageError(page);
+        }
+        end_page_writeback(page);
+}
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
+static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+                            bool do_unlock)
+{
+        struct bio_vec *bvec;
+        int i;
+        u64 resid;
+        u64 good_bytes;
+        u64 length = 0;
+        int ret = exofs_check_ok_resid(or, &resid, NULL);
+        osd_end_request(or);
+        if (likely(!ret))
+                good_bytes = pcol->length;
+        else if (!resid)
+                good_bytes = 0;
+        else
+                good_bytes = pcol->length - resid;
+        EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+                     " length=0x%lx nr_pages=%u\n",
+                     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+                     pcol->nr_pages);
+        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+                struct page *page = bvec->bv_page;
+                struct inode *inode = page->mapping->host;
+                int page_stat;
+                if (inode != pcol->inode)
+                        continue; /* osd might add more pages at end */
+                if (likely(length < good_bytes))
+                        page_stat = 0;
+                else
+                        page_stat = ret;
+                EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+                          inode->i_ino, page->index,
+                          page_stat ? "bad_bytes" : "good_bytes");
+                ret = update_read_page(page, page_stat);
+                if (do_unlock)
+                        unlock_page(page);
+                length += bvec->bv_len;
+        }
+        pcol_free(pcol);
+        EXOFS_DBGMSG("readpages_done END\n");
+        return ret;
+}
+/* callback of async reads */
+static void readpages_done(struct osd_request *or, void *p)
+{
+        struct page_collect *pcol = p;
+        __readpages_done(or, pcol, true);
+        atomic_dec(&pcol->sbi->s_curr_pending);
+        kfree(p);
+}
+static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
+{
+        struct bio_vec *bvec;
+        int i;
+        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+                struct page *page = bvec->bv_page;
+                if (rw == READ)
+                        update_read_page(page, ret);
+                else
+                        update_write_page(page, ret);
+                unlock_page(page);
+        }
+        pcol_free(pcol);
+}
+static int read_exec(struct page_collect *pcol, bool is_sync)
+{
+        struct exofs_i_info *oi = exofs_i(pcol->inode);
+        struct osd_obj_id obj = {pcol->sbi->s_pid,
+                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or = NULL;
+        struct page_collect *pcol_copy = NULL;
+        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+        int ret;
+        if (!pcol->bio)
+                return 0;
+        /* see comment in _readpage() about sync reads */
+        WARN_ON(is_sync && (pcol->nr_pages != 1));
+        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        osd_req_read(or, &obj, pcol->bio, i_start);
+        if (is_sync) {
+                exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+                return __readpages_done(or, pcol, false);
+        }
+        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+        if (!pcol_copy) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        *pcol_copy = *pcol;
+        ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+        if (unlikely(ret))
+                goto err;
+        atomic_inc(&pcol->sbi->s_curr_pending);
+        EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+                  obj.id, _LLU(i_start), pcol->length);
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        return 0;
+err:
+        if (!is_sync)
+                _unlock_pcol_pages(pcol, ret, READ);
+        kfree(pcol_copy);
+        if (or)
+                osd_end_request(or);
+        return ret;
+}
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+        struct page_collect *pcol = data;
+        struct inode *inode = pcol->inode;
+        struct exofs_i_info *oi = exofs_i(inode);
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        size_t len;
+        int ret;
+        /* FIXME: Just for debugging, will be removed */
+        if (PageUptodate(page))
+                EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
+                          page->index);
+        if (page->index < end_index)
+                len = PAGE_CACHE_SIZE;
+        else if (page->index == end_index)
+                len = i_size & ~PAGE_CACHE_MASK;
+        else
+                len = 0;
+        if (!len || !obj_created(oi)) {
+                /* this will be out of bounds, or doesn't exist yet.
+                 * Current page is cleared and the request is split
+                 */
+                clear_highpage(page);
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+                unlock_page(page);
+                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
+                             " splitting\n", inode->i_ino, page->index);
+                return read_exec(pcol, false);
+        }
+try_again:
+        if (unlikely(pcol->pg_first == -1)) {
+                pcol->pg_first = page->index;
+        } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+                   page->index)) {
+                /* Discontinuity detected, split the request */
+                ret = read_exec(pcol, false);
+                if (unlikely(ret))
+                        goto fail;
+                goto try_again;
+        }
+        if (!pcol->bio) {
+                ret = pcol_try_alloc(pcol);
+                if (unlikely(ret))
+                        goto fail;
+        }
+        if (len != PAGE_CACHE_SIZE)
+                zero_user(page, len, PAGE_CACHE_SIZE - len);
+        EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+                     inode->i_ino, page->index, len);
+        ret = pcol_add_page(pcol, page, len);
+        if (ret) {
+                EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+                          "this_len=0x%zx nr_pages=%u length=0x%lx\n",
+                          page, len, pcol->nr_pages, pcol->length);
+                /* split the request, and start again with current page */
+                ret = read_exec(pcol, false);
+                if (unlikely(ret))
+                        goto fail;
+                goto try_again;
+        }
+        return 0;
+fail:
+        /* SetPageError(page); ??? */
+        unlock_page(page);
+        return ret;
+}
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+                           struct list_head *pages, unsigned nr_pages)
+{
+        struct page_collect pcol;
+        int ret;
+        _pcol_init(&pcol, nr_pages, mapping->host);
+        ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
+        if (ret) {
+                EXOFS_ERR("read_cache_pages => %d\n", ret);
+                return ret;
+        }
+        return read_exec(&pcol, false);
+}
+static int _readpage(struct page *page, bool is_sync)
+{
+        struct page_collect pcol;
+        int ret;
+        _pcol_init(&pcol, 1, page->mapping->host);
+        /* readpage_strip might call read_exec(,async) inside at several places
+         * but this is safe for is_async=0 since read_exec will not do anything
+         * when we have a single page.
+         */
+        ret = readpage_strip(&pcol, page);
+        if (ret) {
+                EXOFS_ERR("_readpage => %d\n", ret);
+                return ret;
+        }
+        return read_exec(&pcol, is_sync);
+}
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+        return _readpage(page, false);
+}
+/* Callback for osd_write. All writes are asynchronouse */
+static void writepages_done(struct osd_request *or, void *p)
+{
+        struct page_collect *pcol = p;
+        struct bio_vec *bvec;
+        int i;
+        u64 resid;
+        u64  good_bytes;
+        u64  length = 0;
+        int ret = exofs_check_ok_resid(or, NULL, &resid);
+        osd_end_request(or);
+        atomic_dec(&pcol->sbi->s_curr_pending);
+        if (likely(!ret))
+                good_bytes = pcol->length;
+        else if (!resid)
+                good_bytes = 0;
+        else
+                good_bytes = pcol->length - resid;
+        EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+                     " length=0x%lx nr_pages=%u\n",
+                     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+                     pcol->nr_pages);
+        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+                struct page *page = bvec->bv_page;
+                struct inode *inode = page->mapping->host;
+                int page_stat;
+                if (inode != pcol->inode)
+                        continue; /* osd might add more pages to a bio */
+                if (likely(length < good_bytes))
+                        page_stat = 0;
+                else
+                        page_stat = ret;
+                update_write_page(page, page_stat);
+                unlock_page(page);
+                EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+                             inode->i_ino, page->index, page_stat);
+                length += bvec->bv_len;
+        }
+        pcol_free(pcol);
+        kfree(pcol);
+        EXOFS_DBGMSG("writepages_done END\n");
+}
+static int write_exec(struct page_collect *pcol)
+{
+        struct exofs_i_info *oi = exofs_i(pcol->inode);
+        struct osd_obj_id obj = {pcol->sbi->s_pid,
+                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or = NULL;
+        struct page_collect *pcol_copy = NULL;
+        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+        int ret;
+        if (!pcol->bio)
+                return 0;
+        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
+                ret = -ENOMEM;
+                goto err;
+        }
+        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+        if (!pcol_copy) {
+                EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+                ret = -ENOMEM;
+                goto err;
+        }
+        *pcol_copy = *pcol;
+        osd_req_write(or, &obj, pcol_copy->bio, i_start);
+        ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+        if (unlikely(ret)) {
+                EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+                goto err;
+        }
+        atomic_inc(&pcol->sbi->s_curr_pending);
+        EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+                  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+                  pcol->length);
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        return 0;
+err:
+        _unlock_pcol_pages(pcol, ret, WRITE);
+        kfree(pcol_copy);
+        if (or)
+                osd_end_request(or);
+        return ret;
+}
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
+static int writepage_strip(struct page *page,
+                           struct writeback_control *wbc_unused, void *data)
+{
+        struct page_collect *pcol = data;
+        struct inode *inode = pcol->inode;
+        struct exofs_i_info *oi = exofs_i(inode);
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        size_t len;
+        int ret;
+        BUG_ON(!PageLocked(page));
+        ret = wait_obj_created(oi);
+        if (unlikely(ret))
+                goto fail;
+        if (page->index < end_index)
+                /* in this case, the page is within the limits of the file */
+                len = PAGE_CACHE_SIZE;
+        else {
+                len = i_size & ~PAGE_CACHE_MASK;
+                if (page->index > end_index || !len) {
+                        /* in this case, the page is outside the limits
+                         * (truncate in progress)
+                         */
+                        ret = write_exec(pcol);
+                        if (unlikely(ret))
+                                goto fail;
+                        if (PageError(page))
+                                ClearPageError(page);
+                        unlock_page(page);
+                        return 0;
+                }
+        }
+try_again:
+        if (unlikely(pcol->pg_first == -1)) {
+                pcol->pg_first = page->index;
+        } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+                   page->index)) {
+                /* Discontinuity detected, split the request */
+                ret = write_exec(pcol);
+                if (unlikely(ret))
+                        goto fail;
+                goto try_again;
+        }
+        if (!pcol->bio) {
+                ret = pcol_try_alloc(pcol);
+                if (unlikely(ret))
+                        goto fail;
+        }
+        EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+                     inode->i_ino, page->index, len);
+        ret = pcol_add_page(pcol, page, len);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("Failed pcol_add_page "
+                             "nr_pages=%u total_length=0x%lx\n",
+                             pcol->nr_pages, pcol->length);
+                /* split the request, next loop will start again */
+                ret = write_exec(pcol);
+                if (unlikely(ret)) {
+                        EXOFS_DBGMSG("write_exec faild => %d", ret);
+                        goto fail;
+                }
+                goto try_again;
+        }
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        return 0;
+fail:
+        set_bit(AS_EIO, &page->mapping->flags);
+        unlock_page(page);
+        return ret;
+}
+static int exofs_writepages(struct address_space *mapping,
+                       struct writeback_control *wbc)
+{
+        struct page_collect pcol;
+        long start, end, expected_pages;
+        int ret;
+        start = wbc->range_start >> PAGE_CACHE_SHIFT;
+        end = (wbc->range_end == LLONG_MAX) ?
+                        start + mapping->nrpages :
+                        wbc->range_end >> PAGE_CACHE_SHIFT;
+        if (start || end)
+                expected_pages = min(end - start + 1, 32L);
+        else
+                expected_pages = mapping->nrpages;
+        EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+                     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+                     mapping->host->i_ino, wbc->range_start, wbc->range_end,
+                     mapping->nrpages, start, end);
+        _pcol_init(&pcol, expected_pages, mapping->host);
+        ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
+        if (ret) {
+                EXOFS_ERR("write_cache_pages => %d\n", ret);
+                return ret;
+        }
+        return write_exec(&pcol);
+}
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct page_collect pcol;
+        int ret;
+        _pcol_init(&pcol, 1, page->mapping->host);
+        ret = writepage_strip(page, NULL, &pcol);
+        if (ret) {
+                EXOFS_ERR("exofs_writepage => %d\n", ret);
+                return ret;
+        }
+        return write_exec(&pcol);
+}
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+        int ret = 0;
+        struct page *page;
+        page = *pagep;
+        if (page == NULL) {
+                ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+                                         fsdata);
+                if (ret) {
+                        EXOFS_DBGMSG("simple_write_begin faild\n");
+                        return ret;
+                }
+                page = *pagep;
+        }
+         /* read modify write */
+        if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+                ret = _readpage(page, true);
+                if (ret) {
+                        /*SetPageError was done by _readpage. Is it ok?*/
+                        unlock_page(page);
+                        EXOFS_DBGMSG("__readpage_filler faild\n");
+                }
+        }
+        return ret;
+}
+static int exofs_write_begin_export(struct file *file,
+                struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+        *pagep = NULL;
+        return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+                                        fsdata);
+}
+const struct address_space_operations exofs_aops = {
+        .readpage       = exofs_readpage,
+        .readpages      = exofs_readpages,
+        .writepage      = exofs_writepage,
+        .writepages     = exofs_writepages,
+        .write_begin    = exofs_write_begin_export,
+        .write_end      = simple_write_end,
+};
+/******************************************************************************
+ * INODE OPERATIONS
+ *****************************************************************************/
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+{
+        struct exofs_i_info *oi = exofs_i(inode);
+        return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
+}
+/*
+ * get_block_t - Fill in a buffer_head
+ * An OSD takes care of block allocation so we just fake an allocation by
+ * putting in the inode's sector_t in the buffer_head.
+ * TODO: What about the case of create==0 and @iblock does not exist in the
+ * object?
+ */
+static int exofs_get_block(struct inode *inode, sector_t iblock,
+                    struct buffer_head *bh_result, int create)
+{
+        map_bh(bh_result, inode->i_sb, iblock);
+        return 0;
+}
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+        OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+/*
+ * Truncate a file to the specified size - all we have to do is set the size
+ * attribute.  We make sure the object exists first.
+ */
+void exofs_truncate(struct inode *inode)
+{
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        struct exofs_i_info *oi = exofs_i(inode);
+        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or;
+        struct osd_attr attr;
+        loff_t isize = i_size_read(inode);
+        __be64 newsize;
+        int ret;
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+             || S_ISLNK(inode->i_mode)))
+                return;
+        if (exofs_inode_is_fast_symlink(inode))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
+                goto fail;
+        }
+        osd_req_set_attributes(or, &obj);
+        newsize = cpu_to_be64((u64)isize);
+        attr = g_attr_logical_length;
+        attr.val_ptr = &newsize;
+        osd_req_add_set_attr_list(or, &attr, 1);
+        /* if we are about to truncate an object, and it hasn't been
+         * created yet, wait
+         */
+        if (unlikely(wait_obj_created(oi)))
+                goto fail;
+        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        osd_end_request(or);
+        if (ret)
+                goto fail;
+out:
+        mark_inode_dirty(inode);
+        return;
+fail:
+        make_bad_inode(inode);
+        goto out;
+}
+/*
+ * Set inode attributes - just call generic functions.
+ */
+int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                return error;
+        error = inode_setattr(inode, iattr);
+        return error;
+}
+/*
+ * Read an inode from the OSD, and return it as is.  We also return the size
+ * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * on.
+ */
+static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
+                    struct exofs_fcb *inode, uint64_t *sanity)
+{
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        struct osd_request *or;
+        struct osd_attr attr;
+        struct osd_obj_id obj = {sbi->s_pid,
+                                 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
+        int ret;
+        exofs_make_credential(oi->i_cred, &obj);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
+                return -ENOMEM;
+        }
+        osd_req_get_attributes(or, &obj);
+        /* we need the inode attribute */
+        osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        /* we get the size attributes to do a sanity check */
+        osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+#endif
+        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        if (ret)
+                goto out;
+        attr = g_attr_inode_data;
+        ret = extract_attr_from_req(or, &attr);
+        if (ret) {
+                EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+                goto out;
+        }
+        WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
+        memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        attr = g_attr_logical_length;
+        ret = extract_attr_from_req(or, &attr);
+        if (ret) {
+                EXOFS_ERR("ERROR: extract attr from or failed\n");
+                goto out;
+        }
+        *sanity = get_unaligned_be64(attr.val_ptr);
+#endif
+out:
+        osd_end_request(or);
+        return ret;
+}
+/*
+ * Fill in an inode read from the OSD and set it up for use
+ */
+struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
+{
+        struct exofs_i_info *oi;
+        struct exofs_fcb fcb;
+        struct inode *inode;
+        uint64_t uninitialized_var(sanity);
+        int ret;
+        inode = iget_locked(sb, ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        oi = exofs_i(inode);
+        /* read the inode from the osd */
+        ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+        if (ret)
+                goto bad_inode;
+        init_waitqueue_head(&oi->i_wq);
+        set_obj_created(oi);
+        /* copy stuff from on-disk struct to in-memory struct */
+        inode->i_mode = le16_to_cpu(fcb.i_mode);
+        inode->i_uid = le32_to_cpu(fcb.i_uid);
+        inode->i_gid = le32_to_cpu(fcb.i_gid);
+        inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+        inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
+        inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
+        inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
+        inode->i_ctime.tv_nsec =
+                inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+        oi->i_commit_size = le64_to_cpu(fcb.i_size);
+        i_size_write(inode, oi->i_commit_size);
+        inode->i_blkbits = EXOFS_BLKSHIFT;
+        inode->i_generation = le32_to_cpu(fcb.i_generation);
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        if ((inode->i_size != sanity) &&
+                (!exofs_inode_is_fast_symlink(inode))) {
+                EXOFS_ERR("WARNING: Size of object from inode and "
+                          "attributes differ (%lld != %llu)\n",
+                          inode->i_size, _LLU(sanity));
+        }
+#endif
+        oi->i_dir_start_lookup = 0;
+        if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
+                ret = -ESTALE;
+                goto bad_inode;
+        }
+        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+                if (fcb.i_data[0])
+                        inode->i_rdev =
+                                old_decode_dev(le32_to_cpu(fcb.i_data[0]));
+                else
+                        inode->i_rdev =
+                                new_decode_dev(le32_to_cpu(fcb.i_data[1]));
+        } else {
+                memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
+        }
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &exofs_file_inode_operations;
+                inode->i_fop = &exofs_file_operations;
+                inode->i_mapping->a_ops = &exofs_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &exofs_dir_inode_operations;
+                inode->i_fop = &exofs_dir_operations;
+                inode->i_mapping->a_ops = &exofs_aops;
+        } else if (S_ISLNK(inode->i_mode)) {
+                if (exofs_inode_is_fast_symlink(inode))
+                        inode->i_op = &exofs_fast_symlink_inode_operations;
+                else {
+                        inode->i_op = &exofs_symlink_inode_operations;
+                        inode->i_mapping->a_ops = &exofs_aops;
+                }
+        } else {
+                inode->i_op = &exofs_special_inode_operations;
+                if (fcb.i_data[0])
+                        init_special_inode(inode, inode->i_mode,
+                           old_decode_dev(le32_to_cpu(fcb.i_data[0])));
+                else
+                        init_special_inode(inode, inode->i_mode,
+                           new_decode_dev(le32_to_cpu(fcb.i_data[1])));
+        }
+        unlock_new_inode(inode);
+        return inode;
+bad_inode:
+        iget_failed(inode);
+        return ERR_PTR(ret);
+}
+int __exofs_wait_obj_created(struct exofs_i_info *oi)
+{
+        if (!obj_created(oi)) {
+                BUG_ON(!obj_2bcreated(oi));
+                wait_event(oi->i_wq, obj_created(oi));
+        }
+        return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
+}
+/*
+ * Callback function from exofs_new_inode().  The important thing is that we
+ * set the obj_created flag so that other methods know that the object exists on
+ * the OSD.
+ */
+static void create_done(struct osd_request *or, void *p)
+{
+        struct inode *inode = p;
+        struct exofs_i_info *oi = exofs_i(inode);
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        int ret;
+        ret = exofs_check_ok(or);
+        osd_end_request(or);
+        atomic_dec(&sbi->s_curr_pending);
+        if (unlikely(ret)) {
+                EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+                          _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
+                make_bad_inode(inode);
+        } else
+                set_obj_created(oi);
+        atomic_dec(&inode->i_count);
+        wake_up(&oi->i_wq);
+}
+/*
+ * Set up a new inode and create an object for it on the OSD
+ */
+struct inode *exofs_new_inode(struct inode *dir, int mode)
+{
+        struct super_block *sb;
+        struct inode *inode;
+        struct exofs_i_info *oi;
+        struct exofs_sb_info *sbi;
+        struct osd_request *or;
+        struct osd_obj_id obj;
+        int ret;
+        sb = dir->i_sb;
+        inode = new_inode(sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        oi = exofs_i(inode);
+        init_waitqueue_head(&oi->i_wq);
+        set_obj_2bcreated(oi);
+        sbi = sb->s_fs_info;
+        sb->s_dirt = 1;
+        inode->i_uid = current->cred->fsuid;
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else {
+                inode->i_gid = current->cred->fsgid;
+        }
+        inode->i_mode = mode;
+        inode->i_ino = sbi->s_nextid++;
+        inode->i_blkbits = EXOFS_BLKSHIFT;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        oi->i_commit_size = inode->i_size = 0;
+        spin_lock(&sbi->s_next_gen_lock);
+        inode->i_generation = sbi->s_next_generation++;
+        spin_unlock(&sbi->s_next_gen_lock);
+        insert_inode_hash(inode);
+        mark_inode_dirty(inode);
+        obj.partition = sbi->s_pid;
+        obj.id = inode->i_ino + EXOFS_OBJ_OFF;
+        exofs_make_credential(oi->i_cred, &obj);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        osd_req_create_object(or, &obj);
+        /* increment the refcount so that the inode will still be around when we
+         * reach the callback
+         */
+        atomic_inc(&inode->i_count);
+        ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+        if (ret) {
+                atomic_dec(&inode->i_count);
+                osd_end_request(or);
+                return ERR_PTR(-EIO);
+        }
+        atomic_inc(&sbi->s_curr_pending);
+        return inode;
+}
+/*
+ * struct to pass two arguments to update_inode's callback
+ */
+struct updatei_args {
+        struct exofs_sb_info    *sbi;
+        struct exofs_fcb        fcb;
+};
+/*
+ * Callback function from exofs_update_inode().
+ */
+static void updatei_done(struct osd_request *or, void *p)
+{
+        struct updatei_args *args = p;
+        osd_end_request(or);
+        atomic_dec(&args->sbi->s_curr_pending);
+        kfree(args);
+}
+/*
+ * Write the inode to the OSD.  Just fill up the struct, and set the attribute
+ * synchronously or asynchronously depending on the do_sync flag.
+ */
+static int exofs_update_inode(struct inode *inode, int do_sync)
+{
+        struct exofs_i_info *oi = exofs_i(inode);
+        struct super_block *sb = inode->i_sb;
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or;
+        struct osd_attr attr;
+        struct exofs_fcb *fcb;
+        struct updatei_args *args;
+        int ret;
+        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        if (!args)
+                return -ENOMEM;
+        fcb = &args->fcb;
+        fcb->i_mode = cpu_to_le16(inode->i_mode);
+        fcb->i_uid = cpu_to_le32(inode->i_uid);
+        fcb->i_gid = cpu_to_le32(inode->i_gid);
+        fcb->i_links_count = cpu_to_le16(inode->i_nlink);
+        fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+        fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+        fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+        oi->i_commit_size = i_size_read(inode);
+        fcb->i_size = cpu_to_le64(oi->i_commit_size);
+        fcb->i_generation = cpu_to_le32(inode->i_generation);
+        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+                if (old_valid_dev(inode->i_rdev)) {
+                        fcb->i_data[0] =
+                                cpu_to_le32(old_encode_dev(inode->i_rdev));
+                        fcb->i_data[1] = 0;
+                } else {
+                        fcb->i_data[0] = 0;
+                        fcb->i_data[1] =
+                                cpu_to_le32(new_encode_dev(inode->i_rdev));
+                        fcb->i_data[2] = 0;
+                }
+        } else
+                memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
+                ret = -ENOMEM;
+                goto free_args;
+        }
+        osd_req_set_attributes(or, &obj);
+        attr = g_attr_inode_data;
+        attr.val_ptr = fcb;
+        osd_req_add_set_attr_list(or, &attr, 1);
+        if (!obj_created(oi)) {
+                EXOFS_DBGMSG("!obj_created\n");
+                BUG_ON(!obj_2bcreated(oi));
+                wait_event(oi->i_wq, obj_created(oi));
+                EXOFS_DBGMSG("wait_event done\n");
+        }
+        if (do_sync) {
+                ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+                osd_end_request(or);
+                goto free_args;
+        } else {
+                args->sbi = sbi;
+                ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
+                if (ret) {
+                        osd_end_request(or);
+                        goto free_args;
+                }
+                atomic_inc(&sbi->s_curr_pending);
+                goto out; /* deallocation in updatei_done */
+        }
+free_args:
+        kfree(args);
+out:
+        EXOFS_DBGMSG("ret=>%d\n", ret);
+        return ret;
+}
+int exofs_write_inode(struct inode *inode, int wait)
+{
+        return exofs_update_inode(inode, wait);
+}
+/*
+ * Callback function from exofs_delete_inode() - don't have much cleaning up to
+ * do.
+ */
+static void delete_done(struct osd_request *or, void *p)
+{
+        struct exofs_sb_info *sbi;
+        osd_end_request(or);
+        sbi = p;
+        atomic_dec(&sbi->s_curr_pending);
+}
+/*
+ * Called when the refcount of an inode reaches zero.  We remove the object
+ * from the OSD here.  We make sure the object was created before we try and
+ * delete it.
+ */
+void exofs_delete_inode(struct inode *inode)
+{
+        struct exofs_i_info *oi = exofs_i(inode);
+        struct super_block *sb = inode->i_sb;
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or;
+        int ret;
+        truncate_inode_pages(&inode->i_data, 0);
+        if (is_bad_inode(inode))
+                goto no_delete;
+        mark_inode_dirty(inode);
+        exofs_update_inode(inode, inode_needs_sync(inode));
+        inode->i_size = 0;
+        if (inode->i_blocks)
+                exofs_truncate(inode);
+        clear_inode(inode);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+                return;
+        }
+        osd_req_remove_object(or, &obj);
+        /* if we are deleting an obj that hasn't been created yet, wait */
+        if (!obj_created(oi)) {
+                BUG_ON(!obj_2bcreated(oi));
+                wait_event(oi->i_wq, obj_created(oi));
+        }
+        ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+        if (ret) {
+                EXOFS_ERR(
+                       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+                osd_end_request(or);
+                return;
+        }
+        atomic_inc(&sbi->s_curr_pending);
+        return;
+no_delete:
+        clear_inode(inode);
+}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 000000000000..77fdd765e76d
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "exofs.h"
+static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+        int err = exofs_add_link(dentry, inode);
+        if (!err) {
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        inode_dec_link_count(inode);
+        iput(inode);
+        return err;
+}
+static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
+                                   struct nameidata *nd)
+{
+        struct inode *inode;
+        ino_t ino;
+        if (dentry->d_name.len > EXOFS_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        ino = exofs_inode_by_name(dir, dentry);
+        inode = NULL;
+        if (ino) {
+                inode = exofs_iget(dir->i_sb, ino);
+                if (IS_ERR(inode))
+                        return ERR_CAST(inode);
+        }
+        return d_splice_alias(inode, dentry);
+}
+static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
+                         struct nameidata *nd)
+{
+        struct inode *inode = exofs_new_inode(dir, mode);
+        int err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                inode->i_op = &exofs_file_inode_operations;
+                inode->i_fop = &exofs_file_operations;
+                inode->i_mapping->a_ops = &exofs_aops;
+                mark_inode_dirty(inode);
+                err = exofs_add_nondir(dentry, inode);
+        }
+        return err;
+}
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+                       dev_t rdev)
+{
+        struct inode *inode;
+        int err;
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        inode = exofs_new_inode(dir, mode);
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                init_special_inode(inode, inode->i_mode, rdev);
+                mark_inode_dirty(inode);
+                err = exofs_add_nondir(dentry, inode);
+        }
+        return err;
+}
+static int exofs_symlink(struct inode *dir, struct dentry *dentry,
+                          const char *symname)
+{
+        struct super_block *sb = dir->i_sb;
+        int err = -ENAMETOOLONG;
+        unsigned l = strlen(symname)+1;
+        struct inode *inode;
+        struct exofs_i_info *oi;
+        if (l > sb->s_blocksize)
+                goto out;
+        inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out;
+        oi = exofs_i(inode);
+        if (l > sizeof(oi->i_data)) {
+                /* slow symlink */
+                inode->i_op = &exofs_symlink_inode_operations;
+                inode->i_mapping->a_ops = &exofs_aops;
+                memset(oi->i_data, 0, sizeof(oi->i_data));
+                err = page_symlink(inode, symname, l);
+                if (err)
+                        goto out_fail;
+        } else {
+                /* fast symlink */
+                inode->i_op = &exofs_fast_symlink_inode_operations;
+                memcpy(oi->i_data, symname, l);
+                inode->i_size = l-1;
+        }
+        mark_inode_dirty(inode);
+        err = exofs_add_nondir(dentry, inode);
+out:
+        return err;
+out_fail:
+        inode_dec_link_count(inode);
+        iput(inode);
+        goto out;
+}
+static int exofs_link(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        struct inode *inode = old_dentry->d_inode;
+        if (inode->i_nlink >= EXOFS_LINK_MAX)
+                return -EMLINK;
+        inode->i_ctime = CURRENT_TIME;
+        inode_inc_link_count(inode);
+        atomic_inc(&inode->i_count);
+        return exofs_add_nondir(dentry, inode);
+}
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct inode *inode;
+        int err = -EMLINK;
+        if (dir->i_nlink >= EXOFS_LINK_MAX)
+                goto out;
+        inode_inc_link_count(dir);
+        inode = exofs_new_inode(dir, S_IFDIR | mode);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_dir;
+        inode->i_op = &exofs_dir_inode_operations;
+        inode->i_fop = &exofs_dir_operations;
+        inode->i_mapping->a_ops = &exofs_aops;
+        inode_inc_link_count(inode);
+        err = exofs_make_empty(inode, dir);
+        if (err)
+                goto out_fail;
+        err = exofs_add_link(dentry, inode);
+        if (err)
+                goto out_fail;
+        d_instantiate(dentry, inode);
+out:
+        return err;
+out_fail:
+        inode_dec_link_count(inode);
+        inode_dec_link_count(inode);
+        iput(inode);
+out_dir:
+        inode_dec_link_count(dir);
+        goto out;
+}
+static int exofs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        struct exofs_dir_entry *de;
+        struct page *page;
+        int err = -ENOENT;
+        de = exofs_find_entry(dir, dentry, &page);
+        if (!de)
+                goto out;
+        err = exofs_delete_entry(de, page);
+        if (err)
+                goto out;
+        inode->i_ctime = dir->i_ctime;
+        inode_dec_link_count(inode);
+        err = 0;
+out:
+        return err;
+}
+static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        int err = -ENOTEMPTY;
+        if (exofs_empty_dir(inode)) {
+                err = exofs_unlink(dir, dentry);
+                if (!err) {
+                        inode->i_size = 0;
+                        inode_dec_link_count(inode);
+                        inode_dec_link_count(dir);
+                }
+        }
+        return err;
+}
+static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct inode *old_inode = old_dentry->d_inode;
+        struct inode *new_inode = new_dentry->d_inode;
+        struct page *dir_page = NULL;
+        struct exofs_dir_entry *dir_de = NULL;
+        struct page *old_page;
+        struct exofs_dir_entry *old_de;
+        int err = -ENOENT;
+        old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
+        if (!old_de)
+                goto out;
+        if (S_ISDIR(old_inode->i_mode)) {
+                err = -EIO;
+                dir_de = exofs_dotdot(old_inode, &dir_page);
+                if (!dir_de)
+                        goto out_old;
+        }
+        if (new_inode) {
+                struct page *new_page;
+                struct exofs_dir_entry *new_de;
+                err = -ENOTEMPTY;
+                if (dir_de && !exofs_empty_dir(new_inode))
+                        goto out_dir;
+                err = -ENOENT;
+                new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
+                if (!new_de)
+                        goto out_dir;
+                inode_inc_link_count(old_inode);
+                err = exofs_set_link(new_dir, new_de, new_page, old_inode);
+                new_inode->i_ctime = CURRENT_TIME;
+                if (dir_de)
+                        drop_nlink(new_inode);
+                inode_dec_link_count(new_inode);
+                if (err)
+                        goto out_dir;
+        } else {
+                if (dir_de) {
+                        err = -EMLINK;
+                        if (new_dir->i_nlink >= EXOFS_LINK_MAX)
+                                goto out_dir;
+                }
+                inode_inc_link_count(old_inode);
+                err = exofs_add_link(new_dentry, old_inode);
+                if (err) {
+                        inode_dec_link_count(old_inode);
+                        goto out_dir;
+                }
+                if (dir_de)
+                        inode_inc_link_count(new_dir);
+        }
+        old_inode->i_ctime = CURRENT_TIME;
+        exofs_delete_entry(old_de, old_page);
+        inode_dec_link_count(old_inode);
+        if (dir_de) {
+                err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
+                inode_dec_link_count(old_dir);
+                if (err)
+                        goto out_dir;
+        }
+        return 0;
+out_dir:
+        if (dir_de) {
+                kunmap(dir_page);
+                page_cache_release(dir_page);
+        }
+out_old:
+        kunmap(old_page);
+        page_cache_release(old_page);
+out:
+        return err;
+}
+const struct inode_operations exofs_dir_inode_operations = {
+        .create         = exofs_create,
+        .lookup         = exofs_lookup,
+        .link           = exofs_link,
+        .unlink         = exofs_unlink,
+        .symlink        = exofs_symlink,
+        .mkdir          = exofs_mkdir,
+        .rmdir          = exofs_rmdir,
+        .mknod          = exofs_mknod,
+        .rename         = exofs_rename,
+        .setattr        = exofs_setattr,
+};
+const struct inode_operations exofs_special_inode_operations = {
+        .setattr        = exofs_setattr,
+};
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 000000000000..b249ae97fb15
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <scsi/scsi_device.h>
+#include <scsi/osd_sense.h>
+#include "exofs.h"
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
+{
+        struct osd_sense_info osi;
+        int ret = osd_req_decode_sense(or, &osi);
+        if (ret) { /* translate to Linux codes */
+                if (osi.additional_code == scsi_invalid_field_in_cdb) {
+                        if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
+                                ret = -EFAULT;
+                        if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
+                                ret = -ENOENT;
+                        else
+                                ret = -EINVAL;
+                } else if (osi.additional_code == osd_quota_error)
+                        ret = -ENOSPC;
+                else
+                        ret = -EIO;
+        }
+        /* FIXME: should be include in osd_sense_info */
+        if (in_resid)
+                *in_resid = or->in.req ? or->in.req->data_len : 0;
+        if (out_resid)
+                *out_resid = or->out.req ? or->out.req->data_len : 0;
+        return ret;
+}
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+/*
+ * Perform a synchronous OSD operation.
+ */
+int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+        int ret;
+        or->timeout = timeout;
+        ret = osd_finalize_request(or, 0, credential, NULL);
+        if (ret) {
+                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+                return ret;
+        }
+        ret = osd_execute_request(or);
+        if (ret)
+                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+        /* osd_req_decode_sense(or, ret); */
+        return ret;
+}
+/*
+ * Perform an asynchronous OSD operation.
+ */
+int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+                   void *caller_context, u8 *cred)
+{
+        int ret;
+        ret = osd_finalize_request(or, 0, cred, NULL);
+        if (ret) {
+                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+                return ret;
+        }
+        ret = osd_execute_request_async(or, async_done, caller_context);
+        if (ret)
+                EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
+        return ret;
+}
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+        struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+        void *iter = NULL;
+        int nelem;
+        do {
+                nelem = 1;
+                osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+                if ((cur_attr.attr_page == attr->attr_page) &&
+                    (cur_attr.attr_id == attr->attr_id)) {
+                        attr->len = cur_attr.len;
+                        attr->val_ptr = cur_attr.val_ptr;
+                        return 0;
+                }
+        } while (iter);
+        return -EIO;
+}
+int osd_req_read_kern(struct osd_request *or,
+        const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+        struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+        struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+        if (!bio)
+                return -ENOMEM;
+        osd_req_read(or, obj, bio, offset);
+        return 0;
+}
+int osd_req_write_kern(struct osd_request *or,
+        const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+        struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+        struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+        if (!bio)
+                return -ENOMEM;
+        osd_req_write(or, obj, bio, offset);
+        return 0;
+}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 000000000000..9f1985e857e2
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+#include "exofs.h"
+/******************************************************************************
+ * MOUNT OPTIONS
+ *****************************************************************************/
+/*
+ * struct to hold what we get from mount options
+ */
+struct exofs_mountopt {
+        const char *dev_name;
+        uint64_t pid;
+        int timeout;
+};
+/*
+ * exofs-specific mount-time options.
+ */
+enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+/*
+ * Our mount-time options.  These should ideally be 64-bit unsigned, but the
+ * kernel's parsing functions do not currently support that.  32-bit should be
+ * sufficient for most applications now.
+ */
+static match_table_t tokens = {
+        {Opt_pid, "pid=%u"},
+        {Opt_to, "to=%u"},
+        {Opt_err, NULL}
+};
+/*
+ * The main option parsing method.  Also makes sure that all of the mandatory
+ * mount options were set.
+ */
+static int parse_options(char *options, struct exofs_mountopt *opts)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        bool s_pid = false;
+        EXOFS_DBGMSG("parse_options %s\n", options);
+        /* defaults */
+        memset(opts, 0, sizeof(*opts));
+        opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                char str[32];
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_pid:
+                        if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+                                return -EINVAL;
+                        opts->pid = simple_strtoull(str, NULL, 0);
+                        if (opts->pid < EXOFS_MIN_PID) {
+                                EXOFS_ERR("Partition ID must be >= %u",
+                                          EXOFS_MIN_PID);
+                                return -EINVAL;
+                        }
+                        s_pid = 1;
+                        break;
+                case Opt_to:
+                        if (match_int(&args[0], &option))
+                                return -EINVAL;
+                        if (option <= 0) {
+                                EXOFS_ERR("Timout must be > 0");
+                                return -EINVAL;
+                        }
+                        opts->timeout = option * HZ;
+                        break;
+                }
+        }
+        if (!s_pid) {
+                EXOFS_ERR("Need to specify the following options:\n");
+                EXOFS_ERR("    -o pid=pid_no_to_use\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+/******************************************************************************
+ * INODE CACHE
+ *****************************************************************************/
+/*
+ * Our inode cache.  Isn't it pretty?
+ */
+static struct kmem_cache *exofs_inode_cachep;
+/*
+ * Allocate an inode in the cache
+ */
+static struct inode *exofs_alloc_inode(struct super_block *sb)
+{
+        struct exofs_i_info *oi;
+        oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
+        if (!oi)
+                return NULL;
+        oi->vfs_inode.i_version = 1;
+        return &oi->vfs_inode;
+}
+/*
+ * Remove an inode from the cache
+ */
+static void exofs_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+/*
+ * Initialize the inode
+ */
+static void exofs_init_once(void *foo)
+{
+        struct exofs_i_info *oi = foo;
+        inode_init_once(&oi->vfs_inode);
+}
+/*
+ * Create and initialize the inode cache
+ */
+static int init_inodecache(void)
+{
+        exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
+                                sizeof(struct exofs_i_info), 0,
+                                SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                exofs_init_once);
+        if (exofs_inode_cachep == NULL)
+                return -ENOMEM;
+        return 0;
+}
+/*
+ * Destroy the inode cache
+ */
+static void destroy_inodecache(void)
+{
+        kmem_cache_destroy(exofs_inode_cachep);
+}
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
+/*
+ * Write the superblock to the OSD
+ */
+static void exofs_write_super(struct super_block *sb)
+{
+        struct exofs_sb_info *sbi;
+        struct exofs_fscb *fscb;
+        struct osd_request *or;
+        struct osd_obj_id obj;
+        int ret;
+        fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
+        if (!fscb) {
+                EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
+                return;
+        }
+        lock_kernel();
+        sbi = sb->s_fs_info;
+        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
+        fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+        fscb->s_magic = cpu_to_le16(sb->s_magic);
+        fscb->s_newfs = 0;
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
+                goto out;
+        }
+        obj.partition = sbi->s_pid;
+        obj.id = EXOFS_SUPER_ID;
+        ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
+        if (unlikely(ret)) {
+                EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
+                goto out;
+        }
+        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+        if (unlikely(ret)) {
+                EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
+                goto out;
+        }
+        sb->s_dirt = 0;
+out:
+        if (or)
+                osd_end_request(or);
+        unlock_kernel();
+        kfree(fscb);
+}
+/*
+ * This function is called when the vfs is freeing the superblock.  We just
+ * need to free our own part.
+ */
+static void exofs_put_super(struct super_block *sb)
+{
+        int num_pend;
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        /* make sure there are no pending commands */
+        for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
+             num_pend = atomic_read(&sbi->s_curr_pending)) {
+                wait_queue_head_t wq;
+                init_waitqueue_head(&wq);
+                wait_event_timeout(wq,
+                                  (atomic_read(&sbi->s_curr_pending) == 0),
+                                  msecs_to_jiffies(100));
+        }
+        osduld_put_device(sbi->s_dev);
+        kfree(sb->s_fs_info);
+        sb->s_fs_info = NULL;
+}
+/*
+ * Read the superblock from the OSD and fill in the fields
+ */
+static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct inode *root;
+        struct exofs_mountopt *opts = data;
+        struct exofs_sb_info *sbi;      /*extended info                  */
+        struct exofs_fscb fscb;         /*on-disk superblock info        */
+        struct osd_request *or = NULL;
+        struct osd_obj_id obj;
+        int ret;
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+        if (!sbi)
+                return -ENOMEM;
+        sb->s_fs_info = sbi;
+        /* use mount options to fill superblock */
+        sbi->s_dev = osduld_path_lookup(opts->dev_name);
+        if (IS_ERR(sbi->s_dev)) {
+                ret = PTR_ERR(sbi->s_dev);
+                sbi->s_dev = NULL;
+                goto free_sbi;
+        }
+        sbi->s_pid = opts->pid;
+        sbi->s_timeout = opts->timeout;
+        /* fill in some other data by hand */
+        memset(sb->s_id, 0, sizeof(sb->s_id));
+        strcpy(sb->s_id, "exofs");
+        sb->s_blocksize = EXOFS_BLKSIZE;
+        sb->s_blocksize_bits = EXOFS_BLKSHIFT;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        atomic_set(&sbi->s_curr_pending, 0);
+        sb->s_bdev = NULL;
+        sb->s_dev = 0;
+        /* read data from on-disk superblock object */
+        obj.partition = sbi->s_pid;
+        obj.id = EXOFS_SUPER_ID;
+        exofs_make_credential(sbi->s_cred, &obj);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                if (!silent)
+                        EXOFS_ERR(
+                               "exofs_fill_super: osd_start_request failed.\n");
+                ret = -ENOMEM;
+                goto free_sbi;
+        }
+        ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
+        if (unlikely(ret)) {
+                if (!silent)
+                        EXOFS_ERR(
+                               "exofs_fill_super: osd_req_read_kern failed.\n");
+                ret = -ENOMEM;
+                goto free_sbi;
+        }
+        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+        if (unlikely(ret)) {
+                if (!silent)
+                        EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
+                ret = -EIO;
+                goto free_sbi;
+        }
+        sb->s_magic = le16_to_cpu(fscb.s_magic);
+        sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
+        sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
+        /* make sure what we read from the object store is correct */
+        if (sb->s_magic != EXOFS_SUPER_MAGIC) {
+                if (!silent)
+                        EXOFS_ERR("ERROR: Bad magic value\n");
+                ret = -EINVAL;
+                goto free_sbi;
+        }
+        /* start generation numbers from a random point */
+        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+        spin_lock_init(&sbi->s_next_gen_lock);
+        /* set up operation vectors */
+        sb->s_op = &exofs_sops;
+        sb->s_export_op = &exofs_export_ops;
+        root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+        if (IS_ERR(root)) {
+                EXOFS_ERR("ERROR: exofs_iget failed\n");
+                ret = PTR_ERR(root);
+                goto free_sbi;
+        }
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                iput(root);
+                EXOFS_ERR("ERROR: get root inode failed\n");
+                ret = -ENOMEM;
+                goto free_sbi;
+        }
+        if (!S_ISDIR(root->i_mode)) {
+                dput(sb->s_root);
+                sb->s_root = NULL;
+                EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
+                       root->i_mode);
+                ret = -EINVAL;
+                goto free_sbi;
+        }
+        ret = 0;
+out:
+        if (or)
+                osd_end_request(or);
+        return ret;
+free_sbi:
+        osduld_put_device(sbi->s_dev); /* NULL safe */
+        kfree(sbi);
+        goto out;
+}
+/*
+ * Set up the superblock (calls exofs_fill_super eventually)
+ */
+static int exofs_get_sb(struct file_system_type *type,
+                          int flags, const char *dev_name,
+                          void *data, struct vfsmount *mnt)
+{
+        struct exofs_mountopt opts;
+        int ret;
+        ret = parse_options(data, &opts);
+        if (ret)
+                return ret;
+        opts.dev_name = dev_name;
+        return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+}
+/*
+ * Return information about the file system state in the buffer.  This is used
+ * by the 'df' command, for example.
+ */
+static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        struct osd_obj_id obj = {sbi->s_pid, 0};
+        struct osd_attr attrs[] = {
+                ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
+                        OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
+                ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
+                        OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
+        };
+        uint64_t capacity = ULLONG_MAX;
+        uint64_t used = ULLONG_MAX;
+        struct osd_request *or;
+        uint8_t cred_a[OSD_CAP_LEN];
+        int ret;
+        /* get used/capacity attributes */
+        exofs_make_credential(cred_a, &obj);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
+                return -ENOMEM;
+        }
+        osd_req_get_attributes(or, &obj);
+        osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
+        ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+        if (unlikely(ret))
+                goto out;
+        ret = extract_attr_from_req(or, &attrs[0]);
+        if (likely(!ret))
+                capacity = get_unaligned_be64(attrs[0].val_ptr);
+        else
+                EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
+        ret = extract_attr_from_req(or, &attrs[1]);
+        if (likely(!ret))
+                used = get_unaligned_be64(attrs[1].val_ptr);
+        else
+                EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
+        /* fill in the stats buffer */
+        buf->f_type = EXOFS_SUPER_MAGIC;
+        buf->f_bsize = EXOFS_BLKSIZE;
+        buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+        buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+        buf->f_bavail = buf->f_bfree;
+        buf->f_files = sbi->s_numfiles;
+        buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
+        buf->f_namelen = EXOFS_NAME_LEN;
+out:
+        osd_end_request(or);
+        return ret;
+}
+static const struct super_operations exofs_sops = {
+        .alloc_inode    = exofs_alloc_inode,
+        .destroy_inode  = exofs_destroy_inode,
+        .write_inode    = exofs_write_inode,
+        .delete_inode   = exofs_delete_inode,
+        .put_super      = exofs_put_super,
+        .write_super    = exofs_write_super,
+        .statfs         = exofs_statfs,
+};
+/******************************************************************************
+ * EXPORT OPERATIONS
+ *****************************************************************************/
+struct dentry *exofs_get_parent(struct dentry *child)
+{
+        unsigned long ino = exofs_parent_ino(child);
+        if (!ino)
+                return NULL;
+        return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
+}
+static struct inode *exofs_nfs_get_inode(struct super_block *sb,
+                u64 ino, u32 generation)
+{
+        struct inode *inode;
+        inode = exofs_iget(sb, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (generation && inode->i_generation != generation) {
+                /* we didn't find the right inode.. */
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return inode;
+}
+static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
+                                struct fid *fid, int fh_len, int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                    exofs_nfs_get_inode);
+}
+static struct dentry *exofs_fh_to_parent(struct super_block *sb,
+                                struct fid *fid, int fh_len, int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                    exofs_nfs_get_inode);
+}
+static const struct export_operations exofs_export_ops = {
+        .fh_to_dentry = exofs_fh_to_dentry,
+        .fh_to_parent = exofs_fh_to_parent,
+        .get_parent = exofs_get_parent,
+};
+/******************************************************************************
+ * INSMOD/RMMOD
+ *****************************************************************************/
+/*
+ * struct that describes this file system
+ */
+static struct file_system_type exofs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "exofs",
+        .get_sb         = exofs_get_sb,
+        .kill_sb        = generic_shutdown_super,
+};
+static int __init init_exofs(void)
+{
+        int err;
+        err = init_inodecache();
+        if (err)
+                goto out;
+        err = register_filesystem(&exofs_type);
+        if (err)
+                goto out_d;
+        return 0;
+out_d:
+        destroy_inodecache();
+out:
+        return err;
+}
+static void __exit exit_exofs(void)
+{
+        unregister_filesystem(&exofs_type);
+        destroy_inodecache();
+}
+MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
+MODULE_DESCRIPTION("exofs");
+MODULE_LICENSE("GPL");
+module_init(init_exofs)
+module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 000000000000..36e2d7bc7f7b
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/namei.h>
+#include "exofs.h"
+static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct exofs_i_info *oi = exofs_i(dentry->d_inode);
+        nd_set_link(nd, (char *)oi->i_data);
+        return NULL;
+}
+const struct inode_operations exofs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+};
+const struct inode_operations exofs_fast_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = exofs_follow_link,
+};
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b27..d46e38cb85c5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
                                return PTR_ERR(acl);
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
               struct posix_acl *clone;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b43b95563663..acf678831103 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode,
        if (depth == 0)
                return (err);
-reread:
-        partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+        partial = ext2_get_branch(inode, depth, offsets, chain, &err);
        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
@@ -602,15 +601,16 @@ reread:
                while (count < maxblocks && count <= blocks_to_boundary) {
                        ext2_fsblk_t blk;
-                        if (!verify_chain(chain, partial)) {
+                        if (!verify_chain(chain, chain + depth - 1)) {
                                /*
                                 * Indirect block might be removed by
                                 * truncate while we were reading it.
                                 * Handling of that case: forget what we've
                                 * got now, go to reread.
                                 */
+                                err = -EAGAIN;
                                count = 0;
-                                goto changed;
+                                break;
                        }
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
                        if (blk == first_block + count)
@@ -618,7 +618,8 @@ reread:
                        else
                                break;
                }
-                goto got_it;
+                if (err != -EAGAIN)
+                        goto got_it;
        }
        /* Next simple case - plain lookup or failed read of indirect block */
@@ -626,6 +627,33 @@ reread:
                goto cleanup;
        mutex_lock(&ei->truncate_mutex);
+        /*
+         * If the indirect block is missing while we are reading
+         * the chain(ext3_get_branch() returns -EAGAIN err), or
+         * if the chain has been changed after we grab the semaphore,
+         * (either because another process truncated this branch, or
+         * another get_block allocated this branch) re-grab the chain to see if
+         * the request block has been allocated or not.
+         *
+         * Since we already block the truncate/other get_block
+         * at this point, we will have the current copy of the chain when we
+         * splice the branch into the tree.
+         */
+        if (err == -EAGAIN || !verify_chain(chain, partial)) {
+                while (partial > chain) {
+                        brelse(partial->bh);
+                        partial--;
+                }
+                partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+                if (!partial) {
+                        count++;
+                        mutex_unlock(&ei->truncate_mutex);
+                        if (err)
+                                goto cleanup;
+                        clear_buffer_new(bh_result);
+                        goto got_it;
+                }
+        }
        /*
         * Okay, we need to do block allocation.  Lazily initialize the block
@@ -683,12 +711,6 @@ cleanup:
                partial--;
        }
        return err;
-changed:
-        while (partial > chain) {
-                brelse(partial->bh);
-                partial--;
-        }
-        goto reread;
 }
 int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f983225266dc..5c4afe652245 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1395,8 +1395,10 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
                blk++;
        }
 out:
-        if (len == towrite)
+        if (len == towrite) {
+                mutex_unlock(&inode->i_mutex);
                return err;
+        }
        if (inode->i_size < off+len-towrite)
                i_size_write(inode, off+len-towrite);
        inode->i_version++;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
          To compile this file system support as a module, choose M here: the
          module will be called ext3.
+config EXT3_DEFAULTS_TO_ORDERED
+        bool "Default to 'data=ordered' in ext3 (legacy option)"
+        depends on EXT3_FS
+        help
+          If a filesystem does not explicitly specify a data ordering
+          mode, and the journal capability allowed it, ext3 used to
+          historically default to 'data=ordered'.
+          That was a rather unfortunate choice, because it leads to all
+          kinds of latency problems, and the 'data=writeback' mode is more
+          appropriate these days.
+          You should probably always answer 'n' here, and if you really
+          want to use 'data=ordered' mode, set it in the filesystem itself
+          with 'tune2fs -o journal_data_ordered'.
+          But if you really want to enable the legacy default, you can do
+          so by answering 'y' to this question.
 config EXT3_FS_XATTR
        bool "Ext3 extended attributes"
        depends on EXT3_FS
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880c..d81ef2fdb08e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
                                return PTR_ERR(acl);
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af4..3d724a95882f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = ext3_readdir,         /* we take BKL. needed?*/
-        .ioctl          = ext3_ioctl,           /* BKL held */
+        .unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..5b49704b231b 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -33,6 +33,10 @@
 */
 static int ext3_release_file (struct inode * inode, struct file * filp)
 {
+        if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+                filemap_flush(inode->i_mapping);
+                EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
                        (atomic_read(&inode->i_writecount) == 1))
@@ -112,7 +116,7 @@ const struct file_operations ext3_file_operations = {
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
        .aio_write      = ext3_file_write,
-        .ioctl          = ext3_ioctl,
+        .unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 4a09ff169870..fcfa24361856 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
                                struct page **pagep, void **fsdata)
 {
        struct inode *inode = mapping->host;
-        int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+        int ret;
        handle_t *handle;
        int retries = 0;
        struct page *page;
        pgoff_t index;
        unsigned from, to;
+        /* Reserve one block more for addition to orphan list in case
+         * we allocate blocks but write fails for some reason */
+        int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1184,15 +1187,20 @@ retry:
        }
 write_begin_failed:
        if (ret) {
-                ext3_journal_stop(handle);
-                unlock_page(page);
-                page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
+                 *
+                 * Add inode to orphan list in case we crash before truncate
+                 * finishes.
                 */
                if (pos + len > inode->i_size)
+                        ext3_orphan_add(handle, inode);
+                ext3_journal_stop(handle);
+                unlock_page(page);
+                page_cache_release(page);
+                if (pos + len > inode->i_size)
                        vmtruncate(inode, inode->i_size);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
@@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
        return err;
 }
+/* For ordered writepage and write_end functions */
+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+        /*
+         * Write could have mapped the buffer but it didn't copy the data in
+         * yet. So avoid filing such buffer into a transaction.
+         */
+        if (buffer_mapped(bh) && buffer_uptodate(bh))
+                return ext3_journal_dirty_data(handle, bh);
+        return 0;
+}
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * Generic write_end handler for ordered and writeback ext3 journal modes.
+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
- * We can't use generic_write_end, because that unlocks the page and we need to
+ * for the whole page but later we failed to copy the data in. Update inode
- * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
+ * size according to what we managed to copy. The rest is going to be
- * after block_write_end.
+ * truncated in write_end function.
 */
-static int ext3_generic_write_end(struct file *file,
+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
-                                struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
 {
-        struct inode *inode = file->f_mapping->host;
+        /* What matters to us is i_disksize. We don't write i_size anywhere */
+        if (pos + copied > inode->i_size)
-        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+                i_size_write(inode, pos + copied);
+        if (pos + copied > EXT3_I(inode)->i_disksize) {
-        if (pos+copied > inode->i_size) {
+                EXT3_I(inode)->i_disksize = pos + copied;
-                i_size_write(inode, pos+copied);
                mark_inode_dirty(inode);
        }
-        return copied;
 }
 /*
@@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file,
        unsigned from, to;
        int ret = 0, ret2;
-        from = pos & (PAGE_CACHE_SIZE - 1);
+        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-        to = from + len;
+        from = pos & (PAGE_CACHE_SIZE - 1);
+        to = from + copied;
        ret = walk_page_buffers(handle, page_buffers(page),
-                from, to, NULL, ext3_journal_dirty_data);
+                from, to, NULL, journal_dirty_data_fn);
-        if (ret == 0) {
+        if (ret == 0)
-                /*
+                update_file_sizes(inode, pos, copied);
-                 * generic_write_end() will run mark_inode_dirty() if i_size
+        /*
-                 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+         * There may be allocated blocks outside of i_size because
-                 * into that.
+         * we failed to copy some data. Prepare for truncate.
-                 */
+         */
-                loff_t new_i_size;
+        if (pos + len > inode->i_size)
+                ext3_orphan_add(handle, inode);
-                new_i_size = pos + copied;
-                if (new_i_size > EXT3_I(inode)->i_disksize)
-                        EXT3_I(inode)->i_disksize = new_i_size;
-                ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
-                                                        page, fsdata);
-                copied = ret2;
-                if (ret2 < 0)
-                        ret = ret2;
-        }
        ret2 = ext3_journal_stop(handle);
        if (!ret)
                ret = ret2;
        unlock_page(page);
        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                vmtruncate(inode, inode->i_size);
        return ret ? ret : copied;
 }
@@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file,
 {
        handle_t *handle = ext3_journal_current_handle();
        struct inode *inode = file->f_mapping->host;
-        int ret = 0, ret2;
+        int ret;
-        loff_t new_i_size;
-        new_i_size = pos + copied;
-        if (new_i_size > EXT3_I(inode)->i_disksize)
-                EXT3_I(inode)->i_disksize = new_i_size;
-        ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
-                                                        page, fsdata);
-        copied = ret2;
-        if (ret2 < 0)
-                ret = ret2;
-        ret2 = ext3_journal_stop(handle);
+        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (!ret)
+        update_file_sizes(inode, pos, copied);
-                ret = ret2;
+        /*
+         * There may be allocated blocks outside of i_size because
+         * we failed to copy some data. Prepare for truncate.
+         */
+        if (pos + len > inode->i_size)
+                ext3_orphan_add(handle, inode);
+        ret = ext3_journal_stop(handle);
        unlock_page(page);
        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                vmtruncate(inode, inode->i_size);
        return ret ? ret : copied;
 }
@@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file,
        if (copied < len) {
                if (!PageUptodate(page))
                        copied = 0;
-                page_zero_new_buffers(page, from+copied, to);
+                page_zero_new_buffers(page, from + copied, to);
+                to = from + copied;
        }
        ret = walk_page_buffers(handle, page_buffers(page), from,
                                to, &partial, write_end_fn);
        if (!partial)
                SetPageUptodate(page);
-        if (pos+copied > inode->i_size)
-                i_size_write(inode, pos+copied);
+        if (pos + copied > inode->i_size)
+                i_size_write(inode, pos + copied);
+        /*
+         * There may be allocated blocks outside of i_size because
+         * we failed to copy some data. Prepare for truncate.
+         */
+        if (pos + len > inode->i_size)
+                ext3_orphan_add(handle, inode);
        EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
        if (inode->i_size > EXT3_I(inode)->i_disksize) {
                EXT3_I(inode)->i_disksize = inode->i_size;
@@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file,
        unlock_page(page);
        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                vmtruncate(inode, inode->i_size);
        return ret ? ret : copied;
 }
@@ -1428,17 +1443,11 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-        if (buffer_mapped(bh))
-                return ext3_journal_dirty_data(handle, bh);
-        return 0;
-}
 static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
 {
        return !buffer_mapped(bh);
 }
 /*
 * Note that we always start a transaction even if we're not journalling
 * data.  This is to preserve ordering: any hole instantiation within
@@ -1512,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page,
        if (!page_has_buffers(page)) {
                create_empty_buffers(page, inode->i_sb->s_blocksize,
                                (1 << BH_Dirty)|(1 << BH_Uptodate));
-        } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+                page_bufs = page_buffers(page);
-                /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */
+        } else {
-                return block_write_full_page(page, NULL, wbc);
+                page_bufs = page_buffers(page);
+                if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
+                                       NULL, buffer_unmapped)) {
+                        /* Provide NULL get_block() to catch bugs if buffers
+                         * weren't really mapped */
+                        return block_write_full_page(page, NULL, wbc);
+                }
        }
-        page_bufs = page_buffers(page);
        handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
@@ -1572,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page,
        if (ext3_journal_current_handle())
                goto out_fail;
+        if (page_has_buffers(page)) {
+                if (!walk_page_buffers(NULL, page_buffers(page), 0,
+                                      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+                        /* Provide NULL get_block() to catch bugs if buffers
+                         * weren't really mapped */
+                        return block_write_full_page(page, NULL, wbc);
+                }
+        }
        handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
@@ -2354,6 +2376,9 @@ void ext3_truncate(struct inode *inode)
        if (!ext3_can_truncate(inode))
                return;
+        if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+                ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
        /*
         * We have to lock the EOF page here, because lock_page() nests
         * outside journal_start().
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e0..88974814783a 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
-int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
        struct ext3_inode_info *ei = EXT3_I(inode);
        unsigned int flags;
        unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                unsigned int oldflags;
                unsigned int jflag;
+                if (!is_owner_or_cap(inode))
+                        return -EACCES;
+                if (get_user(flags, (int __user *) arg))
+                        return -EFAULT;
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        return err;
-                if (!is_owner_or_cap(inode)) {
-                        err = -EACCES;
-                        goto flags_out;
-                }
-                if (get_user(flags, (int __user *) arg)) {
-                        err = -EFAULT;
-                        goto flags_out;
-                }
                flags = ext3_mask_flags(inode->i_mode, flags);
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
-                if (IS_NOQUOTA(inode)) {
+                err = -EPERM;
-                        mutex_unlock(&inode->i_mutex);
+                if (IS_NOQUOTA(inode))
-                        err = -EPERM;
                        goto flags_out;
-                }
                oldflags = ei->i_flags;
                /* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                 * This test looks nicer. Thanks to Pauline Middelink
                 */
                if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
-                        if (!capable(CAP_LINUX_IMMUTABLE)) {
+                        if (!capable(CAP_LINUX_IMMUTABLE))
-                                mutex_unlock(&inode->i_mutex);
-                                err = -EPERM;
                                goto flags_out;
-                        }
                }
                /*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                 * the relevant capability.
                 */
                if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-                        if (!capable(CAP_SYS_RESOURCE)) {
+                        if (!capable(CAP_SYS_RESOURCE))
-                                mutex_unlock(&inode->i_mutex);
-                                err = -EPERM;
                                goto flags_out;
-                        }
                }
                handle = ext3_journal_start(inode, 1);
                if (IS_ERR(handle)) {
-                        mutex_unlock(&inode->i_mutex);
                        err = PTR_ERR(handle);
                        goto flags_out;
                }
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
                ext3_journal_stop(handle);
-                if (err) {
+                if (err)
-                        mutex_unlock(&inode->i_mutex);
+                        goto flags_out;
-                        return err;
-                }
                if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
                        err = ext3_change_inode_journal_flag(inode, jflag);
-                mutex_unlock(&inode->i_mutex);
 flags_out:
+                mutex_unlock(&inode->i_mutex);
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
@@ -140,6 +125,7 @@ flags_out:
                if (!is_owner_or_cap(inode))
                        return -EPERM;
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        return err;
@@ -147,6 +133,7 @@ flags_out:
                        err = -EFAULT;
                        goto setversion_out;
                }
                handle = ext3_journal_start(inode, 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
 #ifdef CONFIG_COMPAT
 long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
-        int ret;
        /* These are just misnamed, they actually get/put from/to user an int */
        switch (cmd) {
        case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        default:
                return -ENOIOCTLCMD;
        }
-        lock_kernel();
+        return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-        ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-        unlock_kernel();
-        return ret;
 }
 #endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index e2fc63cbba8b..6ff7b9730234 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry,
                                 struct dx_frame *frame,
                                 int *err);
 static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
                        struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
                struct dx_map_entry *offsets, int count);
-static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
@@ -708,14 +708,14 @@ errout:
 * Create map of hash values, offsets, and sizes, stored at end of block.
 * Returns number of entries mapped.
 */
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
-                        struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+                struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
 {
        int count = 0;
        char *base = (char *) de;
        struct dx_hash_info h = *hinfo;
-        while ((char *) de < base + size)
+        while ((char *) de < base + blocksize)
        {
                if (de->name_len && de->inode) {
                        ext3fs_dirhash(de->name, de->name_len, &h);
@@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
                        return ERR_PTR(-EIO);
                }
                inode = ext3_iget(dir->i_sb, ino);
-                if (IS_ERR(inode))
+                if (unlikely(IS_ERR(inode))) {
-                        return ERR_CAST(inode);
+                        if (PTR_ERR(inode) == -ESTALE) {
+                                ext3_error(dir->i_sb, __func__,
+                                                "deleted inode referenced: %lu",
+                                                ino);
+                                return ERR_PTR(-EIO);
+                        } else {
+                                return ERR_CAST(inode);
+                        }
+                }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 * Compact each dir entry in the range to the minimal rec_len.
 * Returns pointer to last entry in range.
 */
-static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
 {
-        struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
+        struct ext3_dir_entry_2 *next, *to, *prev;
+        struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
        unsigned rec_len = 0;
        prev = to = de;
-        while ((char*)de < base + size) {
+        while ((char *)de < base + blocksize) {
                next = ext3_next_entry(de);
                if (de->inode && de->name_len) {
                        rec_len = EXT3_DIR_REC_LEN(de->name_len);
@@ -2265,7 +2274,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
        struct inode * old_inode, * new_inode;
        struct buffer_head * old_bh, * new_bh, * dir_bh;
        struct ext3_dir_entry_2 * old_de, * new_de;
-        int retval;
+        int retval, flush_file = 0;
        old_bh = new_bh = dir_bh = NULL;
@@ -2401,6 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                ext3_mark_inode_dirty(handle, new_inode);
                if (!new_inode->i_nlink)
                        ext3_orphan_add(handle, new_inode);
+                if (ext3_should_writeback_data(new_inode))
+                        flush_file = 1;
        }
        retval = 0;
@@ -2409,6 +2420,8 @@ end_rename:
        brelse (old_bh);
        brelse (new_bh);
        ext3_journal_stop(handle);
+        if (retval == 0 && flush_file)
+                filemap_flush(old_inode->i_mapping);
        return retval;
 }
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
 #include "acl.h"
 #include "namei.h"
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
                             unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                   cope, else JOURNAL_DATA */
                if (journal_check_available_features
                    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
-                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                        set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
                else
                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
                break;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7505482a08fa..418b6f3b0ae8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -18,7 +18,7 @@ config EXT4_FS
          filesystem; while there will be some performance gains from
          the delayed allocation and inode table readahead, the best
          performance gains will require enabling ext4 features in the
-          filesystem, or formating a new filesystem as an ext4
+          filesystem, or formatting a new filesystem as an ext4
          filesystem initially.
          To compile this file system support as a module, choose M here. The
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc8..647e0d65a284 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
                                return PTR_ERR(acl);
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899c..53c72ad85877 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 }
 static int ext4_group_used_meta_blocks(struct super_block *sb,
-                                ext4_group_t block_group)
+                                       ext4_group_t block_group,
+                                       struct ext4_group_desc *gdp)
 {
        ext4_fsblk_t tmp;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
        int used_blocks = sbi->s_itb_per_group + 2;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-                struct ext4_group_desc *gdp;
-                struct buffer_head *bh;
-                gdp = ext4_get_group_desc(sb, block_group, &bh);
                if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
                                        block_group))
                        used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 */
                mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
        }
-        return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+        return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-                spin_lock(sb_bgl_lock(sbi, flex_group));
+                atomic_add(blocks_freed,
-                sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
+                           &sbi->s_flex_groups[flex_group].free_blocks);
-                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
        /*
         * request to reload the buddy with the
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01af..b64789929a65 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                         unsigned int offset)
 {
        const char *error_msg = NULL;
-        const int rlen = ext4_rec_len_from_disk(de->rec_len);
+        const int rlen = ext4_rec_len_from_disk(de->rec_len,
+                                                dir->i_sb->s_blocksize);
        if (rlen < EXT4_DIR_REC_LEN(1))
                error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
                                 * least that it is non-zero.  A
                                 * failure will be detected in the
                                 * dirent test below. */
-                                if (ext4_rec_len_from_disk(de->rec_len)
+                                if (ext4_rec_len_from_disk(de->rec_len,
-                                                < EXT4_DIR_REC_LEN(1))
+                                        sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
                                        break;
-                                i += ext4_rec_len_from_disk(de->rec_len);
+                                i += ext4_rec_len_from_disk(de->rec_len,
+                                                            sb->s_blocksize);
                        }
                        offset = i;
                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
                                ret = stored;
                                goto out;
                        }
-                        offset += ext4_rec_len_from_disk(de->rec_len);
+                        offset += ext4_rec_len_from_disk(de->rec_len,
+                                        sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
                                /* We might block in the next section
                                 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
                                        goto revalidate;
                                stored++;
                        }
-                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+                                                sb->s_blocksize);
                }
                offset = 0;
                brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6083bb38057b..d0f15ef56de1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,14 +33,6 @@
 #undef EXT4FS_DEBUG
 /*
- * Define EXT4_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT4_DEFAULT_RESERVE_BLOCKS     8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT4_MAX_RESERVE_BLOCKS         1027
-#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
-/*
 * Debug code
 */
 #ifdef EXT4FS_DEBUG
@@ -54,8 +46,6 @@
 #define ext4_debug(f, a...)     do {} while (0)
 #endif
-#define EXT4_MULTIBLOCK_ALLOCATOR       1
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE              1
 /* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
 */
 struct flex_groups {
-        __u32 free_inodes;
+        atomic_t free_inodes;
-        __u32 free_blocks;
+        atomic_t free_blocks;
+        atomic_t used_dirs;
 };
 #define EXT4_BG_INODE_UNINIT    0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
 #define EXT4_FL_USER_VISIBLE            0x000BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE         0x000B80FF /* User modifiable flags */
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+                           EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+                           EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & EXT4_REG_FLMASK;
+        else
+                return flags & EXT4_OTHER_FLMASK;
+}
 /*
 * Inode dynamic state flags
 */
@@ -256,6 +271,7 @@ struct flex_groups {
 #define EXT4_STATE_NEW                  0x00000002 /* inode is newly created */
 #define EXT4_STATE_XATTR                0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND            0x00000008 /* No space for expansion */
+#define EXT4_STATE_DA_ALLOC_CLOSE       0x00000010 /* Alloc DA blks on close */
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GROUP_EXTEND           _IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD              _IOW('f', 8, struct ext4_new_group_input)
 #define EXT4_IOC_MIGRATE                _IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
 /*
 * ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do {									       \
 #define EXT4_MOUNT_NO_UID32             0x02000  /* Disable 32-bit UIDs */
 #define EXT4_MOUNT_XATTR_USER           0x04000 /* Extended user attributes */
 #define EXT4_MOUNT_POSIX_ACL            0x08000 /* POSIX Access Control Lists */
-#define EXT4_MOUNT_RESERVATION          0x10000 /* Preallocation */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC     0x10000 /* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER              0x20000 /* Use block barriers */
 #define EXT4_MOUNT_NOBH                 0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA                0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8    s_reserved_char_pad2;
        __le16  s_reserved_pad;
-        __u32   s_reserved[162];        /* Padding to the end of the block */
+        __le64  s_kbytes_written;       /* nr of lifetime kilobytes written */
+        __u32   s_reserved[160];        /* Padding to the end of the block */
 };
 #ifdef __KERNEL__
@@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
 /*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
+/*
 * Structure of a directory entry
 */
 #define EXT4_NAME_LEN 255
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
                                         ~EXT4_DIR_ROUND)
 #define EXT4_MAX_REC_LEN                ((1<<16)-1)
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
-        unsigned len = le16_to_cpu(dlen);
-        if (len == EXT4_MAX_REC_LEN || len == 0)
-                return 1 << 16;
-        return len;
-}
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
-        if (len == (1 << 16))
-                return cpu_to_le16(EXT4_MAX_REC_LEN);
-        else if (len > (1 << 16))
-                BUG();
-        return cpu_to_le16(len);
-}
 /*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 extern struct proc_dir_entry *ext4_proc_root;
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations ext4_ui_proc_fops;
-#define EXT4_PROC_HANDLER(name, var)                                    \
-do {                                                                    \
-        proc = proc_create_data(name, mode, sbi->s_proc,                \
-                                &ext4_ui_proc_fops, &sbi->s_##var);     \
-        if (proc == NULL) {                                             \
-                printk(KERN_ERR "EXT4-fs: can't create %s\n", name);    \
-                goto err_out;                                           \
-        }                                                               \
-} while (0)
-#else
-#define EXT4_PROC_HANDLER(name, var)
-#endif
 /*
 * Function prototypes
 */
@@ -1092,13 +1083,14 @@ extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t ext4_get_reserved_space(struct inode *inode);
 /* ioctl.c */
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
 /* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbbc..f0c3ec85bd48 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
                                                ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c4..4ce2187123aa 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
 /*
 * storage for cached extent
 */
@@ -125,6 +122,9 @@ struct ext4_inode_info {
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
+        /* ialloc */
+        ext4_group_t    i_last_alloc_group;
        /* allocation reservation info for delalloc */
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a042..57b71fefbccf 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyblocks_counter;
-        struct blockgroup_lock s_blockgroup_lock;
+        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
+        struct kobject s_kobj;
-        /* root of the per fs reservation window tree */
+        struct completion s_kobj_unregister;
-        spinlock_t s_rsv_window_lock;
-        struct rb_root s_rsv_window_root;
        /* Journaling */
        struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
        /* locality groups */
        struct ext4_locality_group *s_locality_groups;
+        /* for write statistics */
+        unsigned long s_sectors_written_start;
+        u64 s_kbytes_written;
        unsigned int s_log_groups_per_flex;
        struct flex_groups *s_flex_groups;
 };
@@ -153,7 +155,7 @@ struct ext4_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
 {
-        return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+        return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 #endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f596..e3a55eb8b26a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
+        ext4_group_t block_group;
+        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        int depth;
        if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        }
        /* OK. use inode's group */
-        bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+        block_group = ei->i_block_group;
+        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+                /*
+                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+                 * block groups per flexgroup, reserve the first block 
+                 * group for directories and special files.  Regular 
+                 * files will start at the second block group.  This
+                 * tends to speed up directory access and improves 
+                 * fsck times.
+                 */
+                block_group &= ~(flex_size-1);
+                if (S_ISREG(inode->i_mode))
+                        block_group++;
+        }
+        bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
                le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+        /*
+         * If we are doing delayed allocation, we don't need take
+         * colour into account.
+         */
+        if (test_opt(inode->i_sb, DELALLOC))
+                return bg_start;
        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,70 @@ ext4_ext_max_entries(struct inode *inode, int depth)
        return max;
 }
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+        ext4_fsblk_t block = ext_pblock(ext), valid_block;
+        int len = ext4_ext_get_actual_len(ext);
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+        valid_block = le32_to_cpu(es->s_first_data_block) +
+                EXT4_SB(inode->i_sb)->s_gdb_count;
+        if (unlikely(block <= valid_block ||
+                     ((block + len) > ext4_blocks_count(es))))
+                return 0;
+        else
+                return 1;
+}
+static int ext4_valid_extent_idx(struct inode *inode,
+                                struct ext4_extent_idx *ext_idx)
+{
+        ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+        valid_block = le32_to_cpu(es->s_first_data_block) +
+                EXT4_SB(inode->i_sb)->s_gdb_count;
+        if (unlikely(block <= valid_block ||
+                     (block >= ext4_blocks_count(es))))
+                return 0;
+        else
+                return 1;
+}
+static int ext4_valid_extent_entries(struct inode *inode,
+                                struct ext4_extent_header *eh,
+                                int depth)
+{
+        struct ext4_extent *ext;
+        struct ext4_extent_idx *ext_idx;
+        unsigned short entries;
+        if (eh->eh_entries == 0)
+                return 1;
+        entries = le16_to_cpu(eh->eh_entries);
+        if (depth == 0) {
+                /* leaf entries */
+                ext = EXT_FIRST_EXTENT(eh);
+                while (entries) {
+                        if (!ext4_valid_extent(inode, ext))
+                                return 0;
+                        ext++;
+                        entries--;
+                }
+        } else {
+                ext_idx = EXT_FIRST_INDEX(eh);
+                while (entries) {
+                        if (!ext4_valid_extent_idx(inode, ext_idx))
+                                return 0;
+                        ext_idx++;
+                        entries--;
+                }
+        }
+        return 1;
+}
+static int __ext4_ext_check(const char *function, struct inode *inode,
                                        struct ext4_extent_header *eh,
                                        int depth)
 {
@@ -329,11 +415,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
                error_msg = "invalid eh_entries";
                goto corrupted;
        }
+        if (!ext4_valid_extent_entries(inode, eh, depth)) {
+                error_msg = "invalid extent entries";
+                goto corrupted;
+        }
        return 0;
 corrupted:
        ext4_error(inode->i_sb, function,
-                        "bad header in inode #%lu: %s - magic %x, "
+                        "bad header/extent in inode #%lu: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +432,13 @@ corrupted:
        return -EIO;
 }
-#define ext4_ext_check_header(inode, eh, depth) \
+#define ext4_ext_check(inode, eh, depth)        \
-        __ext4_ext_check_header(__func__, inode, eh, depth)
+        __ext4_ext_check(__func__, inode, eh, depth)
+int ext4_ext_check_inode(struct inode *inode)
+{
+        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +642,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        eh = ext_inode_hdr(inode);
        depth = ext_depth(inode);
-        if (ext4_ext_check_header(inode, eh, depth))
-                return ERR_PTR(-EIO);
        /* account possible depth increase */
        if (!path) {
@@ -565,6 +657,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        i = depth;
        /* walk through the tree */
        while (i) {
+                int need_to_validate = 0;
                ext_debug("depth %d: num %d, max %d\n",
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -573,10 +667,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
-                bh = sb_bread(inode->i_sb, path[ppos].p_block);
+                bh = sb_getblk(inode->i_sb, path[ppos].p_block);
-                if (!bh)
+                if (unlikely(!bh))
                        goto err;
+                if (!bh_uptodate_or_lock(bh)) {
+                        if (bh_submit_read(bh) < 0) {
+                                put_bh(bh);
+                                goto err;
+                        }
+                        /* validate the extent entries */
+                        need_to_validate = 1;
+                }
                eh = ext_block_hdr(bh);
                ppos++;
                BUG_ON(ppos > depth);
@@ -584,7 +685,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                path[ppos].p_hdr = eh;
                i--;
-                if (ext4_ext_check_header(inode, eh, i))
+                if (need_to_validate && ext4_ext_check(inode, eh, i))
                        goto err;
        }
@@ -1181,7 +1282,7 @@ got_index:
                        return -EIO;
                eh = ext_block_hdr(bh);
                /* subtract from p_depth to get proper eh_depth */
-                if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+                if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
                        put_bh(bh);
                        return -EIO;
                }
@@ -1194,7 +1295,7 @@ got_index:
        if (bh == NULL)
                return -EIO;
        eh = ext_block_hdr(bh);
-        if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+        if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
                put_bh(bh);
                return -EIO;
        }
@@ -1740,11 +1841,13 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 {
        struct ext4_ext_cache *cex;
        BUG_ON(len == 0);
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
        cex->ec_type = type;
        cex->ec_block = block;
        cex->ec_len = len;
        cex->ec_start = start;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 }
 /*
@@ -1801,12 +1904,17 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                        struct ext4_extent *ex)
 {
        struct ext4_ext_cache *cex;
+        int ret = EXT4_EXT_CACHE_NO;
+        /* 
+         * We borrow i_block_reservation_lock to protect i_cached_extent
+         */
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
        /* has cache valid data? */
        if (cex->ec_type == EXT4_EXT_CACHE_NO)
-                return EXT4_EXT_CACHE_NO;
+                goto errout;
        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
@@ -1817,11 +1925,11 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                ext_debug("%u cached by %u:%u:%llu\n",
                                block,
                                cex->ec_block, cex->ec_len, cex->ec_start);
-                return cex->ec_type;
+                ret = cex->ec_type;
        }
+errout:
-        /* not in cache */
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        return EXT4_EXT_CACHE_NO;
+        return ret;
 }
 /*
@@ -2137,7 +2245,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                return -ENOMEM;
        }
        path[0].p_hdr = ext_inode_hdr(inode);
-        if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
+        if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                err = -EIO;
                goto out;
        }
@@ -2191,7 +2299,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                                err = -EIO;
                                break;
                        }
-                        if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+                        if (ext4_ext_check(inode, ext_block_hdr(bh),
                                                        depth - i - 1)) {
                                err = -EIO;
                                break;
@@ -2321,8 +2429,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                        len = ee_len;
                bio = bio_alloc(GFP_NOIO, len);
-                if (!bio)
-                        return -ENOMEM;
                bio->bi_sector = ee_pblock;
                bio->bi_bdev   = inode->i_sb->s_bdev;
@@ -2776,6 +2882,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                if (allocated > max_blocks)
                                        allocated = max_blocks;
                                set_buffer_unwritten(bh_result);
+                                bh_result->b_bdev = inode->i_sb->s_bdev;
+                                bh_result->b_blocknr = newblock;
                                goto out2;
                        }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a03..588af8c77246 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
 */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
+        if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+                ext4_alloc_da_blocks(inode);
+                EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
-                        (atomic_read(&inode->i_writecount) == 1))
+                        (atomic_read(&inode->i_writecount) == 1) &&
+                        !EXT4_I(inode)->i_reserved_data_blocks)
        {
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8f..f18e0a08a6b5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err, count, cleared;
-        ext4_group_t flex_group;
        if (atomic_read(&inode->i_count) > 1) {
                printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                        if (is_directory) {
                                count = ext4_used_dirs_count(sb, gdp) - 1;
                                ext4_used_dirs_set(sb, gdp, count);
+                                if (sbi->s_log_groups_per_flex) {
+                                        ext4_group_t f;
+                                        f = ext4_flex_group(sbi, block_group);
+                                        atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+                                }
                        }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                                percpu_counter_dec(&sbi->s_dirs_counter);
                        if (sbi->s_log_groups_per_flex) {
-                                flex_group = ext4_flex_group(sbi, block_group);
+                                ext4_group_t f;
-                                spin_lock(sb_bgl_lock(sbi, flex_group));
-                                sbi->s_flex_groups[flex_group].free_inodes++;
+                                f = ext4_flex_group(sbi, block_group);
-                                spin_unlock(sb_bgl_lock(sbi, flex_group));
+                                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
                        }
                }
                BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
                sbi->s_log_groups_per_flex;
 find_close_to_parent:
-        flexbg_free_blocks = flex_group[best_flex].free_blocks;
+        flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
        flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-        if (flex_group[best_flex].free_inodes &&
+        if (atomic_read(&flex_group[best_flex].free_inodes) &&
            flex_freeb_ratio > free_block_ratio)
                goto found_flexbg;
@@ -375,24 +381,24 @@ find_close_to_parent:
                if (i == parent_fbg_group || i == parent_fbg_group - 1)
                        continue;
-                flexbg_free_blocks = flex_group[i].free_blocks;
+                flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
                flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
                if (flex_freeb_ratio > free_block_ratio &&
-                    flex_group[i].free_inodes) {
+                    (atomic_read(&flex_group[i].free_inodes))) {
                        best_flex = i;
                        goto found_flexbg;
                }
-                if (flex_group[best_flex].free_inodes == 0 ||
+                if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
-                    (flex_group[i].free_blocks >
+                    ((atomic_read(&flex_group[i].free_blocks) >
-                     flex_group[best_flex].free_blocks &&
+                      atomic_read(&flex_group[best_flex].free_blocks)) &&
-                     flex_group[i].free_inodes))
+                     atomic_read(&flex_group[i].free_inodes)))
                        best_flex = i;
        }
-        if (!flex_group[best_flex].free_inodes ||
+        if (!atomic_read(&flex_group[best_flex].free_inodes) ||
-            !flex_group[best_flex].free_blocks)
+            !atomic_read(&flex_group[best_flex].free_blocks))
                return -1;
 found_flexbg:
@@ -410,6 +416,42 @@ out:
        return 0;
 }
+struct orlov_stats {
+        __u32 free_inodes;
+        __u32 free_blocks;
+        __u32 used_dirs;
+};
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg.  If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+                       int flex_size, struct orlov_stats *stats)
+{
+        struct ext4_group_desc *desc;
+        struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
+        if (flex_size > 1) {
+                stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+                stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+                stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+                return;
+        }
+        desc = ext4_get_group_desc(sb, g, NULL);
+        if (desc) {
+                stats->free_inodes = ext4_free_inodes_count(sb, desc);
+                stats->free_blocks = ext4_free_blks_count(sb, desc);
+                stats->used_dirs = ext4_used_dirs_count(sb, desc);
+        } else {
+                stats->free_inodes = 0;
+                stats->free_blocks = 0;
+                stats->used_dirs = 0;
+        }
+}
 /*
 * Orlov's allocator for directories.
 *
@@ -425,35 +467,34 @@ out:
 * it has too many directories already (max_dirs) or
 * it has too few free inodes left (min_inodes) or
 * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
 * Parent's group is preferred, if it doesn't satisfy these
 * conditions we search cyclically through the rest. If none
 * of the groups look good we just look for a group with more
 * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
 */
-#define INODE_COST 64
-#define BLOCK_COST 256
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-                                ext4_group_t *group)
+                            ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_super_block *es = sbi->s_es;
        ext4_group_t ngroups = sbi->s_groups_count;
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
        ext4_fsblk_t freeb, avefreeb;
-        ext4_fsblk_t blocks_per_dir;
        unsigned int ndirs;
-        int max_debt, max_dirs, min_inodes;
+        int max_dirs, min_inodes;
        ext4_grpblk_t min_blocks;
-        ext4_group_t i;
+        ext4_group_t i, grp, g;
        struct ext4_group_desc *desc;
+        struct orlov_stats stats;
+        int flex_size = ext4_flex_bg_size(sbi);
+        if (flex_size > 1) {
+                ngroups = (ngroups + flex_size - 1) >>
+                        sbi->s_log_groups_per_flex;
+                parent_group >>= sbi->s_log_groups_per_flex;
+        }
        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
        avefreei = freei / ngroups;
@@ -462,71 +503,98 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        do_div(avefreeb, ngroups);
        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
-        if ((parent == sb->s_root->d_inode) ||
+        if (S_ISDIR(mode) &&
-            (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+            ((parent == sb->s_root->d_inode) ||
+             (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
                int best_ndir = inodes_per_group;
-                ext4_group_t grp;
                int ret = -1;
                get_random_bytes(&grp, sizeof(grp));
                parent_group = (unsigned)grp % ngroups;
                for (i = 0; i < ngroups; i++) {
-                        grp = (parent_group + i) % ngroups;
+                        g = (parent_group + i) % ngroups;
-                        desc = ext4_get_group_desc(sb, grp, NULL);
+                        get_orlov_stats(sb, g, flex_size, &stats);
-                        if (!desc || !ext4_free_inodes_count(sb, desc))
+                        if (!stats.free_inodes)
                                continue;
-                        if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+                        if (stats.used_dirs >= best_ndir)
                                continue;
-                        if (ext4_free_inodes_count(sb, desc) < avefreei)
+                        if (stats.free_inodes < avefreei)
                                continue;
-                        if (ext4_free_blks_count(sb, desc) < avefreeb)
+                        if (stats.free_blocks < avefreeb)
                                continue;
-                        *group = grp;
+                        grp = g;
                        ret = 0;
-                        best_ndir = ext4_used_dirs_count(sb, desc);
+                        best_ndir = stats.used_dirs;
+                }
+                if (ret)
+                        goto fallback;
+        found_flex_bg:
+                if (flex_size == 1) {
+                        *group = grp;
+                        return 0;
+                }
+                /*
+                 * We pack inodes at the beginning of the flexgroup's
+                 * inode tables.  Block allocation decisions will do
+                 * something similar, although regular files will
+                 * start at 2nd block group of the flexgroup.  See
+                 * ext4_ext_find_goal() and ext4_find_near().
+                 */
+                grp *= flex_size;
+                for (i = 0; i < flex_size; i++) {
+                        if (grp+i >= sbi->s_groups_count)
+                                break;
+                        desc = ext4_get_group_desc(sb, grp+i, NULL);
+                        if (desc && ext4_free_inodes_count(sb, desc)) {
+                                *group = grp+i;
+                                return 0;
+                        }
                }
-                if (ret == 0)
-                        return ret;
                goto fallback;
        }
-        blocks_per_dir = ext4_blocks_count(es) - freeb;
-        do_div(blocks_per_dir, ndirs);
        max_dirs = ndirs / ngroups + inodes_per_group / 16;
-        min_inodes = avefreei - inodes_per_group / 4;
+        min_inodes = avefreei - inodes_per_group*flex_size / 4;
-        min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
+        if (min_inodes < 1)
+                min_inodes = 1;
-        max_debt = EXT4_BLOCKS_PER_GROUP(sb);
+        min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
-        max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
-        if (max_debt * INODE_COST > inodes_per_group)
+        /*
-                max_debt = inodes_per_group / INODE_COST;
+         * Start looking in the flex group where we last allocated an
-        if (max_debt > 255)
+         * inode for this parent directory
-                max_debt = 255;
+         */
-        if (max_debt == 0)
+        if (EXT4_I(parent)->i_last_alloc_group != ~0) {
-                max_debt = 1;
+                parent_group = EXT4_I(parent)->i_last_alloc_group;
+                if (flex_size > 1)
+                        parent_group >>= sbi->s_log_groups_per_flex;
+        }
        for (i = 0; i < ngroups; i++) {
-                *group = (parent_group + i) % ngroups;
+                grp = (parent_group + i) % ngroups;
-                desc = ext4_get_group_desc(sb, *group, NULL);
+                get_orlov_stats(sb, grp, flex_size, &stats);
-                if (!desc || !ext4_free_inodes_count(sb, desc))
+                if (stats.used_dirs >= max_dirs)
                        continue;
-                if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+                if (stats.free_inodes < min_inodes)
                        continue;
-                if (ext4_free_inodes_count(sb, desc) < min_inodes)
+                if (stats.free_blocks < min_blocks)
                        continue;
-                if (ext4_free_blks_count(sb, desc) < min_blocks)
+                goto found_flex_bg;
-                        continue;
-                return 0;
        }
 fallback:
+        ngroups = sbi->s_groups_count;
+        avefreei = freei / ngroups;
+fallback_retry:
+        parent_group = EXT4_I(parent)->i_block_group;
        for (i = 0; i < ngroups; i++) {
-                *group = (parent_group + i) % ngroups;
+                grp = (parent_group + i) % ngroups;
-                desc = ext4_get_group_desc(sb, *group, NULL);
+                desc = ext4_get_group_desc(sb, grp, NULL);
                if (desc && ext4_free_inodes_count(sb, desc) &&
-                        ext4_free_inodes_count(sb, desc) >= avefreei)
+                    ext4_free_inodes_count(sb, desc) >= avefreei) {
+                        *group = grp;
                        return 0;
+                }
        }
        if (avefreei) {
@@ -535,19 +603,58 @@ fallback:
                 * filesystems the above test can fail to find any blockgroups
                 */
                avefreei = 0;
-                goto fallback;
+                goto fallback_retry;
        }
        return -1;
 }
 static int find_group_other(struct super_block *sb, struct inode *parent,
-                                ext4_group_t *group)
+                            ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        struct ext4_group_desc *desc;
-        ext4_group_t i;
+        ext4_group_t i, last;
+        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+        /*
+         * Try to place the inode is the same flex group as its
+         * parent.  If we can't find space, use the Orlov algorithm to
+         * find another flex group, and store that information in the
+         * parent directory's inode information so that use that flex
+         * group for future allocations.
+         */
+        if (flex_size > 1) {
+                int retry = 0;
+        try_again:
+                parent_group &= ~(flex_size-1);
+                last = parent_group + flex_size;
+                if (last > ngroups)
+                        last = ngroups;
+                for  (i = parent_group; i < last; i++) {
+                        desc = ext4_get_group_desc(sb, i, NULL);
+                        if (desc && ext4_free_inodes_count(sb, desc)) {
+                                *group = i;
+                                return 0;
+                        }
+                }
+                if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+                        retry = 1;
+                        parent_group = EXT4_I(parent)->i_last_alloc_group;
+                        goto try_again;
+                }
+                /*
+                 * If this didn't work, use the Orlov search algorithm
+                 * to find a new flex group; we pass in the mode to
+                 * avoid the topdir algorithms.
+                 */
+                *group = parent_group + flex_size;
+                if (*group > ngroups)
+                        *group = 0;
+                return find_group_orlov(sb, parent, group, mode);
+        }
        /*
         * Try to place the inode in its parent directory
@@ -665,6 +772,11 @@ static int ext4_claim_inode(struct super_block *sb,
        if (S_ISDIR(mode)) {
                count = ext4_used_dirs_count(sb, gdp) + 1;
                ext4_used_dirs_set(sb, gdp, count);
+                if (sbi->s_log_groups_per_flex) {
+                        ext4_group_t f = ext4_flex_group(sbi, group);
+                        atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+                }
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
@@ -716,15 +828,16 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
-        if (sbi->s_log_groups_per_flex) {
+        if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
                ret2 = find_group_flex(sb, dir, &group);
                if (ret2 == -1) {
-                        ret2 = find_group_other(sb, dir, &group);
+                        ret2 = find_group_other(sb, dir, &group, mode);
-                        if (ret2 == 0 && once)
+                        if (ret2 == 0 && once) {
                                once = 0;
                                printk(KERN_NOTICE "ext4: find_group_flex "
                                       "failed, fallback succeeded dir %lu\n",
                                       dir->i_ino);
+                        }
                }
                goto got_group;
        }
@@ -733,11 +846,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                if (test_opt(sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
                else
-                        ret2 = find_group_orlov(sb, dir, &group);
+                        ret2 = find_group_orlov(sb, dir, &group, mode);
        } else
-                ret2 = find_group_other(sb, dir, &group);
+                ret2 = find_group_other(sb, dir, &group, mode);
 got_group:
+        EXT4_I(dir)->i_last_alloc_group = group;
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;
@@ -858,9 +972,7 @@ got:
        if (sbi->s_log_groups_per_flex) {
                flex_group = ext4_flex_group(sbi, group);
-                spin_lock(sb_bgl_lock(sbi, flex_group));
+                atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
-                sbi->s_flex_groups[flex_group].free_inodes--;
-                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
        inode->i_uid = current_fsuid();
@@ -885,19 +997,16 @@ got:
        ei->i_disksize = 0;
        /*
-         * Don't inherit extent flag from directory. We set extent flag on
+         * Don't inherit extent flag from directory, amongst others. We set
-         * newly created directory and file only if -o extent mount option is
+         * extent flag on newly created directory and file only if -o extent
-         * specified
+         * mount option is specified
         */
-        ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
-                ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
-        /* dirsync only applies to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT4_DIRSYNC_FL;
        ei->i_file_acl = 0;
        ei->i_dtime = 0;
        ei->i_block_group = group;
+        ei->i_last_alloc_group = ~0;
        ext4_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71d3ecd5db79..2a9ffd528dd1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
        return n;
 }
+static int __ext4_check_blockref(const char *function, struct inode *inode,
+                                 __le32 *p, unsigned int max) {
+        unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+        __le32 *bref = p;
+        while (bref < p+max) {
+                if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
+                        ext4_error(inode->i_sb, function,
+                                   "block reference %u >= max (%u) "
+                                   "in inode #%lu, offset=%d",
+                                   le32_to_cpu(*bref), maxblocks,
+                                   inode->i_ino, (int)(bref-p));
+                        return -EIO;
+                }
+                bref++;
+        }
+        return 0;
+}
+#define ext4_check_indirect_blockref(inode, bh)                         \
+        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+                              EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+#define ext4_check_inode_blockref(inode)                                \
+        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+                              EXT4_NDIR_BLOCKS)
 /**
 *      ext4_get_branch - read the chain of indirect blocks leading to data
 *      @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
        if (!p->key)
                goto no_block;
        while (--depth) {
-                bh = sb_bread(sb, le32_to_cpu(p->key));
+                bh = sb_getblk(sb, le32_to_cpu(p->key));
-                if (!bh)
+                if (unlikely(!bh))
                        goto failure;
+                  
+                if (!bh_uptodate_or_lock(bh)) {
+                        if (bh_submit_read(bh) < 0) {
+                                put_bh(bh);
+                                goto failure;
+                        }
+                        /* validate block references */
+                        if (ext4_check_indirect_blockref(inode, bh)) {
+                                put_bh(bh);
+                                goto failure;
+                        }
+                }
+                
                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
+        ext4_group_t block_group;
+        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        /* Try to find previous block */
        for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
         * It is going to be referred to from the inode itself? OK, just put it
         * into the same cylinder group then.
         */
-        bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+        block_group = ei->i_block_group;
+        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+                block_group &= ~(flex_size-1);
+                if (S_ISREG(inode->i_mode))
+                        block_group++;
+        }
+        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+        /*
+         * If we are doing delayed allocation, we don't need take
+         * colour into account.
+         */
+        if (test_opt(inode->i_sb, DELALLOC))
+                return bg_start;
        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -1052,9 +1108,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
        /*
         * free those over-booking quota for metadata blocks
         */
        if (mdb_free)
                vfs_dq_release_reservation_block(inode, mdb_free);
+        /*
+         * If we have done all the pending block allocations and if
+         * there aren't any writers on the inode, we can discard the
+         * inode's preallocations.
+         */
+        if (!total && (atomic_read(&inode->i_writecount) == 0))
+                ext4_discard_preallocations(inode);
 }
 /*
@@ -1086,6 +1149,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
        int retval;
        clear_buffer_mapped(bh);
+        clear_buffer_unwritten(bh);
        /*
         * Try to see if we can get  the block without requesting
@@ -1116,6 +1180,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                return retval;
        /*
+         * When we call get_blocks without the create flag, the
+         * BH_Unwritten flag could have gotten set if the blocks
+         * requested were part of a uninitialized extent.  We need to
+         * clear this flag now that we are committed to convert all or
+         * part of the uninitialized extent to be an initialized
+         * extent.  This is because we need to avoid the combination
+         * of BH_Unwritten and BH_Mapped flags being simultaneously
+         * set on the buffer_head.
+         */
+        clear_buffer_unwritten(bh);
+        /*
         * New blocks allocate and/or writing to uninitialized extent
         * will possibly result in updating i_data, so we take
         * the write lock of i_data_sem, and call get_blocks()
@@ -1688,9 +1764,10 @@ static void ext4_da_page_release_reservation(struct page *page,
 struct mpage_da_data {
        struct inode *inode;
-        struct buffer_head lbh;                 /* extent of blocks */
+        sector_t b_blocknr;             /* start block number of extent */
+        size_t b_size;                  /* size of extent */
+        unsigned long b_state;          /* state of the extent */
        unsigned long first_page, next_page;    /* extent of pages */
-        get_block_t *get_block;
        struct writeback_control *wbc;
        int io_done;
        int pages_written;
@@ -1704,7 +1781,6 @@ struct mpage_da_data {
 * @mpd->inode: inode
 * @mpd->first_page: first page of the extent
 * @mpd->next_page: page after the last page of the extent
- * @mpd->get_block: the filesystem's block mapper function
 *
 * By the time mpage_da_submit_io() is called we expect all blocks
 * to be allocated. this may be wrong if allocation failed.
@@ -1724,7 +1800,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
-         * If we look at mpd->lbh.b_blocknr we would only be looking
+         * If we look at mpd->b_blocknr we would only be looking
         * at the currently mapped buffer_heads.
         */
        index = mpd->first_page;
@@ -1914,68 +1990,111 @@ static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
+#define         EXT4_DELALLOC_RSVED     1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        int ret;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        loff_t disksize = EXT4_I(inode)->i_disksize;
+        handle_t *handle = NULL;
+        handle = ext4_journal_current_handle();
+        BUG_ON(!handle);
+        ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+        if (ret <= 0)
+                return ret;
+        bh_result->b_size = (ret << inode->i_blkbits);
+        if (ext4_should_order_data(inode)) {
+                int retval;
+                retval = ext4_jbd2_file_inode(handle, inode);
+                if (retval)
+                        /*
+                         * Failed to add inode for ordered mode. Don't
+                         * update file size
+                         */
+                        return retval;
+        }
+        /*
+         * Update on-disk size along with block allocation we don't
+         * use 'extend_disksize' as size may change within already
+         * allocated block -bzzz
+         */
+        disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+        if (disksize > i_size_read(inode))
+                disksize = i_size_read(inode);
+        if (disksize > EXT4_I(inode)->i_disksize) {
+                ext4_update_i_disksize(inode, disksize);
+                ret = ext4_mark_inode_dirty(handle, inode);
+                return ret;
+        }
+        return 0;
+}
 /*
 * mpage_da_map_blocks - go through given space
 *
- * @mpd->lbh - bh describing space
+ * @mpd - bh describing space
- * @mpd->get_block - the filesystem's block mapper function
 *
 * The function skips space we know is already mapped to disk blocks.
 *
 */
-static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
        int err = 0;
        struct buffer_head new;
-        struct buffer_head *lbh = &mpd->lbh;
        sector_t next;
        /*
         * We consider only non-mapped and non-allocated blocks
         */
-        if (buffer_mapped(lbh) && !buffer_delay(lbh))
+        if ((mpd->b_state  & (1 << BH_Mapped)) &&
+            !(mpd->b_state & (1 << BH_Delay)))
                return 0;
-        new.b_state = lbh->b_state;
+        new.b_state = mpd->b_state;
        new.b_blocknr = 0;
-        new.b_size = lbh->b_size;
+        new.b_size = mpd->b_size;
-        next = lbh->b_blocknr;
+        next = mpd->b_blocknr;
        /*
         * If we didn't accumulate anything
         * to write simply return
         */
        if (!new.b_size)
                return 0;
-        err = mpd->get_block(mpd->inode, next, &new, 1);
-        if (err) {
-                /* If get block returns with error
+        err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
-                 * we simply return. Later writepage
+        if (err) {
-                 * will redirty the page and writepages
+                /*
-                 * will find the dirty page again
+                 * If get block returns with error we simply
+                 * return. Later writepage will redirty the page and
+                 * writepages will find the dirty page again
                 */
                if (err == -EAGAIN)
                        return 0;
                if (err == -ENOSPC &&
-                                ext4_count_free_blocks(mpd->inode->i_sb)) {
+                    ext4_count_free_blocks(mpd->inode->i_sb)) {
                        mpd->retval = err;
                        return 0;
                }
                /*
-                 * get block failure will cause us
+                 * get block failure will cause us to loop in
-                 * to loop in writepages. Because
+                 * writepages, because a_ops->writepage won't be able
-                 * a_ops->writepage won't be able to
+                 * to make progress. The page will be redirtied by
-                 * make progress. The page will be redirtied
+                 * writepage and writepages will again try to write
-                 * by writepage and writepages will again
+                 * the same.
-                 * try to write the same.
                 */
                printk(KERN_EMERG "%s block allocation failed for inode %lu "
                                  "at logical offset %llu with max blocks "
                                  "%zd with error %d\n",
                                  __func__, mpd->inode->i_ino,
                                  (unsigned long long)next,
-                                  lbh->b_size >> mpd->inode->i_blkbits, err);
+                                  mpd->b_size >> mpd->inode->i_blkbits, err);
                printk(KERN_EMERG "This should not happen.!! "
                                        "Data will be lost\n");
                if (err == -ENOSPC) {
@@ -1983,7 +2102,7 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
                }
                /* invlaidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
-                                lbh->b_size >> mpd->inode->i_blkbits);
+                                mpd->b_size >> mpd->inode->i_blkbits);
                return err;
        }
        BUG_ON(new.b_size == 0);
@@ -1995,7 +2114,8 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
         * If blocks are delayed marked, we need to
         * put actual blocknr and drop delayed bit
         */
-        if (buffer_delay(lbh) || buffer_unwritten(lbh))
+        if ((mpd->b_state & (1 << BH_Delay)) ||
+            (mpd->b_state & (1 << BH_Unwritten)))
                mpage_put_bnr_to_bhs(mpd, next, &new);
        return 0;
@@ -2014,12 +2134,11 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 * the function is used to collect contig. blocks in same state
 */
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
-                                   sector_t logical, struct buffer_head *bh)
+                                   sector_t logical, size_t b_size,
+                                   unsigned long b_state)
 {
        sector_t next;
-        size_t b_size = bh->b_size;
+        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
-        struct buffer_head *lbh = &mpd->lbh;
-        int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
        /* check if thereserved journal credits might overflow */
        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2046,19 +2165,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
        /*
         * First block in the extent
         */
-        if (lbh->b_size == 0) {
+        if (mpd->b_size == 0) {
-                lbh->b_blocknr = logical;
+                mpd->b_blocknr = logical;
-                lbh->b_size = b_size;
+                mpd->b_size = b_size;
-                lbh->b_state = bh->b_state & BH_FLAGS;
+                mpd->b_state = b_state & BH_FLAGS;
                return;
        }
-        next = lbh->b_blocknr + nrblocks;
+        next = mpd->b_blocknr + nrblocks;
        /*
         * Can we merge the block to our big extent?
         */
-        if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+        if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-                lbh->b_size += b_size;
+                mpd->b_size += b_size;
                return;
        }
@@ -2087,7 +2206,7 @@ static int __mpage_da_writepage(struct page *page,
 {
        struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
-        struct buffer_head *bh, *head, fake;
+        struct buffer_head *bh, *head;
        sector_t logical;
        if (mpd->io_done) {
@@ -2129,9 +2248,9 @@ static int __mpage_da_writepage(struct page *page,
                /*
                 * ... and blocks
                 */
-                mpd->lbh.b_size = 0;
+                mpd->b_size = 0;
-                mpd->lbh.b_state = 0;
+                mpd->b_state = 0;
-                mpd->lbh.b_blocknr = 0;
+                mpd->b_blocknr = 0;
        }
        mpd->next_page = page->index + 1;
@@ -2139,16 +2258,8 @@ static int __mpage_da_writepage(struct page *page,
                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
        if (!page_has_buffers(page)) {
-                /*
+                mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
-                 * There is no attached buffer heads yet (mmap?)
+                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
-                 * we treat the page asfull of dirty blocks
-                 */
-                bh = &fake;
-                bh->b_size = PAGE_CACHE_SIZE;
-                bh->b_state = 0;
-                set_buffer_dirty(bh);
-                set_buffer_uptodate(bh);
-                mpage_add_bh_to_extent(mpd, logical, bh);
                if (mpd->io_done)
                        return MPAGE_DA_EXTENT_TAIL;
        } else {
@@ -2166,8 +2277,10 @@ static int __mpage_da_writepage(struct page *page,
                         * with the page in ext4_da_writepage
                         */
                        if (buffer_dirty(bh) &&
-                                (!buffer_mapped(bh) || buffer_delay(bh))) {
+                            (!buffer_mapped(bh) || buffer_delay(bh))) {
-                                mpage_add_bh_to_extent(mpd, logical, bh);
+                                mpage_add_bh_to_extent(mpd, logical,
+                                                       bh->b_size,
+                                                       bh->b_state);
                                if (mpd->io_done)
                                        return MPAGE_DA_EXTENT_TAIL;
                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2179,9 +2292,8 @@ static int __mpage_da_writepage(struct page *page,
                                 * unmapped buffer_head later we need to
                                 * use the b_state flag of that buffer_head.
                                 */
-                                if (mpd->lbh.b_size == 0)
+                                if (mpd->b_size == 0)
-                                        mpd->lbh.b_state =
+                                        mpd->b_state = bh->b_state & BH_FLAGS;
-                                                bh->b_state & BH_FLAGS;
                        }
                        logical++;
                } while ((bh = bh->b_this_page) != head);
@@ -2191,51 +2303,6 @@ static int __mpage_da_writepage(struct page *page,
 }
 /*
- * mpage_da_writepages - walk the list of dirty pages of the given
- * address space, allocates non-allocated blocks, maps newly-allocated
- * blocks to existing bhs and issue IO them
- *
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @get_block: the filesystem's block mapper function.
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- */
-static int mpage_da_writepages(struct address_space *mapping,
-                               struct writeback_control *wbc,
-                               struct mpage_da_data *mpd)
-{
-        int ret;
-        if (!mpd->get_block)
-                return generic_writepages(mapping, wbc);
-        mpd->lbh.b_size = 0;
-        mpd->lbh.b_state = 0;
-        mpd->lbh.b_blocknr = 0;
-        mpd->first_page = 0;
-        mpd->next_page = 0;
-        mpd->io_done = 0;
-        mpd->pages_written = 0;
-        mpd->retval = 0;
-        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-        /*
-         * Handle last extent of pages
-         */
-        if (!mpd->io_done && mpd->next_page != mpd->first_page) {
-                if (mpage_da_map_blocks(mpd) == 0)
-                        mpage_da_submit_io(mpd);
-                mpd->io_done = 1;
-                ret = MPAGE_DA_EXTENT_TAIL;
-        }
-        wbc->nr_to_write -= mpd->pages_written;
-        return ret;
-}
-/*
 * this is a special callback for ->write_begin() only
 * it's intention is to return mapped block or reserve space
 */
@@ -2243,6 +2310,10 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                                  struct buffer_head *bh_result, int create)
 {
        int ret = 0;
+        sector_t invalid_block = ~((sector_t) 0xffff);
+        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+                invalid_block = ~0;
        BUG_ON(create == 0);
        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
@@ -2264,59 +2335,21 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                        /* not enough space to reserve */
                        return ret;
-                map_bh(bh_result, inode->i_sb, 0);
+                map_bh(bh_result, inode->i_sb, invalid_block);
                set_buffer_new(bh_result);
                set_buffer_delay(bh_result);
        } else if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
-        }
-        return ret;
-}
-#define         EXT4_DELALLOC_RSVED     1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-                                   struct buffer_head *bh_result, int create)
-{
-        int ret;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        loff_t disksize = EXT4_I(inode)->i_disksize;
-        handle_t *handle = NULL;
-        handle = ext4_journal_current_handle();
-        BUG_ON(!handle);
-        ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                        bh_result, create, 0, EXT4_DELALLOC_RSVED);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                if (ext4_should_order_data(inode)) {
-                        int retval;
-                        retval = ext4_jbd2_file_inode(handle, inode);
-                        if (retval)
-                                /*
-                                 * Failed to add inode for ordered
-                                 * mode. Don't update file size
-                                 */
-                                return retval;
-                }
                /*
-                 * Update on-disk size along with block allocation
+                 * With sub-block writes into unwritten extents
-                 * we don't use 'extend_disksize' as size may change
+                 * we also need to mark the buffer as new so that
-                 * within already allocated block -bzzz
+                 * the unwritten parts of the buffer gets correctly zeroed.
                 */
-                disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+                if (buffer_unwritten(bh_result))
-                if (disksize > i_size_read(inode))
+                        set_buffer_new(bh_result);
-                        disksize = i_size_read(inode);
-                if (disksize > EXT4_I(inode)->i_disksize) {
-                        ext4_update_i_disksize(inode, disksize);
-                        ret = ext4_mark_inode_dirty(handle, inode);
-                        return ret;
-                }
                ret = 0;
        }
        return ret;
 }
@@ -2569,8 +2602,38 @@ retry:
                        dump_stack();
                        goto out_writepages;
                }
-                mpd.get_block = ext4_da_get_block_write;
-                ret = mpage_da_writepages(mapping, wbc, &mpd);
+                /*
+                 * Now call __mpage_da_writepage to find the next
+                 * contiguous region of logical blocks that need
+                 * blocks to be allocated by ext4.  We don't actually
+                 * submit the blocks for I/O here, even though
+                 * write_cache_pages thinks it will, and will set the
+                 * pages as clean for write before calling
+                 * __mpage_da_writepage().
+                 */
+                mpd.b_size = 0;
+                mpd.b_state = 0;
+                mpd.b_blocknr = 0;
+                mpd.first_page = 0;
+                mpd.next_page = 0;
+                mpd.io_done = 0;
+                mpd.pages_written = 0;
+                mpd.retval = 0;
+                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+                                        &mpd);
+                /*
+                 * If we have a contigous extent of pages and we
+                 * haven't done the I/O yet, map the blocks and submit
+                 * them for I/O.
+                 */
+                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+                        if (mpage_da_map_blocks(&mpd) == 0)
+                                mpage_da_submit_io(&mpd);
+                        mpd.io_done = 1;
+                        ret = MPAGE_DA_EXTENT_TAIL;
+                }
+                wbc->nr_to_write -= mpd.pages_written;
                ext4_journal_stop(handle);
@@ -2846,6 +2909,48 @@ out:
        return;
 }
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+        if (!EXT4_I(inode)->i_reserved_data_blocks &&
+            !EXT4_I(inode)->i_reserved_meta_blocks)
+                return 0;
+        /*
+         * We do something simple for now.  The filemap_flush() will
+         * also start triggering a write of the data blocks, which is
+         * not strictly speaking necessary (and for users of
+         * laptop_mode, not even desirable).  However, to do otherwise
+         * would require replicating code paths in:
+         * 
+         * ext4_da_writepages() ->
+         *    write_cache_pages() ---> (via passed in callback function)
+         *        __mpage_da_writepage() -->
+         *           mpage_add_bh_to_extent()
+         *           mpage_da_map_blocks()
+         *
+         * The problem is that write_cache_pages(), located in
+         * mm/page-writeback.c, marks pages clean in preparation for
+         * doing I/O, which is not desirable if we're not planning on
+         * doing I/O at all.
+         *
+         * We could call write_cache_pages(), and then redirty all of
+         * the pages by calling redirty_page_for_writeback() but that
+         * would be ugly in the extreme.  So instead we would need to
+         * replicate parts of the code in the above functions,
+         * simplifying them becuase we wouldn't actually intend to
+         * write out the pages, but rather only collect contiguous
+         * logical block extents, call the multi-block allocator, and
+         * then update the buffer heads with the block allocations.
+         * 
+         * For now, though, we'll cheat by calling filemap_flush(),
+         * which will map the blocks, and start the I/O, but not
+         * actually wait for the I/O to complete.
+         */
+        return filemap_flush(inode->i_mapping);
+}
 /*
 * bmap() is special.  It gets used by applications such as lilo and by
@@ -3868,6 +3973,9 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
+        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                ext4_ext_truncate(inode);
                return;
@@ -4110,12 +4218,7 @@ make_io:
                        unsigned num;
                        table = ext4_inode_table(sb, gdp);
-                        /* Make sure s_inode_readahead_blks is a power of 2 */
+                        /* s_inode_readahead_blks is always a power of 2 */
-                        while (EXT4_SB(sb)->s_inode_readahead_blks &
-                               (EXT4_SB(sb)->s_inode_readahead_blks-1))
-                                EXT4_SB(sb)->s_inode_readahead_blks = 
-                                   (EXT4_SB(sb)->s_inode_readahead_blks &
-                                    (EXT4_SB(sb)->s_inode_readahead_blks-1));
                        b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
                        if (table > b)
                                b = table;
@@ -4278,15 +4381,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
-        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
-            cpu_to_le32(EXT4_OS_HURD)) {
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-        }
        inode->i_size = ext4_isize(raw_inode);
        ei->i_disksize = inode->i_size;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
+        ei->i_last_alloc_group = ~0;
        /*
         * NOTE! The in-memory inode i_data array is in little-endian order
         * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4329,6 +4431,34 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
        }
+        ret = 0;
+        if (ei->i_file_acl &&
+            ((ei->i_file_acl < 
+              (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+               EXT4_SB(sb)->s_gdb_count)) ||
+             (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+                ext4_error(sb, __func__,
+                           "bad extended attribute block %llu in inode #%lu",
+                           ei->i_file_acl, inode->i_ino);
+                ret = -EIO;
+                goto bad_inode;
+        } else if (ei->i_flags & EXT4_EXTENTS_FL) {
+                if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                    (S_ISLNK(inode->i_mode) &&
+                     !ext4_inode_is_fast_symlink(inode)))
+                        /* Validate extent which is part of inode */
+                        ret = ext4_ext_check_inode(inode);
+        } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                   (S_ISLNK(inode->i_mode) &&
+                    !ext4_inode_is_fast_symlink(inode))) {
+                /* Validate block references which are part of inode */
+                ret = ext4_check_inode_blockref(inode);
+        }
+        if (ret) {
+                brelse(bh);
+                goto bad_inode;
+        }
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
@@ -4345,7 +4475,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        inode->i_op = &ext4_symlink_inode_operations;
                        ext4_set_aops(inode);
                }
-        } else {
+        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+              S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                inode->i_op = &ext4_special_inode_operations;
                if (raw_inode->i_block[0])
                        init_special_inode(inode, inode->i_mode,
@@ -4353,6 +4484,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+        } else {
+                brelse(bh);
+                ret = -EIO;
+                ext4_error(inode->i_sb, __func__, 
+                           "bogus i_mode (%o) for inode=%lu",
+                           inode->i_mode, inode->i_ino);
+                goto bad_inode;
        }
        brelse(iloc.bh);
        ext4_set_inode_flags(inode);
@@ -5146,8 +5284,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
        return !buffer_mapped(bh);
 }
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        loff_t size;
        unsigned long len;
        int ret = -EINVAL;
@@ -5199,6 +5338,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                goto out_unlock;
        ret = 0;
 out_unlock:
+        if (ret)
+                ret = VM_FAULT_SIGBUS;
        up_read(&inode->i_alloc_sem);
        return ret;
 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247a..91e75f7a9e73 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (err)
                        return err;
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext4_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT4_DIRSYNC_FL;
                err = -EPERM;
                mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
                return err;
        }
+        case EXT4_IOC_ALLOC_DA_BLKS:
+        {
+                int err;
+                if (!is_owner_or_cap(inode))
+                        return -EACCES;
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        return err;
+                err = ext4_alloc_da_blocks(inode);
+                mnt_drop_write(filp->f_path.mnt);
+                return err;
+        }
        default:
                return -ENOTTY;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b038188bd039..f871677a7984 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
 * The allocation request involve request for multiple number of blocks
 * near to the goal(block) value specified.
 *
- * During initialization phase of the allocator we decide to use the group
+ * During initialization phase of the allocator we decide to use the
- * preallocation or inode preallocation depending on the size file. The
+ * group preallocation or inode preallocation depending on the size of
- * size of the file could be the resulting file size we would have after
+ * the file. The size of the file could be the resulting file size we
- * allocation or the current file size which ever is larger. If the size is
+ * would have after allocation, or the current file size, which ever
- * less that sbi->s_mb_stream_request we select the group
+ * is larger. If the size is less than sbi->s_mb_stream_request we
- * preallocation. The default value of s_mb_stream_request is 16
+ * select to use the group preallocation. The default value of
- * blocks. This can also be tuned via
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
- * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
- * of number of blocks.
+ * terms of number of blocks.
 *
 * The main motivation for having small file use group preallocation is to
- * ensure that we have small file closer in the disk.
+ * ensure that we have small files closer together on the disk.
 *
- * First stage the allocator looks at the inode prealloc list
+ * First stage the allocator looks at the inode prealloc list,
- * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
- * this particular inode. The inode prealloc space is represented as:
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
 *
 * pa_lstart -> the logical start block for this prealloc space
 * pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
 * list. In case of inode preallocation we follow a list of heuristics
 * based on file size. This can be found in ext4_mb_normalize_request. If
 * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
 * 512 blocks. This can be tuned via
- * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
 * terms of number of blocks. If we have mounted the file system with -O
 * stripe=<value> option the group prealloc request is normalized to the
 * stripe value (sbi->s_stripe)
 *
- * The regular allocator(using the buddy cache) support few tunables.
+ * The regular allocator(using the buddy cache) supports few tunables.
 *
- * /proc/fs/ext4/<partition>/min_to_scan
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
- * /proc/fs/ext4/<partition>/max_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
- * /proc/fs/ext4/<partition>/order2_req
+ * /sys/fs/ext4/<partition>/mb_order2_req
 *
- * The regular allocator use buddy scan only if the request len is power of
+ * The regular allocator uses buddy scan only if the request len is power of
 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 * value of s_mb_order2_reqs can be tuned via
- * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
+ * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 * stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setup. If
+ * stripe size. This should result in better allocation on RAID setups. If
- * not we search in the specific group using bitmap for best extents. The
+ * not, we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * tunable min_to_scan and max_to_scan control the behaviour here.
 * min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
 * best extent in the found extents. Searching for the blocks starts with
 * the group specified as the goal value in allocation context via
 * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 {
        unsigned free, fragments;
        unsigned i, bits;
+        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_desc *desc;
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
                        return 0;
+                /* Avoid using the first bg of a flexgroup for data files */
+                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+                    ((group % flex_size) == 0))
+                        return 0;
                bits = ac->ac_sb->s_blocksize_bits + 1;
                for (i = ac->ac_2order; i <= bits; i++)
                        if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        /*
         * We search using buddy data only if the order of the request
         * is greater than equal to the sbi_s_mb_order2_reqs
-         * You can tune it via /proc/fs/ext4/<partition>/order2_req
+         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
         */
        if (i >= sbi->s_mb_order2_reqs) {
                /*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-                kfree(sbi->s_mb_maxs);
+                kfree(sbi->s_mb_offsets);
                return -ENOMEM;
        }
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                spin_lock_init(&lg->lg_prealloc_lock);
        }
-        ext4_mb_init_per_dev_proc(sb);
        ext4_mb_history_init(sb);
        if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
        free_percpu(sbi->s_locality_groups);
        ext4_mb_history_release(sb);
-        ext4_mb_destroy_per_dev_proc(sb);
        return 0;
 }
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
-#define EXT4_MB_STATS_NAME              "stats"
-#define EXT4_MB_MAX_TO_SCAN_NAME        "max_to_scan"
-#define EXT4_MB_MIN_TO_SCAN_NAME        "min_to_scan"
-#define EXT4_MB_ORDER2_REQ              "order2_req"
-#define EXT4_MB_STREAM_REQ              "stream_req"
-#define EXT4_MB_GROUP_PREALLOC          "group_prealloc"
-static int ext4_mb_init_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-        mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct proc_dir_entry *proc;
-        if (sbi->s_proc == NULL)
-                return -EINVAL;
-        EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
-        EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
-        EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
-        EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
-        EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
-        EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
-        return 0;
-err_out:
-        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-        return -ENOMEM;
-#else
-        return 0;
-#endif
-}
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (sbi->s_proc == NULL)
-                return -EINVAL;
-        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-#endif
-        return 0;
-}
 int __init init_ext4_mballoc(void)
 {
        ext4_pspace_cachep =
@@ -3096,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
                                                          ac->ac_b_ex.fe_group);
-                spin_lock(sb_bgl_lock(sbi, flex_group));
+                atomic_sub(ac->ac_b_ex.fe_len,
-                sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+                           &sbi->s_flex_groups[flex_group].free_blocks);
-                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3116,7 +3063,7 @@ out_err:
 * here we normalize request for locality group
 * Group request are normalized to s_strip size if we set the same via mount
 * option. If not we set it to s_mb_group_prealloc which can be configured via
- * /proc/fs/ext4/<partition>/group_prealloc
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
 *
 * XXX: should we try to preallocate more than the group has now?
 */
@@ -3608,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        spin_unlock(&pa->pa_lock);
        grp_blk = pa->pa_pstart;
-        /* If linear, pa_pstart may be in the next group when pa is used up */
+        /* 
-        if (pa->pa_linear)
+         * If doing group-based preallocation, pa_pstart may be in the
+         * next group when pa is used up
+         */
+        if (pa->pa_type == MB_GROUP_PA)
                grp_blk--;
        ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3704,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
-        pa->pa_linear = 0;
+        pa->pa_type = MB_INODE_PA;
        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3767,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
-        pa->pa_linear = 1;
+        pa->pa_type = MB_GROUP_PA;
        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
                 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4021,7 +3971,7 @@ repeat:
                list_del_rcu(&pa->pa_inode_list);
                spin_unlock(pa->pa_obj_lock);
-                if (pa->pa_linear)
+                if (pa->pa_type == MB_GROUP_PA)
                        ext4_mb_release_group_pa(&e4b, pa, ac);
                else
                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4121,7 +4071,7 @@ repeat:
        spin_unlock(&ei->i_prealloc_lock);
        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
-                BUG_ON(pa->pa_linear != 0);
+                BUG_ON(pa->pa_type != MB_INODE_PA);
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
                err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4232,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 * file is determined by the current size or the resulting size after
 * allocation which ever is larger
 *
- * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
 */
 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 {
@@ -4373,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                        continue;
                }
                /* only lg prealloc space */
-                BUG_ON(!pa->pa_linear);
+                BUG_ON(pa->pa_type != MB_GROUP_PA);
                /* seems this one can be freed ... */
                pa->pa_deleted = 1;
@@ -4442,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
                                                pa_inode_list) {
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted) {
-                        spin_unlock(&pa->pa_lock);
+                        spin_unlock(&tmp_pa->pa_lock);
                        continue;
                }
                if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4479,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
-                if (pa->pa_linear) {
+                if (pa->pa_type == MB_GROUP_PA) {
                        /* see comment in ext4_mb_use_group_pa() */
                        spin_lock(&pa->pa_lock);
                        pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4499,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                 * doesn't grow big.  We need to release
                 * alloc_semp before calling ext4_mb_add_n_trim()
                 */
-                if (pa->pa_linear && likely(pa->pa_free)) {
+                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
                        spin_lock(pa->pa_obj_lock);
                        list_del_rcu(&pa->pa_inode_list);
                        spin_unlock(pa->pa_obj_lock);
@@ -4936,9 +4886,7 @@ do_more:
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-                spin_lock(sb_bgl_lock(sbi, flex_group));
+                atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
-                sbi->s_flex_groups[flex_group].free_blocks += count;
-                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
        ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf14..dd9e6cd5f6cf 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
        ext4_lblk_t             pa_lstart;      /* log. block */
        unsigned short          pa_len;         /* len of preallocated chunk */
        unsigned short          pa_free;        /* how many blocks are free */
-        unsigned short          pa_linear;      /* consumed in one direction
+        unsigned short          pa_type;        /* pa type. inode or group */
-                                                 * strictly, for grp prealloc */
        spinlock_t              *pa_obj_lock;
        struct inode            *pa_inode;      /* hack, for history only */
 };
+enum {
+        MB_INODE_PA = 0,
+        MB_GROUP_PA = 1
+};
 struct ext4_free_extent {
        ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 83410244d3ee..22098e1cd085 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
                                 struct dx_frame *frame,
                                 int *err);
 static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
-                struct dx_map_entry *offsets, int count);
+                struct dx_map_entry *offsets, int count, unsigned blocksize);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block(struct dx_frame *frame,
                                        u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
+unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+        unsigned len = le16_to_cpu(dlen);
+        if (len == EXT4_MAX_REC_LEN || len == 0)
+                return blocksize;
+        return (len & 65532) | ((len & 3) << 16);
+}
+  
+__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+                BUG();
+        if (len < 65536)
+                return cpu_to_le16(len);
+        if (len == blocksize) {
+                if (blocksize == 65536)
+                        return cpu_to_le16(EXT4_MAX_REC_LEN);
+                else 
+                        return cpu_to_le16(0);
+        }
+        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+}
 /*
 * p is at least 6 bytes before the end of page
 */
 static inline struct ext4_dir_entry_2 *
-ext4_next_entry(struct ext4_dir_entry_2 *p)
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
 {
        return (struct ext4_dir_entry_2 *)((char *)p +
-                ext4_rec_len_from_disk(p->rec_len));
+                ext4_rec_len_from_disk(p->rec_len, blocksize));
 }
 /*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
                        space += EXT4_DIR_REC_LEN(de->name_len);
                        names++;
                }
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, size);
        }
        printk("(%i)\n", names);
        return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
        top = (struct ext4_dir_entry_2 *) ((char *) de +
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
-        for (; de < top; de = ext4_next_entry(de)) {
+        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
                if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                                +((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        }
        if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, dir->i_sb->s_blocksize);
                if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
                        goto errout;
                count++;
@@ -713,15 +737,15 @@ errout:
 * Create map of hash values, offsets, and sizes, stored at end of block.
 * Returns number of entries mapped.
 */
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
-                        struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+                       struct dx_hash_info *hinfo,
+                       struct dx_map_entry *map_tail)
 {
        int count = 0;
        char *base = (char *) de;
        struct dx_hash_info h = *hinfo;
-        while ((char *) de < base + size)
+        while ((char *) de < base + blocksize) {
-        {
                if (de->name_len && de->inode) {
                        ext4fs_dirhash(de->name, de->name_len, &h);
                        map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
                        cond_resched();
                }
                /* XXX: do we need to check rec_len == 0 case? -Chris */
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, blocksize);
        }
        return count;
 }
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
                        return 1;
                }
                /* prevent looping on a bad block */
-                de_len = ext4_rec_len_from_disk(de->rec_len);
+                de_len = ext4_rec_len_from_disk(de->rec_len,
+                                                dir->i_sb->s_blocksize);
                if (de_len <= 0)
                        return -1;
                offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                de = (struct ext4_dir_entry_2 *) bh->b_data;
                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
                                       EXT4_DIR_REC_LEN(0));
-                for (; de < top; de = ext4_next_entry(de)) {
+                for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
                                  + ((char *) de - bh->b_data);
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
-                if (IS_ERR(inode))
+                if (unlikely(IS_ERR(inode))) {
-                        return ERR_CAST(inode);
+                        if (PTR_ERR(inode) == -ESTALE) {
+                                ext4_error(dir->i_sb, __func__,
+                                                "deleted inode referenced: %u",
+                                                ino);
+                                return ERR_PTR(-EIO);
+                        } else {
+                                return ERR_CAST(inode);
+                        }
+                }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
 * Returns pointer to last entry moved.
 */
 static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+                unsigned blocksize)
 {
        unsigned rec_len = 0;
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
-                                ext4_rec_len_to_disk(rec_len);
+                                ext4_rec_len_to_disk(rec_len, blocksize);
                de->inode = 0;
                map++;
                to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 * Compact each dir entry in the range to the minimal rec_len.
 * Returns pointer to last entry in range.
 */
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
 {
        struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
        unsigned rec_len = 0;
        prev = to = de;
-        while ((char*)de < base + size) {
+        while ((char*)de < base + blocksize) {
-                next = ext4_next_entry(de);
+                next = ext4_next_entry(de, blocksize);
                if (de->inode && de->name_len) {
                        rec_len = EXT4_DIR_REC_LEN(de->name_len);
                        if (de > to)
                                memmove(to, de, rec_len);
-                        to->rec_len = ext4_rec_len_to_disk(rec_len);
+                        to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
                        prev = to;
                        to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
                }
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                                        hash2, split, count-split));
        /* Fancy dance to stay within two buffers */
-        de2 = dx_move_dirents(data1, data2, map + split, count - split);
+        de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
        de = dx_pack_dirents(data1, blocksize);
-        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
-        de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+                                           blocksize);
+        de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+                                            blocksize);
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        const char      *name = dentry->d_name.name;
        int             namelen = dentry->d_name.len;
        unsigned int    offset = 0;
+        unsigned int    blocksize = dir->i_sb->s_blocksize;
        unsigned short  reclen;
        int             nlen, rlen, err;
        char            *top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        reclen = EXT4_DIR_REC_LEN(namelen);
        if (!de) {
                de = (struct ext4_dir_entry_2 *)bh->b_data;
-                top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
                                                  bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                                return -EEXIST;
                        }
                        nlen = EXT4_DIR_REC_LEN(de->name_len);
-                        rlen = ext4_rec_len_from_disk(de->rec_len);
+                        rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
                        if ((de->inode? rlen - nlen: rlen) >= reclen)
                                break;
                        de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        /* By now the buffer is marked for journaling */
        nlen = EXT4_DIR_REC_LEN(de->name_len);
-        rlen = ext4_rec_len_from_disk(de->rec_len);
+        rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
        if (de->inode) {
                struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
+                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
-                de->rec_len = ext4_rec_len_to_disk(nlen);
+                de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
                de = de1;
        }
        de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* The 0th block becomes the root, move the dirents out */
        fde = &root->dotdot;
        de = (struct ext4_dir_entry_2 *)((char *)fde +
-                ext4_rec_len_from_disk(fde->rec_len));
+                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
                ext4_error(dir->i_sb, __func__,
                           "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        memcpy (data1, de, len);
        de = (struct ext4_dir_entry_2 *) data1;
        top = data1 + len;
-        while ((char *)(de2 = ext4_next_entry(de)) < top)
+        while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
                de = de2;
-        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+                                           blocksize);
        /* Initialize the root; the dot dirents already exist */
        de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-        de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+        de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+                                           blocksize);
        memset (&root->info, 0, sizeof(root->info));
        root->info.info_length = sizeof(root->info);
        root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                return retval;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
-        de->rec_len = ext4_rec_len_to_disk(blocksize);
+        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
-                node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
+                node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+                                                           sb->s_blocksize);
                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
                             struct buffer_head *bh)
 {
        struct ext4_dir_entry_2 *de, *pde;
+        unsigned int blocksize = dir->i_sb->s_blocksize;
        int i;
        i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
                        ext4_journal_get_write_access(handle, bh);
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
-                                        ext4_rec_len_from_disk(pde->rec_len) +
+                                        ext4_rec_len_from_disk(pde->rec_len,
-                                        ext4_rec_len_from_disk(de->rec_len));
+                                                               blocksize) +
+                                        ext4_rec_len_from_disk(de->rec_len,
+                                                               blocksize),
+                                        blocksize);
                        else
                                de->inode = 0;
                        dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
                        ext4_handle_dirty_metadata(handle, dir, bh);
                        return 0;
                }
-                i += ext4_rec_len_from_disk(de->rec_len);
+                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
                pde = de;
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, blocksize);
        }
        return -ENOENT;
 }
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct inode *inode;
        struct buffer_head *dir_block;
        struct ext4_dir_entry_2 *de;
+        unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
        if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
-        de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
+        de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+                                           blocksize);
        strcpy(de->name, ".");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-        de = ext4_next_entry(de);
+        de = ext4_next_entry(de, blocksize);
        de->inode = cpu_to_le32(dir->i_ino);
-        de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
+        de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
-                                                EXT4_DIR_REC_LEN(1));
+                                           blocksize);
        de->name_len = 2;
        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
                return 1;
        }
        de = (struct ext4_dir_entry_2 *) bh->b_data;
-        de1 = ext4_next_entry(de);
+        de1 = ext4_next_entry(de, sb->s_blocksize);
        if (le32_to_cpu(de->inode) != inode->i_ino ||
                        !le32_to_cpu(de1->inode) ||
                        strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
                brelse(bh);
                return 1;
        }
-        offset = ext4_rec_len_from_disk(de->rec_len) +
+        offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
-                 ext4_rec_len_from_disk(de1->rec_len);
+                 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
-        de = ext4_next_entry(de1);
+        de = ext4_next_entry(de1, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!bh ||
                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
                        brelse(bh);
                        return 0;
                }
-                offset += ext4_rec_len_from_disk(de->rec_len);
+                offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, sb->s_blocksize);
        }
        brelse(bh);
        return 1;
@@ -2297,8 +2343,8 @@ retry:
        return err;
 }
-#define PARENT_INO(buffer) \
+#define PARENT_INO(buffer, size) \
-        (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+        (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
 /*
 * Anybody can rename anything with this: the permission checks are left to the
@@ -2311,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct inode *old_inode, *new_inode;
        struct buffer_head *old_bh, *new_bh, *dir_bh;
        struct ext4_dir_entry_2 *old_de, *new_de;
-        int retval;
+        int retval, force_da_alloc = 0;
        old_bh = new_bh = dir_bh = NULL;
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
                if (!dir_bh)
                        goto end_rename;
-                if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+                if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+                                old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
                        goto end_rename;
                retval = -EMLINK;
                if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (dir_bh) {
                BUFFER_TRACE(dir_bh, "get_write_access");
                ext4_journal_get_write_access(handle, dir_bh);
-                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
                ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                ext4_mark_inode_dirty(handle, new_inode);
                if (!new_inode->i_nlink)
                        ext4_orphan_add(handle, new_inode);
+                if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+                        force_da_alloc = 1;
        }
        retval = 0;
@@ -2457,6 +2507,8 @@ end_rename:
        brelse(old_bh);
        brelse(new_bh);
        ext4_journal_stop(handle);
+        if (retval == 0 && force_da_alloc)
+                ext4_alloc_da_blocks(old_inode);
        return retval;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd658..546c7dd869e1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, input->group);
-                sbi->s_flex_groups[flex_group].free_blocks +=
+                atomic_add(input->free_blocks_count,
-                        input->free_blocks_count;
+                           &sbi->s_flex_groups[flex_group].free_blocks);
-                sbi->s_flex_groups[flex_group].free_inodes +=
+                atomic_add(EXT4_INODES_PER_GROUP(sb),
-                        EXT4_INODES_PER_GROUP(sb);
+                           &sbi->s_flex_groups[flex_group].free_inodes);
        }
        ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7371a6a923d..2958f4e6f222 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/ctype.h>
 #include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
@@ -48,6 +49,7 @@
 #include "group.h"
 struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
                ext4_commit_super(sb, es, 1);
        }
        if (sbi->s_proc) {
-                remove_proc_entry("inode_readahead_blks", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
+        kobject_del(&sbi->s_kobj);
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
                ext4_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
+        /*
+         * Now that we are completely done shutting down the
+         * superblock, we need to actually destroy the kobject.
+         */
+        unlock_kernel();
+        unlock_super(sb);
+        kobject_put(&sbi->s_kobj);
+        wait_for_completion(&sbi->s_kobj_unregister);
+        lock_super(sb);
+        lock_kernel();
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
 }
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
                seq_puts(seq, ",noacl");
 #endif
-        if (!test_opt(sb, RESERVATION))
-                seq_puts(seq, ",noreservation");
        if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, DATA_ERR_ABORT))
                seq_puts(seq, ",data_err=abort");
+        if (test_opt(sb, NO_AUTO_DA_ALLOC))
+                seq_puts(seq, ",noauto_da_alloc");
        ext4_show_quota_options(seq, sb);
        return 0;
 }
@@ -1004,7 +1018,7 @@ enum {
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
@@ -1012,8 +1026,8 @@ enum {
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+        Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
-        Opt_grpquota, Opt_i_version,
+        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
        Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1039,8 +1053,6 @@ static const match_table_t tokens = {
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
-        {Opt_reservation, "reservation"},
-        {Opt_noreservation, "noreservation"},
        {Opt_noload, "noload"},
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
@@ -1068,6 +1080,8 @@ static const match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
+        {Opt_barrier, "barrier"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_i_version, "i_version"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
@@ -1075,6 +1089,9 @@ static const match_table_t tokens = {
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
        {Opt_journal_ioprio, "journal_ioprio=%u"},
+        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+        {Opt_auto_da_alloc, "auto_da_alloc"},
+        {Opt_noauto_da_alloc, "noauto_da_alloc"},
        {Opt_err, NULL},
 };
@@ -1207,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
                               "not supported\n");
                        break;
 #endif
-                case Opt_reservation:
-                        set_opt(sbi->s_mount_opt, RESERVATION);
-                        break;
-                case Opt_noreservation:
-                        clear_opt(sbi->s_mount_opt, RESERVATION);
-                        break;
                case Opt_journal_update:
                        /* @@@ FIXME */
                        /* Eventually we will want to be able to create
@@ -1415,9 +1426,14 @@ set_qf_format:
                case Opt_abort:
                        set_opt(sbi->s_mount_opt, ABORT);
                        break;
+                case Opt_nobarrier:
+                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        break;
                case Opt_barrier:
-                        if (match_int(&args[0], &option))
+                        if (match_int(&args[0], &option)) {
-                                return 0;
+                                set_opt(sbi->s_mount_opt, BARRIER);
+                                break;
+                        }
                        if (option)
                                set_opt(sbi->s_mount_opt, BARRIER);
                        else
@@ -1463,6 +1479,11 @@ set_qf_format:
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
+                        if (option & (option - 1)) {
+                                printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+                                       " must be a power of 2\n");
+                                return 0;
+                        }
                        sbi->s_inode_readahead_blks = option;
                        break;
                case Opt_journal_ioprio:
@@ -1473,6 +1494,19 @@ set_qf_format:
                        *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
                                                            option);
                        break;
+                case Opt_noauto_da_alloc:
+                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        break;
+                case Opt_auto_da_alloc:
+                        if (match_int(&args[0], &option)) {
+                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                                break;
+                        }
+                        if (option)
+                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                        else
+                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        break;
                default:
                        printk(KERN_ERR
                               "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1612,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, &bh);
                flex_group = ext4_flex_group(sbi, i);
-                sbi->s_flex_groups[flex_group].free_inodes +=
+                atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
-                        ext4_free_inodes_count(sb, gdp);
+                           ext4_free_inodes_count(sb, gdp));
-                sbi->s_flex_groups[flex_group].free_blocks +=
+                atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
-                        ext4_free_blks_count(sb, gdp);
+                           ext4_free_blks_count(sb, gdp));
+                atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+                           ext4_used_dirs_count(sb, gdp));
        }
        return 1;
@@ -1991,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
        return 0;
 }
+/* sysfs supprt */
+struct ext4_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+                         const char *, size_t);
+        int offset;
+};
+static int parse_strtoul(const char *buf,
+                unsigned long max, unsigned long *value)
+{
+        char *endp;
+        while (*buf && isspace(*buf))
+                buf++;
+        *value = simple_strtoul(buf, &endp, 0);
+        while (*endp && isspace(*endp))
+                endp++;
+        if (*endp || *value > max)
+                return -EINVAL;
+        return 0;
+}
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+                                              struct ext4_sb_info *sbi,
+                                              char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                        (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+}
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+                                         struct ext4_sb_info *sbi, char *buf)
+{
+        struct super_block *sb = sbi->s_buddy_cache->i_sb;
+        return snprintf(buf, PAGE_SIZE, "%lu\n",
+                        (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                         sbi->s_sectors_written_start) >> 1);
+}
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+                                          struct ext4_sb_info *sbi, char *buf)
+{
+        struct super_block *sb = sbi->s_buddy_cache->i_sb;
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                        sbi->s_kbytes_written + 
+                        ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                          EXT4_SB(sb)->s_sectors_written_start) >> 1));
+}
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+                                          struct ext4_sb_info *sbi,
+                                          const char *buf, size_t count)
+{
+        unsigned long t;
+        if (parse_strtoul(buf, 0x40000000, &t))
+                return -EINVAL;
+        /* inode_readahead_blks must be a power of 2 */
+        if (t & (t-1))
+                return -EINVAL;
+        sbi->s_inode_readahead_blks = t;
+        return count;
+}
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+                                struct ext4_sb_info *sbi, char *buf)
+{
+        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+        return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+                            struct ext4_sb_info *sbi,
+                            const char *buf, size_t count)
+{
+        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+        unsigned long t;
+        if (parse_strtoul(buf, 0xffffffff, &t))
+                return -EINVAL;
+        *ui = t;
+        return count;
+}
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {                   \
+        .attr = {.name = __stringify(_name), .mode = _mode },   \
+        .show   = _show,                                        \
+        .store  = _store,                                       \
+        .offset = offsetof(struct ext4_sb_info, _elname),       \
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)       \
+        EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+                 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+static struct attribute *ext4_attrs[] = {
+        ATTR_LIST(delayed_allocation_blocks),
+        ATTR_LIST(session_write_kbytes),
+        ATTR_LIST(lifetime_write_kbytes),
+        ATTR_LIST(inode_readahead_blks),
+        ATTR_LIST(mb_stats),
+        ATTR_LIST(mb_max_to_scan),
+        ATTR_LIST(mb_min_to_scan),
+        ATTR_LIST(mb_order2_req),
+        ATTR_LIST(mb_stream_req),
+        ATTR_LIST(mb_group_prealloc),
+        NULL,
+};
+static ssize_t ext4_attr_show(struct kobject *kobj,
+                              struct attribute *attr, char *buf)
+{
+        struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                                s_kobj);
+        struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+        return a->show ? a->show(a, sbi, buf) : 0;
+}
+static ssize_t ext4_attr_store(struct kobject *kobj,
+                               struct attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                                s_kobj);
+        struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+        return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+static void ext4_sb_release(struct kobject *kobj)
+{
+        struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                                s_kobj);
+        complete(&sbi->s_kobj_unregister);
+}
+static struct sysfs_ops ext4_attr_ops = {
+        .show   = ext4_attr_show,
+        .store  = ext4_attr_store,
+};
+static struct kobj_type ext4_ktype = {
+        .default_attrs  = ext4_attrs,
+        .sysfs_ops      = &ext4_attr_ops,
+        .release        = ext4_sb_release,
+};
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2021,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT4_DEF_RESUID;
        sbi->s_resgid = EXT4_DEF_RESGID;
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
+        sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+                                                      sectors[1]);
        unlock_kernel();
@@ -2064,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT4_SUPER_MAGIC)
                goto cantfind_ext4;
+        sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2101,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-        set_opt(sbi->s_mount_opt, RESERVATION);
        set_opt(sbi->s_mount_opt, BARRIER);
        /*
@@ -2288,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
+        /* check blocks count against device size */
+        blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+                printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
+                       "exceeds size of device (%llu blocks)\n",
+                       ext4_blocks_count(es), blocks_count);
+                goto failed_mount;
+        }
        /*
         * It makes no sense for the first data block to be beyond the end
         * of the filesystem.
@@ -2325,14 +2554,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_PROC_FS
        if (ext4_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-        if (sbi->s_proc)
-                proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
-                                 &ext4_ui_proc_fops,
-                                 &sbi->s_inode_readahead_blks);
 #endif
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logical_sb_block, i);
@@ -2564,6 +2788,16 @@ no_journal:
                goto failed_mount4;
        }
+        sbi->s_kobj.kset = ext4_kset;
+        init_completion(&sbi->s_kobj_unregister);
+        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+                                   "%s", sb->s_id);
+        if (err) {
+                ext4_mb_release(sb);
+                ext4_ext_release(sb);
+                goto failed_mount4;
+        };
        /*
         * akpm: core read_super() calls in here with the superblock locked.
         * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2618,7 +2852,6 @@ failed_mount2:
        kfree(sbi->s_group_desc);
 failed_mount:
        if (sbi->s_proc) {
-                remove_proc_entry("inode_readahead_blks", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
 #ifdef CONFIG_QUOTA
@@ -2913,6 +3146,10 @@ static int ext4_commit_super(struct super_block *sb,
                set_buffer_uptodate(sbh);
        }
        es->s_wtime = cpu_to_le32(get_seconds());
+        es->s_kbytes_written =
+                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+                            ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeblocks_counter));
        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3647,45 +3884,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
-#ifdef CONFIG_PROC_FS
-static int ext4_ui_proc_show(struct seq_file *m, void *v)
-{
-        unsigned int *p = m->private;
-        seq_printf(m, "%u\n", *p);
-        return 0;
-}
-static int ext4_ui_proc_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
-}
-static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
-                               size_t cnt, loff_t *ppos)
-{
-        unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
-        char str[32];
-        if (cnt >= sizeof(str))
-                return -EINVAL;
-        if (copy_from_user(str, buf, cnt))
-                return -EFAULT;
-        *p = simple_strtoul(str, NULL, 0);
-        return cnt;
-}
-const struct file_operations ext4_ui_proc_fops = {
-        .owner          = THIS_MODULE,
-        .open           = ext4_ui_proc_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-        .write          = ext4_ui_proc_write,
-};
-#endif
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
@@ -3719,6 +3917,9 @@ static int __init init_ext4_fs(void)
 {
        int err;
+        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+        if (!ext4_kset)
+                return -ENOMEM;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
        err = init_ext4_mballoc();
        if (err)
@@ -3760,6 +3961,7 @@ static void __exit exit_ext4_fs(void)
        exit_ext4_xattr();
        exit_ext4_mballoc();
        remove_proc_entry("fs/ext4", NULL);
+        kset_unregister(ext4_kset);
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index d0a69ff25375..182f9ffe2b51 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -95,3 +95,6 @@ config FAT_DEFAULT_IOCHARSET
          Note that "utf8" is not recommended for FAT filesystems.
          If unsure, you shouldn't set "utf8" here.
          See <file:Documentation/filesystems/vfat.txt> for more information.
+          Enable any character sets you need in File Systems/Native Language
+          Support.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0004fe6e00..296785a0dec8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
+        struct super_block *sb = dentry->d_sb;
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        /* If the count of free cluster is still unknown, counts it here. */
        if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
@@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
        buf->f_bfree = sbi->free_clusters;
        buf->f_bavail = sbi->free_clusters;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = sbi->options.isvfat ? 260 : 12;
        return 0;
@@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->fs_uid = current_uid();
        opts->fs_gid = current_gid();
-        opts->fs_fmask = opts->fs_dmask = current->fs->umask;
+        opts->fs_fmask = current_umask();
        opts->allow_utime = -1;
        opts->codepage = fat_default_codepage;
        opts->iocharset = fat_default_iocharset;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cc8e4de2fee5..1ad703150dee 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -117,11 +117,13 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
        if (unlikely(newfd == oldfd)) { /* corner case */
                struct files_struct *files = current->files;
+                int retval = oldfd;
                rcu_read_lock();
                if (!fcheck_files(files, oldfd))
-                        oldfd = -EBADF;
+                        retval = -EBADF;
                rcu_read_unlock();
-                return oldfd;
+                return retval;
        }
        return sys_dup3(oldfd, newfd, 0);
 }
diff --git a/fs/file_table.c b/fs/file_table.c
index b74a8e1da913..54018fe48840 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -169,7 +169,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
                fmode_t mode, const struct file_operations *fop)
 {
        struct file *file;
-        struct path;
        file = get_empty_filp();
        if (!file)
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 1aa70260e6d1..a24c58e181db 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
        return retval;
 }
-int get_filesystem_list(char * buf)
+int __init get_filesystem_list(char *buf)
 {
        int len = 0;
        struct file_system_type * tmp;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e3fe9918faaf..91013ff7dd53 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode)
                struct inode *tail_inode;
                tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
-                if (!time_after_eq(inode->dirtied_when,
+                if (time_before(inode->dirtied_when,
                                tail_inode->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
@@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode)
        wake_up_bit(&inode->i_state, __I_SYNC);
 }
+static bool inode_dirtied_after(struct inode *inode, unsigned long t)
+{
+        bool ret = time_after(inode->dirtied_when, t);
+#ifndef CONFIG_64BIT
+        /*
+         * For inodes being constantly redirtied, dirtied_when can get stuck.
+         * It _appears_ to be in the future, but is actually in distant past.
+         * This test is necessary to prevent such wrapped-around relative times
+         * from permanently stopping the whole pdflush writeback.
+         */
+        ret = ret && time_before_eq(inode->dirtied_when, jiffies);
+#endif
+        return ret;
+}
 /*
 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
 */
@@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
                struct inode *inode = list_entry(delaying_queue->prev,
                                                struct inode, i_list);
                if (older_than_this &&
-                        time_after(inode->dirtied_when, *older_than_this))
+                    inode_dirtied_after(inode, *older_than_this))
                        break;
                list_move(&inode->i_list, dispatch_queue);
        }
@@ -420,7 +435,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * If older_than_this is non-NULL, then only write out inodes which
 * had their first dirtying at a time earlier than *older_than_this.
 *
- * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * If we're a pdflush thread, then implement pdflush collision avoidance
 * against the entire list.
 *
 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
@@ -492,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
                        continue;               /* blockdev has wrong queue */
                }
-                /* Was this inode dirtied after sync_sb_inodes was called? */
+                /*
-                if (time_after(inode->dirtied_when, start))
+                 * Was this inode dirtied after sync_sb_inodes was called?
+                 * This keeps sync from extra jobs and livelock.
+                 */
+                if (inode_dirtied_after(inode, start))
                        break;
                /* Is another pdflush already flushing this queue? */
@@ -538,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
                list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                        struct address_space *mapping;
-                        if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                        if (inode->i_state &
+                                        (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                                continue;
                        mapping = inode->i_mapping;
                        if (mapping->nrpages == 0)
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..eee059052db5
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,177 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/fs_struct.h>
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, struct path *path)
+{
+        struct path old_root;
+        write_lock(&fs->lock);
+        old_root = fs->root;
+        fs->root = *path;
+        path_get(path);
+        write_unlock(&fs->lock);
+        if (old_root.dentry)
+                path_put(&old_root);
+}
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, struct path *path)
+{
+        struct path old_pwd;
+        write_lock(&fs->lock);
+        old_pwd = fs->pwd;
+        fs->pwd = *path;
+        path_get(path);
+        write_unlock(&fs->lock);
+        if (old_pwd.dentry)
+                path_put(&old_pwd);
+}
+void chroot_fs_refs(struct path *old_root, struct path *new_root)
+{
+        struct task_struct *g, *p;
+        struct fs_struct *fs;
+        int count = 0;
+        read_lock(&tasklist_lock);
+        do_each_thread(g, p) {
+                task_lock(p);
+                fs = p->fs;
+                if (fs) {
+                        write_lock(&fs->lock);
+                        if (fs->root.dentry == old_root->dentry
+                            && fs->root.mnt == old_root->mnt) {
+                                path_get(new_root);
+                                fs->root = *new_root;
+                                count++;
+                        }
+                        if (fs->pwd.dentry == old_root->dentry
+                            && fs->pwd.mnt == old_root->mnt) {
+                                path_get(new_root);
+                                fs->pwd = *new_root;
+                                count++;
+                        }
+                        write_unlock(&fs->lock);
+                }
+                task_unlock(p);
+        } while_each_thread(g, p);
+        read_unlock(&tasklist_lock);
+        while (count--)
+                path_put(old_root);
+}
+void free_fs_struct(struct fs_struct *fs)
+{
+        path_put(&fs->root);
+        path_put(&fs->pwd);
+        kmem_cache_free(fs_cachep, fs);
+}
+void exit_fs(struct task_struct *tsk)
+{
+        struct fs_struct *fs = tsk->fs;
+        if (fs) {
+                int kill;
+                task_lock(tsk);
+                write_lock(&fs->lock);
+                tsk->fs = NULL;
+                kill = !--fs->users;
+                write_unlock(&fs->lock);
+                task_unlock(tsk);
+                if (kill)
+                        free_fs_struct(fs);
+        }
+}
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+        /* We don't need to lock fs - think why ;-) */
+        if (fs) {
+                fs->users = 1;
+                fs->in_exec = 0;
+                rwlock_init(&fs->lock);
+                fs->umask = old->umask;
+                read_lock(&old->lock);
+                fs->root = old->root;
+                path_get(&old->root);
+                fs->pwd = old->pwd;
+                path_get(&old->pwd);
+                read_unlock(&old->lock);
+        }
+        return fs;
+}
+int unshare_fs_struct(void)
+{
+        struct fs_struct *fs = current->fs;
+        struct fs_struct *new_fs = copy_fs_struct(fs);
+        int kill;
+        if (!new_fs)
+                return -ENOMEM;
+        task_lock(current);
+        write_lock(&fs->lock);
+        kill = !--fs->users;
+        current->fs = new_fs;
+        write_unlock(&fs->lock);
+        task_unlock(current);
+        if (kill)
+                free_fs_struct(fs);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+int current_umask(void)
+{
+        return current->fs->umask;
+}
+EXPORT_SYMBOL(current_umask);
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+        .users          = 1,
+        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
+        .umask          = 0022,
+};
+void daemonize_fs_struct(void)
+{
+        struct fs_struct *fs = current->fs;
+        if (fs) {
+                int kill;
+                task_lock(current);
+                write_lock(&init_fs.lock);
+                init_fs.users++;
+                write_unlock(&init_fs.lock);
+                write_lock(&fs->lock);
+                current->fs = &init_fs;
+                kill = !--fs->users;
+                write_unlock(&fs->lock);
+                task_unlock(current);
+                if (kill)
+                        free_fs_struct(fs);
+        }
+}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644
index 000000000000..9bbb8ce7bea0
--- /dev/null
+++ b/fs/fscache/Kconfig
@@ -0,0 +1,56 @@
+config FSCACHE
+        tristate "General filesystem local caching manager"
+        depends on EXPERIMENTAL
+        select SLOW_WORK
+        help
+          This option enables a generic filesystem caching manager that can be
+          used by various network and other filesystems to cache data locally.
+          Different sorts of caches can be plugged in, depending on the
+          resources available.
+          See Documentation/filesystems/caching/fscache.txt for more information.
+config FSCACHE_STATS
+        bool "Gather statistical information on local caching"
+        depends on FSCACHE && PROC_FS
+        help
+          This option causes statistical information to be gathered on local
+          caching and exported through file:
+                /proc/fs/fscache/stats
+          The gathering of statistics adds a certain amount of overhead to
+          execution as there are a quite a few stats gathered, and on a
+          multi-CPU system these may be on cachelines that keep bouncing
+          between CPUs.  On the other hand, the stats are very useful for
+          debugging purposes.  Saying 'Y' here is recommended.
+          See Documentation/filesystems/caching/fscache.txt for more information.
+config FSCACHE_HISTOGRAM
+        bool "Gather latency information on local caching"
+        depends on FSCACHE && PROC_FS
+        help
+          This option causes latency information to be gathered on local
+          caching and exported through file:
+                /proc/fs/fscache/histogram
+          The generation of this histogram adds a certain amount of overhead to
+          execution as there are a number of points at which data is gathered,
+          and on a multi-CPU system these may be on cachelines that keep
+          bouncing between CPUs.  On the other hand, the histogram may be
+          useful for debugging purposes.  Saying 'N' here is recommended.
+          See Documentation/filesystems/caching/fscache.txt for more information.
+config FSCACHE_DEBUG
+        bool "Debug FS-Cache"
+        depends on FSCACHE
+        help
+          This permits debugging to be dynamically enabled in the local caching
+          management module.  If this is set, the debugging output may be
+          enabled by setting bits in /sys/modules/fscache/parameter/debug.
+          See Documentation/filesystems/caching/fscache.txt for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644
index 000000000000..91571b95aacc
--- /dev/null
+++ b/fs/fscache/Makefile
@@ -0,0 +1,19 @@
+#
+# Makefile for general filesystem caching code
+#
+fscache-y := \
+        cache.o \
+        cookie.o \
+        fsdef.o \
+        main.o \
+        netfs.o \
+        object.o \
+        operation.o \
+        page.o
+fscache-$(CONFIG_PROC_FS) += proc.o
+fscache-$(CONFIG_FSCACHE_STATS) += stats.o
+fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 000000000000..e21985bbb1fb
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,415 @@
+/* FS-Cache cache handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+LIST_HEAD(fscache_cache_list);
+DECLARE_RWSEM(fscache_addremove_sem);
+DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
+EXPORT_SYMBOL(fscache_cache_cleared_wq);
+static LIST_HEAD(fscache_cache_tag_list);
+/*
+ * look up a cache tag
+ */
+struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+{
+        struct fscache_cache_tag *tag, *xtag;
+        /* firstly check for the existence of the tag under read lock */
+        down_read(&fscache_addremove_sem);
+        list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+                if (strcmp(tag->name, name) == 0) {
+                        atomic_inc(&tag->usage);
+                        up_read(&fscache_addremove_sem);
+                        return tag;
+                }
+        }
+        up_read(&fscache_addremove_sem);
+        /* the tag does not exist - create a candidate */
+        xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
+        if (!xtag)
+                /* return a dummy tag if out of memory */
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&xtag->usage, 1);
+        strcpy(xtag->name, name);
+        /* write lock, search again and add if still not present */
+        down_write(&fscache_addremove_sem);
+        list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+                if (strcmp(tag->name, name) == 0) {
+                        atomic_inc(&tag->usage);
+                        up_write(&fscache_addremove_sem);
+                        kfree(xtag);
+                        return tag;
+                }
+        }
+        list_add_tail(&xtag->link, &fscache_cache_tag_list);
+        up_write(&fscache_addremove_sem);
+        return xtag;
+}
+/*
+ * release a reference to a cache tag
+ */
+void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+        if (tag != ERR_PTR(-ENOMEM)) {
+                down_write(&fscache_addremove_sem);
+                if (atomic_dec_and_test(&tag->usage))
+                        list_del_init(&tag->link);
+                else
+                        tag = NULL;
+                up_write(&fscache_addremove_sem);
+                kfree(tag);
+        }
+}
+/*
+ * select a cache in which to store an object
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ * - the object will never be an index
+ */
+struct fscache_cache *fscache_select_cache_for_object(
+        struct fscache_cookie *cookie)
+{
+        struct fscache_cache_tag *tag;
+        struct fscache_object *object;
+        struct fscache_cache *cache;
+        _enter("");
+        if (list_empty(&fscache_cache_list)) {
+                _leave(" = NULL [no cache]");
+                return NULL;
+        }
+        /* we check the parent to determine the cache to use */
+        spin_lock(&cookie->lock);
+        /* the first in the parent's backing list should be the preferred
+         * cache */
+        if (!hlist_empty(&cookie->backing_objects)) {
+                object = hlist_entry(cookie->backing_objects.first,
+                                     struct fscache_object, cookie_link);
+                cache = object->cache;
+                if (object->state >= FSCACHE_OBJECT_DYING ||
+                    test_bit(FSCACHE_IOERROR, &cache->flags))
+                        cache = NULL;
+                spin_unlock(&cookie->lock);
+                _leave(" = %p [parent]", cache);
+                return cache;
+        }
+        /* the parent is unbacked */
+        if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+                /* cookie not an index and is unbacked */
+                spin_unlock(&cookie->lock);
+                _leave(" = NULL [cookie ub,ni]");
+                return NULL;
+        }
+        spin_unlock(&cookie->lock);
+        if (!cookie->def->select_cache)
+                goto no_preference;
+        /* ask the netfs for its preference */
+        tag = cookie->def->select_cache(cookie->parent->netfs_data,
+                                        cookie->netfs_data);
+        if (!tag)
+                goto no_preference;
+        if (tag == ERR_PTR(-ENOMEM)) {
+                _leave(" = NULL [nomem tag]");
+                return NULL;
+        }
+        if (!tag->cache) {
+                _leave(" = NULL [unbacked tag]");
+                return NULL;
+        }
+        if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
+                return NULL;
+        _leave(" = %p [specific]", tag->cache);
+        return tag->cache;
+no_preference:
+        /* netfs has no preference - just select first cache */
+        cache = list_entry(fscache_cache_list.next,
+                           struct fscache_cache, link);
+        _leave(" = %p [first]", cache);
+        return cache;
+}
+/**
+ * fscache_init_cache - Initialise a cache record
+ * @cache: The cache record to be initialised
+ * @ops: The cache operations to be installed in that record
+ * @idfmt: Format string to define identifier
+ * @...: sprintf-style arguments
+ *
+ * Initialise a record of a cache and fill in the name.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_init_cache(struct fscache_cache *cache,
+                        const struct fscache_cache_ops *ops,
+                        const char *idfmt,
+                        ...)
+{
+        va_list va;
+        memset(cache, 0, sizeof(*cache));
+        cache->ops = ops;
+        va_start(va, idfmt);
+        vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
+        va_end(va);
+        INIT_WORK(&cache->op_gc, fscache_operation_gc);
+        INIT_LIST_HEAD(&cache->link);
+        INIT_LIST_HEAD(&cache->object_list);
+        INIT_LIST_HEAD(&cache->op_gc_list);
+        spin_lock_init(&cache->object_list_lock);
+        spin_lock_init(&cache->op_gc_list_lock);
+}
+EXPORT_SYMBOL(fscache_init_cache);
+/**
+ * fscache_add_cache - Declare a cache as being open for business
+ * @cache: The record describing the cache
+ * @ifsdef: The record of the cache object describing the top-level index
+ * @tagname: The tag describing this cache
+ *
+ * Add a cache to the system, making it available for netfs's to use.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+int fscache_add_cache(struct fscache_cache *cache,
+                      struct fscache_object *ifsdef,
+                      const char *tagname)
+{
+        struct fscache_cache_tag *tag;
+        BUG_ON(!cache->ops);
+        BUG_ON(!ifsdef);
+        cache->flags = 0;
+        ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+        ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+        if (!tagname)
+                tagname = cache->identifier;
+        BUG_ON(!tagname[0]);
+        _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
+        /* we use the cache tag to uniquely identify caches */
+        tag = __fscache_lookup_cache_tag(tagname);
+        if (IS_ERR(tag))
+                goto nomem;
+        if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
+                goto tag_in_use;
+        cache->kobj = kobject_create_and_add(tagname, fscache_root);
+        if (!cache->kobj)
+                goto error;
+        ifsdef->cookie = &fscache_fsdef_index;
+        ifsdef->cache = cache;
+        cache->fsdef = ifsdef;
+        down_write(&fscache_addremove_sem);
+        tag->cache = cache;
+        cache->tag = tag;
+        /* add the cache to the list */
+        list_add(&cache->link, &fscache_cache_list);
+        /* add the cache's netfs definition index object to the cache's
+         * list */
+        spin_lock(&cache->object_list_lock);
+        list_add_tail(&ifsdef->cache_link, &cache->object_list);
+        spin_unlock(&cache->object_list_lock);
+        /* add the cache's netfs definition index object to the top level index
+         * cookie as a known backing object */
+        spin_lock(&fscache_fsdef_index.lock);
+        hlist_add_head(&ifsdef->cookie_link,
+                       &fscache_fsdef_index.backing_objects);
+        atomic_inc(&fscache_fsdef_index.usage);
+        /* done */
+        spin_unlock(&fscache_fsdef_index.lock);
+        up_write(&fscache_addremove_sem);
+        printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
+               cache->tag->name, cache->ops->name);
+        kobject_uevent(cache->kobj, KOBJ_ADD);
+        _leave(" = 0 [%s]", cache->identifier);
+        return 0;
+tag_in_use:
+        printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
+        __fscache_release_cache_tag(tag);
+        _leave(" = -EXIST");
+        return -EEXIST;
+error:
+        __fscache_release_cache_tag(tag);
+        _leave(" = -EINVAL");
+        return -EINVAL;
+nomem:
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+EXPORT_SYMBOL(fscache_add_cache);
+/**
+ * fscache_io_error - Note a cache I/O error
+ * @cache: The record describing the cache
+ *
+ * Note that an I/O error occurred in a cache and that it should no longer be
+ * used for anything.  This also reports the error into the kernel log.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_io_error(struct fscache_cache *cache)
+{
+        set_bit(FSCACHE_IOERROR, &cache->flags);
+        printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
+               cache->ops->name);
+}
+EXPORT_SYMBOL(fscache_io_error);
+/*
+ * request withdrawal of all the objects in a cache
+ * - all the objects being withdrawn are moved onto the supplied list
+ */
+static void fscache_withdraw_all_objects(struct fscache_cache *cache,
+                                         struct list_head *dying_objects)
+{
+        struct fscache_object *object;
+        spin_lock(&cache->object_list_lock);
+        while (!list_empty(&cache->object_list)) {
+                object = list_entry(cache->object_list.next,
+                                    struct fscache_object, cache_link);
+                list_move_tail(&object->cache_link, dying_objects);
+                _debug("withdraw %p", object->cookie);
+                spin_lock(&object->lock);
+                spin_unlock(&cache->object_list_lock);
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
+                spin_unlock(&object->lock);
+                cond_resched();
+                spin_lock(&cache->object_list_lock);
+        }
+        spin_unlock(&cache->object_list_lock);
+}
+/**
+ * fscache_withdraw_cache - Withdraw a cache from the active service
+ * @cache: The record describing the cache
+ *
+ * Withdraw a cache from service, unbinding all its cache objects from the
+ * netfs cookies they're currently representing.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_withdraw_cache(struct fscache_cache *cache)
+{
+        LIST_HEAD(dying_objects);
+        _enter("");
+        printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
+               cache->tag->name);
+        /* make the cache unavailable for cookie acquisition */
+        if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
+                BUG();
+        down_write(&fscache_addremove_sem);
+        list_del_init(&cache->link);
+        cache->tag->cache = NULL;
+        up_write(&fscache_addremove_sem);
+        /* make sure all pages pinned by operations on behalf of the netfs are
+         * written to disk */
+        cache->ops->sync_cache(cache);
+        /* dissociate all the netfs pages backed by this cache from the block
+         * mappings in the cache */
+        cache->ops->dissociate_pages(cache);
+        /* we now have to destroy all the active objects pertaining to this
+         * cache - which we do by passing them off to thread pool to be
+         * disposed of */
+        _debug("destroy");
+        fscache_withdraw_all_objects(cache, &dying_objects);
+        /* wait for all extant objects to finish their outstanding operations
+         * and go away */
+        _debug("wait for finish");
+        wait_event(fscache_cache_cleared_wq,
+                   atomic_read(&cache->object_count) == 0);
+        _debug("wait for clearance");
+        wait_event(fscache_cache_cleared_wq,
+                   list_empty(&cache->object_list));
+        _debug("cleared");
+        ASSERT(list_empty(&dying_objects));
+        kobject_put(cache->kobj);
+        clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
+        fscache_release_cache_tag(cache->tag);
+        cache->tag = NULL;
+        _leave("");
+}
+EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644
index 000000000000..72fd18f6c71f
--- /dev/null
+++ b/fs/fscache/cookie.c
@@ -0,0 +1,500 @@
+/* netfs cookie management
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for more information on
+ * the netfs API.
+ */
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+struct kmem_cache *fscache_cookie_jar;
+static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+static int fscache_alloc_object(struct fscache_cache *cache,
+                                struct fscache_cookie *cookie);
+static int fscache_attach_object(struct fscache_cookie *cookie,
+                                 struct fscache_object *object);
+/*
+ * initialise an cookie jar slab element prior to any use
+ */
+void fscache_cookie_init_once(void *_cookie)
+{
+        struct fscache_cookie *cookie = _cookie;
+        memset(cookie, 0, sizeof(*cookie));
+        spin_lock_init(&cookie->lock);
+        INIT_HLIST_HEAD(&cookie->backing_objects);
+}
+/*
+ * request a cookie to represent an object (index, datafile, xattr, etc)
+ * - parent specifies the parent object
+ *   - the top level index cookie for each netfs is stored in the fscache_netfs
+ *     struct upon registration
+ * - def points to the definition
+ * - the netfs_data will be passed to the functions pointed to in *def
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disk until there's a dependent file that
+ *   needs storing
+ * - other objects are stored in a selected cache immediately, and all the
+ *   indices forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ *   - we may set a negative cookie pointer, but that's okay
+ */
+struct fscache_cookie *__fscache_acquire_cookie(
+        struct fscache_cookie *parent,
+        const struct fscache_cookie_def *def,
+        void *netfs_data)
+{
+        struct fscache_cookie *cookie;
+        BUG_ON(!def);
+        _enter("{%s},{%s},%p",
+               parent ? (char *) parent->def->name : "<no-parent>",
+               def->name, netfs_data);
+        fscache_stat(&fscache_n_acquires);
+        /* if there's no parent cookie, then we don't create one here either */
+        if (!parent) {
+                fscache_stat(&fscache_n_acquires_null);
+                _leave(" [no parent]");
+                return NULL;
+        }
+        /* validate the definition */
+        BUG_ON(!def->get_key);
+        BUG_ON(!def->name[0]);
+        BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
+               parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+        /* allocate and initialise a cookie */
+        cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+        if (!cookie) {
+                fscache_stat(&fscache_n_acquires_oom);
+                _leave(" [ENOMEM]");
+                return NULL;
+        }
+        atomic_set(&cookie->usage, 1);
+        atomic_set(&cookie->n_children, 0);
+        atomic_inc(&parent->usage);
+        atomic_inc(&parent->n_children);
+        cookie->def             = def;
+        cookie->parent          = parent;
+        cookie->netfs_data      = netfs_data;
+        cookie->flags           = 0;
+        INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+        switch (cookie->def->type) {
+        case FSCACHE_COOKIE_TYPE_INDEX:
+                fscache_stat(&fscache_n_cookie_index);
+                break;
+        case FSCACHE_COOKIE_TYPE_DATAFILE:
+                fscache_stat(&fscache_n_cookie_data);
+                break;
+        default:
+                fscache_stat(&fscache_n_cookie_special);
+                break;
+        }
+        /* if the object is an index then we need do nothing more here - we
+         * create indices on disk when we need them as an index may exist in
+         * multiple caches */
+        if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+                if (fscache_acquire_non_index_cookie(cookie) < 0) {
+                        atomic_dec(&parent->n_children);
+                        __fscache_cookie_put(cookie);
+                        fscache_stat(&fscache_n_acquires_nobufs);
+                        _leave(" = NULL");
+                        return NULL;
+                }
+        }
+        fscache_stat(&fscache_n_acquires_ok);
+        _leave(" = %p", cookie);
+        return cookie;
+}
+EXPORT_SYMBOL(__fscache_acquire_cookie);
+/*
+ * acquire a non-index cookie
+ * - this must make sure the index chain is instantiated and instantiate the
+ *   object representation too
+ */
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+{
+        struct fscache_object *object;
+        struct fscache_cache *cache;
+        uint64_t i_size;
+        int ret;
+        _enter("");
+        cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+        /* now we need to see whether the backing objects for this cookie yet
+         * exist, if not there'll be nothing to search */
+        down_read(&fscache_addremove_sem);
+        if (list_empty(&fscache_cache_list)) {
+                up_read(&fscache_addremove_sem);
+                _leave(" = 0 [no caches]");
+                return 0;
+        }
+        /* select a cache in which to store the object */
+        cache = fscache_select_cache_for_object(cookie->parent);
+        if (!cache) {
+                up_read(&fscache_addremove_sem);
+                fscache_stat(&fscache_n_acquires_no_cache);
+                _leave(" = -ENOMEDIUM [no cache]");
+                return -ENOMEDIUM;
+        }
+        _debug("cache %s", cache->tag->name);
+        cookie->flags =
+                (1 << FSCACHE_COOKIE_LOOKING_UP) |
+                (1 << FSCACHE_COOKIE_CREATING) |
+                (1 << FSCACHE_COOKIE_NO_DATA_YET);
+        /* ask the cache to allocate objects for this cookie and its parent
+         * chain */
+        ret = fscache_alloc_object(cache, cookie);
+        if (ret < 0) {
+                up_read(&fscache_addremove_sem);
+                _leave(" = %d", ret);
+                return ret;
+        }
+        /* pass on how big the object we're caching is supposed to be */
+        cookie->def->get_attr(cookie->netfs_data, &i_size);
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects)) {
+                spin_unlock(&cookie->lock);
+                goto unavailable;
+        }
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        fscache_set_store_limit(object, i_size);
+        /* initiate the process of looking up all the objects in the chain
+         * (done by fscache_initialise_object()) */
+        fscache_enqueue_object(object);
+        spin_unlock(&cookie->lock);
+        /* we may be required to wait for lookup to complete at this point */
+        if (!fscache_defer_lookup) {
+                _debug("non-deferred lookup %p", &cookie->flags);
+                wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+                _debug("complete");
+                if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
+                        goto unavailable;
+        }
+        up_read(&fscache_addremove_sem);
+        _leave(" = 0 [deferred]");
+        return 0;
+unavailable:
+        up_read(&fscache_addremove_sem);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+}
+/*
+ * recursively allocate cache object records for a cookie/cache combination
+ * - caller must be holding the addremove sem
+ */
+static int fscache_alloc_object(struct fscache_cache *cache,
+                                struct fscache_cookie *cookie)
+{
+        struct fscache_object *object;
+        struct hlist_node *_n;
+        int ret;
+        _enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+        spin_lock(&cookie->lock);
+        hlist_for_each_entry(object, _n, &cookie->backing_objects,
+                             cookie_link) {
+                if (object->cache == cache)
+                        goto object_already_extant;
+        }
+        spin_unlock(&cookie->lock);
+        /* ask the cache to allocate an object (we may end up with duplicate
+         * objects at this stage, but we sort that out later) */
+        object = cache->ops->alloc_object(cache, cookie);
+        if (IS_ERR(object)) {
+                fscache_stat(&fscache_n_object_no_alloc);
+                ret = PTR_ERR(object);
+                goto error;
+        }
+        fscache_stat(&fscache_n_object_alloc);
+        object->debug_id = atomic_inc_return(&fscache_object_debug_id);
+        _debug("ALLOC OBJ%x: %s {%lx}",
+               object->debug_id, cookie->def->name, object->events);
+        ret = fscache_alloc_object(cache, cookie->parent);
+        if (ret < 0)
+                goto error_put;
+        /* only attach if we managed to allocate all we needed, otherwise
+         * discard the object we just allocated and instead use the one
+         * attached to the cookie */
+        if (fscache_attach_object(cookie, object) < 0)
+                cache->ops->put_object(object);
+        _leave(" = 0");
+        return 0;
+object_already_extant:
+        ret = -ENOBUFS;
+        if (object->state >= FSCACHE_OBJECT_DYING) {
+                spin_unlock(&cookie->lock);
+                goto error;
+        }
+        spin_unlock(&cookie->lock);
+        _leave(" = 0 [found]");
+        return 0;
+error_put:
+        cache->ops->put_object(object);
+error:
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * attach a cache object to a cookie
+ */
+static int fscache_attach_object(struct fscache_cookie *cookie,
+                                 struct fscache_object *object)
+{
+        struct fscache_object *p;
+        struct fscache_cache *cache = object->cache;
+        struct hlist_node *_n;
+        int ret;
+        _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+        spin_lock(&cookie->lock);
+        /* there may be multiple initial creations of this object, but we only
+         * want one */
+        ret = -EEXIST;
+        hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+                if (p->cache == object->cache) {
+                        if (p->state >= FSCACHE_OBJECT_DYING)
+                                ret = -ENOBUFS;
+                        goto cant_attach_object;
+                }
+        }
+        /* pin the parent object */
+        spin_lock_nested(&cookie->parent->lock, 1);
+        hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+                             cookie_link) {
+                if (p->cache == object->cache) {
+                        if (p->state >= FSCACHE_OBJECT_DYING) {
+                                ret = -ENOBUFS;
+                                spin_unlock(&cookie->parent->lock);
+                                goto cant_attach_object;
+                        }
+                        object->parent = p;
+                        spin_lock(&p->lock);
+                        p->n_children++;
+                        spin_unlock(&p->lock);
+                        break;
+                }
+        }
+        spin_unlock(&cookie->parent->lock);
+        /* attach to the cache's object list */
+        if (list_empty(&object->cache_link)) {
+                spin_lock(&cache->object_list_lock);
+                list_add(&object->cache_link, &cache->object_list);
+                spin_unlock(&cache->object_list_lock);
+        }
+        /* attach to the cookie */
+        object->cookie = cookie;
+        atomic_inc(&cookie->usage);
+        hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+        ret = 0;
+cant_attach_object:
+        spin_unlock(&cookie->lock);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * update the index entries backing a cookie
+ */
+void __fscache_update_cookie(struct fscache_cookie *cookie)
+{
+        struct fscache_object *object;
+        struct hlist_node *_p;
+        fscache_stat(&fscache_n_updates);
+        if (!cookie) {
+                fscache_stat(&fscache_n_updates_null);
+                _leave(" [no cookie]");
+                return;
+        }
+        _enter("{%s}", cookie->def->name);
+        BUG_ON(!cookie->def->get_aux);
+        spin_lock(&cookie->lock);
+        /* update the index entry on disk in each cache backing this cookie */
+        hlist_for_each_entry(object, _p,
+                             &cookie->backing_objects, cookie_link) {
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+        }
+        spin_unlock(&cookie->lock);
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_update_cookie);
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+        struct fscache_cache *cache;
+        struct fscache_object *object;
+        unsigned long event;
+        fscache_stat(&fscache_n_relinquishes);
+        if (!cookie) {
+                fscache_stat(&fscache_n_relinquishes_null);
+                _leave(" [no cookie]");
+                return;
+        }
+        _enter("%p{%s,%p},%d",
+               cookie, cookie->def->name, cookie->netfs_data, retire);
+        if (atomic_read(&cookie->n_children) != 0) {
+                printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
+                       cookie->def->name);
+                BUG();
+        }
+        /* wait for the cookie to finish being instantiated (or to fail) */
+        if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+                fscache_stat(&fscache_n_relinquishes_waitcrt);
+                wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+        }
+        event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+        /* detach pointers back to the netfs */
+        spin_lock(&cookie->lock);
+        cookie->netfs_data      = NULL;
+        cookie->def             = NULL;
+        /* break links with all the active objects */
+        while (!hlist_empty(&cookie->backing_objects)) {
+                object = hlist_entry(cookie->backing_objects.first,
+                                     struct fscache_object,
+                                     cookie_link);
+                _debug("RELEASE OBJ%x", object->debug_id);
+                /* detach each cache object from the object cookie */
+                spin_lock(&object->lock);
+                hlist_del_init(&object->cookie_link);
+                cache = object->cache;
+                object->cookie = NULL;
+                fscache_raise_event(object, event);
+                spin_unlock(&object->lock);
+                if (atomic_dec_and_test(&cookie->usage))
+                        /* the cookie refcount shouldn't be reduced to 0 yet */
+                        BUG();
+        }
+        spin_unlock(&cookie->lock);
+        if (cookie->parent) {
+                ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+                ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
+                atomic_dec(&cookie->parent->n_children);
+        }
+        /* finally dispose of the cookie */
+        ASSERTCMP(atomic_read(&cookie->usage), >, 0);
+        fscache_cookie_put(cookie);
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
+/*
+ * destroy a cookie
+ */
+void __fscache_cookie_put(struct fscache_cookie *cookie)
+{
+        struct fscache_cookie *parent;
+        _enter("%p", cookie);
+        for (;;) {
+                _debug("FREE COOKIE %p", cookie);
+                parent = cookie->parent;
+                BUG_ON(!hlist_empty(&cookie->backing_objects));
+                kmem_cache_free(fscache_cookie_jar, cookie);
+                if (!parent)
+                        break;
+                cookie = parent;
+                BUG_ON(atomic_read(&cookie->usage) <= 0);
+                if (!atomic_dec_and_test(&cookie->usage))
+                        break;
+        }
+        _leave("");
+}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644
index 000000000000..f5b4baee7352
--- /dev/null
+++ b/fs/fscache/fsdef.c
@@ -0,0 +1,144 @@
+/* Filesystem index definition
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include "internal.h"
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax);
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax);
+static
+enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
+                                                    const void *data,
+                                                    uint16_t datalen);
+/*
+ * The root index is owned by FS-Cache itself.
+ *
+ * When a netfs requests caching facilities, FS-Cache will, if one doesn't
+ * already exist, create an entry in the root index with the key being the name
+ * of the netfs ("AFS" for example), and the auxiliary data holding the index
+ * structure version supplied by the netfs:
+ *
+ *                                   FSDEF
+ *                                     |
+ *                               +-----------+
+ *                               |           |
+ *                              NFS         AFS
+ *                             [v=1]       [v=1]
+ *
+ * If an entry with the appropriate name does already exist, the version is
+ * compared.  If the version is different, the entire subtree from that entry
+ * will be discarded and a new entry created.
+ *
+ * The new entry will be an index, and a cookie referring to it will be passed
+ * to the netfs.  This is then the root handle by which the netfs accesses the
+ * cache.  It can create whatever objects it likes in that index, including
+ * further indices.
+ */
+static struct fscache_cookie_def fscache_fsdef_index_def = {
+        .name           = ".FS-Cache",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+};
+struct fscache_cookie fscache_fsdef_index = {
+        .usage          = ATOMIC_INIT(1),
+        .lock           = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
+        .backing_objects = HLIST_HEAD_INIT,
+        .def            = &fscache_fsdef_index_def,
+};
+EXPORT_SYMBOL(fscache_fsdef_index);
+/*
+ * Definition of an entry in the root index.  Each entry is an index, keyed to
+ * a specific netfs and only applicable to a particular version of the index
+ * structure used by that netfs.
+ */
+struct fscache_cookie_def fscache_fsdef_netfs_def = {
+        .name           = "FSDEF.netfs",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = fscache_fsdef_netfs_get_key,
+        .get_aux        = fscache_fsdef_netfs_get_aux,
+        .check_aux      = fscache_fsdef_netfs_check_aux,
+};
+/*
+ * get the key data for an FSDEF index record - this is the name of the netfs
+ * for which this entry is created
+ */
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax)
+{
+        const struct fscache_netfs *netfs = cookie_netfs_data;
+        unsigned klen;
+        _enter("{%s.%u},", netfs->name, netfs->version);
+        klen = strlen(netfs->name);
+        if (klen > bufmax)
+                return 0;
+        memcpy(buffer, netfs->name, klen);
+        return klen;
+}
+/*
+ * get the auxiliary data for an FSDEF index record - this is the index
+ * structure version number of the netfs for which this version is created
+ */
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax)
+{
+        const struct fscache_netfs *netfs = cookie_netfs_data;
+        unsigned dlen;
+        _enter("{%s.%u},", netfs->name, netfs->version);
+        dlen = sizeof(uint32_t);
+        if (dlen > bufmax)
+                return 0;
+        memcpy(buffer, &netfs->version, dlen);
+        return dlen;
+}
+/*
+ * check that the index structure version number stored in the auxiliary data
+ * matches the one the netfs gave us
+ */
+static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
+        void *cookie_netfs_data,
+        const void *data,
+        uint16_t datalen)
+{
+        struct fscache_netfs *netfs = cookie_netfs_data;
+        uint32_t version;
+        _enter("{%s},,%hu", netfs->name, datalen);
+        if (datalen != sizeof(version)) {
+                _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        }
+        memcpy(&version, data, sizeof(version));
+        if (version != netfs->version) {
+                _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        }
+        _leave(" = OKAY");
+        return FSCACHE_CHECKAUX_OKAY;
+}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 000000000000..bad496748a59
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
+/* FS-Cache latency histogram
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+atomic_t fscache_obj_instantiate_histogram[HZ];
+atomic_t fscache_objs_histogram[HZ];
+atomic_t fscache_ops_histogram[HZ];
+atomic_t fscache_retrieval_delay_histogram[HZ];
+atomic_t fscache_retrieval_histogram[HZ];
+/*
+ * display the time-taken histogram
+ */
+static int fscache_histogram_show(struct seq_file *m, void *v)
+{
+        unsigned long index;
+        unsigned n[5], t;
+        switch ((unsigned long) v) {
+        case 1:
+                seq_puts(m, "JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS "
+                         " RETRV DLY RETRIEVLS\n");
+                return 0;
+        case 2:
+                seq_puts(m, "===== ===== ========= ========= ========="
+                         " ========= =========\n");
+                return 0;
+        default:
+                index = (unsigned long) v - 3;
+                n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
+                n[1] = atomic_read(&fscache_ops_histogram[index]);
+                n[2] = atomic_read(&fscache_objs_histogram[index]);
+                n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
+                n[4] = atomic_read(&fscache_retrieval_histogram[index]);
+                if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
+                        return 0;
+                t = (index * 1000) / HZ;
+                seq_printf(m, "%4lu  0.%03u %9u %9u %9u %9u %9u\n",
+                           index, t, n[0], n[1], n[2], n[3], n[4]);
+                return 0;
+        }
+}
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+        if ((unsigned long long)*_pos >= HZ + 2)
+                return NULL;
+        if (*_pos == 0)
+                *_pos = 1;
+        return (void *)(unsigned long) *_pos;
+}
+/*
+ * move to the next line
+ */
+static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return (unsigned long long)*pos > HZ + 2 ?
+                NULL : (void *)(unsigned long) *pos;
+}
+/*
+ * clean up after reading
+ */
+static void fscache_histogram_stop(struct seq_file *m, void *v)
+{
+}
+static const struct seq_operations fscache_histogram_ops = {
+        .start          = fscache_histogram_start,
+        .stop           = fscache_histogram_stop,
+        .next           = fscache_histogram_next,
+        .show           = fscache_histogram_show,
+};
+/*
+ * open "/proc/fs/fscache/histogram" to provide latency data
+ */
+static int fscache_histogram_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &fscache_histogram_ops);
+}
+const struct file_operations fscache_histogram_fops = {
+        .owner          = THIS_MODULE,
+        .open           = fscache_histogram_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644
index 000000000000..1c341304621f
--- /dev/null
+++ b/fs/fscache/internal.h
@@ -0,0 +1,380 @@
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+/*
+ * Lock order, in the order in which multiple locks should be obtained:
+ * - fscache_addremove_sem
+ * - cookie->lock
+ * - cookie->parent->lock
+ * - cache->object_list_lock
+ * - object->lock
+ * - object->parent->lock
+ * - fscache_thread_lock
+ *
+ */
+#include <linux/fscache-cache.h>
+#include <linux/sched.h>
+#define FSCACHE_MIN_THREADS     4
+#define FSCACHE_MAX_THREADS     32
+/*
+ * cache.c
+ */
+extern struct list_head fscache_cache_list;
+extern struct rw_semaphore fscache_addremove_sem;
+extern struct fscache_cache *fscache_select_cache_for_object(
+        struct fscache_cookie *);
+/*
+ * cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+extern void fscache_cookie_init_once(void *);
+extern void __fscache_cookie_put(struct fscache_cookie *);
+/*
+ * fsdef.c
+ */
+extern struct fscache_cookie fscache_fsdef_index;
+extern struct fscache_cookie_def fscache_fsdef_netfs_def;
+/*
+ * histogram.c
+ */
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+extern atomic_t fscache_obj_instantiate_histogram[HZ];
+extern atomic_t fscache_objs_histogram[HZ];
+extern atomic_t fscache_ops_histogram[HZ];
+extern atomic_t fscache_retrieval_delay_histogram[HZ];
+extern atomic_t fscache_retrieval_histogram[HZ];
+static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
+{
+        unsigned long jif = jiffies - start_jif;
+        if (jif >= HZ)
+                jif = HZ - 1;
+        atomic_inc(&histogram[jif]);
+}
+extern const struct file_operations fscache_histogram_fops;
+#else
+#define fscache_hist(hist, start_jif) do {} while (0)
+#endif
+/*
+ * main.c
+ */
+extern unsigned fscache_defer_lookup;
+extern unsigned fscache_defer_create;
+extern unsigned fscache_debug;
+extern struct kobject *fscache_root;
+extern int fscache_wait_bit(void *);
+extern int fscache_wait_bit_interruptible(void *);
+/*
+ * object.c
+ */
+extern void fscache_withdrawing_object(struct fscache_cache *,
+                                       struct fscache_object *);
+extern void fscache_enqueue_object(struct fscache_object *);
+/*
+ * operation.c
+ */
+extern int fscache_submit_exclusive_op(struct fscache_object *,
+                                       struct fscache_operation *);
+extern int fscache_submit_op(struct fscache_object *,
+                             struct fscache_operation *);
+extern void fscache_abort_object(struct fscache_object *);
+extern void fscache_start_operations(struct fscache_object *);
+extern void fscache_operation_gc(struct work_struct *);
+/*
+ * proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init()     (0)
+#define fscache_proc_cleanup()  do {} while (0)
+#endif
+/*
+ * stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_op_pend;
+extern atomic_t fscache_n_op_run;
+extern atomic_t fscache_n_op_enqueue;
+extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_release;
+extern atomic_t fscache_n_op_gc;
+extern atomic_t fscache_n_attr_changed;
+extern atomic_t fscache_n_attr_changed_ok;
+extern atomic_t fscache_n_attr_changed_nobufs;
+extern atomic_t fscache_n_attr_changed_nomem;
+extern atomic_t fscache_n_attr_changed_calls;
+extern atomic_t fscache_n_allocs;
+extern atomic_t fscache_n_allocs_ok;
+extern atomic_t fscache_n_allocs_wait;
+extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_alloc_ops;
+extern atomic_t fscache_n_alloc_op_waits;
+extern atomic_t fscache_n_retrievals;
+extern atomic_t fscache_n_retrievals_ok;
+extern atomic_t fscache_n_retrievals_wait;
+extern atomic_t fscache_n_retrievals_nodata;
+extern atomic_t fscache_n_retrievals_nobufs;
+extern atomic_t fscache_n_retrievals_intr;
+extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrieval_ops;
+extern atomic_t fscache_n_retrieval_op_waits;
+extern atomic_t fscache_n_stores;
+extern atomic_t fscache_n_stores_ok;
+extern atomic_t fscache_n_stores_again;
+extern atomic_t fscache_n_stores_nobufs;
+extern atomic_t fscache_n_stores_oom;
+extern atomic_t fscache_n_store_ops;
+extern atomic_t fscache_n_store_calls;
+extern atomic_t fscache_n_marks;
+extern atomic_t fscache_n_uncaches;
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_null;
+extern atomic_t fscache_n_acquires_no_cache;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_nobufs;
+extern atomic_t fscache_n_acquires_oom;
+extern atomic_t fscache_n_updates;
+extern atomic_t fscache_n_updates_null;
+extern atomic_t fscache_n_updates_run;
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_null;
+extern atomic_t fscache_n_relinquishes_waitcrt;
+extern atomic_t fscache_n_cookie_index;
+extern atomic_t fscache_n_cookie_data;
+extern atomic_t fscache_n_cookie_special;
+extern atomic_t fscache_n_object_alloc;
+extern atomic_t fscache_n_object_no_alloc;
+extern atomic_t fscache_n_object_lookups;
+extern atomic_t fscache_n_object_lookups_negative;
+extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_created;
+extern atomic_t fscache_n_object_avail;
+extern atomic_t fscache_n_object_dead;
+extern atomic_t fscache_n_checkaux_none;
+extern atomic_t fscache_n_checkaux_okay;
+extern atomic_t fscache_n_checkaux_update;
+extern atomic_t fscache_n_checkaux_obsolete;
+static inline void fscache_stat(atomic_t *stat)
+{
+        atomic_inc(stat);
+}
+extern const struct file_operations fscache_stats_fops;
+#else
+#define fscache_stat(stat) do {} while (0)
+#endif
+/*
+ * raise an event on an object
+ * - if the event is not masked for that object, then the object is
+ *   queued for attention by the thread pool.
+ */
+static inline void fscache_raise_event(struct fscache_object *object,
+                                       unsigned event)
+{
+        if (!test_and_set_bit(event, &object->events) &&
+            test_bit(event, &object->event_mask))
+                fscache_enqueue_object(object);
+}
+/*
+ * drop a reference to a cookie
+ */
+static inline void fscache_cookie_put(struct fscache_cookie *cookie)
+{
+        BUG_ON(atomic_read(&cookie->usage) <= 0);
+        if (atomic_dec_and_test(&cookie->usage))
+                __fscache_cookie_put(cookie);
+}
+/*
+ * get an extra reference to a netfs retrieval context
+ */
+static inline
+void *fscache_get_context(struct fscache_cookie *cookie, void *context)
+{
+        if (cookie->def->get_context)
+                cookie->def->get_context(cookie->netfs_data, context);
+        return context;
+}
+/*
+ * release a reference to a netfs retrieval context
+ */
+static inline
+void fscache_put_context(struct fscache_cookie *cookie, void *context)
+{
+        if (cookie->def->put_context)
+                cookie->def->put_context(cookie->netfs_data, context);
+}
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+        printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline __attribute__((format(printf, 1, 2)))
+void _dbprintk(const char *fmt, ...)
+{
+}
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#ifdef __KDEBUG
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+#elif defined(CONFIG_FSCACHE_DEBUG)
+#define _enter(FMT, ...)                        \
+do {                                            \
+        if (__do_kdebug(ENTER))                 \
+                kenter(FMT, ##__VA_ARGS__);     \
+} while (0)
+#define _leave(FMT, ...)                        \
+do {                                            \
+        if (__do_kdebug(LEAVE))                 \
+                kleave(FMT, ##__VA_ARGS__);     \
+} while (0)
+#define _debug(FMT, ...)                        \
+do {                                            \
+        if (__do_kdebug(DEBUG))                 \
+                kdebug(FMT, ##__VA_ARGS__);     \
+} while (0)
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+/*
+ * determine whether a particular optional debugging point should be logged
+ * - we need to go through three steps to persuade cpp to correctly join the
+ *   shorthand in FSCACHE_DEBUG_LEVEL with its prefix
+ */
+#define ____do_kdebug(LEVEL, POINT) \
+        unlikely((fscache_debug & \
+                  (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
+#define ___do_kdebug(LEVEL, POINT) \
+        ____do_kdebug(LEVEL, POINT)
+#define __do_kdebug(POINT) \
+        ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
+#define FSCACHE_DEBUG_CACHE     0
+#define FSCACHE_DEBUG_COOKIE    1
+#define FSCACHE_DEBUG_PAGE      2
+#define FSCACHE_DEBUG_OPERATION 3
+#define FSCACHE_POINT_ENTER     1
+#define FSCACHE_POINT_LEAVE     2
+#define FSCACHE_POINT_DEBUG     4
+#ifndef FSCACHE_DEBUG_LEVEL
+#define FSCACHE_DEBUG_LEVEL CACHE
+#endif
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+#define ASSERT(X)                                                       \
+do {                                                                    \
+        if (unlikely(!(X))) {                                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTCMP(X, OP, Y)                                             \
+do {                                                                    \
+        if (unlikely(!((X) OP (Y)))) {                                  \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+                printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTIF(C, X)                                                  \
+do {                                                                    \
+        if (unlikely((C) && !(X))) {                                    \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)                                        \
+do {                                                                    \
+        if (unlikely((C) && !((X) OP (Y)))) {                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+                printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#else
+#define ASSERT(X)                       do {} while (0)
+#define ASSERTCMP(X, OP, Y)             do {} while (0)
+#define ASSERTIF(C, X)                  do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)        do {} while (0)
+#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644
index 000000000000..4de41b597499
--- /dev/null
+++ b/fs/fscache/main.c
@@ -0,0 +1,124 @@
+/* General filesystem local caching manager
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include "internal.h"
+MODULE_DESCRIPTION("FS Cache Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+unsigned fscache_defer_lookup = 1;
+module_param_named(defer_lookup, fscache_defer_lookup, uint,
+                   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_lookup,
+                 "Defer cookie lookup to background thread");
+unsigned fscache_defer_create = 1;
+module_param_named(defer_create, fscache_defer_create, uint,
+                   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_create,
+                 "Defer cookie creation to background thread");
+unsigned fscache_debug;
+module_param_named(debug, fscache_debug, uint,
+                   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_debug,
+                 "FS-Cache debugging mask");
+struct kobject *fscache_root;
+/*
+ * initialise the fs caching module
+ */
+static int __init fscache_init(void)
+{
+        int ret;
+        ret = slow_work_register_user();
+        if (ret < 0)
+                goto error_slow_work;
+        ret = fscache_proc_init();
+        if (ret < 0)
+                goto error_proc;
+        fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
+                                               sizeof(struct fscache_cookie),
+                                               0,
+                                               0,
+                                               fscache_cookie_init_once);
+        if (!fscache_cookie_jar) {
+                printk(KERN_NOTICE
+                       "FS-Cache: Failed to allocate a cookie jar\n");
+                ret = -ENOMEM;
+                goto error_cookie_jar;
+        }
+        fscache_root = kobject_create_and_add("fscache", kernel_kobj);
+        if (!fscache_root)
+                goto error_kobj;
+        printk(KERN_NOTICE "FS-Cache: Loaded\n");
+        return 0;
+error_kobj:
+        kmem_cache_destroy(fscache_cookie_jar);
+error_cookie_jar:
+        fscache_proc_cleanup();
+error_proc:
+        slow_work_unregister_user();
+error_slow_work:
+        return ret;
+}
+fs_initcall(fscache_init);
+/*
+ * clean up on module removal
+ */
+static void __exit fscache_exit(void)
+{
+        _enter("");
+        kobject_put(fscache_root);
+        kmem_cache_destroy(fscache_cookie_jar);
+        fscache_proc_cleanup();
+        slow_work_unregister_user();
+        printk(KERN_NOTICE "FS-Cache: Unloaded\n");
+}
+module_exit(fscache_exit);
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+int fscache_wait_bit(void *flags)
+{
+        schedule();
+        return 0;
+}
+EXPORT_SYMBOL(fscache_wait_bit);
+/*
+ * wait_on_bit() sleep function for interruptible waiting
+ */
+int fscache_wait_bit_interruptible(void *flags)
+{
+        schedule();
+        return signal_pending(current);
+}
+EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 000000000000..e028b8eb1c40
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
+/* FS-Cache netfs (client) registration
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+static LIST_HEAD(fscache_netfs_list);
+/*
+ * register a network filesystem for caching
+ */
+int __fscache_register_netfs(struct fscache_netfs *netfs)
+{
+        struct fscache_netfs *ptr;
+        int ret;
+        _enter("{%s}", netfs->name);
+        INIT_LIST_HEAD(&netfs->link);
+        /* allocate a cookie for the primary index */
+        netfs->primary_index =
+                kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+        if (!netfs->primary_index) {
+                _leave(" = -ENOMEM");
+                return -ENOMEM;
+        }
+        /* initialise the primary index cookie */
+        atomic_set(&netfs->primary_index->usage, 1);
+        atomic_set(&netfs->primary_index->n_children, 0);
+        netfs->primary_index->def               = &fscache_fsdef_netfs_def;
+        netfs->primary_index->parent            = &fscache_fsdef_index;
+        netfs->primary_index->netfs_data        = netfs;
+        atomic_inc(&netfs->primary_index->parent->usage);
+        atomic_inc(&netfs->primary_index->parent->n_children);
+        spin_lock_init(&netfs->primary_index->lock);
+        INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+        /* check the netfs type is not already present */
+        down_write(&fscache_addremove_sem);
+        ret = -EEXIST;
+        list_for_each_entry(ptr, &fscache_netfs_list, link) {
+                if (strcmp(ptr->name, netfs->name) == 0)
+                        goto already_registered;
+        }
+        list_add(&netfs->link, &fscache_netfs_list);
+        ret = 0;
+        printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
+               netfs->name);
+already_registered:
+        up_write(&fscache_addremove_sem);
+        if (ret < 0) {
+                netfs->primary_index->parent = NULL;
+                __fscache_cookie_put(netfs->primary_index);
+                netfs->primary_index = NULL;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+EXPORT_SYMBOL(__fscache_register_netfs);
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+        _enter("{%s.%u}", netfs->name, netfs->version);
+        down_write(&fscache_addremove_sem);
+        list_del(&netfs->link);
+        fscache_relinquish_cookie(netfs->primary_index, 0);
+        up_write(&fscache_addremove_sem);
+        printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
+               netfs->name);
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644
index 000000000000..392a41b1b79d
--- /dev/null
+++ b/fs/fscache/object.c
@@ -0,0 +1,810 @@
+/* FS-Cache object state machine handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/object.txt for a description of the
+ * object state machine and the in-kernel representations.
+ */
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include "internal.h"
+const char *fscache_object_states[] = {
+        [FSCACHE_OBJECT_INIT]           = "OBJECT_INIT",
+        [FSCACHE_OBJECT_LOOKING_UP]     = "OBJECT_LOOKING_UP",
+        [FSCACHE_OBJECT_CREATING]       = "OBJECT_CREATING",
+        [FSCACHE_OBJECT_AVAILABLE]      = "OBJECT_AVAILABLE",
+        [FSCACHE_OBJECT_ACTIVE]         = "OBJECT_ACTIVE",
+        [FSCACHE_OBJECT_UPDATING]       = "OBJECT_UPDATING",
+        [FSCACHE_OBJECT_DYING]          = "OBJECT_DYING",
+        [FSCACHE_OBJECT_LC_DYING]       = "OBJECT_LC_DYING",
+        [FSCACHE_OBJECT_ABORT_INIT]     = "OBJECT_ABORT_INIT",
+        [FSCACHE_OBJECT_RELEASING]      = "OBJECT_RELEASING",
+        [FSCACHE_OBJECT_RECYCLING]      = "OBJECT_RECYCLING",
+        [FSCACHE_OBJECT_WITHDRAWING]    = "OBJECT_WITHDRAWING",
+        [FSCACHE_OBJECT_DEAD]           = "OBJECT_DEAD",
+};
+EXPORT_SYMBOL(fscache_object_states);
+static void fscache_object_slow_work_put_ref(struct slow_work *);
+static int  fscache_object_slow_work_get_ref(struct slow_work *);
+static void fscache_object_slow_work_execute(struct slow_work *);
+static void fscache_initialise_object(struct fscache_object *);
+static void fscache_lookup_object(struct fscache_object *);
+static void fscache_object_available(struct fscache_object *);
+static void fscache_release_object(struct fscache_object *);
+static void fscache_withdraw_object(struct fscache_object *);
+static void fscache_enqueue_dependents(struct fscache_object *);
+static void fscache_dequeue_object(struct fscache_object *);
+const struct slow_work_ops fscache_object_slow_work_ops = {
+        .get_ref        = fscache_object_slow_work_get_ref,
+        .put_ref        = fscache_object_slow_work_put_ref,
+        .execute        = fscache_object_slow_work_execute,
+};
+EXPORT_SYMBOL(fscache_object_slow_work_ops);
+/*
+ * we need to notify the parent when an op completes that we had outstanding
+ * upon it
+ */
+static inline void fscache_done_parent_op(struct fscache_object *object)
+{
+        struct fscache_object *parent = object->parent;
+        _enter("OBJ%x {OBJ%x,%x}",
+               object->debug_id, parent->debug_id, parent->n_ops);
+        spin_lock_nested(&parent->lock, 1);
+        parent->n_ops--;
+        parent->n_obj_ops--;
+        if (parent->n_ops == 0)
+                fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+        spin_unlock(&parent->lock);
+}
+/*
+ * process events that have been sent to an object's state machine
+ * - initiates parent lookup
+ * - does object lookup
+ * - does object creation
+ * - does object recycling and retirement
+ * - does object withdrawal
+ */
+static void fscache_object_state_machine(struct fscache_object *object)
+{
+        enum fscache_object_state new_state;
+        ASSERT(object != NULL);
+        _enter("{OBJ%x,%s,%lx}",
+               object->debug_id, fscache_object_states[object->state],
+               object->events);
+        switch (object->state) {
+                /* wait for the parent object to become ready */
+        case FSCACHE_OBJECT_INIT:
+                object->event_mask =
+                        ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+                fscache_initialise_object(object);
+                goto done;
+                /* look up the object metadata on disk */
+        case FSCACHE_OBJECT_LOOKING_UP:
+                fscache_lookup_object(object);
+                goto lookup_transit;
+                /* create the object metadata on disk */
+        case FSCACHE_OBJECT_CREATING:
+                fscache_lookup_object(object);
+                goto lookup_transit;
+                /* handle an object becoming available; start pending
+                 * operations and queue dependent operations for processing */
+        case FSCACHE_OBJECT_AVAILABLE:
+                fscache_object_available(object);
+                goto active_transit;
+                /* normal running state */
+        case FSCACHE_OBJECT_ACTIVE:
+                goto active_transit;
+                /* update the object metadata on disk */
+        case FSCACHE_OBJECT_UPDATING:
+                clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
+                fscache_stat(&fscache_n_updates_run);
+                object->cache->ops->update_object(object);
+                goto active_transit;
+                /* handle an object dying during lookup or creation */
+        case FSCACHE_OBJECT_LC_DYING:
+                object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+                object->cache->ops->lookup_complete(object);
+                spin_lock(&object->lock);
+                object->state = FSCACHE_OBJECT_DYING;
+                if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+                                       &object->cookie->flags))
+                        wake_up_bit(&object->cookie->flags,
+                                    FSCACHE_COOKIE_CREATING);
+                spin_unlock(&object->lock);
+                fscache_done_parent_op(object);
+                /* wait for completion of all active operations on this object
+                 * and the death of all child objects of this object */
+        case FSCACHE_OBJECT_DYING:
+        dying:
+                clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
+                spin_lock(&object->lock);
+                _debug("dying OBJ%x {%d,%d}",
+                       object->debug_id, object->n_ops, object->n_children);
+                if (object->n_ops == 0 && object->n_children == 0) {
+                        object->event_mask &=
+                                ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+                        object->event_mask |=
+                                (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                                (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                                (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                                (1 << FSCACHE_OBJECT_EV_ERROR);
+                } else {
+                        object->event_mask &=
+                                ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                                  (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                                  (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                                  (1 << FSCACHE_OBJECT_EV_ERROR));
+                        object->event_mask |=
+                                1 << FSCACHE_OBJECT_EV_CLEARED;
+                }
+                spin_unlock(&object->lock);
+                fscache_enqueue_dependents(object);
+                goto terminal_transit;
+                /* handle an abort during initialisation */
+        case FSCACHE_OBJECT_ABORT_INIT:
+                _debug("handle abort init %lx", object->events);
+                object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+                spin_lock(&object->lock);
+                fscache_dequeue_object(object);
+                object->state = FSCACHE_OBJECT_DYING;
+                if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+                                       &object->cookie->flags))
+                        wake_up_bit(&object->cookie->flags,
+                                    FSCACHE_COOKIE_CREATING);
+                spin_unlock(&object->lock);
+                goto dying;
+                /* handle the netfs releasing an object and possibly marking it
+                 * obsolete too */
+        case FSCACHE_OBJECT_RELEASING:
+        case FSCACHE_OBJECT_RECYCLING:
+                object->event_mask &=
+                        ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                          (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                          (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                          (1 << FSCACHE_OBJECT_EV_ERROR));
+                fscache_release_object(object);
+                spin_lock(&object->lock);
+                object->state = FSCACHE_OBJECT_DEAD;
+                spin_unlock(&object->lock);
+                fscache_stat(&fscache_n_object_dead);
+                goto terminal_transit;
+                /* handle the parent cache of this object being withdrawn from
+                 * active service */
+        case FSCACHE_OBJECT_WITHDRAWING:
+                object->event_mask &=
+                        ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                          (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                          (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                          (1 << FSCACHE_OBJECT_EV_ERROR));
+                fscache_withdraw_object(object);
+                spin_lock(&object->lock);
+                object->state = FSCACHE_OBJECT_DEAD;
+                spin_unlock(&object->lock);
+                fscache_stat(&fscache_n_object_dead);
+                goto terminal_transit;
+                /* complain about the object being woken up once it is
+                 * deceased */
+        case FSCACHE_OBJECT_DEAD:
+                printk(KERN_ERR "FS-Cache:"
+                       " Unexpected event in dead state %lx\n",
+                       object->events & object->event_mask);
+                BUG();
+        default:
+                printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
+                       object->state);
+                BUG();
+        }
+        /* determine the transition from a lookup state */
+lookup_transit:
+        switch (fls(object->events & object->event_mask) - 1) {
+        case FSCACHE_OBJECT_EV_WITHDRAW:
+        case FSCACHE_OBJECT_EV_RETIRE:
+        case FSCACHE_OBJECT_EV_RELEASE:
+        case FSCACHE_OBJECT_EV_ERROR:
+                new_state = FSCACHE_OBJECT_LC_DYING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_REQUEUE:
+                goto done;
+        case -1:
+                goto done; /* sleep until event */
+        default:
+                goto unsupported_event;
+        }
+        /* determine the transition from an active state */
+active_transit:
+        switch (fls(object->events & object->event_mask) - 1) {
+        case FSCACHE_OBJECT_EV_WITHDRAW:
+        case FSCACHE_OBJECT_EV_RETIRE:
+        case FSCACHE_OBJECT_EV_RELEASE:
+        case FSCACHE_OBJECT_EV_ERROR:
+                new_state = FSCACHE_OBJECT_DYING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_UPDATE:
+                new_state = FSCACHE_OBJECT_UPDATING;
+                goto change_state;
+        case -1:
+                new_state = FSCACHE_OBJECT_ACTIVE;
+                goto change_state; /* sleep until event */
+        default:
+                goto unsupported_event;
+        }
+        /* determine the transition from a terminal state */
+terminal_transit:
+        switch (fls(object->events & object->event_mask) - 1) {
+        case FSCACHE_OBJECT_EV_WITHDRAW:
+                new_state = FSCACHE_OBJECT_WITHDRAWING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_RETIRE:
+                new_state = FSCACHE_OBJECT_RECYCLING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_RELEASE:
+                new_state = FSCACHE_OBJECT_RELEASING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_ERROR:
+                new_state = FSCACHE_OBJECT_WITHDRAWING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_CLEARED:
+                new_state = FSCACHE_OBJECT_DYING;
+                goto change_state;
+        case -1:
+                goto done; /* sleep until event */
+        default:
+                goto unsupported_event;
+        }
+change_state:
+        spin_lock(&object->lock);
+        object->state = new_state;
+        spin_unlock(&object->lock);
+done:
+        _leave(" [->%s]", fscache_object_states[object->state]);
+        return;
+unsupported_event:
+        printk(KERN_ERR "FS-Cache:"
+               " Unsupported event %lx [mask %lx] in state %s\n",
+               object->events, object->event_mask,
+               fscache_object_states[object->state]);
+        BUG();
+}
+/*
+ * execute an object
+ */
+static void fscache_object_slow_work_execute(struct slow_work *work)
+{
+        struct fscache_object *object =
+                container_of(work, struct fscache_object, work);
+        unsigned long start;
+        _enter("{OBJ%x}", object->debug_id);
+        clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+        start = jiffies;
+        fscache_object_state_machine(object);
+        fscache_hist(fscache_objs_histogram, start);
+        if (object->events & object->event_mask)
+                fscache_enqueue_object(object);
+}
+/*
+ * initialise an object
+ * - check the specified object's parent to see if we can make use of it
+ *   immediately to do a creation
+ * - we may need to start the process of creating a parent and we need to wait
+ *   for the parent's lookup and creation to complete if it's not there yet
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_initialise_object(struct fscache_object *object)
+{
+        struct fscache_object *parent;
+        _enter("");
+        ASSERT(object->cookie != NULL);
+        ASSERT(object->cookie->parent != NULL);
+        ASSERT(list_empty(&object->work.link));
+        if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
+                              (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                              (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                              (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
+                _debug("abort init %lx", object->events);
+                spin_lock(&object->lock);
+                object->state = FSCACHE_OBJECT_ABORT_INIT;
+                spin_unlock(&object->lock);
+                return;
+        }
+        spin_lock(&object->cookie->lock);
+        spin_lock_nested(&object->cookie->parent->lock, 1);
+        parent = object->parent;
+        if (!parent) {
+                _debug("no parent");
+                set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+        } else {
+                spin_lock(&object->lock);
+                spin_lock_nested(&parent->lock, 1);
+                _debug("parent %s", fscache_object_states[parent->state]);
+                if (parent->state >= FSCACHE_OBJECT_DYING) {
+                        _debug("bad parent");
+                        set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+                } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
+                        _debug("wait");
+                        /* we may get woken up in this state by child objects
+                         * binding on to us, so we need to make sure we don't
+                         * add ourself to the list multiple times */
+                        if (list_empty(&object->dep_link)) {
+                                object->cache->ops->grab_object(object);
+                                list_add(&object->dep_link,
+                                         &parent->dependents);
+                                /* fscache_acquire_non_index_cookie() uses this
+                                 * to wake the chain up */
+                                if (parent->state == FSCACHE_OBJECT_INIT)
+                                        fscache_enqueue_object(parent);
+                        }
+                } else {
+                        _debug("go");
+                        parent->n_ops++;
+                        parent->n_obj_ops++;
+                        object->lookup_jif = jiffies;
+                        object->state = FSCACHE_OBJECT_LOOKING_UP;
+                        set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+                }
+                spin_unlock(&parent->lock);
+                spin_unlock(&object->lock);
+        }
+        spin_unlock(&object->cookie->parent->lock);
+        spin_unlock(&object->cookie->lock);
+        _leave("");
+}
+/*
+ * look an object up in the cache from which it was allocated
+ * - we hold an "access lock" on the parent object, so the parent object cannot
+ *   be withdrawn by either party till we've finished
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_lookup_object(struct fscache_object *object)
+{
+        struct fscache_cookie *cookie = object->cookie;
+        struct fscache_object *parent;
+        _enter("");
+        parent = object->parent;
+        ASSERT(parent != NULL);
+        ASSERTCMP(parent->n_ops, >, 0);
+        ASSERTCMP(parent->n_obj_ops, >, 0);
+        /* make sure the parent is still available */
+        ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
+        if (parent->state >= FSCACHE_OBJECT_DYING ||
+            test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+                _debug("unavailable");
+                set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+                _leave("");
+                return;
+        }
+        _debug("LOOKUP \"%s/%s\" in \"%s\"",
+               parent->cookie->def->name, cookie->def->name,
+               object->cache->tag->name);
+        fscache_stat(&fscache_n_object_lookups);
+        object->cache->ops->lookup_object(object);
+        if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
+                set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+        _leave("");
+}
+/**
+ * fscache_object_lookup_negative - Note negative cookie lookup
+ * @object: Object pointing to cookie to mark
+ *
+ * Note negative lookup, permitting those waiting to read data from an already
+ * existing backing object to continue as there's no data for them to read.
+ */
+void fscache_object_lookup_negative(struct fscache_object *object)
+{
+        struct fscache_cookie *cookie = object->cookie;
+        _enter("{OBJ%x,%s}",
+               object->debug_id, fscache_object_states[object->state]);
+        spin_lock(&object->lock);
+        if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+                fscache_stat(&fscache_n_object_lookups_negative);
+                /* transit here to allow write requests to begin stacking up
+                 * and read requests to begin returning ENODATA */
+                object->state = FSCACHE_OBJECT_CREATING;
+                spin_unlock(&object->lock);
+                set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+                set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+                _debug("wake up lookup %p", &cookie->flags);
+                smp_mb__before_clear_bit();
+                clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+                smp_mb__after_clear_bit();
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+        } else {
+                ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+                spin_unlock(&object->lock);
+        }
+        _leave("");
+}
+EXPORT_SYMBOL(fscache_object_lookup_negative);
+/**
+ * fscache_obtained_object - Note successful object lookup or creation
+ * @object: Object pointing to cookie to mark
+ *
+ * Note successful lookup and/or creation, permitting those waiting to write
+ * data to a backing object to continue.
+ *
+ * Note that after calling this, an object's cookie may be relinquished by the
+ * netfs, and so must be accessed with object lock held.
+ */
+void fscache_obtained_object(struct fscache_object *object)
+{
+        struct fscache_cookie *cookie = object->cookie;
+        _enter("{OBJ%x,%s}",
+               object->debug_id, fscache_object_states[object->state]);
+        /* if we were still looking up, then we must have a positive lookup
+         * result, in which case there may be data available */
+        spin_lock(&object->lock);
+        if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+                fscache_stat(&fscache_n_object_lookups_positive);
+                clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+                object->state = FSCACHE_OBJECT_AVAILABLE;
+                spin_unlock(&object->lock);
+                smp_mb__before_clear_bit();
+                clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+                smp_mb__after_clear_bit();
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+        } else {
+                ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+                fscache_stat(&fscache_n_object_created);
+                object->state = FSCACHE_OBJECT_AVAILABLE;
+                spin_unlock(&object->lock);
+                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+                smp_wmb();
+        }
+        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
+        _leave("");
+}
+EXPORT_SYMBOL(fscache_obtained_object);
+/*
+ * handle an object that has just become available
+ */
+static void fscache_object_available(struct fscache_object *object)
+{
+        _enter("{OBJ%x}", object->debug_id);
+        spin_lock(&object->lock);
+        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
+                wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+        fscache_done_parent_op(object);
+        if (object->n_in_progress == 0) {
+                if (object->n_ops > 0) {
+                        ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
+                        ASSERTIF(object->n_ops > object->n_obj_ops,
+                                 !list_empty(&object->pending_ops));
+                        fscache_start_operations(object);
+                } else {
+                        ASSERT(list_empty(&object->pending_ops));
+                }
+        }
+        spin_unlock(&object->lock);
+        object->cache->ops->lookup_complete(object);
+        fscache_enqueue_dependents(object);
+        fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
+        fscache_stat(&fscache_n_object_avail);
+        _leave("");
+}
+/*
+ * drop an object's attachments
+ */
+static void fscache_drop_object(struct fscache_object *object)
+{
+        struct fscache_object *parent = object->parent;
+        struct fscache_cache *cache = object->cache;
+        _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+        spin_lock(&cache->object_list_lock);
+        list_del_init(&object->cache_link);
+        spin_unlock(&cache->object_list_lock);
+        cache->ops->drop_object(object);
+        if (parent) {
+                _debug("release parent OBJ%x {%d}",
+                       parent->debug_id, parent->n_children);
+                spin_lock(&parent->lock);
+                parent->n_children--;
+                if (parent->n_children == 0)
+                        fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+                spin_unlock(&parent->lock);
+                object->parent = NULL;
+        }
+        /* this just shifts the object release to the slow work processor */
+        object->cache->ops->put_object(object);
+        _leave("");
+}
+/*
+ * release or recycle an object that the netfs has discarded
+ */
+static void fscache_release_object(struct fscache_object *object)
+{
+        _enter("");
+        fscache_drop_object(object);
+}
+/*
+ * withdraw an object from active service
+ */
+static void fscache_withdraw_object(struct fscache_object *object)
+{
+        struct fscache_cookie *cookie;
+        bool detached;
+        _enter("");
+        spin_lock(&object->lock);
+        cookie = object->cookie;
+        if (cookie) {
+                /* need to get the cookie lock before the object lock, starting
+                 * from the object pointer */
+                atomic_inc(&cookie->usage);
+                spin_unlock(&object->lock);
+                detached = false;
+                spin_lock(&cookie->lock);
+                spin_lock(&object->lock);
+                if (object->cookie == cookie) {
+                        hlist_del_init(&object->cookie_link);
+                        object->cookie = NULL;
+                        detached = true;
+                }
+                spin_unlock(&cookie->lock);
+                fscache_cookie_put(cookie);
+                if (detached)
+                        fscache_cookie_put(cookie);
+        }
+        spin_unlock(&object->lock);
+        fscache_drop_object(object);
+}
+/*
+ * withdraw an object from active service at the behest of the cache
+ * - need break the links to a cached object cookie
+ * - called under two situations:
+ *   (1) recycler decides to reclaim an in-use object
+ *   (2) a cache is unmounted
+ * - have to take care as the cookie can be being relinquished by the netfs
+ *   simultaneously
+ * - the object is pinned by the caller holding a refcount on it
+ */
+void fscache_withdrawing_object(struct fscache_cache *cache,
+                                struct fscache_object *object)
+{
+        bool enqueue = false;
+        _enter(",OBJ%x", object->debug_id);
+        spin_lock(&object->lock);
+        if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
+                object->state = FSCACHE_OBJECT_WITHDRAWING;
+                enqueue = true;
+        }
+        spin_unlock(&object->lock);
+        if (enqueue)
+                fscache_enqueue_object(object);
+        _leave("");
+}
+/*
+ * allow the slow work item processor to get a ref on an object
+ */
+static int fscache_object_slow_work_get_ref(struct slow_work *work)
+{
+        struct fscache_object *object =
+                container_of(work, struct fscache_object, work);
+        return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+}
+/*
+ * allow the slow work item processor to discard a ref on a work item
+ */
+static void fscache_object_slow_work_put_ref(struct slow_work *work)
+{
+        struct fscache_object *object =
+                container_of(work, struct fscache_object, work);
+        return object->cache->ops->put_object(object);
+}
+/*
+ * enqueue an object for metadata-type processing
+ */
+void fscache_enqueue_object(struct fscache_object *object)
+{
+        _enter("{OBJ%x}", object->debug_id);
+        slow_work_enqueue(&object->work);
+}
+/*
+ * enqueue the dependents of an object for metadata-type processing
+ * - the caller must hold the object's lock
+ * - this may cause an already locked object to wind up being processed again
+ */
+static void fscache_enqueue_dependents(struct fscache_object *object)
+{
+        struct fscache_object *dep;
+        _enter("{OBJ%x}", object->debug_id);
+        if (list_empty(&object->dependents))
+                return;
+        spin_lock(&object->lock);
+        while (!list_empty(&object->dependents)) {
+                dep = list_entry(object->dependents.next,
+                                 struct fscache_object, dep_link);
+                list_del_init(&dep->dep_link);
+                /* sort onto appropriate lists */
+                fscache_enqueue_object(dep);
+                dep->cache->ops->put_object(dep);
+                if (!list_empty(&object->dependents))
+                        cond_resched_lock(&object->lock);
+        }
+        spin_unlock(&object->lock);
+}
+/*
+ * remove an object from whatever queue it's waiting on
+ * - the caller must hold object->lock
+ */
+void fscache_dequeue_object(struct fscache_object *object)
+{
+        _enter("{OBJ%x}", object->debug_id);
+        if (!list_empty(&object->dep_link)) {
+                spin_lock(&object->parent->lock);
+                list_del_init(&object->dep_link);
+                spin_unlock(&object->parent->lock);
+        }
+        _leave("");
+}
+/**
+ * fscache_check_aux - Ask the netfs whether an object on disk is still valid
+ * @object: The object to ask about
+ * @data: The auxiliary data for the object
+ * @datalen: The size of the auxiliary data
+ *
+ * This function consults the netfs about the coherency state of an object
+ */
+enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+                                        const void *data, uint16_t datalen)
+{
+        enum fscache_checkaux result;
+        if (!object->cookie->def->check_aux) {
+                fscache_stat(&fscache_n_checkaux_none);
+                return FSCACHE_CHECKAUX_OKAY;
+        }
+        result = object->cookie->def->check_aux(object->cookie->netfs_data,
+                                                data, datalen);
+        switch (result) {
+                /* entry okay as is */
+        case FSCACHE_CHECKAUX_OKAY:
+                fscache_stat(&fscache_n_checkaux_okay);
+                break;
+                /* entry requires update */
+        case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+                fscache_stat(&fscache_n_checkaux_update);
+                break;
+                /* entry requires deletion */
+        case FSCACHE_CHECKAUX_OBSOLETE:
+                fscache_stat(&fscache_n_checkaux_obsolete);
+                break;
+        default:
+                BUG();
+        }
+        return result;
+}
+EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644
index 000000000000..e7f8d53b8b6b
--- /dev/null
+++ b/fs/fscache/operation.c
@@ -0,0 +1,459 @@
+/* FS-Cache worker operation management routines
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/operations.txt
+ */
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include "internal.h"
+atomic_t fscache_op_debug_id;
+EXPORT_SYMBOL(fscache_op_debug_id);
+/**
+ * fscache_enqueue_operation - Enqueue an operation for processing
+ * @op: The operation to enqueue
+ *
+ * Enqueue an operation for processing by the FS-Cache thread pool.
+ *
+ * This will get its own ref on the object.
+ */
+void fscache_enqueue_operation(struct fscache_operation *op)
+{
+        _enter("{OBJ%x OP%x,%u}",
+               op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+        ASSERT(op->processor != NULL);
+        ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+        ASSERTCMP(atomic_read(&op->usage), >, 0);
+        if (list_empty(&op->pend_link)) {
+                switch (op->flags & FSCACHE_OP_TYPE) {
+                case FSCACHE_OP_FAST:
+                        _debug("queue fast");
+                        atomic_inc(&op->usage);
+                        if (!schedule_work(&op->fast_work))
+                                fscache_put_operation(op);
+                        break;
+                case FSCACHE_OP_SLOW:
+                        _debug("queue slow");
+                        slow_work_enqueue(&op->slow_work);
+                        break;
+                case FSCACHE_OP_MYTHREAD:
+                        _debug("queue for caller's attention");
+                        break;
+                default:
+                        printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
+                               op->flags);
+                        BUG();
+                        break;
+                }
+                fscache_stat(&fscache_n_op_enqueue);
+        }
+}
+EXPORT_SYMBOL(fscache_enqueue_operation);
+/*
+ * start an op running
+ */
+static void fscache_run_op(struct fscache_object *object,
+                           struct fscache_operation *op)
+{
+        object->n_in_progress++;
+        if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+                wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+        if (op->processor)
+                fscache_enqueue_operation(op);
+        fscache_stat(&fscache_n_op_run);
+}
+/*
+ * submit an exclusive operation for an object
+ * - other ops are excluded from running simultaneously with this one
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_exclusive_op(struct fscache_object *object,
+                                struct fscache_operation *op)
+{
+        int ret;
+        _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+        spin_lock(&object->lock);
+        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+        ret = -ENOBUFS;
+        if (fscache_object_is_active(object)) {
+                op->object = object;
+                object->n_ops++;
+                object->n_exclusive++;  /* reads and writes must wait */
+                if (object->n_ops > 0) {
+                        atomic_inc(&op->usage);
+                        list_add_tail(&op->pend_link, &object->pending_ops);
+                        fscache_stat(&fscache_n_op_pend);
+                } else if (!list_empty(&object->pending_ops)) {
+                        atomic_inc(&op->usage);
+                        list_add_tail(&op->pend_link, &object->pending_ops);
+                        fscache_stat(&fscache_n_op_pend);
+                        fscache_start_operations(object);
+                } else {
+                        ASSERTCMP(object->n_in_progress, ==, 0);
+                        fscache_run_op(object, op);
+                }
+                /* need to issue a new write op after this */
+                clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+                ret = 0;
+        } else if (object->state == FSCACHE_OBJECT_CREATING) {
+                op->object = object;
+                object->n_ops++;
+                object->n_exclusive++;  /* reads and writes must wait */
+                atomic_inc(&op->usage);
+                list_add_tail(&op->pend_link, &object->pending_ops);
+                fscache_stat(&fscache_n_op_pend);
+                ret = 0;
+        } else {
+                /* not allowed to submit ops in any other state */
+                BUG();
+        }
+        spin_unlock(&object->lock);
+        return ret;
+}
+/*
+ * report an unexpected submission
+ */
+static void fscache_report_unexpected_submission(struct fscache_object *object,
+                                                 struct fscache_operation *op,
+                                                 unsigned long ostate)
+{
+        static bool once_only;
+        struct fscache_operation *p;
+        unsigned n;
+        if (once_only)
+                return;
+        once_only = true;
+        kdebug("unexpected submission OP%x [OBJ%x %s]",
+               op->debug_id, object->debug_id,
+               fscache_object_states[object->state]);
+        kdebug("objstate=%s [%s]",
+               fscache_object_states[object->state],
+               fscache_object_states[ostate]);
+        kdebug("objflags=%lx", object->flags);
+        kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
+        kdebug("ops=%u inp=%u exc=%u",
+               object->n_ops, object->n_in_progress, object->n_exclusive);
+        if (!list_empty(&object->pending_ops)) {
+                n = 0;
+                list_for_each_entry(p, &object->pending_ops, pend_link) {
+                        ASSERTCMP(p->object, ==, object);
+                        kdebug("%p %p", op->processor, op->release);
+                        n++;
+                }
+                kdebug("n=%u", n);
+        }
+        dump_stack();
+}
+/*
+ * submit an operation for an object
+ * - objects may be submitted only in the following states:
+ *   - during object creation (write ops may be submitted)
+ *   - whilst the object is active
+ *   - after an I/O error incurred in one of the two above states (op rejected)
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_op(struct fscache_object *object,
+                      struct fscache_operation *op)
+{
+        unsigned long ostate;
+        int ret;
+        _enter("{OBJ%x OP%x},{%u}",
+               object->debug_id, op->debug_id, atomic_read(&op->usage));
+        ASSERTCMP(atomic_read(&op->usage), >, 0);
+        spin_lock(&object->lock);
+        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+        ostate = object->state;
+        smp_rmb();
+        if (fscache_object_is_active(object)) {
+                op->object = object;
+                object->n_ops++;
+                if (object->n_exclusive > 0) {
+                        atomic_inc(&op->usage);
+                        list_add_tail(&op->pend_link, &object->pending_ops);
+                        fscache_stat(&fscache_n_op_pend);
+                } else if (!list_empty(&object->pending_ops)) {
+                        atomic_inc(&op->usage);
+                        list_add_tail(&op->pend_link, &object->pending_ops);
+                        fscache_stat(&fscache_n_op_pend);
+                        fscache_start_operations(object);
+                } else {
+                        ASSERTCMP(object->n_exclusive, ==, 0);
+                        fscache_run_op(object, op);
+                }
+                ret = 0;
+        } else if (object->state == FSCACHE_OBJECT_CREATING) {
+                op->object = object;
+                object->n_ops++;
+                atomic_inc(&op->usage);
+                list_add_tail(&op->pend_link, &object->pending_ops);
+                fscache_stat(&fscache_n_op_pend);
+                ret = 0;
+        } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+                fscache_report_unexpected_submission(object, op, ostate);
+                ASSERT(!fscache_object_is_active(object));
+                ret = -ENOBUFS;
+        } else {
+                ret = -ENOBUFS;
+        }
+        spin_unlock(&object->lock);
+        return ret;
+}
+/*
+ * queue an object for withdrawal on error, aborting all following asynchronous
+ * operations
+ */
+void fscache_abort_object(struct fscache_object *object)
+{
+        _enter("{OBJ%x}", object->debug_id);
+        fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+}
+/*
+ * jump start the operation processing on an object
+ * - caller must hold object->lock
+ */
+void fscache_start_operations(struct fscache_object *object)
+{
+        struct fscache_operation *op;
+        bool stop = false;
+        while (!list_empty(&object->pending_ops) && !stop) {
+                op = list_entry(object->pending_ops.next,
+                                struct fscache_operation, pend_link);
+                if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+                        if (object->n_in_progress > 0)
+                                break;
+                        stop = true;
+                }
+                list_del_init(&op->pend_link);
+                object->n_in_progress++;
+                if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+                        wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+                if (op->processor)
+                        fscache_enqueue_operation(op);
+                /* the pending queue was holding a ref on the object */
+                fscache_put_operation(op);
+        }
+        ASSERTCMP(object->n_in_progress, <=, object->n_ops);
+        _debug("woke %d ops on OBJ%x",
+               object->n_in_progress, object->debug_id);
+}
+/*
+ * release an operation
+ * - queues pending ops if this is the last in-progress op
+ */
+void fscache_put_operation(struct fscache_operation *op)
+{
+        struct fscache_object *object;
+        struct fscache_cache *cache;
+        _enter("{OBJ%x OP%x,%d}",
+               op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+        ASSERTCMP(atomic_read(&op->usage), >, 0);
+        if (!atomic_dec_and_test(&op->usage))
+                return;
+        _debug("PUT OP");
+        if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
+                BUG();
+        fscache_stat(&fscache_n_op_release);
+        if (op->release) {
+                op->release(op);
+                op->release = NULL;
+        }
+        object = op->object;
+        /* now... we may get called with the object spinlock held, so we
+         * complete the cleanup here only if we can immediately acquire the
+         * lock, and defer it otherwise */
+        if (!spin_trylock(&object->lock)) {
+                _debug("defer put");
+                fscache_stat(&fscache_n_op_deferred_release);
+                cache = object->cache;
+                spin_lock(&cache->op_gc_list_lock);
+                list_add_tail(&op->pend_link, &cache->op_gc_list);
+                spin_unlock(&cache->op_gc_list_lock);
+                schedule_work(&cache->op_gc);
+                _leave(" [defer]");
+                return;
+        }
+        if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+                ASSERTCMP(object->n_exclusive, >, 0);
+                object->n_exclusive--;
+        }
+        ASSERTCMP(object->n_in_progress, >, 0);
+        object->n_in_progress--;
+        if (object->n_in_progress == 0)
+                fscache_start_operations(object);
+        ASSERTCMP(object->n_ops, >, 0);
+        object->n_ops--;
+        if (object->n_ops == 0)
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+        spin_unlock(&object->lock);
+        kfree(op);
+        _leave(" [done]");
+}
+EXPORT_SYMBOL(fscache_put_operation);
+/*
+ * garbage collect operations that have had their release deferred
+ */
+void fscache_operation_gc(struct work_struct *work)
+{
+        struct fscache_operation *op;
+        struct fscache_object *object;
+        struct fscache_cache *cache =
+                container_of(work, struct fscache_cache, op_gc);
+        int count = 0;
+        _enter("");
+        do {
+                spin_lock(&cache->op_gc_list_lock);
+                if (list_empty(&cache->op_gc_list)) {
+                        spin_unlock(&cache->op_gc_list_lock);
+                        break;
+                }
+                op = list_entry(cache->op_gc_list.next,
+                                struct fscache_operation, pend_link);
+                list_del(&op->pend_link);
+                spin_unlock(&cache->op_gc_list_lock);
+                object = op->object;
+                _debug("GC DEFERRED REL OBJ%x OP%x",
+                       object->debug_id, op->debug_id);
+                fscache_stat(&fscache_n_op_gc);
+                ASSERTCMP(atomic_read(&op->usage), ==, 0);
+                spin_lock(&object->lock);
+                if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+                        ASSERTCMP(object->n_exclusive, >, 0);
+                        object->n_exclusive--;
+                }
+                ASSERTCMP(object->n_in_progress, >, 0);
+                object->n_in_progress--;
+                if (object->n_in_progress == 0)
+                        fscache_start_operations(object);
+                ASSERTCMP(object->n_ops, >, 0);
+                object->n_ops--;
+                if (object->n_ops == 0)
+                        fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+                spin_unlock(&object->lock);
+        } while (count++ < 20);
+        if (!list_empty(&cache->op_gc_list))
+                schedule_work(&cache->op_gc);
+        _leave("");
+}
+/*
+ * allow the slow work item processor to get a ref on an operation
+ */
+static int fscache_op_get_ref(struct slow_work *work)
+{
+        struct fscache_operation *op =
+                container_of(work, struct fscache_operation, slow_work);
+        atomic_inc(&op->usage);
+        return 0;
+}
+/*
+ * allow the slow work item processor to discard a ref on an operation
+ */
+static void fscache_op_put_ref(struct slow_work *work)
+{
+        struct fscache_operation *op =
+                container_of(work, struct fscache_operation, slow_work);
+        fscache_put_operation(op);
+}
+/*
+ * execute an operation using the slow thread pool to provide processing context
+ * - the caller holds a ref to this object, so we don't need to hold one
+ */
+static void fscache_op_execute(struct slow_work *work)
+{
+        struct fscache_operation *op =
+                container_of(work, struct fscache_operation, slow_work);
+        unsigned long start;
+        _enter("{OBJ%x OP%x,%d}",
+               op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+        ASSERT(op->processor != NULL);
+        start = jiffies;
+        op->processor(op);
+        fscache_hist(fscache_ops_histogram, start);
+        _leave("");
+}
+const struct slow_work_ops fscache_op_slow_work_ops = {
+        .get_ref        = fscache_op_get_ref,
+        .put_ref        = fscache_op_put_ref,
+        .execute        = fscache_op_execute,
+};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 000000000000..2568e0eb644f
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
+/* Cache page management and data I/O routines
+ *
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#include <linux/fscache-cache.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+/*
+ * check to see if a page is being written to the cache
+ */
+bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+        void *val;
+        rcu_read_lock();
+        val = radix_tree_lookup(&cookie->stores, page->index);
+        rcu_read_unlock();
+        return val != NULL;
+}
+EXPORT_SYMBOL(__fscache_check_page_write);
+/*
+ * wait for a page to finish being written to the cache
+ */
+void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+        wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+        wait_event(*wq, !__fscache_check_page_write(cookie, page));
+}
+EXPORT_SYMBOL(__fscache_wait_on_page_write);
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+        struct page *xpage;
+        spin_lock(&cookie->lock);
+        xpage = radix_tree_delete(&cookie->stores, page->index);
+        spin_unlock(&cookie->lock);
+        ASSERT(xpage != NULL);
+        wake_up_bit(&cookie->flags, 0);
+}
+/*
+ * actually apply the changed attributes to a cache object
+ */
+static void fscache_attr_changed_op(struct fscache_operation *op)
+{
+        struct fscache_object *object = op->object;
+        _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
+        fscache_stat(&fscache_n_attr_changed_calls);
+        if (fscache_object_is_active(object) &&
+            object->cache->ops->attr_changed(object) < 0)
+                fscache_abort_object(object);
+        _leave("");
+}
+/*
+ * notification that the attributes on an object have changed
+ */
+int __fscache_attr_changed(struct fscache_cookie *cookie)
+{
+        struct fscache_operation *op;
+        struct fscache_object *object;
+        _enter("%p", cookie);
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        fscache_stat(&fscache_n_attr_changed);
+        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        if (!op) {
+                fscache_stat(&fscache_n_attr_changed_nomem);
+                _leave(" = -ENOMEM");
+                return -ENOMEM;
+        }
+        fscache_operation_init(op, NULL);
+        fscache_operation_init_slow(op, fscache_attr_changed_op);
+        op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        if (fscache_submit_exclusive_op(object, op) < 0)
+                goto nobufs;
+        spin_unlock(&cookie->lock);
+        fscache_stat(&fscache_n_attr_changed_ok);
+        fscache_put_operation(op);
+        _leave(" = 0");
+        return 0;
+nobufs:
+        spin_unlock(&cookie->lock);
+        kfree(op);
+        fscache_stat(&fscache_n_attr_changed_nobufs);
+        _leave(" = %d", -ENOBUFS);
+        return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_attr_changed);
+/*
+ * handle secondary execution given to a retrieval op on behalf of the
+ * cache
+ */
+static void fscache_retrieval_work(struct work_struct *work)
+{
+        struct fscache_retrieval *op =
+                container_of(work, struct fscache_retrieval, op.fast_work);
+        unsigned long start;
+        _enter("{OP%x}", op->op.debug_id);
+        start = jiffies;
+        op->op.processor(&op->op);
+        fscache_hist(fscache_ops_histogram, start);
+        fscache_put_operation(&op->op);
+}
+/*
+ * release a retrieval op reference
+ */
+static void fscache_release_retrieval_op(struct fscache_operation *_op)
+{
+        struct fscache_retrieval *op =
+                container_of(_op, struct fscache_retrieval, op);
+        _enter("{OP%x}", op->op.debug_id);
+        fscache_hist(fscache_retrieval_histogram, op->start_time);
+        if (op->context)
+                fscache_put_context(op->op.object->cookie, op->context);
+        _leave("");
+}
+/*
+ * allocate a retrieval op
+ */
+static struct fscache_retrieval *fscache_alloc_retrieval(
+        struct address_space *mapping,
+        fscache_rw_complete_t end_io_func,
+        void *context)
+{
+        struct fscache_retrieval *op;
+        /* allocate a retrieval operation and attempt to submit it */
+        op = kzalloc(sizeof(*op), GFP_NOIO);
+        if (!op) {
+                fscache_stat(&fscache_n_retrievals_nomem);
+                return NULL;
+        }
+        fscache_operation_init(&op->op, fscache_release_retrieval_op);
+        op->op.flags    = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+        op->mapping     = mapping;
+        op->end_io_func = end_io_func;
+        op->context     = context;
+        op->start_time  = jiffies;
+        INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
+        INIT_LIST_HEAD(&op->to_do);
+        return op;
+}
+/*
+ * wait for a deferred lookup to complete
+ */
+static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+{
+        unsigned long jif;
+        _enter("");
+        if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
+                _leave(" = 0 [imm]");
+                return 0;
+        }
+        fscache_stat(&fscache_n_retrievals_wait);
+        jif = jiffies;
+        if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+                        fscache_wait_bit_interruptible,
+                        TASK_INTERRUPTIBLE) != 0) {
+                fscache_stat(&fscache_n_retrievals_intr);
+                _leave(" = -ERESTARTSYS");
+                return -ERESTARTSYS;
+        }
+        ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
+        smp_rmb();
+        fscache_hist(fscache_retrieval_delay_histogram, jif);
+        _leave(" = 0 [dly]");
+        return 0;
+}
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - we return:
+ *   -ENOMEM    - out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS   - no backing object available in which to cache the block
+ *   -ENODATA   - no data available in the backing object for this block
+ *   0          - dispatched a read - it'll call end_io_func() when finished
+ */
+int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+                                 struct page *page,
+                                 fscache_rw_complete_t end_io_func,
+                                 void *context,
+                                 gfp_t gfp)
+{
+        struct fscache_retrieval *op;
+        struct fscache_object *object;
+        int ret;
+        _enter("%p,%p,,,", cookie, page);
+        fscache_stat(&fscache_n_retrievals);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERTCMP(page, !=, NULL);
+        if (fscache_wait_for_deferred_lookup(cookie) < 0)
+                return -ERESTARTSYS;
+        op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+        if (!op) {
+                _leave(" = -ENOMEM");
+                return -ENOMEM;
+        }
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs_unlock;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+        if (fscache_submit_op(object, &op->op) < 0)
+                goto nobufs_unlock;
+        spin_unlock(&cookie->lock);
+        fscache_stat(&fscache_n_retrieval_ops);
+        /* pin the netfs read context in case we need to do the actual netfs
+         * read because we've encountered a cache read failure */
+        fscache_get_context(object->cookie, op->context);
+        /* we wait for the operation to become active, and then process it
+         * *here*, in this thread, and not in the thread pool */
+        if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+                _debug(">>> WT");
+                fscache_stat(&fscache_n_retrieval_op_waits);
+                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+                _debug("<<< GO");
+        }
+        /* ask the cache to honour the operation */
+        if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+                ret = object->cache->ops->allocate_page(op, page, gfp);
+                if (ret == 0)
+                        ret = -ENODATA;
+        } else {
+                ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+        }
+        if (ret == -ENOMEM)
+                fscache_stat(&fscache_n_retrievals_nomem);
+        else if (ret == -ERESTARTSYS)
+                fscache_stat(&fscache_n_retrievals_intr);
+        else if (ret == -ENODATA)
+                fscache_stat(&fscache_n_retrievals_nodata);
+        else if (ret < 0)
+                fscache_stat(&fscache_n_retrievals_nobufs);
+        else
+                fscache_stat(&fscache_n_retrievals_ok);
+        fscache_put_retrieval(op);
+        _leave(" = %d", ret);
+        return ret;
+nobufs_unlock:
+        spin_unlock(&cookie->lock);
+        kfree(op);
+nobufs:
+        fscache_stat(&fscache_n_retrievals_nobufs);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_page);
+/*
+ * read a list of page from the cache or allocate a block in which to store
+ * them
+ * - we return:
+ *   -ENOMEM    - out of memory, some pages may be being read
+ *   -ERESTARTSYS - interrupted, some pages may be being read
+ *   -ENOBUFS   - no backing object or space available in which to cache any
+ *                pages not being read
+ *   -ENODATA   - no data available in the backing object for some or all of
+ *                the pages
+ *   0          - dispatched a read on all pages
+ *
+ * end_io_func() will be called for each page read from the cache as it is
+ * finishes being read
+ *
+ * any pages for which a read is dispatched will be removed from pages and
+ * nr_pages
+ */
+int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+                                  struct address_space *mapping,
+                                  struct list_head *pages,
+                                  unsigned *nr_pages,
+                                  fscache_rw_complete_t end_io_func,
+                                  void *context,
+                                  gfp_t gfp)
+{
+        fscache_pages_retrieval_func_t func;
+        struct fscache_retrieval *op;
+        struct fscache_object *object;
+        int ret;
+        _enter("%p,,%d,,,", cookie, *nr_pages);
+        fscache_stat(&fscache_n_retrievals);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERTCMP(*nr_pages, >, 0);
+        ASSERT(!list_empty(pages));
+        if (fscache_wait_for_deferred_lookup(cookie) < 0)
+                return -ERESTARTSYS;
+        op = fscache_alloc_retrieval(mapping, end_io_func, context);
+        if (!op)
+                return -ENOMEM;
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs_unlock;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        if (fscache_submit_op(object, &op->op) < 0)
+                goto nobufs_unlock;
+        spin_unlock(&cookie->lock);
+        fscache_stat(&fscache_n_retrieval_ops);
+        /* pin the netfs read context in case we need to do the actual netfs
+         * read because we've encountered a cache read failure */
+        fscache_get_context(object->cookie, op->context);
+        /* we wait for the operation to become active, and then process it
+         * *here*, in this thread, and not in the thread pool */
+        if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+                _debug(">>> WT");
+                fscache_stat(&fscache_n_retrieval_op_waits);
+                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+                _debug("<<< GO");
+        }
+        /* ask the cache to honour the operation */
+        if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
+                func = object->cache->ops->allocate_pages;
+        else
+                func = object->cache->ops->read_or_alloc_pages;
+        ret = func(op, pages, nr_pages, gfp);
+        if (ret == -ENOMEM)
+                fscache_stat(&fscache_n_retrievals_nomem);
+        else if (ret == -ERESTARTSYS)
+                fscache_stat(&fscache_n_retrievals_intr);
+        else if (ret == -ENODATA)
+                fscache_stat(&fscache_n_retrievals_nodata);
+        else if (ret < 0)
+                fscache_stat(&fscache_n_retrievals_nobufs);
+        else
+                fscache_stat(&fscache_n_retrievals_ok);
+        fscache_put_retrieval(op);
+        _leave(" = %d", ret);
+        return ret;
+nobufs_unlock:
+        spin_unlock(&cookie->lock);
+        kfree(op);
+nobufs:
+        fscache_stat(&fscache_n_retrievals_nobufs);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
+/*
+ * allocate a block in the cache on which to store a page
+ * - we return:
+ *   -ENOMEM    - out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS   - no backing object available in which to cache the block
+ *   0          - block allocated
+ */
+int __fscache_alloc_page(struct fscache_cookie *cookie,
+                         struct page *page,
+                         gfp_t gfp)
+{
+        struct fscache_retrieval *op;
+        struct fscache_object *object;
+        int ret;
+        _enter("%p,%p,,,", cookie, page);
+        fscache_stat(&fscache_n_allocs);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERTCMP(page, !=, NULL);
+        if (fscache_wait_for_deferred_lookup(cookie) < 0)
+                return -ERESTARTSYS;
+        op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+        if (!op)
+                return -ENOMEM;
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs_unlock;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        if (fscache_submit_op(object, &op->op) < 0)
+                goto nobufs_unlock;
+        spin_unlock(&cookie->lock);
+        fscache_stat(&fscache_n_alloc_ops);
+        if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+                _debug(">>> WT");
+                fscache_stat(&fscache_n_alloc_op_waits);
+                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+                _debug("<<< GO");
+        }
+        /* ask the cache to honour the operation */
+        ret = object->cache->ops->allocate_page(op, page, gfp);
+        if (ret < 0)
+                fscache_stat(&fscache_n_allocs_nobufs);
+        else
+                fscache_stat(&fscache_n_allocs_ok);
+        fscache_put_retrieval(op);
+        _leave(" = %d", ret);
+        return ret;
+nobufs_unlock:
+        spin_unlock(&cookie->lock);
+        kfree(op);
+nobufs:
+        fscache_stat(&fscache_n_allocs_nobufs);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_alloc_page);
+/*
+ * release a write op reference
+ */
+static void fscache_release_write_op(struct fscache_operation *_op)
+{
+        _enter("{OP%x}", _op->debug_id);
+}
+/*
+ * perform the background storage of a page into the cache
+ */
+static void fscache_write_op(struct fscache_operation *_op)
+{
+        struct fscache_storage *op =
+                container_of(_op, struct fscache_storage, op);
+        struct fscache_object *object = op->op.object;
+        struct fscache_cookie *cookie = object->cookie;
+        struct page *page;
+        unsigned n;
+        void *results[1];
+        int ret;
+        _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+        spin_lock(&cookie->lock);
+        spin_lock(&object->lock);
+        if (!fscache_object_is_active(object)) {
+                spin_unlock(&object->lock);
+                spin_unlock(&cookie->lock);
+                _leave("");
+                return;
+        }
+        fscache_stat(&fscache_n_store_calls);
+        /* find a page to store */
+        page = NULL;
+        n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
+                                       FSCACHE_COOKIE_PENDING_TAG);
+        if (n != 1)
+                goto superseded;
+        page = results[0];
+        _debug("gang %d [%lx]", n, page->index);
+        if (page->index > op->store_limit)
+                goto superseded;
+        radix_tree_tag_clear(&cookie->stores, page->index,
+                             FSCACHE_COOKIE_PENDING_TAG);
+        spin_unlock(&object->lock);
+        spin_unlock(&cookie->lock);
+        if (page) {
+                ret = object->cache->ops->write_page(op, page);
+                fscache_end_page_write(cookie, page);
+                page_cache_release(page);
+                if (ret < 0)
+                        fscache_abort_object(object);
+                else
+                        fscache_enqueue_operation(&op->op);
+        }
+        _leave("");
+        return;
+superseded:
+        /* this writer is going away and there aren't any more things to
+         * write */
+        _debug("cease");
+        clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+        spin_unlock(&object->lock);
+        spin_unlock(&cookie->lock);
+        _leave("");
+}
+/*
+ * request a page be stored in the cache
+ * - returns:
+ *   -ENOMEM    - out of memory, nothing done
+ *   -ENOBUFS   - no backing object available in which to cache the page
+ *   0          - dispatched a write - it'll call end_io_func() when finished
+ *
+ * if the cookie still has a backing object at this point, that object can be
+ * in one of a few states with respect to storage processing:
+ *
+ *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
+ *      set)
+ *
+ *      (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ *          fill op)
+ *
+ *      (b) writes deferred till post-creation (mark page for writing and
+ *          return immediately)
+ *
+ *  (2) negative lookup, object created, initial fill being made from netfs
+ *      (FSCACHE_COOKIE_INITIAL_FILL is set)
+ *
+ *      (a) fill point not yet reached this page (mark page for writing and
+ *          return)
+ *
+ *      (b) fill point passed this page (queue op to store this page)
+ *
+ *  (3) object extant (queue op to store this page)
+ *
+ * any other state is invalid
+ */
+int __fscache_write_page(struct fscache_cookie *cookie,
+                         struct page *page,
+                         gfp_t gfp)
+{
+        struct fscache_storage *op;
+        struct fscache_object *object;
+        int ret;
+        _enter("%p,%x,", cookie, (u32) page->flags);
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERT(PageFsCache(page));
+        fscache_stat(&fscache_n_stores);
+        op = kzalloc(sizeof(*op), GFP_NOIO);
+        if (!op)
+                goto nomem;
+        fscache_operation_init(&op->op, fscache_release_write_op);
+        fscache_operation_init_slow(&op->op, fscache_write_op);
+        op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+        ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+        if (ret < 0)
+                goto nomem_free;
+        ret = -ENOBUFS;
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+                goto nobufs;
+        /* add the page to the pending-storage radix tree on the backing
+         * object */
+        spin_lock(&object->lock);
+        _debug("store limit %llx", (unsigned long long) object->store_limit);
+        ret = radix_tree_insert(&cookie->stores, page->index, page);
+        if (ret < 0) {
+                if (ret == -EEXIST)
+                        goto already_queued;
+                _debug("insert failed %d", ret);
+                goto nobufs_unlock_obj;
+        }
+        radix_tree_tag_set(&cookie->stores, page->index,
+                           FSCACHE_COOKIE_PENDING_TAG);
+        page_cache_get(page);
+        /* we only want one writer at a time, but we do need to queue new
+         * writers after exclusive ops */
+        if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
+                goto already_pending;
+        spin_unlock(&object->lock);
+        op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
+        op->store_limit = object->store_limit;
+        if (fscache_submit_op(object, &op->op) < 0)
+                goto submit_failed;
+        spin_unlock(&cookie->lock);
+        radix_tree_preload_end();
+        fscache_stat(&fscache_n_store_ops);
+        fscache_stat(&fscache_n_stores_ok);
+        /* the slow work queue now carries its own ref on the object */
+        fscache_put_operation(&op->op);
+        _leave(" = 0");
+        return 0;
+already_queued:
+        fscache_stat(&fscache_n_stores_again);
+already_pending:
+        spin_unlock(&object->lock);
+        spin_unlock(&cookie->lock);
+        radix_tree_preload_end();
+        kfree(op);
+        fscache_stat(&fscache_n_stores_ok);
+        _leave(" = 0");
+        return 0;
+submit_failed:
+        radix_tree_delete(&cookie->stores, page->index);
+        page_cache_release(page);
+        ret = -ENOBUFS;
+        goto nobufs;
+nobufs_unlock_obj:
+        spin_unlock(&object->lock);
+nobufs:
+        spin_unlock(&cookie->lock);
+        radix_tree_preload_end();
+        kfree(op);
+        fscache_stat(&fscache_n_stores_nobufs);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+nomem_free:
+        kfree(op);
+nomem:
+        fscache_stat(&fscache_n_stores_oom);
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+EXPORT_SYMBOL(__fscache_write_page);
+/*
+ * remove a page from the cache
+ */
+void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
+{
+        struct fscache_object *object;
+        _enter(",%p", page);
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERTCMP(page, !=, NULL);
+        fscache_stat(&fscache_n_uncaches);
+        /* cache withdrawal may beat us to it */
+        if (!PageFsCache(page))
+                goto done;
+        /* get the object */
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects)) {
+                ClearPageFsCache(page);
+                goto done_unlock;
+        }
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        /* there might now be stuff on disk we could read */
+        clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+        /* only invoke the cache backend if we managed to mark the page
+         * uncached here; this deals with synchronisation vs withdrawal */
+        if (TestClearPageFsCache(page) &&
+            object->cache->ops->uncache_page) {
+                /* the cache backend releases the cookie lock */
+                object->cache->ops->uncache_page(object, page);
+                goto done;
+        }
+done_unlock:
+        spin_unlock(&cookie->lock);
+done:
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_page);
+/**
+ * fscache_mark_pages_cached - Mark pages as being cached
+ * @op: The retrieval op pages are being marked for
+ * @pagevec: The pages to be marked
+ *
+ * Mark a bunch of netfs pages as being cached.  After this is called,
+ * the netfs must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_pages_cached(struct fscache_retrieval *op,
+                               struct pagevec *pagevec)
+{
+        struct fscache_cookie *cookie = op->op.object->cookie;
+        unsigned long loop;
+#ifdef CONFIG_FSCACHE_STATS
+        atomic_add(pagevec->nr, &fscache_n_marks);
+#endif
+        for (loop = 0; loop < pagevec->nr; loop++) {
+                struct page *page = pagevec->pages[loop];
+                _debug("- mark %p{%lx}", page, page->index);
+                if (TestSetPageFsCache(page)) {
+                        static bool once_only;
+                        if (!once_only) {
+                                once_only = true;
+                                printk(KERN_WARNING "FS-Cache:"
+                                       " Cookie type %s marked page %lx"
+                                       " multiple times\n",
+                                       cookie->def->name, page->index);
+                        }
+                }
+        }
+        if (cookie->def->mark_pages_cached)
+                cookie->def->mark_pages_cached(cookie->netfs_data,
+                                               op->mapping, pagevec);
+        pagevec_reinit(pagevec);
+}
+EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 000000000000..beeab44bc31a
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
+/* FS-Cache statistics viewing interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+/*
+ * initialise the /proc/fs/fscache/ directory
+ */
+int __init fscache_proc_init(void)
+{
+        _enter("");
+        if (!proc_mkdir("fs/fscache", NULL))
+                goto error_dir;
+#ifdef CONFIG_FSCACHE_STATS
+        if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
+                         &fscache_stats_fops))
+                goto error_stats;
+#endif
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+        if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
+                         &fscache_histogram_fops))
+                goto error_histogram;
+#endif
+        _leave(" = 0");
+        return 0;
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+error_histogram:
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+        remove_proc_entry("fs/fscache/stats", NULL);
+error_stats:
+#endif
+        remove_proc_entry("fs/fscache", NULL);
+error_dir:
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+/*
+ * clean up the /proc/fs/fscache/ directory
+ */
+void fscache_proc_cleanup(void)
+{
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+        remove_proc_entry("fs/fscache/histogram", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+        remove_proc_entry("fs/fscache/stats", NULL);
+#endif
+        remove_proc_entry("fs/fscache", NULL);
+}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 000000000000..65deb99e756b
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
+/* FS-Cache statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+/*
+ * operation counters
+ */
+atomic_t fscache_n_op_pend;
+atomic_t fscache_n_op_run;
+atomic_t fscache_n_op_enqueue;
+atomic_t fscache_n_op_requeue;
+atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_release;
+atomic_t fscache_n_op_gc;
+atomic_t fscache_n_attr_changed;
+atomic_t fscache_n_attr_changed_ok;
+atomic_t fscache_n_attr_changed_nobufs;
+atomic_t fscache_n_attr_changed_nomem;
+atomic_t fscache_n_attr_changed_calls;
+atomic_t fscache_n_allocs;
+atomic_t fscache_n_allocs_ok;
+atomic_t fscache_n_allocs_wait;
+atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_alloc_ops;
+atomic_t fscache_n_alloc_op_waits;
+atomic_t fscache_n_retrievals;
+atomic_t fscache_n_retrievals_ok;
+atomic_t fscache_n_retrievals_wait;
+atomic_t fscache_n_retrievals_nodata;
+atomic_t fscache_n_retrievals_nobufs;
+atomic_t fscache_n_retrievals_intr;
+atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrieval_ops;
+atomic_t fscache_n_retrieval_op_waits;
+atomic_t fscache_n_stores;
+atomic_t fscache_n_stores_ok;
+atomic_t fscache_n_stores_again;
+atomic_t fscache_n_stores_nobufs;
+atomic_t fscache_n_stores_oom;
+atomic_t fscache_n_store_ops;
+atomic_t fscache_n_store_calls;
+atomic_t fscache_n_marks;
+atomic_t fscache_n_uncaches;
+atomic_t fscache_n_acquires;
+atomic_t fscache_n_acquires_null;
+atomic_t fscache_n_acquires_no_cache;
+atomic_t fscache_n_acquires_ok;
+atomic_t fscache_n_acquires_nobufs;
+atomic_t fscache_n_acquires_oom;
+atomic_t fscache_n_updates;
+atomic_t fscache_n_updates_null;
+atomic_t fscache_n_updates_run;
+atomic_t fscache_n_relinquishes;
+atomic_t fscache_n_relinquishes_null;
+atomic_t fscache_n_relinquishes_waitcrt;
+atomic_t fscache_n_cookie_index;
+atomic_t fscache_n_cookie_data;
+atomic_t fscache_n_cookie_special;
+atomic_t fscache_n_object_alloc;
+atomic_t fscache_n_object_no_alloc;
+atomic_t fscache_n_object_lookups;
+atomic_t fscache_n_object_lookups_negative;
+atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_created;
+atomic_t fscache_n_object_avail;
+atomic_t fscache_n_object_dead;
+atomic_t fscache_n_checkaux_none;
+atomic_t fscache_n_checkaux_okay;
+atomic_t fscache_n_checkaux_update;
+atomic_t fscache_n_checkaux_obsolete;
+/*
+ * display the general statistics
+ */
+static int fscache_stats_show(struct seq_file *m, void *v)
+{
+        seq_puts(m, "FS-Cache statistics\n");
+        seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
+                   atomic_read(&fscache_n_cookie_index),
+                   atomic_read(&fscache_n_cookie_data),
+                   atomic_read(&fscache_n_cookie_special));
+        seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
+                   atomic_read(&fscache_n_object_alloc),
+                   atomic_read(&fscache_n_object_no_alloc),
+                   atomic_read(&fscache_n_object_avail),
+                   atomic_read(&fscache_n_object_dead));
+        seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
+                   atomic_read(&fscache_n_checkaux_none),
+                   atomic_read(&fscache_n_checkaux_okay),
+                   atomic_read(&fscache_n_checkaux_update),
+                   atomic_read(&fscache_n_checkaux_obsolete));
+        seq_printf(m, "Pages  : mrk=%u unc=%u\n",
+                   atomic_read(&fscache_n_marks),
+                   atomic_read(&fscache_n_uncaches));
+        seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
+                   " oom=%u\n",
+                   atomic_read(&fscache_n_acquires),
+                   atomic_read(&fscache_n_acquires_null),
+                   atomic_read(&fscache_n_acquires_no_cache),
+                   atomic_read(&fscache_n_acquires_ok),
+                   atomic_read(&fscache_n_acquires_nobufs),
+                   atomic_read(&fscache_n_acquires_oom));
+        seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+                   atomic_read(&fscache_n_object_lookups),
+                   atomic_read(&fscache_n_object_lookups_negative),
+                   atomic_read(&fscache_n_object_lookups_positive),
+                   atomic_read(&fscache_n_object_created));
+        seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+                   atomic_read(&fscache_n_updates),
+                   atomic_read(&fscache_n_updates_null),
+                   atomic_read(&fscache_n_updates_run));
+        seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+                   atomic_read(&fscache_n_relinquishes),
+                   atomic_read(&fscache_n_relinquishes_null),
+                   atomic_read(&fscache_n_relinquishes_waitcrt));
+        seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
+                   atomic_read(&fscache_n_attr_changed),
+                   atomic_read(&fscache_n_attr_changed_ok),
+                   atomic_read(&fscache_n_attr_changed_nobufs),
+                   atomic_read(&fscache_n_attr_changed_nomem),
+                   atomic_read(&fscache_n_attr_changed_calls));
+        seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+                   atomic_read(&fscache_n_allocs),
+                   atomic_read(&fscache_n_allocs_ok),
+                   atomic_read(&fscache_n_allocs_wait),
+                   atomic_read(&fscache_n_allocs_nobufs));
+        seq_printf(m, "Allocs : ops=%u owt=%u\n",
+                   atomic_read(&fscache_n_alloc_ops),
+                   atomic_read(&fscache_n_alloc_op_waits));
+        seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
+                   " int=%u oom=%u\n",
+                   atomic_read(&fscache_n_retrievals),
+                   atomic_read(&fscache_n_retrievals_ok),
+                   atomic_read(&fscache_n_retrievals_wait),
+                   atomic_read(&fscache_n_retrievals_nodata),
+                   atomic_read(&fscache_n_retrievals_nobufs),
+                   atomic_read(&fscache_n_retrievals_intr),
+                   atomic_read(&fscache_n_retrievals_nomem));
+        seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+                   atomic_read(&fscache_n_retrieval_ops),
+                   atomic_read(&fscache_n_retrieval_op_waits));
+        seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
+                   atomic_read(&fscache_n_stores),
+                   atomic_read(&fscache_n_stores_ok),
+                   atomic_read(&fscache_n_stores_again),
+                   atomic_read(&fscache_n_stores_nobufs),
+                   atomic_read(&fscache_n_stores_oom));
+        seq_printf(m, "Stores : ops=%u run=%u\n",
+                   atomic_read(&fscache_n_store_ops),
+                   atomic_read(&fscache_n_store_calls));
+        seq_printf(m, "Ops    : pend=%u run=%u enq=%u\n",
+                   atomic_read(&fscache_n_op_pend),
+                   atomic_read(&fscache_n_op_run),
+                   atomic_read(&fscache_n_op_enqueue));
+        seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
+                   atomic_read(&fscache_n_op_deferred_release),
+                   atomic_read(&fscache_n_op_release),
+                   atomic_read(&fscache_n_op_gc));
+        return 0;
+}
+/*
+ * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
+ */
+static int fscache_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, fscache_stats_show, NULL);
+}
+const struct file_operations fscache_stats_fops = {
+        .owner          = THIS_MODULE,
+        .open           = fscache_stats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 06da05261e04..8b8eebc5614b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1032,6 +1032,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
                fuse_put_request(fc, req);
                return -ENOMEM;
        }
+        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
        fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 821d10f719bd..06f30e965676 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -386,7 +386,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_read_in);
        req->in.args[0].value = inarg;
-        req->out.argpages = 1;
        req->out.argvar = 1;
        req->out.numargs = 1;
        req->out.args[0].size = count;
@@ -453,6 +452,7 @@ static int fuse_readpage(struct file *file, struct page *page)
        attr_ver = fuse_get_attr_version(fc);
        req->out.page_zeroing = 1;
+        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
        num_read = fuse_send_read(req, file, inode, pos, count, NULL);
@@ -510,6 +510,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
        struct fuse_conn *fc = get_fuse_conn(inode);
        loff_t pos = page_offset(req->pages[0]);
        size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+        req->out.argpages = 1;
        req->out.page_zeroing = 1;
        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
        req->misc.read.attr_ver = fuse_get_attr_version(fc);
@@ -621,7 +623,6 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
        inarg->flags = file ? file->f_flags : 0;
        req->in.h.opcode = FUSE_WRITE;
        req->in.h.nodeid = get_node_id(inode);
-        req->in.argpages = 1;
        req->in.numargs = 2;
        if (fc->minor < 9)
                req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
@@ -695,6 +696,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
        if (IS_ERR(req))
                return PTR_ERR(req);
+        req->in.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
        req->page_offset = offset;
@@ -771,6 +773,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
        size_t count = 0;
        int err;
+        req->in.argpages = 1;
        req->page_offset = offset;
        do {
@@ -935,21 +938,28 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 }
 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
-                               unsigned nbytes, int write)
+                               size_t *nbytesp, int write)
 {
+        size_t nbytes = *nbytesp;
        unsigned long user_addr = (unsigned long) buf;
        unsigned offset = user_addr & ~PAGE_MASK;
        int npages;
-        /* This doesn't work with nfsd */
+        /* Special case for kernel I/O: can copy directly into the buffer */
-        if (!current->mm)
+        if (segment_eq(get_fs(), KERNEL_DS)) {
-                return -EPERM;
+                if (write)
+                        req->in.args[1].value = (void *) user_addr;
+                else
+                        req->out.args[0].value = (void *) user_addr;
+                return 0;
+        }
-        nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+        nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
        npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
        npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
        down_read(&current->mm->mmap_sem);
-        npages = get_user_pages(current, current->mm, user_addr, npages, write,
+        npages = get_user_pages(current, current->mm, user_addr, npages, !write,
                                0, req->pages, NULL);
        up_read(&current->mm->mmap_sem);
        if (npages < 0)
@@ -957,6 +967,15 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
        req->num_pages = npages;
        req->page_offset = offset;
+        if (write)
+                req->in.argpages = 1;
+        else
+                req->out.argpages = 1;
+        nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+        *nbytesp = min(*nbytesp, nbytes);
        return 0;
 }
@@ -979,15 +998,13 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
        while (count) {
                size_t nres;
-                size_t nbytes_limit = min(count, nmax);
+                size_t nbytes = min(count, nmax);
-                size_t nbytes;
+                int err = fuse_get_user_pages(req, buf, &nbytes, write);
-                int err = fuse_get_user_pages(req, buf, nbytes_limit, !write);
                if (err) {
                        res = err;
                        break;
                }
-                nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
-                nbytes = min(nbytes_limit, nbytes);
                if (write)
                        nres = fuse_send_write(req, file, inode, pos, nbytes,
                                               current->files);
@@ -1163,6 +1180,7 @@ static int fuse_writepage_locked(struct page *page)
        fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
        copy_highpage(tmp_page, page);
+        req->in.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = tmp_page;
        req->page_offset = 0;
@@ -1234,8 +1252,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
 * - sync(2)
 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
 */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        /*
         * Don't use page->mapping as it may become NULL from a
         * concurrent truncate.
@@ -1273,6 +1292,17 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
+static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        /* Can't provide the coherency needed for MAP_SHARED */
+        if (vma->vm_flags & VM_MAYSHARE)
+                return -ENODEV;
+        invalidate_inode_pages2(file->f_mapping);
+        return generic_file_mmap(file, vma);
+}
 static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
                                  struct file_lock *fl)
 {
@@ -1907,6 +1937,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
        .llseek         = fuse_file_llseek,
        .read           = fuse_direct_read,
        .write          = fuse_direct_write,
+        .mmap           = fuse_direct_mmap,
        .open           = fuse_open,
        .flush          = fuse_flush,
        .release        = fuse_release,
@@ -1916,7 +1947,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
        .unlocked_ioctl = fuse_file_ioctl,
        .compat_ioctl   = fuse_file_compat_ioctl,
        .poll           = fuse_file_poll,
-        /* no mmap and splice_read */
+        /* no splice_read */
 };
 static const struct address_space_operations fuse_file_aops  = {
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 459b73dd45e1..91f7c85f1ffd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -19,6 +19,7 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
+#include <linux/smp_lock.h>
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -259,7 +260,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 static void fuse_umount_begin(struct super_block *sb)
 {
+        lock_kernel();
        fuse_abort_conn(get_fuse_conn_super(sb));
+        unlock_kernel();
 }
 static void fuse_send_destroy(struct fuse_conn *fc)
@@ -908,6 +911,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 err_put_root:
        dput(root_dentry);
 err_put_conn:
+        bdi_destroy(&fc->bdi);
        fuse_conn_put(fc);
 err_fput:
        fput(file);
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e747..e0b53aa7bbec 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
        mode_t mode = inode->i_mode;
        int error;
-        inode->i_mode = mode & ~current->fs->umask;
+        inode->i_mode = mode & ~current_umask();
        if (!S_ISLNK(inode->i_mode))
                acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
        if (acl) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 43764f4fa763..fa881bdc3d85 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        if (error)
                return error;
        if (!acl) {
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
                if (mode != ip->i_inode.i_mode)
                        error = munge_mode(ip, mode);
                return error;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3984e47d1d33..ff4981090489 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -597,7 +597,6 @@ __acquires(&gl->gl_spin)
        GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
-        down_read(&gfs2_umount_flush_sem);
        if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
            gl->gl_demote_state != gl->gl_state) {
                if (find_first_holder(gl))
@@ -614,15 +613,14 @@ __acquires(&gl->gl_spin)
                if (ret == 0)
                        goto out_unlock;
                if (ret == 2)
-                        goto out_sem;
+                        goto out;
                gh = find_first_waiter(gl);
                gl->gl_target = gh->gh_state;
                if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                        do_error(gl, 0); /* Fail queued try locks */
        }
        do_xmote(gl, gh, gl->gl_target);
-out_sem:
+out:
-        up_read(&gfs2_umount_flush_sem);
        return;
 out_sched:
@@ -631,7 +629,7 @@ out_sched:
                gfs2_glock_put(gl);
 out_unlock:
        clear_bit(GLF_LOCK, &gl->gl_flags);
-        goto out_sem;
+        goto out;
 }
 static void glock_work_func(struct work_struct *work)
@@ -641,6 +639,7 @@ static void glock_work_func(struct work_struct *work)
        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
                finish_xmote(gl, gl->gl_reply);
+        down_read(&gfs2_umount_flush_sem);
        spin_lock(&gl->gl_spin);
        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
            gl->gl_state != LM_ST_UNLOCKED &&
@@ -653,6 +652,7 @@ static void glock_work_func(struct work_struct *work)
        }
        run_queue(gl, 0);
        spin_unlock(&gl->gl_spin);
+        up_read(&gfs2_umount_flush_sem);
        if (!delay ||
            queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
@@ -1304,6 +1304,7 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                                nr--;
                                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                                        gfs2_glock_put(gl);
+                                got_ref = 0;
                        }
                        spin_lock(&lru_lock);
                        if (may_demote)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index bf23a62aa925..70f87f43afa2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -156,6 +156,12 @@ static void inode_go_sync(struct gfs2_glock *gl)
        error = filemap_fdatawait(metamapping);
        mapping_set_error(metamapping, error);
        gfs2_ail_empty_gl(gl);
+        /*
+         * Writeback of the data mapping may cause the dirty flag to be set
+         * so we have to clear it again here.
+         */
+        smp_mb__before_clear_bit();
+        clear_bit(GLF_DIRTY, &gl->gl_flags);
 }
 /**
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7b277d449155..5a31d426116f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -137,15 +137,15 @@ void gfs2_set_iop(struct inode *inode)
        if (S_ISREG(mode)) {
                inode->i_op = &gfs2_file_iops;
                if (gfs2_localflocks(sdp))
-                        inode->i_fop = gfs2_file_fops_nolock;
+                        inode->i_fop = &gfs2_file_fops_nolock;
                else
-                        inode->i_fop = gfs2_file_fops;
+                        inode->i_fop = &gfs2_file_fops;
        } else if (S_ISDIR(mode)) {
                inode->i_op = &gfs2_dir_iops;
                if (gfs2_localflocks(sdp))
-                        inode->i_fop = gfs2_dir_fops_nolock;
+                        inode->i_fop = &gfs2_dir_fops_nolock;
                else
-                        inode->i_fop = gfs2_dir_fops;
+                        inode->i_fop = &gfs2_dir_fops;
        } else if (S_ISLNK(mode)) {
                inode->i_op = &gfs2_symlink_iops;
        } else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index dca4fee3078b..c30be2b66580 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,21 +101,23 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations *gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations *gfs2_dir_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
 extern void gfs2_set_inode_flags(struct inode *inode);
 
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
-extern const struct file_operations *gfs2_file_fops;
+extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations *gfs2_dir_fops;
+extern const struct file_operations gfs2_dir_fops;
 static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
 {
        return sdp->sd_args.ar_localflocks;
 }
 #else /* Single node only */
-#define gfs2_file_fops NULL
+#define gfs2_file_fops gfs2_file_fops_nolock
-#define gfs2_dir_fops NULL
+#define gfs2_dir_fops gfs2_dir_fops_nolock
 static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
 {
        return 1;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3b9e8de3500b..5d82e91887e3 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
 * blocks allocated on disk to back that page.
 */
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -412,6 +413,10 @@ out_unlock:
        gfs2_glock_dq(&gh);
 out:
        gfs2_holder_uninit(&gh);
+        if (ret == -ENOMEM)
+                ret = VM_FAULT_OOM;
+        else if (ret)
+                ret = VM_FAULT_SIGBUS;
        return ret;
 }
@@ -702,7 +707,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
        }
 }
-const struct file_operations *gfs2_file_fops = &(const struct file_operations){
+const struct file_operations gfs2_file_fops = {
        .llseek         = gfs2_llseek,
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
@@ -720,7 +725,7 @@ const struct file_operations *gfs2_file_fops = &(const struct file_operations){
        .setlease       = gfs2_setlease,
 };
-const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
+const struct file_operations gfs2_dir_fops = {
        .readdir        = gfs2_readdir,
        .unlocked_ioctl = gfs2_ioctl,
        .open           = gfs2_open,
@@ -732,7 +737,7 @@ const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
 #endif /* CONFIG_GFS2_FS_LOCKING_DLM */
-const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){
+const struct file_operations gfs2_file_fops_nolock = {
        .llseek         = gfs2_llseek,
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
@@ -748,7 +753,7 @@ const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operat
        .setlease       = generic_setlease,
 };
-const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){
+const struct file_operations gfs2_dir_fops_nolock = {
        .readdir        = gfs2_readdir,
        .unlocked_ioctl = gfs2_ioctl,
        .open           = gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 51883b3ad89c..1ff9473ea753 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -272,11 +272,6 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
        lock_page(page);
        bio = bio_alloc(GFP_NOFS, 1);
-        if (unlikely(!bio)) {
-                __free_page(page);
-                return -ENOBUFS;
-        }
        bio->bi_sector = sector * (sb->s_blocksize >> 9);
        bio->bi_bdev = sb->s_bdev;
        bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -1287,21 +1282,21 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 static struct super_block *get_gfs2_sb(const char *dev_name)
 {
        struct super_block *sb;
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+        error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
        if (error) {
                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
                       dev_name, error);
                return NULL;
        }
-        sb = nd.path.dentry->d_inode->i_sb;
+        sb = path.dentry->d_inode->i_sb;
        if (sb && (sb->s_type == &gfs2_fs_type))
                atomic_inc(&sb->s_active);
        else
                sb = NULL;
-        path_put(&nd.path);
+        path_put(&path);
        return sb;
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index abd5429ae285..1c70fa5168d6 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -371,6 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        ip = ghs[1].gh_gl->gl_object;
        ip->i_disksize = size;
+        i_size_write(inode, size);
        error = gfs2_meta_inode_buffer(ip, &dibh);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8d53f66b5bcc..152e6c4a0dca 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -81,7 +81,7 @@ struct gfs2_quota_change_host {
 static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
-static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(qd_lru_lock);
 int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
 {
@@ -1364,7 +1364,7 @@ int gfs2_quotad(void *data)
                        refrigerator();
                t = min(quotad_timeo, statfs_timeo);
-                prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
+                prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
                spin_lock(&sdp->sd_trunc_lock);
                empty = list_empty(&sdp->sd_trunc_list);
                spin_unlock(&sdp->sd_trunc_lock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f03d024038ea..565038243fa2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -212,8 +212,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
        if (tmp == 0)
                return BFITNOENT;
        ptr--;
-        bit = fls64(tmp);
+        bit = __ffs64(tmp);
-        bit--;          /* fls64 always adds one to the bit count */
        bit /= 2;       /* two bits per entry in the bitmap */
        return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
 }
@@ -1445,10 +1444,12 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_rgrpd *rgd = al->al_rgd;
        u32 goal, blk;
        u64 block;
+        int error;
        if (rgrp_contains_block(rgd, ip->i_goal))
                goal = ip->i_goal - rgd->rd_data0;
@@ -1461,7 +1462,13 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
        ip->i_goal = block;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error == 0) {
+                struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
+                brelse(dibh);
+        }
        gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
        rgd->rd_free -= *n;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9435dda8f1e0..a1cbff2b4d99 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -70,6 +70,10 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
                BUG();
                return 0;
        }
+        if (!tree)
+                return 0;
        if (tree->node_size >= PAGE_CACHE_SIZE) {
                nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
                spin_lock(&tree->hash_lock);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 36ca2e1a4fa3..7b6165f25fbe 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -349,6 +349,7 @@ void hfs_mdb_put(struct super_block *sb)
        if (HFS_SB(sb)->nls_disk)
                unload_nls(HFS_SB(sb)->nls_disk);
+        free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
        kfree(HFS_SB(sb));
        sb->s_fs_info = NULL;
 }
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c8b5acf4b0b7..a36bb749926d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb)
 static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = HFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
@@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = buf->f_bfree;
        buf->f_files = HFS_SB(sb)->fs_ablocks;
        buf->f_ffree = HFS_SB(sb)->free_ablocks;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = HFS_NAMELEN;
        return 0;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdfa..3fcbb0e1f6fc 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
        opts->creator = HFSPLUS_DEF_CR_TYPE;
        opts->type = HFSPLUS_DEF_CR_TYPE;
-        opts->umask = current->fs->umask;
+        opts->umask = current_umask();
        opts->uid = current_uid();
        opts->gid = current_gid();
        opts->part = -1;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb74531a0a8e..f2a64020f42e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb)
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = HFSPLUS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
@@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = buf->f_bfree;
        buf->f_files = 0xFFFFFFFF;
        buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = HFSPLUS_MAX_STRLEN;
        return 0;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c4..fc77965be841 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *s = dentry->d_sb;
        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        u64 id = huge_encode_dev(s->s_bdev->bd_dev);
        lock_kernel();
        /*if (sbi->sb_n_free == -1) {*/
@@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = sbi->sb_n_free;
        buf->f_files = sbi->sb_dirband_size / 4;
        buf->f_ffree = sbi->sb_n_free_dnodes;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = 254;
        unlock_kernel();
@@ -420,8 +423,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        if (!(*flags & MS_RDONLY)) mark_dirty(s);
-        kfree(s->s_options);
+        replace_mount_options(s, new_opts);
-        s->s_options = new_opts;
        return 0;
@@ -477,7 +479,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        uid = current_uid();
        gid = current_gid();
-        umask = current->fs->umask;
+        umask = current_umask();
        lowercase = 0;
        conv = CONV_BINARY;
        eas = 2;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index b278f7f52024..a5089a6dd67a 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
                               "errno = %d\n", err);
                        return err;
                }
-                count = hppfs_read_file(hppfs->host_fd, buf, count);
+                err = hppfs_read_file(hppfs->host_fd, buf, count);
+                if (err < 0) {
+                        printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
+                        return err;
+                }
+                count = err;
                if (count > 0)
                        *ppos += count;
        }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9b800d97a687..c1462d43e721 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -26,7 +26,6 @@
 #include <linux/pagevec.h>
 #include <linux/parser.h>
 #include <linux/mman.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
@@ -313,16 +312,6 @@ out:
        return retval;
 }
-/*
- * Read a page. Again trivial. If it didn't already exist
- * in the page cache, it is zero-filled.
- */
-static int hugetlbfs_readpage(struct file *file, struct page * page)
-{
-        unlock_page(page);
-        return -EINVAL;
-}
 static int hugetlbfs_write_begin(struct file *file,
                        struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
@@ -702,7 +691,6 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
 }
 static const struct address_space_operations hugetlbfs_aops = {
-        .readpage       = hugetlbfs_readpage,
        .write_begin    = hugetlbfs_write_begin,
        .write_end      = hugetlbfs_write_end,
        .set_page_dirty = hugetlbfs_set_page_dirty,
@@ -842,7 +830,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 bad_val:
        printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
               args[0].from, p);
-        return 1;
+        return -EINVAL;
 }
 static int
@@ -943,14 +931,13 @@ static struct vfsmount *hugetlbfs_vfsmount;
 static int can_do_hugetlb_shm(void)
 {
-        return likely(capable(CAP_IPC_LOCK) ||
+        return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
-                        in_group_p(sysctl_hugetlb_shm_group) ||
-                        can_do_mlock());
 }
 struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 {
        int error = -ENOMEM;
+        int unlock_shm = 0;
        struct file *file;
        struct inode *inode;
        struct dentry *dentry, *root;
@@ -960,11 +947,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
        if (!hugetlbfs_vfsmount)
                return ERR_PTR(-ENOENT);
-        if (!can_do_hugetlb_shm())
+        if (!can_do_hugetlb_shm()) {
-                return ERR_PTR(-EPERM);
+                if (user_shm_lock(size, user)) {
+                        unlock_shm = 1;
-        if (!user_shm_lock(size, user))
+                        WARN_ONCE(1,
-                return ERR_PTR(-ENOMEM);
+                          "Using mlock ulimits for SHM_HUGETLB deprecated\n");
+                } else
+                        return ERR_PTR(-EPERM);
+        }
        root = hugetlbfs_vfsmount->mnt_root;
        quick_string.name = name;
@@ -1004,7 +994,8 @@ out_inode:
 out_dentry:
        dput(dentry);
 out_shm_unlock:
-        user_shm_unlock(size, user);
+        if (unlock_shm)
+                user_shm_unlock(size, user);
        return ERR_PTR(error);
 }
diff --git a/fs/inode.c b/fs/inode.c
index d06d6d268de9..bca0c618fdb3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -99,7 +99,7 @@ static DEFINE_MUTEX(iprune_mutex);
 */
 struct inodes_stat_t inodes_stat;
-static struct kmem_cache * inode_cachep __read_mostly;
+static struct kmem_cache *inode_cachep __read_mostly;
 static void wake_up_inode(struct inode *inode)
 {
@@ -124,7 +124,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
-        struct address_space * const mapping = &inode->i_data;
+        struct address_space *const mapping = &inode->i_data;
        inode->i_sb = sb;
        inode->i_blkbits = sb->s_blocksize_bits;
@@ -216,9 +216,10 @@ static struct inode *alloc_inode(struct super_block *sb)
        return NULL;
 }
-void destroy_inode(struct inode *inode) 
+void destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
+        ima_inode_free(inode);
        security_inode_free(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
@@ -252,12 +253,11 @@ void inode_init_once(struct inode *inode)
        mutex_init(&inode->inotify_mutex);
 #endif
 }
 EXPORT_SYMBOL(inode_init_once);
 static void init_once(void *foo)
 {
-        struct inode * inode = (struct inode *) foo;
+        struct inode *inode = (struct inode *) foo;
        inode_init_once(inode);
 }
@@ -265,7 +265,7 @@ static void init_once(void *foo)
 /*
 * inode_lock must be held
 */
-void __iget(struct inode * inode)
+void __iget(struct inode *inode)
 {
        if (atomic_read(&inode->i_count)) {
                atomic_inc(&inode->i_count);
@@ -289,7 +289,7 @@ void clear_inode(struct inode *inode)
 {
        might_sleep();
        invalidate_inode_buffers(inode);
-       
        BUG_ON(inode->i_data.nrpages);
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
@@ -303,7 +303,6 @@ void clear_inode(struct inode *inode)
                cd_forget(inode);
        inode->i_state = I_CLEAR;
 }
 EXPORT_SYMBOL(clear_inode);
 /*
@@ -351,8 +350,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
        next = head->next;
        for (;;) {
-                struct list_head * tmp = next;
+                struct list_head *tmp = next;
-                struct inode * inode;
+                struct inode *inode;
                /*
                 * We can reschedule here without worrying about the list's
@@ -391,7 +390,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 *      fails because there are busy inodes then a non zero value is returned.
 *      If the discard is successful all the inodes have been discarded.
 */
-int invalidate_inodes(struct super_block * sb)
+int invalidate_inodes(struct super_block *sb)
 {
        int busy;
        LIST_HEAD(throw_away);
@@ -407,7 +406,6 @@ int invalidate_inodes(struct super_block * sb)
        return busy;
 }
 EXPORT_SYMBOL(invalidate_inodes);
 static int can_unuse(struct inode *inode)
@@ -504,7 +502,7 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
                 * Nasty deadlock avoidance.  We may hold various FS locks,
                 * and we don't want to recurse into the FS that called us
                 * in clear_inode() and friends..
-                 */
+                 */
                if (!(gfp_mask & __GFP_FS))
                        return -1;
                prune_icache(nr);
@@ -524,10 +522,13 @@ static void __wait_on_freeing_inode(struct inode *inode);
 * by hand after calling find_inode now! This simplifies iunique and won't
 * add any additional branch in the common code.
 */
-static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+static struct inode *find_inode(struct super_block *sb,
+                                struct hlist_head *head,
+                                int (*test)(struct inode *, void *),
+                                void *data)
 {
        struct hlist_node *node;
-        struct inode * inode = NULL;
+        struct inode *inode = NULL;
 repeat:
        hlist_for_each_entry(inode, node, head, i_hash) {
@@ -548,10 +549,11 @@ repeat:
 * find_inode_fast is the fast path version of find_inode, see the comment at
 * iget_locked for details.
 */
-static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+static struct inode *find_inode_fast(struct super_block *sb,
+                                struct hlist_head *head, unsigned long ino)
 {
        struct hlist_node *node;
-        struct inode * inode = NULL;
+        struct inode *inode = NULL;
 repeat:
        hlist_for_each_entry(inode, node, head, i_hash) {
@@ -631,10 +633,10 @@ struct inode *new_inode(struct super_block *sb)
         * here to attempt to avoid that.
         */
        static unsigned int last_ino;
-        struct inode * inode;
+        struct inode *inode;
        spin_lock_prefetch(&inode_lock);
-        
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
@@ -645,7 +647,6 @@ struct inode *new_inode(struct super_block *sb)
        }
        return inode;
 }
 EXPORT_SYMBOL(new_inode);
 void unlock_new_inode(struct inode *inode)
@@ -674,7 +675,6 @@ void unlock_new_inode(struct inode *inode)
        inode->i_state &= ~(I_LOCK|I_NEW);
        wake_up_inode(inode);
 }
 EXPORT_SYMBOL(unlock_new_inode);
 /*
@@ -683,13 +683,17 @@ EXPORT_SYMBOL(unlock_new_inode);
 * We no longer cache the sb_flags in i_flags - see fs.h
 *      -- rmk@arm.uk.linux.org
 */
-static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+static struct inode *get_new_inode(struct super_block *sb,
+                                struct hlist_head *head,
+                                int (*test)(struct inode *, void *),
+                                int (*set)(struct inode *, void *),
+                                void *data)
 {
-        struct inode * inode;
+        struct inode *inode;
        inode = alloc_inode(sb);
        if (inode) {
-                struct inode * old;
+                struct inode *old;
                spin_lock(&inode_lock);
                /* We released the lock, so.. */
@@ -731,13 +735,14 @@ set_failed:
 * get_new_inode_fast is the fast path version of get_new_inode, see the
 * comment at iget_locked for details.
 */
-static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+static struct inode *get_new_inode_fast(struct super_block *sb,
+                                struct hlist_head *head, unsigned long ino)
 {
-        struct inode * inode;
+        struct inode *inode;
        inode = alloc_inode(sb);
        if (inode) {
-                struct inode * old;
+                struct inode *old;
                spin_lock(&inode_lock);
                /* We released the lock, so.. */
@@ -823,7 +828,6 @@ struct inode *igrab(struct inode *inode)
        spin_unlock(&inode_lock);
        return inode;
 }
 EXPORT_SYMBOL(igrab);
 /**
@@ -924,7 +928,6 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
        return ifind(sb, head, test, data, 0);
 }
 EXPORT_SYMBOL(ilookup5_nowait);
 /**
@@ -953,7 +956,6 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
        return ifind(sb, head, test, data, 1);
 }
 EXPORT_SYMBOL(ilookup5);
 /**
@@ -976,7 +978,6 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
        return ifind_fast(sb, head, ino);
 }
 EXPORT_SYMBOL(ilookup);
 /**
@@ -1015,7 +1016,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
         */
        return get_new_inode(sb, head, test, set, data);
 }
 EXPORT_SYMBOL(iget5_locked);
 /**
@@ -1047,7 +1047,6 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
         */
        return get_new_inode_fast(sb, head, ino);
 }
 EXPORT_SYMBOL(iget_locked);
 int insert_inode_locked(struct inode *inode)
@@ -1055,13 +1054,22 @@ int insert_inode_locked(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
-        struct inode *old;
        inode->i_state |= I_LOCK|I_NEW;
        while (1) {
+                struct hlist_node *node;
+                struct inode *old = NULL;
                spin_lock(&inode_lock);
-                old = find_inode_fast(sb, head, ino);
+                hlist_for_each_entry(old, node, head, i_hash) {
-                if (likely(!old)) {
+                        if (old->i_ino != ino)
+                                continue;
+                        if (old->i_sb != sb)
+                                continue;
+                        if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+                                continue;
+                        break;
+                }
+                if (likely(!node)) {
                        hlist_add_head(&inode->i_hash, head);
                        spin_unlock(&inode_lock);
                        return 0;
@@ -1076,7 +1084,6 @@ int insert_inode_locked(struct inode *inode)
                iput(old);
        }
 }
 EXPORT_SYMBOL(insert_inode_locked);
 int insert_inode_locked4(struct inode *inode, unsigned long hashval,
@@ -1084,14 +1091,24 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 {
        struct super_block *sb = inode->i_sb;
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
-        struct inode *old;
        inode->i_state |= I_LOCK|I_NEW;
        while (1) {
+                struct hlist_node *node;
+                struct inode *old = NULL;
                spin_lock(&inode_lock);
-                old = find_inode(sb, head, test, data);
+                hlist_for_each_entry(old, node, head, i_hash) {
-                if (likely(!old)) {
+                        if (old->i_sb != sb)
+                                continue;
+                        if (!test(old, data))
+                                continue;
+                        if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+                                continue;
+                        break;
+                }
+                if (likely(!node)) {
                        hlist_add_head(&inode->i_hash, head);
                        spin_unlock(&inode_lock);
                        return 0;
@@ -1106,7 +1123,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                iput(old);
        }
 }
 EXPORT_SYMBOL(insert_inode_locked4);
 /**
@@ -1124,7 +1140,6 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
        hlist_add_head(&inode->i_hash, head);
        spin_unlock(&inode_lock);
 }
 EXPORT_SYMBOL(__insert_inode_hash);
 /**
@@ -1139,7 +1154,6 @@ void remove_inode_hash(struct inode *inode)
        hlist_del_init(&inode->i_hash);
        spin_unlock(&inode_lock);
 }
 EXPORT_SYMBOL(remove_inode_hash);
 /*
@@ -1187,7 +1201,6 @@ void generic_delete_inode(struct inode *inode)
        BUG_ON(inode->i_state != I_CLEAR);
        destroy_inode(inode);
 }
 EXPORT_SYMBOL(generic_delete_inode);
 static void generic_forget_inode(struct inode *inode)
@@ -1237,12 +1250,11 @@ void generic_drop_inode(struct inode *inode)
        else
                generic_forget_inode(inode);
 }
 EXPORT_SYMBOL_GPL(generic_drop_inode);
 /*
 * Called when we're dropping the last reference
- * to an inode. 
+ * to an inode.
 *
 * Call the FS "drop()" function, defaulting to
 * the legacy UNIX filesystem behaviour..
@@ -1262,7 +1274,7 @@ static inline void iput_final(struct inode *inode)
 }
 /**
- *      iput    - put an inode 
+ *      iput    - put an inode
 *      @inode: inode to put
 *
 *      Puts an inode, dropping its usage count. If the inode use count hits
@@ -1279,7 +1291,6 @@ void iput(struct inode *inode)
                        iput_final(inode);
        }
 }
 EXPORT_SYMBOL(iput);
 /**
@@ -1290,10 +1301,10 @@ EXPORT_SYMBOL(iput);
 *      Returns the block number on the device holding the inode that
 *      is the disk block number for the block of the file requested.
 *      That is, asked for block 4 of inode 1 the function will return the
- *      disk block relative to the disk start that holds that block of the 
+ *      disk block relative to the disk start that holds that block of the
 *      file.
 */
-sector_t bmap(struct inode * inode, sector_t block)
+sector_t bmap(struct inode *inode, sector_t block)
 {
        sector_t res = 0;
        if (inode->i_mapping->a_ops->bmap)
@@ -1425,7 +1436,6 @@ void file_update_time(struct file *file)
                mark_inode_dirty_sync(inode);
        mnt_drop_write(file->f_path.mnt);
 }
 EXPORT_SYMBOL(file_update_time);
 int inode_needs_sync(struct inode *inode)
@@ -1436,7 +1446,6 @@ int inode_needs_sync(struct inode *inode)
                return 1;
        return 0;
 }
 EXPORT_SYMBOL(inode_needs_sync);
 int inode_wait(void *word)
@@ -1470,42 +1479,6 @@ static void __wait_on_freeing_inode(struct inode *inode)
        spin_lock(&inode_lock);
 }
-/*
- * We rarely want to lock two inodes that do not have a parent/child
- * relationship (such as directory, child inode) simultaneously. The
- * vast majority of file systems should be able to get along fine
- * without this. Do not use these functions except as a last resort.
- */
-void inode_double_lock(struct inode *inode1, struct inode *inode2)
-{
-        if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
-                if (inode1)
-                        mutex_lock(&inode1->i_mutex);
-                else if (inode2)
-                        mutex_lock(&inode2->i_mutex);
-                return;
-        }
-        if (inode1 < inode2) {
-                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
-        } else {
-                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
-                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
-        }
-}
-EXPORT_SYMBOL(inode_double_lock);
-void inode_double_unlock(struct inode *inode1, struct inode *inode2)
-{
-        if (inode1)
-                mutex_unlock(&inode1->i_mutex);
-        if (inode2 && inode2 != inode1)
-                mutex_unlock(&inode2->i_mutex);
-}
-EXPORT_SYMBOL(inode_double_unlock);
 static __initdata unsigned long ihash_entries;
 static int __init set_ihash_entries(char *str)
 {
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 struct super_block;
 struct linux_binprm;
+struct path;
 /*
 * block_dev.c
@@ -43,7 +44,7 @@ extern void __init chrdev_init(void);
 /*
 * exec.c
 */
-extern void check_unsafe_exec(struct linux_binprm *);
+extern int check_unsafe_exec(struct linux_binprm *);
 /*
 * namespace.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern void __init mnt_init(void);
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index ac2d47e43926..82d9c42b8bac 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -258,7 +258,7 @@ int __generic_block_fiemap(struct inode *inode,
        long long length = 0, map_len = 0;
        u64 logical = 0, phys = 0, size = 0;
        u32 flags = FIEMAP_EXTENT_MERGED;
-        int ret = 0;
+        int ret = 0, past_eof = 0, whole_file = 0;
        if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
                return ret;
@@ -266,6 +266,9 @@ int __generic_block_fiemap(struct inode *inode,
        start_blk = logical_to_blk(inode, start);
        length = (long long)min_t(u64, len, i_size_read(inode));
+        if (length < len)
+                whole_file = 1;
        map_len = length;
        do {
@@ -282,11 +285,26 @@ int __generic_block_fiemap(struct inode *inode,
                /* HOLE */
                if (!buffer_mapped(&tmp)) {
+                        length -= blk_to_logical(inode, 1);
+                        start_blk++;
+                        /*
+                         * we want to handle the case where there is an
+                         * allocated block at the front of the file, and then
+                         * nothing but holes up to the end of the file properly,
+                         * to make sure that extent at the front gets properly
+                         * marked with FIEMAP_EXTENT_LAST
+                         */
+                        if (!past_eof &&
+                            blk_to_logical(inode, start_blk) >=
+                            blk_to_logical(inode, 0)+i_size_read(inode))
+                                past_eof = 1;
                        /*
                         * first hole after going past the EOF, this is our
                         * last extent
                         */
-                        if (length <= 0) {
+                        if (past_eof && size) {
                                flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
@@ -294,15 +312,37 @@ int __generic_block_fiemap(struct inode *inode,
                                break;
                        }
-                        length -= blk_to_logical(inode, 1);
                        /* if we have holes up to/past EOF then we're done */
-                        if (length <= 0)
+                        if (length <= 0 || past_eof)
                                break;
-                        start_blk++;
                } else {
-                        if (length <= 0 && size) {
+                        /*
+                         * we have gone over the length of what we wanted to
+                         * map, and it wasn't the entire file, so add the extent
+                         * we got last time and exit.
+                         *
+                         * This is for the case where say we want to map all the
+                         * way up to the second to the last block in a file, but
+                         * the last block is a hole, making the second to last
+                         * block FIEMAP_EXTENT_LAST.  In this case we want to
+                         * see if there is a hole after the second to last block
+                         * so we can mark it properly.  If we found data after
+                         * we exceeded the length we were requesting, then we
+                         * are good to go, just add the extent to the fieinfo
+                         * and break
+                         */
+                        if (length <= 0 && !whole_file) {
+                                ret = fiemap_fill_next_extent(fieinfo, logical,
+                                                              phys, size,
+                                                              flags);
+                                break;
+                        }
+                        /*
+                         * if size != 0 then we know we already have an extent
+                         * to add, so add it.
+                         */
+                        if (size) {
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
@@ -319,19 +359,14 @@ int __generic_block_fiemap(struct inode *inode,
                        start_blk += logical_to_blk(inode, size);
                        /*
-                         * if we are past the EOF we need to loop again to see
+                         * If we are past the EOF, then we need to make sure as
-                         * if there is a hole so we can mark this extent as the
+                         * soon as we find a hole that the last extent we found
-                         * last one, and if not keep mapping things until we
+                         * is marked with FIEMAP_EXTENT_LAST
-                         * find a hole, or we run out of slots in the extent
-                         * array
                         */
-                        if (length <= 0)
+                        if (!past_eof &&
-                                continue;
+                            logical+size >=
+                            blk_to_logical(inode, 0)+i_size_read(inode))
-                        ret = fiemap_fill_next_extent(fieinfo, logical, phys,
+                                past_eof = 1;
-                                                      size, flags);
-                        if (ret)
-                                break;
                }
                cond_resched();
        } while (1);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 13d2eddd0692..b4cbe9603c7d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -923,6 +923,7 @@ out_freesbi:
 static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = ISOFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
@@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = 0;
        buf->f_files = ISOFS_SB(sb)->s_ninodes;
        buf->f_ffree = 0;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = NAME_MAX;
        return 0;
 }
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3fbffb1ea714..618e21c0b7a3 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/bio.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
        return (ret == -EIO);
 }
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+                                   int write_op)
 {
        int i;
        for (i = 0; i < bufs; i++) {
                wbuf[i]->b_end_io = end_buffer_write_sync;
                /* We use-up our safety reference in submit_bh() */
-                submit_bh(WRITE, wbuf[i]);
+                submit_bh(write_op, wbuf[i]);
        }
 }
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 *  Submit all the data buffers to disk
 */
 static int journal_submit_data_buffers(journal_t *journal,
-                                transaction_t *commit_transaction)
+                                       transaction_t *commit_transaction,
+                                       int write_op)
 {
        struct journal_head *jh;
        struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
                                BUFFER_TRACE(bh, "needs blocking lock");
                                spin_unlock(&journal->j_list_lock);
                                /* Write out all data to prevent deadlocks */
-                                journal_do_submit_data(wbuf, bufs);
+                                journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
                                lock_buffer(bh);
                                spin_lock(&journal->j_list_lock);
@@ -238,7 +241,7 @@ write_out_data:
                        spin_lock(&journal->j_list_lock);
                }
                /* Someone already cleaned up the buffer? */
-                if (!buffer_jbd(bh)
+                if (!buffer_jbd(bh) || bh2jh(bh) != jh
                        || jh->b_transaction != commit_transaction
                        || jh->b_jlist != BJ_SyncData) {
                        jbd_unlock_bh_state(bh);
@@ -256,7 +259,7 @@ write_out_data:
                        jbd_unlock_bh_state(bh);
                        if (bufs == journal->j_wbufsize) {
                                spin_unlock(&journal->j_list_lock);
-                                journal_do_submit_data(wbuf, bufs);
+                                journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
                                goto write_out_data;
                        }
@@ -286,7 +289,7 @@ write_out_data:
                }
        }
        spin_unlock(&journal->j_list_lock);
-        journal_do_submit_data(wbuf, bufs);
+        journal_do_submit_data(wbuf, bufs, write_op);
        return err;
 }
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
        int first_tag = 0;
        int tag_flag;
        int i;
+        int write_op = WRITE;
        /*
         * First job: lock down the current transaction and wait for
@@ -347,6 +351,13 @@ void journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
+        /*
+         * Use plugged writes here, since we want to submit several before
+         * we unplug the device. We don't do explicit unplugging in here,
+         * instead we rely on sync_buffer() doing the unplug for us.
+         */
+        if (commit_transaction->t_synchronous_commit)
+                write_op = WRITE_SYNC_PLUG;
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
@@ -431,7 +442,8 @@ void journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-        err = journal_submit_data_buffers(journal, commit_transaction);
+        err = journal_submit_data_buffers(journal, commit_transaction,
+                                          write_op);
        /*
         * Wait for all previously submitted IO to complete.
@@ -466,7 +478,9 @@ void journal_commit_transaction(journal_t *journal)
                        spin_lock(&journal->j_list_lock);
                        continue;
                }
-                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+                if (buffer_jbd(bh) && bh2jh(bh) == jh &&
+                    jh->b_transaction == commit_transaction &&
+                    jh->b_jlist == BJ_Locked) {
                        __journal_unfile_buffer(jh);
                        jbd_unlock_bh_state(bh);
                        journal_remove_journal_head(bh);
@@ -490,7 +504,7 @@ void journal_commit_transaction(journal_t *journal)
                err = 0;
        }
-        journal_write_revoke_records(journal, commit_transaction);
+        journal_write_revoke_records(journal, commit_transaction, write_op);
        /*
         * If we found any dirty or locked buffers, then we should have
@@ -660,7 +674,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                                submit_bh(WRITE, bh);
+                                submit_bh(write_op, bh);
                        }
                        cond_resched();
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e79c07812afa..737f7246a4b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
                return NULL;
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        if (!bh)
+                return NULL;
        lock_buffer(bh);
        memset(bh->b_data, 0, journal->j_blocksize);
        set_buffer_uptodate(bh);
@@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-                kfree(journal);
+                goto out_err;
-                journal = NULL;
-                goto out;
        }
        journal->j_dev = bdev;
        journal->j_fs_dev = fs_dev;
@@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev,
        journal->j_maxlen = len;
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-        J_ASSERT(bh != NULL);
+        if (!bh) {
+                printk(KERN_ERR
+                       "%s: Cannot get buffer for journal superblock\n",
+                       __func__);
+                goto out_err;
+        }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
        return journal;
+out_err:
+        kfree(journal);
+        return NULL;
 }
 /**
@@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode)
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-                kfree(journal);
+                goto out_err;
-                return NULL;
        }
        err = journal_bmap(journal, 0, &blocknr);
@@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode)
        if (err) {
                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
                       __func__);
-                kfree(journal);
+                goto out_err;
-                return NULL;
        }
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-        J_ASSERT(bh != NULL);
+        if (!bh) {
+                printk(KERN_ERR
+                       "%s: Cannot get buffer for journal superblock\n",
+                       __func__);
+                goto out_err;
+        }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
        return journal;
+out_err:
+        kfree(journal);
+        return NULL;
 }
 /*
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c7bd649bbbdc..da6cd9bdaabc 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -55,6 +55,25 @@
 *                      need do nothing.
 * RevokeValid set, Revoked set:
 *                      buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
 */
 #ifndef __KERNEL__
@@ -67,6 +86,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
@@ -99,8 +119,8 @@ struct jbd_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
                                    struct journal_head **, int *,
-                                    struct jbd_revoke_record_s *);
+                                    struct jbd_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 /* Utility functions to maintain the revoke table */
@@ -402,8 +422,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
 * the second time we would still have a pending revoke to cancel.  So,
 * do not trust the Revoked bit on buffers unless RevokeValid is also
 * set.
- *
- * The caller must have the journal locked.
 */
 int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -481,12 +499,9 @@ void journal_switch_revoke_table(journal_t *journal)
 /*
 * Write revoke records to the journal for all entries in the current
 * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
 */
 void journal_write_revoke_records(journal_t *journal,
-                                  transaction_t *transaction)
+                                  transaction_t *transaction, int write_op)
 {
        struct journal_head *descriptor;
        struct jbd_revoke_record_s *record;
@@ -510,14 +525,14 @@ void journal_write_revoke_records(journal_t *journal,
                                hash_list->next;
                        write_one_revoke_record(journal, transaction,
                                                &descriptor, &offset,
-                                                record);
+                                                record, write_op);
                        count++;
                        list_del(&record->hash);
                        kmem_cache_free(revoke_record_cache, record);
                }
        }
        if (descriptor)
-                flush_descriptor(journal, descriptor, offset);
+                flush_descriptor(journal, descriptor, offset, write_op);
        jbd_debug(1, "Wrote %d revoke records\n", count);
 }
@@ -530,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
                                    transaction_t *transaction,
                                    struct journal_head **descriptorp,
                                    int *offsetp,
-                                    struct jbd_revoke_record_s *record)
+                                    struct jbd_revoke_record_s *record,
+                                    int write_op)
 {
        struct journal_head *descriptor;
        int offset;
@@ -549,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
        /* Make sure we have a descriptor with space left for the record */
        if (descriptor) {
                if (offset == journal->j_blocksize) {
-                        flush_descriptor(journal, descriptor, offset);
+                        flush_descriptor(journal, descriptor, offset, write_op);
                        descriptor = NULL;
                }
        }
@@ -586,7 +602,7 @@ static void write_one_revoke_record(journal_t *journal,
 static void flush_descriptor(journal_t *journal,
                             struct journal_head *descriptor,
-                             int offset)
+                             int offset, int write_op)
 {
        journal_revoke_header_t *header;
        struct buffer_head *bh = jh2bh(descriptor);
@@ -601,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block(SWRITE, 1, &bh);
+        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a117431277..ed886e6db399 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
                }
        }
+        if (handle->h_sync)
+                transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44c..0b7d3b8226fd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
                set_buffer_ordered(bh);
                barrier_done = 1;
        }
-        ret = submit_bh(WRITE_SYNC, bh);
+        ret = submit_bh(WRITE_SYNC_PLUG, bh);
        if (barrier_done)
                clear_buffer_ordered(bh);
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
                lock_buffer(bh);
                set_buffer_uptodate(bh);
                clear_buffer_dirty(bh);
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(WRITE_SYNC_PLUG, bh);
        }
        *cbh = bh;
        return ret;
@@ -190,7 +190,7 @@ retry:
                set_buffer_uptodate(bh);
                bh->b_end_io = journal_end_buffer_io_sync;
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(WRITE_SYNC_PLUG, bh);
                if (ret) {
                        unlock_buffer(bh);
                        return ret;
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
+        int write_op = WRITE;
        /*
         * First job: lock down the current transaction and wait for
@@ -401,6 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
+        /*
+         * Use plugged writes here, since we want to submit several before
+         * we unplug the device. We don't do explicit unplugging in here,
+         * instead we rely on sync_buffer() doing the unplug for us.
+         */
+        if (commit_transaction->t_synchronous_commit)
+                write_op = WRITE_SYNC_PLUG;
        stats.u.run.rs_wait = commit_transaction->t_max_wait;
        stats.u.run.rs_locked = jiffies;
        stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -498,7 +506,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        if (err)
                jbd2_journal_abort(journal, err);
-        jbd2_journal_write_revoke_records(journal, commit_transaction);
+        jbd2_journal_write_revoke_records(journal, commit_transaction,
+                                          write_op);
        jbd_debug(3, "JBD: commit phase 2\n");
@@ -680,7 +689,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                                submit_bh(WRITE, bh);
+                                submit_bh(write_op, bh);
                        }
                        cond_resched();
                        stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff2625765..a360b06af2e3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
 *                      need do nothing.
 * RevokeValid set, Revoked set:
 *                      buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
 */
 #ifndef __KERNEL__
@@ -67,6 +86,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
@@ -99,8 +119,8 @@ struct jbd2_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
                                    struct journal_head **, int *,
-                                    struct jbd2_revoke_record_s *);
+                                    struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 /* Utility functions to maintain the revoke table */
@@ -401,8 +421,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
 * the second time we would still have a pending revoke to cancel.  So,
 * do not trust the Revoked bit on buffers unless RevokeValid is also
 * set.
- *
- * The caller must have the journal locked.
 */
 int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -480,12 +498,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 /*
 * Write revoke records to the journal for all entries in the current
 * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
 */
 void jbd2_journal_write_revoke_records(journal_t *journal,
-                                  transaction_t *transaction)
+                                       transaction_t *transaction,
+                                       int write_op)
 {
        struct journal_head *descriptor;
        struct jbd2_revoke_record_s *record;
@@ -509,14 +525,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
                                hash_list->next;
                        write_one_revoke_record(journal, transaction,
                                                &descriptor, &offset,
-                                                record);
+                                                record, write_op);
                        count++;
                        list_del(&record->hash);
                        kmem_cache_free(jbd2_revoke_record_cache, record);
                }
        }
        if (descriptor)
-                flush_descriptor(journal, descriptor, offset);
+                flush_descriptor(journal, descriptor, offset, write_op);
        jbd_debug(1, "Wrote %d revoke records\n", count);
 }
@@ -529,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
                                    transaction_t *transaction,
                                    struct journal_head **descriptorp,
                                    int *offsetp,
-                                    struct jbd2_revoke_record_s *record)
+                                    struct jbd2_revoke_record_s *record,
+                                    int write_op)
 {
        struct journal_head *descriptor;
        int offset;
@@ -548,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
        /* Make sure we have a descriptor with space left for the record */
        if (descriptor) {
                if (offset == journal->j_blocksize) {
-                        flush_descriptor(journal, descriptor, offset);
+                        flush_descriptor(journal, descriptor, offset, write_op);
                        descriptor = NULL;
                }
        }
@@ -593,7 +610,7 @@ static void write_one_revoke_record(journal_t *journal,
 static void flush_descriptor(journal_t *journal,
                             struct journal_head *descriptor,
-                             int offset)
+                             int offset, int write_op)
 {
        jbd2_journal_revoke_header_t *header;
        struct buffer_head *bh = jh2bh(descriptor);
@@ -608,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block(SWRITE, 1, &bh);
+        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598e..996ffda06bf3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
                }
        }
+        if (handle->h_sync)
+                transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1b..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
        size_t s;
        size -= sizeof(struct jffs2_acl_header);
-        s = size - 4 * sizeof(struct jffs2_acl_entry_short);
+        if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
-        if (s < 0) {
                if (size % sizeof(struct jffs2_acl_entry_short))
                        return -1;
                return size / sizeof(struct jffs2_acl_entry_short);
        } else {
+                s = size - 4 * sizeof(struct jffs2_acl_entry_short);
                if (s % sizeof(struct jffs2_acl_entry))
                        return -1;
                return s / sizeof(struct jffs2_acl_entry) + 4;
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
                return PTR_ERR(acl);
        if (!acl) {
-                *i_mode &= ~current->fs->umask;
+                *i_mode &= ~current_umask();
        } else {
                if (S_ISDIR(*i_mode))
                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index c32b4a1ad6cf..a0244740b75a 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -480,13 +480,6 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
        return;
 filebad:
-        mutex_lock(&c->erase_free_sem);
-        spin_lock(&c->erase_completion_lock);
-        /* Stick it on a list (any list) so erase_failed can take it
-           right off again.  Silly, but shouldn't happen often. */
-        list_move(&jeb->list, &c->erasing_list);
-        spin_unlock(&c->erase_completion_lock);
-        mutex_unlock(&c->erase_free_sem);
        jffs2_erase_failed(c, jeb, bad_offset);
        return;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
 struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
 {
        struct jffs2_xattr_datum *xd;
-        xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+        xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
        dbg_memalloc("%p\n", xd);
-        memset(xd, 0, sizeof(struct jffs2_xattr_datum));
        xd->class = RAWNODE_CLASS_XATTR_DATUM;
        xd->node = (void *)xd;
        INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
 struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 {
        struct jffs2_xattr_ref *ref;
-        ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+        ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
        dbg_memalloc("%p\n", ref);
-        memset(ref, 0, sizeof(struct jffs2_xattr_ref));
        ref->class = RAWNODE_CLASS_XATTR_REF;
        ref->node = (void *)ref;
        return ref;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a166c1669e82..06ca1b8d2054 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 cleanup:
                posix_acl_release(acl);
        } else
-                inode->i_mode &= ~current->fs->umask;
+                inode->i_mode &= ~current_umask();
        JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
                               inode->i_mode;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..80046ddf5063 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -246,8 +246,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        return 0;
 Enomem:
-        up_write(&s->s_umount);
+        deactivate_locked_super(s);
-        deactivate_super(s);
        return -ENOMEM;
 }
@@ -575,6 +574,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 * possibly a read which collects the result - which is stored in a
 * file-local buffer.
 */
+void simple_transaction_set(struct file *file, size_t n)
+{
+        struct simple_transaction_argresp *ar = file->private_data;
+        BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+        /*
+         * The barrier ensures that ar->size will really remain zero until
+         * ar->data is ready for reading.
+         */
+        smp_mb();
+        ar->size = n;
+}
 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
 {
        struct simple_transaction_argresp *ar;
@@ -820,6 +834,7 @@ EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
+EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
 EXPORT_SYMBOL(simple_transaction_read);
 EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index aedc47a264c1..1f3b0fc0d351 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
        return 0;
 }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
-                                                 struct in6_addr *addr_mapped)
-{
-        const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
-        switch (sap->sa_family) {
-        case AF_INET6:
-                return &((const struct sockaddr_in6 *)sap)->sin6_addr;
-        case AF_INET:
-                ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
-                return addr_mapped;
-        }
-        return NULL;
-}
-/*
- * If lockd is using a PF_INET6 listener, all incoming requests appear
- * to come from AF_INET6 remotes.  The address of AF_INET remotes are
- * mapped to AF_INET6 automatically by the network layer.  In case the
- * user passed an AF_INET server address at mount time, ensure both
- * addresses are AF_INET6 before comparing them.
- */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
-                            const struct sockaddr *sap)
-{
-        const struct in6_addr *addr1;
-        const struct in6_addr *addr2;
-        struct in6_addr addr1_mapped;
-        struct in6_addr addr2_mapped;
-        addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
-        if (likely(addr1 != NULL)) {
-                addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
-                if (likely(addr2 != NULL))
-                        return ipv6_addr_equal(addr1, addr2);
-        }
-        return 0;
-}
-#else   /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
-                            const struct sockaddr *sap)
-{
-        return nlm_cmp_addr(nlm_addr(host), sap);
-}
-#endif  /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
 /*
 * The server lockd has called us back to tell us the lock was granted
 */
@@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
                 */
                if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
                        continue;
-                if (!nlmclnt_cmp_addr(block->b_host, addr))
+                if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
                        continue;
                if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
                        continue;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac827..6d5d4a4169e5 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
+#include <asm/unaligned.h>
 #define NLMDBG_FACILITY         NLMDBG_MONITOR
 #define NSM_PROGRAM             100024
 #define NSM_VERSION             1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
 {
        u64 *p = (u64 *)&nsm->sm_priv.data;
        struct timespec ts;
+        s64 ns;
        ktime_get_ts(&ts);
-        *p++ = timespec_to_ns(&ts);
+        ns = timespec_to_ns(&ts);
-        *p = (unsigned long)nsm;
+        put_unaligned(ns, p);
+        put_unaligned((unsigned long)nsm, p + 1);
 }
 static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b5853..1a54ae14a192 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,17 +53,6 @@ static struct svc_rqst		*nlmsvc_rqst;
 unsigned long                   nlmsvc_timeout;
 /*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
-        defined(CONFIG_SUNRPC_REGISTER_V4)
-static const sa_family_t        nlmsvc_family = AF_INET6;
-#else   /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-static const sa_family_t        nlmsvc_family = AF_INET;
-#endif  /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-/*
 * These can be set at insmod time (useful for NFS as root filesystem),
 * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
 */
@@ -115,6 +104,16 @@ static void set_grace_period(void)
        schedule_delayed_work(&grace_period_end, grace_period);
 }
+static void restart_grace(void)
+{
+        if (nlmsvc_ops) {
+                cancel_delayed_work_sync(&grace_period_end);
+                locks_end_grace(&lockd_manager);
+                nlmsvc_invalidate_all();
+                set_grace_period();
+        }
+}
 /*
 * This is the lockd kernel thread
 */
@@ -160,10 +159,7 @@ lockd(void *vrqstp)
                if (signalled()) {
                        flush_signals(current);
-                        if (nlmsvc_ops) {
+                        restart_grace();
-                                nlmsvc_invalidate_all();
-                                set_grace_period();
-                        }
                        continue;
                }
@@ -204,19 +200,30 @@ lockd(void *vrqstp)
        return 0;
 }
-static int create_lockd_listener(struct svc_serv *serv, char *name,
+static int create_lockd_listener(struct svc_serv *serv, const char *name,
-                                 unsigned short port)
+                                 const int family, const unsigned short port)
 {
        struct svc_xprt *xprt;
-        xprt = svc_find_xprt(serv, name, 0, 0);
+        xprt = svc_find_xprt(serv, name, family, 0);
        if (xprt == NULL)
-                return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+                return svc_create_xprt(serv, name, family, port,
+                                                SVC_SOCK_DEFAULTS);
        svc_xprt_put(xprt);
        return 0;
 }
+static int create_lockd_family(struct svc_serv *serv, const int family)
+{
+        int err;
+        err = create_lockd_listener(serv, "udp", family, nlm_udpport);
+        if (err < 0)
+                return err;
+        return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
+}
 /*
 * Ensure there are active UDP and TCP listeners for lockd.
 *
@@ -232,13 +239,15 @@ static int make_socks(struct svc_serv *serv)
        static int warned;
        int err;
-        err = create_lockd_listener(serv, "udp", nlm_udpport);
+        err = create_lockd_family(serv, PF_INET);
        if (err < 0)
                goto out_err;
-        err = create_lockd_listener(serv, "tcp", nlm_tcpport);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-        if (err < 0)
+        err = create_lockd_family(serv, PF_INET6);
+        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_err;
+#endif  /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
        warned = 0;
        return 0;
@@ -274,7 +283,7 @@ int lockd_up(void)
                        "lockd_up: no pid, %d users??\n", nlmsvc_users);
        error = -ENOMEM;
-        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
+        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
        if (!serv) {
                printk(KERN_WARNING "lockd_up: create service failed\n");
                goto out;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                        ret = nlm_granted;
                        goto out;
                case -EAGAIN:
+                        /*
+                         * If this is a blocking request for an
+                         * already pending lock request then we need
+                         * to put it back on lockd's block list
+                         */
+                        if (wait)
+                                break;
                        ret = nlm_lck_denied;
-                        break;
+                        goto out;
                case FILE_LOCK_DEFERRED:
                        if (wait)
                                break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                        goto out;
        }
-        ret = nlm_lck_denied;
-        if (!wait)
-                goto out;
        ret = nlm_lck_blocked;
        /* Append to list of blocked */
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 618865b3128b..daad3c2740db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -321,15 +321,20 @@ out:
 static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
+        struct super_block *sb = dentry->d_sb;
-        buf->f_type = dentry->d_sb->s_magic;
+        struct minix_sb_info *sbi = minix_sb(sb);
-        buf->f_bsize = dentry->d_sb->s_blocksize;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+        buf->f_type = sb->s_magic;
+        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
        buf->f_bfree = minix_count_free_blocks(sbi);
        buf->f_bavail = buf->f_bfree;
        buf->f_files = sbi->s_ninodes;
        buf->f_ffree = minix_count_free_inodes(sbi);
        buf->f_namelen = sbi->s_namelen;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae3..680ba60863ff 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
        bio_put(bio);
 }
-struct bio *mpage_bio_submit(int rw, struct bio *bio)
+static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
        bio->bi_end_io = mpage_end_io_read;
        if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
        submit_bio(rw, bio);
        return NULL;
 }
-EXPORT_SYMBOL(mpage_bio_submit);
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
 * just allocate full-size (16-page) BIOs.
 */
-int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+struct mpage_data {
+        struct bio *bio;
+        sector_t last_block_in_bio;
+        get_block_t *get_block;
+        unsigned use_writepage;
+};
+static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
                      void *data)
 {
        struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
        mpd->bio = bio;
        return ret;
 }
-EXPORT_SYMBOL(__mpage_writepage);
 /**
 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785d..967c3db92724 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -1129,8 +1130,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 * @nd: pointer to nameidata
 * @open_flags: open intent flags
 */
-int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
+static int path_lookup_open(int dfd, const char *name,
-                struct nameidata *nd, int open_flags)
+                unsigned int lookup_flags, struct nameidata *nd, int open_flags)
 {
        struct file *filp = get_empty_filp();
        int err;
@@ -1247,6 +1248,8 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        int err;
        struct qstr this;
+        WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
        err = __lookup_one_len(name, &this, base, len);
        if (err)
                return ERR_PTR(err);
@@ -1578,7 +1581,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
        struct dentry *dir = nd->path.dentry;
        if (!IS_POSIXACL(dir->d_inode))
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
        error = security_path_mknod(&nd->path, path->dentry, mode, 0);
        if (error)
                goto out_unlock;
@@ -1634,18 +1637,19 @@ static int open_will_write_to_fs(int flag, struct inode *inode)
 * open_to_namei_flags() for more details.
 */
 struct file *do_filp_open(int dfd, const char *pathname,
-                int open_flag, int mode)
+                int open_flag, int mode, int acc_mode)
 {
        struct file *filp;
        struct nameidata nd;
-        int acc_mode, error;
+        int error;
        struct path path;
        struct dentry *dir;
        int count = 0;
        int will_write;
        int flag = open_to_namei_flags(open_flag);
-        acc_mode = MAY_OPEN | ACC_MODE(flag);
+        if (!acc_mode)
+                acc_mode = MAY_OPEN | ACC_MODE(flag);
        /* O_TRUNC implies we need access checks for write permissions */
        if (flag & O_TRUNC)
@@ -1866,7 +1870,7 @@ do_link:
 */
 struct file *filp_open(const char *filename, int flags, int mode)
 {
-        return do_filp_open(AT_FDCWD, filename, flags, mode);
+        return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
 }
 EXPORT_SYMBOL(filp_open);
@@ -1989,7 +1993,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
                goto out_unlock;
        }
        if (!IS_POSIXACL(nd.path.dentry->d_inode))
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
        error = may_mknod(mode);
        if (error)
                goto out_dput;
@@ -2067,7 +2071,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
                goto out_unlock;
        if (!IS_POSIXACL(nd.path.dentry->d_inode))
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
@@ -2897,10 +2901,3 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
-/* to be mentioned only in INIT_TASK */
-struct fs_struct init_fs = {
-        .count          = ATOMIC_INIT(1),
-        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
-        .umask          = 0022,
-};
diff --git a/fs/namespace.c b/fs/namespace.c
index 0a42e0e96027..134d494158d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
 #include <linux/ramfs.h>
 #include <linux/log2.h>
 #include <linux/idr.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
@@ -694,12 +695,16 @@ static inline void mangle(struct seq_file *m, const char *s)
 */
 int generic_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
-        const char *options = mnt->mnt_sb->s_options;
+        const char *options;
+        rcu_read_lock();
+        options = rcu_dereference(mnt->mnt_sb->s_options);
        if (options != NULL && options[0]) {
                seq_putc(m, ',');
                mangle(m, options);
        }
+        rcu_read_unlock();
        return 0;
 }
@@ -720,11 +725,22 @@ EXPORT_SYMBOL(generic_show_options);
 */
 void save_mount_options(struct super_block *sb, char *options)
 {
-        kfree(sb->s_options);
+        BUG_ON(sb->s_options);
-        sb->s_options = kstrdup(options, GFP_KERNEL);
+        rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
 }
 EXPORT_SYMBOL(save_mount_options);
+void replace_mount_options(struct super_block *sb, char *options)
+{
+        char *old = sb->s_options;
+        rcu_assign_pointer(sb->s_options, options);
+        if (old) {
+                synchronize_rcu();
+                kfree(old);
+        }
+}
+EXPORT_SYMBOL(replace_mount_options);
 #ifdef CONFIG_PROC_FS
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
@@ -1072,9 +1088,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
         */
        if (flags & MNT_FORCE && sb->s_op->umount_begin) {
-                lock_kernel();
                sb->s_op->umount_begin(sb);
-                unlock_kernel();
        }
        /*
@@ -1376,7 +1390,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (parent_path) {
                detach_mnt(source_mnt, parent_path);
                attach_mnt(source_mnt, path);
-                touch_mnt_namespace(current->nsproxy->mnt_ns);
+                touch_mnt_namespace(parent_path->mnt->mnt_ns);
        } else {
                mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
                commit_tree(source_mnt);
@@ -1919,8 +1933,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;
-        /* Default to relatime */
+        /* Default to relatime unless overriden */
-        mnt_flags |= MNT_RELATIME;
+        if (!(flags & MS_NOATIME))
+                mnt_flags |= MNT_RELATIME;
        /* Separate the per-mountpoint flags */
        if (flags & MS_NOSUID)
@@ -2093,66 +2108,6 @@ out1:
 }
 /*
- * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_root(struct fs_struct *fs, struct path *path)
-{
-        struct path old_root;
-        write_lock(&fs->lock);
-        old_root = fs->root;
-        fs->root = *path;
-        path_get(path);
-        write_unlock(&fs->lock);
-        if (old_root.dentry)
-                path_put(&old_root);
-}
-/*
- * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
-{
-        struct path old_pwd;
-        write_lock(&fs->lock);
-        old_pwd = fs->pwd;
-        fs->pwd = *path;
-        path_get(path);
-        write_unlock(&fs->lock);
-        if (old_pwd.dentry)
-                path_put(&old_pwd);
-}
-static void chroot_fs_refs(struct path *old_root, struct path *new_root)
-{
-        struct task_struct *g, *p;
-        struct fs_struct *fs;
-        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
-                task_lock(p);
-                fs = p->fs;
-                if (fs) {
-                        atomic_inc(&fs->count);
-                        task_unlock(p);
-                        if (fs->root.dentry == old_root->dentry
-                            && fs->root.mnt == old_root->mnt)
-                                set_fs_root(fs, new_root);
-                        if (fs->pwd.dentry == old_root->dentry
-                            && fs->pwd.mnt == old_root->mnt)
-                                set_fs_pwd(fs, new_root);
-                        put_fs_struct(fs);
-                } else
-                        task_unlock(p);
-        } while_each_thread(g, p);
-        read_unlock(&tasklist_lock);
-}
-/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index f54360f50a9c..fa038df63ac8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -660,13 +660,10 @@ outrel:
                        if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
                                return -ENOMEM;
                        if (user.object_name_len) {
-                                newname = kmalloc(user.object_name_len, GFP_USER);
+                                newname = memdup_user(user.object_name,
-                                if (!newname)
+                                                      user.object_name_len);
-                                        return -ENOMEM;
+                                if (IS_ERR(newname))
-                                if (copy_from_user(newname, user.object_name, user.object_name_len)) {
+                                        return PTR_ERR(newname);
-                                        kfree(newname);
-                                        return -EFAULT;
-                                }
                        } else {
                                newname = NULL;
                        }
@@ -760,13 +757,9 @@ outrel:
                        if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
                                return -ENOMEM;
                        if (user.len) {
-                                new = kmalloc(user.len, GFP_USER);
+                                new = memdup_user(user.data, user.len);
-                                if (!new)
+                                if (IS_ERR(new))
-                                        return -ENOMEM;
+                                        return PTR_ERR(new);
-                                if (copy_from_user(new, user.data, user.len)) {
-                                        kfree(new);
-                                        return -EFAULT;
-                                }
                        } else {
                                new = NULL;
                        }
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 36fe20d6eba2..e67f3ec07736 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -84,3 +84,11 @@ config ROOT_NFS
          <file:Documentation/filesystems/nfsroot.txt>.
          Most people say N here.
+config NFS_FSCACHE
+        bool "Provide NFS client caching support (EXPERIMENTAL)"
+        depends on EXPERIMENTAL
+        depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+        help
+          Say Y here if you want NFS data to be cached locally on disc through
+          the general filesystem cache manager
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ac6170c594a3..845159814de2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           callback.o callback_xdr.o callback_proc.o \
                           nfs4namespace.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a1083..a886e692ddd0 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program;
 unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
+unsigned short nfs_callback_tcpport6;
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
-/*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const sa_family_t        nfs_callback_family = AF_INET6;
-#else
-static const sa_family_t        nfs_callback_family = AF_INET;
-#endif
 static int param_set_port(const char *val, struct kernel_param *kp)
 {
        char *endp;
@@ -116,19 +107,29 @@ int nfs_callback_up(void)
        mutex_lock(&nfs_callback_mutex);
        if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
                goto out;
-        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
-                                nfs_callback_family, NULL);
        ret = -ENOMEM;
        if (!serv)
                goto out_err;
-        ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
+        ret = svc_create_xprt(serv, "tcp", PF_INET,
-                              SVC_SOCK_ANONYMOUS);
+                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret <= 0)
                goto out_err;
        nfs_callback_tcpport = ret;
        dprintk("NFS: Callback listener port = %u (af %u)\n",
-                        nfs_callback_tcpport, nfs_callback_family);
+                        nfs_callback_tcpport, PF_INET);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+        ret = svc_create_xprt(serv, "tcp", PF_INET6,
+                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+        if (ret > 0) {
+                nfs_callback_tcpport6 = ret;
+                dprintk("NFS: Callback listener port = %u (af %u)\n",
+                                nfs_callback_tcpport6, PF_INET6);
+        } else if (ret != -EAFNOSUPPORT)
+                goto out_err;
+#endif  /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
        nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
        if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff1..e110e286a262 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
 extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
+extern unsigned short nfs_callback_tcpport6;
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2277421656e7..75c9cd2aa119 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_CLIENT
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        if (!IS_ERR(cred))
                clp->cl_machine_cred = cred;
+        nfs_fscache_get_client_cookie(clp);
        return clp;
 error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
        nfs4_shutdown_client(clp);
+        nfs_fscache_release_client_cookie(clp);
        /* -EIO all pending I/O */
        if (!IS_ERR(clp->cl_rpcclient))
                rpc_shutdown_client(clp->cl_rpcclient);
@@ -224,38 +229,6 @@ void nfs_put_client(struct nfs_client *clp)
 }
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
-{
-        switch (sa->sa_family) {
-                default:
-                        return NULL;
-                case AF_INET6:
-                        return &((const struct sockaddr_in6 *)sa)->sin6_addr;
-                        break;
-                case AF_INET:
-                        ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
-                                        addr_mapped);
-                        return addr_mapped;
-        }
-}
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-                const struct sockaddr *sa2)
-{
-        const struct in6_addr *addr1;
-        const struct in6_addr *addr2;
-        struct in6_addr addr1_mapped;
-        struct in6_addr addr2_mapped;
-        addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
-        if (likely(addr1 != NULL)) {
-                addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
-                if (likely(addr2 != NULL))
-                        return ipv6_addr_equal(addr1, addr2);
-        }
-        return 0;
-}
 /*
 * Test if two ip6 socket addresses refer to the same socket by
 * comparing relevant fields. The padding bytes specifically, are not
@@ -267,38 +240,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
 *
 * The caller should ensure both socket addresses are AF_INET6.
 */
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-                                const struct sockaddr *sa2)
+                                      const struct sockaddr *sa2)
 {
-        const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
+        const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-        const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
+        const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-        if (!ipv6_addr_equal(&saddr1->sin6_addr,
+        if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
-                             &saddr1->sin6_addr))
+            sin1->sin6_scope_id != sin2->sin6_scope_id)
-                return 0;
-        if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
-            saddr1->sin6_scope_id != saddr2->sin6_scope_id)
                return 0;
-        return saddr1->sin6_port == saddr2->sin6_port;
-}
-#else
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
-                                 const struct sockaddr_in *sa2)
-{
-        return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
-}
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+        return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
-                                 const struct sockaddr *sa2)
-{
-        if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
-                return 0;
-        return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
-                        (const struct sockaddr_in *)sa2);
 }
+#else   /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-                                const struct sockaddr * sa2)
+                                      const struct sockaddr *sa2)
 {
        return 0;
 }
@@ -311,20 +267,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
 *
 * The caller should ensure both socket addresses are AF_INET.
 */
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
+                                      const struct sockaddr *sa2)
+{
+        const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+        const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+        return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+                                const struct sockaddr *sa2)
+{
+        const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+        const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+        return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
+                (sin1->sin6_port == sin2->sin6_port);
+}
 static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
                                const struct sockaddr *sa2)
 {
-        const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
+        const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-        const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
+        const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-        if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
+        return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
+                (sin1->sin_port == sin2->sin_port);
+}
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, excluding the port number.
+ */
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+                                     const struct sockaddr *sa2)
+{
+        if (sa1->sa_family != sa2->sa_family)
                return 0;
-        return saddr1->sin_port == saddr2->sin_port;
+        switch (sa1->sa_family) {
+        case AF_INET:
+                return nfs_sockaddr_match_ipaddr4(sa1, sa2);
+        case AF_INET6:
+                return nfs_sockaddr_match_ipaddr6(sa1, sa2);
+        }
+        return 0;
 }
 /*
 * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields.
+ * by comparing (only) relevant fields, including the port number.
 */
 static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
                            const struct sockaddr *sa2)
@@ -772,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
+        server->options = data->options;
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1160,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
        server->caps |= NFS_CAP_ATOMIC_OPEN;
+        server->options = data->options;
        /* Get a client record */
        error = nfs4_set_client(server,
@@ -1571,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
        /* display header on line 1 */
        if (v == &nfs_volume_list) {
-                seq_puts(m, "NV SERVER   PORT DEV     FSID\n");
+                seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
                return 0;
        }
        /* display one transport per line on subsequent lines */
@@ -1585,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
                 (unsigned long long) server->fsid.major,
                 (unsigned long long) server->fsid.minor);
-        seq_printf(m, "v%u %s %s %-7s %-17s\n",
+        seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
                   clp->rpc_ops->version,
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
                   dev,
-                   fsid);
+                   fsid,
+                   nfs_server_fscache_state(server));
        return 0;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 78bf72fc1db3..89f98e9a024b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                } else if (atomic_read(&new_dentry->d_count) > 1)
                        /* dentry still busy? */
                        goto out;
-        } else
+        }
-                nfs_drop_nlink(new_inode);
 go_ahead:
        /*
@@ -1638,10 +1637,8 @@ go_ahead:
        }
        nfs_inode_return_delegation(old_inode);
-        if (new_inode != NULL) {
+        if (new_inode != NULL)
                nfs_inode_return_delegation(new_inode);
-                d_delete(new_dentry);
-        }
        error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
                                           new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
        if (rehash)
                d_rehash(rehash);
        if (!error) {
+                if (new_inode != NULL)
+                        nfs_drop_nlink(new_inode);
                d_move(old_dentry, new_dentry);
                nfs_set_verifier(new_dentry,
                                        nfs_save_change_attribute(new_dir));
@@ -1944,7 +1943,8 @@ int nfs_permission(struct inode *inode, int mask)
                case S_IFREG:
                        /* NFSv4 has atomic_open... */
                        if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
-                                        && (mask & MAY_OPEN))
+                                        && (mask & MAY_OPEN)
+                                        && !(mask & MAY_EXEC))
                                goto out;
                        break;
                case S_IFDIR:
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d2..ec7e27d00bc6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -35,6 +35,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_FILE
@@ -64,11 +65,7 @@ const struct file_operations nfs_file_operations = {
        .write          = do_sync_write,
        .aio_read       = nfs_file_read,
        .aio_write      = nfs_file_write,
-#ifdef CONFIG_MMU
        .mmap           = nfs_file_mmap,
-#else
-        .mmap           = generic_file_mmap,
-#endif
        .open           = nfs_file_open,
        .flush          = nfs_file_flush,
        .release        = nfs_file_release,
@@ -141,9 +138,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
                        dentry->d_parent->d_name.name,
                        dentry->d_name.name);
-        /* Ensure that dirty pages are flushed out with the right creds */
-        if (filp->f_mode & FMODE_WRITE)
-                nfs_wb_all(dentry->d_inode);
        nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
        return nfs_release(inode, filp);
 }
@@ -235,7 +229,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct dentry   *dentry = file->f_path.dentry;
        struct inode    *inode = dentry->d_inode;
-        int             status;
        dprintk("NFS: flush(%s/%s)\n",
                        dentry->d_parent->d_name.name,
@@ -245,11 +238,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
                return 0;
        nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
-        /* Ensure that data+attribute caches are up to date after close() */
+        /* Flush writes to the server and return any errors */
-        status = nfs_do_fsync(ctx, inode);
+        return nfs_do_fsync(ctx, inode);
-        if (!status)
-                nfs_revalidate_inode(NFS_SERVER(inode), inode);
-        return status;
 }
 static ssize_t
@@ -304,11 +294,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
        dprintk("NFS: mmap(%s/%s)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        status = nfs_revalidate_mapping(inode, file->f_mapping);
+        /* Note: generic_file_mmap() returns ENOSYS on nommu systems
+         *       so we call that before revalidating the mapping
+         */
+        status = generic_file_mmap(file, vma);
        if (!status) {
                vma->vm_ops = &nfs_file_vm_ops;
-                vma->vm_flags |= VM_CAN_NONLINEAR;
+                status = nfs_revalidate_mapping(inode, file->f_mapping);
-                file_accessed(file);
        }
        return status;
 }
@@ -354,6 +346,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
+        /*
+         * Prevent starvation issues if someone is doing a consistency
+         * sync-to-disk
+         */
+        ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+                        nfs_wait_bit_killable, TASK_KILLABLE);
+        if (ret)
+                return ret;
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
@@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
        return copied;
 }
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ *   page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
        dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
@@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
                return;
        /* Cancel any unstarted writes on this page */
        nfs_wb_page_cancel(page->mapping->host, page);
+        nfs_fscache_invalidate_page(page, page->mapping->host);
 }
+/*
+ * Attempt to release the private state associated with a page
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ * - Return true (may release page) or false (may not)
+ */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
        /* If PagePrivate() is set, then the page is not freeable */
-        return 0;
+        if (PagePrivate(page))
+                return 0;
+        return nfs_fscache_release_page(page, gfp);
 }
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
 static int nfs_launder_page(struct page *page)
 {
        struct inode *inode = page->mapping->host;
+        struct nfs_inode *nfsi = NFS_I(inode);
        dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
                inode->i_ino, (long long)page_offset(page));
+        nfs_fscache_wait_on_page_write(nfsi, page);
        return nfs_wb_page(inode, page);
 }
@@ -451,8 +479,14 @@ const struct address_space_operations nfs_file_aops = {
        .launder_page = nfs_launder_page,
 };
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct file *filp = vma->vm_file;
        struct dentry *dentry = filp->f_path.dentry;
        unsigned pagelen;
@@ -464,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                filp->f_mapping->host->i_ino,
                (long long)page_offset(page));
+        /* make sure the cache has finished storing the page */
+        nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
        lock_page(page);
        mapping = page->mapping;
        if (mapping != dentry->d_inode->i_mapping)
@@ -479,11 +516,11 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                goto out_unlock;
        ret = nfs_updatepage(filp, page, 0, pagelen);
-        if (ret == 0)
-                ret = pagelen;
 out_unlock:
+        if (!ret)
+                return VM_FAULT_LOCKED;
        unlock_page(page);
-        return ret;
+        return VM_FAULT_SIGBUS;
 }
 static struct vm_operations_struct nfs_file_vm_ops = {
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 000000000000..5b1006480bc2
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
+/* NFS FS-Cache index structure definition
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include "internal.h"
+#include "fscache.h"
+#define NFSDBG_FACILITY         NFSDBG_FSCACHE
+/*
+ * Define the NFS filesystem for FS-Cache.  Upon registration FS-Cache sticks
+ * the cookie for the top-level index object for NFS into here.  The top-level
+ * index can than have other cache objects inserted into it.
+ */
+struct fscache_netfs nfs_fscache_netfs = {
+        .name           = "nfs",
+        .version        = 0,
+};
+/*
+ * Register NFS for caching
+ */
+int nfs_fscache_register(void)
+{
+        return fscache_register_netfs(&nfs_fscache_netfs);
+}
+/*
+ * Unregister NFS for caching
+ */
+void nfs_fscache_unregister(void)
+{
+        fscache_unregister_netfs(&nfs_fscache_netfs);
+}
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+        uint16_t        nfsversion;             /* NFS protocol version */
+        uint16_t        family;                 /* address family */
+        uint16_t        port;                   /* IP port */
+        union {
+                struct in_addr  ipv4_addr;      /* IPv4 address */
+                struct in6_addr ipv6_addr;      /* IPv6 address */
+        } addr[0];
+};
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+                                   void *buffer, uint16_t bufmax)
+{
+        const struct nfs_client *clp = cookie_netfs_data;
+        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+        const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+        struct nfs_server_key *key = buffer;
+        uint16_t len = sizeof(struct nfs_server_key);
+        key->nfsversion = clp->rpc_ops->version;
+        key->family = clp->cl_addr.ss_family;
+        memset(key, 0, len);
+        switch (clp->cl_addr.ss_family) {
+        case AF_INET:
+                key->port = sin->sin_port;
+                key->addr[0].ipv4_addr = sin->sin_addr;
+                len += sizeof(key->addr[0].ipv4_addr);
+                break;
+        case AF_INET6:
+                key->port = sin6->sin6_port;
+                key->addr[0].ipv6_addr = sin6->sin6_addr;
+                len += sizeof(key->addr[0].ipv6_addr);
+                break;
+        default:
+                printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+                       clp->cl_addr.ss_family);
+                len = 0;
+                break;
+        }
+        return len;
+}
+/*
+ * Define the server object for FS-Cache.  This is used to describe a server
+ * object to fscache_acquire_cookie().  It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+        .name           = "NFS.server",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = nfs_server_get_key,
+};
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+                                  void *buffer, uint16_t bufmax)
+{
+        const struct nfs_fscache_key *key;
+        const struct nfs_server *nfss = cookie_netfs_data;
+        uint16_t len;
+        key = nfss->fscache_key;
+        len = sizeof(key->key) + key->key.uniq_len;
+        if (len > bufmax) {
+                len = 0;
+        } else {
+                memcpy(buffer, &key->key, sizeof(key->key));
+                memcpy(buffer + sizeof(key->key),
+                       key->key.uniquifier, key->key.uniq_len);
+        }
+        return len;
+}
+/*
+ * Define the superblock object for FS-Cache.  This is used to describe a
+ * superblock object to fscache_acquire_cookie().  It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+        .name           = "NFS.super",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = nfs_super_get_key,
+};
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode.  This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+        struct timespec mtime;
+        struct timespec ctime;
+        loff_t          size;
+        u64             change_attr;
+};
+/*
+ * Generate a key to describe an NFS inode in an NFS server's index
+ */
+static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
+                                          void *buffer, uint16_t bufmax)
+{
+        const struct nfs_inode *nfsi = cookie_netfs_data;
+        uint16_t nsize;
+        /* use the inode's NFS filehandle as the key */
+        nsize = nfsi->fh.size;
+        memcpy(buffer, nfsi->fh.data, nsize);
+        return nsize;
+}
+/*
+ * Get certain file attributes from the netfs data
+ * - This function can be absent for an index
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
+                                       uint64_t *size)
+{
+        const struct nfs_inode *nfsi = cookie_netfs_data;
+        *size = nfsi->vfs_inode.i_size;
+}
+/*
+ * Get the auxiliary data from netfs data
+ * - This function can be absent if the index carries no state data
+ * - Should store the auxiliary data in the buffer
+ * - Should return the amount of amount stored
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
+                                          void *buffer, uint16_t bufmax)
+{
+        struct nfs_fscache_inode_auxdata auxdata;
+        const struct nfs_inode *nfsi = cookie_netfs_data;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.size = nfsi->vfs_inode.i_size;
+        auxdata.mtime = nfsi->vfs_inode.i_mtime;
+        auxdata.ctime = nfsi->vfs_inode.i_ctime;
+        if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+                auxdata.change_attr = nfsi->change_attr;
+        if (bufmax > sizeof(auxdata))
+                bufmax = sizeof(auxdata);
+        memcpy(buffer, &auxdata, bufmax);
+        return bufmax;
+}
+/*
+ * Consult the netfs about the state of an object
+ * - This function can be absent if the index carries no state data
+ * - The netfs data from the cookie being used as the target is
+ *   presented, as is the auxiliary data
+ */
+static
+enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
+                                                  const void *data,
+                                                  uint16_t datalen)
+{
+        struct nfs_fscache_inode_auxdata auxdata;
+        struct nfs_inode *nfsi = cookie_netfs_data;
+        if (datalen != sizeof(auxdata))
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.size = nfsi->vfs_inode.i_size;
+        auxdata.mtime = nfsi->vfs_inode.i_mtime;
+        auxdata.ctime = nfsi->vfs_inode.i_ctime;
+        if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+                auxdata.change_attr = nfsi->change_attr;
+        if (memcmp(data, &auxdata, datalen) != 0)
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        return FSCACHE_CHECKAUX_OKAY;
+}
+/*
+ * Indication from FS-Cache that the cookie is no longer cached
+ * - This function is called when the backing store currently caching a cookie
+ *   is removed
+ * - The netfs should use this to clean up any markers indicating cached pages
+ * - This is mandatory for any object that may have data
+ */
+static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+        struct nfs_inode *nfsi = cookie_netfs_data;
+        struct pagevec pvec;
+        pgoff_t first;
+        int loop, nr_pages;
+        pagevec_init(&pvec, 0);
+        first = 0;
+        dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
+        for (;;) {
+                /* grab a bunch of pages to unmark */
+                nr_pages = pagevec_lookup(&pvec,
+                                          nfsi->vfs_inode.i_mapping,
+                                          first,
+                                          PAGEVEC_SIZE - pagevec_count(&pvec));
+                if (!nr_pages)
+                        break;
+                for (loop = 0; loop < nr_pages; loop++)
+                        ClearPageFsCache(pvec.pages[loop]);
+                first = pvec.pages[nr_pages - 1]->index + 1;
+                pvec.nr = nr_pages;
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+/*
+ * Get an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ * - The read context is passed back to NFS in the event that a data read on the
+ *   cache fails with EIO - in which case the server must be contacted to
+ *   retrieve the data, which requires the read context for security.
+ */
+static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
+{
+        get_nfs_open_context(context);
+}
+/*
+ * Release an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ */
+static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
+{
+        if (context)
+                put_nfs_open_context(context);
+}
+/*
+ * Define the inode object for FS-Cache.  This is used to describe an inode
+ * object to fscache_acquire_cookie().  It is keyed by the NFS file handle for
+ * an inode.
+ *
+ * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
+ * held in the cache auxiliary data for the data storage object with those in
+ * the inode struct in memory.
+ */
+const struct fscache_cookie_def nfs_fscache_inode_object_def = {
+        .name           = "NFS.fh",
+        .type           = FSCACHE_COOKIE_TYPE_DATAFILE,
+        .get_key        = nfs_fscache_inode_get_key,
+        .get_attr       = nfs_fscache_inode_get_attr,
+        .get_aux        = nfs_fscache_inode_get_aux,
+        .check_aux      = nfs_fscache_inode_check_aux,
+        .now_uncached   = nfs_fscache_inode_now_uncached,
+        .get_context    = nfs_fh_get_context,
+        .put_context    = nfs_fh_put_context,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000000..379be678cb7e
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,523 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#define NFSDBG_FACILITY         NFSDBG_FSCACHE
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ *   cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+        /* create a cache index for looking up filehandles */
+        clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+                                              &nfs_fscache_server_index_def,
+                                              clp);
+        dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+                 clp, clp->fscache);
+}
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+        dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+                 clp, clp->fscache);
+        fscache_relinquish_cookie(clp->fscache, 0);
+        clp->fscache = NULL;
+}
+/*
+ * Get the cache cookie for an NFS superblock.  We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb,
+                                  struct nfs_parsed_mount_data *data)
+{
+        struct nfs_fscache_key *key, *xkey;
+        struct nfs_server *nfss = NFS_SB(sb);
+        struct rb_node **p, *parent;
+        const char *uniq = data->fscache_uniq ?: "";
+        int diff, ulen;
+        ulen = strlen(uniq);
+        key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+        if (!key)
+                return;
+        key->nfs_client = nfss->nfs_client;
+        key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+        key->key.nfs_server.flags = nfss->flags;
+        key->key.nfs_server.rsize = nfss->rsize;
+        key->key.nfs_server.wsize = nfss->wsize;
+        key->key.nfs_server.acregmin = nfss->acregmin;
+        key->key.nfs_server.acregmax = nfss->acregmax;
+        key->key.nfs_server.acdirmin = nfss->acdirmin;
+        key->key.nfs_server.acdirmax = nfss->acdirmax;
+        key->key.nfs_server.fsid = nfss->fsid;
+        key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+        key->key.uniq_len = ulen;
+        memcpy(key->key.uniquifier, uniq, ulen);
+        spin_lock(&nfs_fscache_keys_lock);
+        p = &nfs_fscache_keys.rb_node;
+        parent = NULL;
+        while (*p) {
+                parent = *p;
+                xkey = rb_entry(parent, struct nfs_fscache_key, node);
+                if (key->nfs_client < xkey->nfs_client)
+                        goto go_left;
+                if (key->nfs_client > xkey->nfs_client)
+                        goto go_right;
+                diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+                if (diff < 0)
+                        goto go_left;
+                if (diff > 0)
+                        goto go_right;
+                if (key->key.uniq_len == 0)
+                        goto non_unique;
+                diff = memcmp(key->key.uniquifier,
+                              xkey->key.uniquifier,
+                              key->key.uniq_len);
+                if (diff < 0)
+                        goto go_left;
+                if (diff > 0)
+                        goto go_right;
+                goto non_unique;
+        go_left:
+                p = &(*p)->rb_left;
+                continue;
+        go_right:
+                p = &(*p)->rb_right;
+        }
+        rb_link_node(&key->node, parent, p);
+        rb_insert_color(&key->node, &nfs_fscache_keys);
+        spin_unlock(&nfs_fscache_keys_lock);
+        nfss->fscache_key = key;
+        /* create a cache index for looking up filehandles */
+        nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+                                               &nfs_fscache_super_index_def,
+                                               nfss);
+        dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+                 nfss, nfss->fscache);
+        return;
+non_unique:
+        spin_unlock(&nfs_fscache_keys_lock);
+        kfree(key);
+        nfss->fscache_key = NULL;
+        nfss->fscache = NULL;
+        printk(KERN_WARNING "NFS:"
+               " Cache request denied due to non-unique superblock keys\n");
+}
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+        struct nfs_server *nfss = NFS_SB(sb);
+        dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+                 nfss, nfss->fscache);
+        fscache_relinquish_cookie(nfss->fscache, 0);
+        nfss->fscache = NULL;
+        if (nfss->fscache_key) {
+                spin_lock(&nfs_fscache_keys_lock);
+                rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+                spin_unlock(&nfs_fscache_keys_lock);
+                kfree(nfss->fscache_key);
+                nfss->fscache_key = NULL;
+        }
+}
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode_cookie(struct inode *inode)
+{
+        NFS_I(inode)->fscache = NULL;
+        if (S_ISREG(inode->i_mode))
+                set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+/*
+ * Get the per-inode cache cookie for an NFS inode.
+ */
+static void nfs_fscache_enable_inode_cookie(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct nfs_inode *nfsi = NFS_I(inode);
+        if (nfsi->fscache || !NFS_FSCACHE(inode))
+                return;
+        if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
+                nfsi->fscache = fscache_acquire_cookie(
+                        NFS_SB(sb)->fscache,
+                        &nfs_fscache_inode_object_def,
+                        nfsi);
+                dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
+                         sb, nfsi, nfsi->fscache);
+        }
+}
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_release_inode_cookie(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
+                 nfsi, nfsi->fscache);
+        fscache_relinquish_cookie(nfsi->fscache, 0);
+        nfsi->fscache = NULL;
+}
+/*
+ * Retire a per-inode cookie, destroying the data attached to it.
+ */
+void nfs_fscache_zap_inode_cookie(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
+                 nfsi, nfsi->fscache);
+        fscache_relinquish_cookie(nfsi->fscache, 1);
+        nfsi->fscache = NULL;
+}
+/*
+ * Turn off the cache with regard to a per-inode cookie if opened for writing,
+ * invalidating all the pages in the page cache relating to the associated
+ * inode to clear the per-page caching.
+ */
+static void nfs_fscache_disable_inode_cookie(struct inode *inode)
+{
+        clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+        if (NFS_I(inode)->fscache) {
+                dfprintk(FSCACHE,
+                         "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
+                /* Need to invalidate any mapped pages that were read in before
+                 * turning off the cache.
+                 */
+                if (inode->i_mapping && inode->i_mapping->nrpages)
+                        invalidate_inode_pages2(inode->i_mapping);
+                nfs_fscache_zap_inode_cookie(inode);
+        }
+}
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+static int nfs_fscache_wait_bit(void *flags)
+{
+        schedule();
+        return 0;
+}
+/*
+ * Lock against someone else trying to also acquire or relinquish a cookie
+ */
+static inline void nfs_fscache_inode_lock(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
+                wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
+                            nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+}
+/*
+ * Unlock cookie management lock
+ */
+static inline void nfs_fscache_inode_unlock(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        smp_mb__before_clear_bit();
+        clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
+}
+/*
+ * Decide if we should enable or disable local caching for this inode.
+ * - For now, with NFS, only regular files that are open read-only will be able
+ *   to use the cache.
+ * - May be invoked multiple times in parallel by parallel nfs_open() functions.
+ */
+void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+        if (NFS_FSCACHE(inode)) {
+                nfs_fscache_inode_lock(inode);
+                if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+                        nfs_fscache_disable_inode_cookie(inode);
+                else
+                        nfs_fscache_enable_inode_cookie(inode);
+                nfs_fscache_inode_unlock(inode);
+        }
+}
+/*
+ * Replace a per-inode cookie due to revalidation detecting a file having
+ * changed on the server.
+ */
+void nfs_fscache_reset_inode_cookie(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct nfs_server *nfss = NFS_SERVER(inode);
+        struct fscache_cookie *old = nfsi->fscache;
+        nfs_fscache_inode_lock(inode);
+        if (nfsi->fscache) {
+                /* retire the current fscache cache and get a new one */
+                fscache_relinquish_cookie(nfsi->fscache, 1);
+                nfsi->fscache = fscache_acquire_cookie(
+                        nfss->nfs_client->fscache,
+                        &nfs_fscache_inode_object_def,
+                        nfsi);
+                dfprintk(FSCACHE,
+                         "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
+                         nfss, nfsi, old, nfsi->fscache);
+        }
+        nfs_fscache_inode_unlock(inode);
+}
+/*
+ * Release the caching state associated with a page, if the page isn't busy
+ * interacting with the cache.
+ * - Returns true (can release page) or false (page busy).
+ */
+int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+        struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+        struct fscache_cookie *cookie = nfsi->fscache;
+        BUG_ON(!cookie);
+        if (fscache_check_page_write(cookie, page)) {
+                if (!(gfp & __GFP_WAIT))
+                        return 0;
+                fscache_wait_on_page_write(cookie, page);
+        }
+        if (PageFsCache(page)) {
+                dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
+                         cookie, page, nfsi);
+                fscache_uncache_page(cookie, page);
+                nfs_add_fscache_stats(page->mapping->host,
+                                      NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+        }
+        return 1;
+}
+/*
+ * Release the caching state associated with a page if undergoing complete page
+ * invalidation.
+ */
+void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct fscache_cookie *cookie = nfsi->fscache;
+        BUG_ON(!cookie);
+        dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
+                 cookie, page, nfsi);
+        fscache_wait_on_page_write(cookie, page);
+        BUG_ON(!PageLocked(page));
+        fscache_uncache_page(cookie, page);
+        nfs_add_fscache_stats(page->mapping->host,
+                              NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+}
+/*
+ * Handle completion of a page being read from the cache.
+ * - Called in process (keventd) context.
+ */
+static void nfs_readpage_from_fscache_complete(struct page *page,
+                                               void *context,
+                                               int error)
+{
+        dfprintk(FSCACHE,
+                 "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
+                 page, context, error);
+        /* if the read completes with an error, we just unlock the page and let
+         * the VM reissue the readpage */
+        if (!error) {
+                SetPageUptodate(page);
+                unlock_page(page);
+        } else {
+                error = nfs_readpage_async(context, page->mapping->host, page);
+                if (error)
+                        unlock_page(page);
+        }
+}
+/*
+ * Retrieve a page from fscache
+ */
+int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+                                struct inode *inode, struct page *page)
+{
+        int ret;
+        dfprintk(FSCACHE,
+                 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
+                 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+        ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+                                         page,
+                                         nfs_readpage_from_fscache_complete,
+                                         ctx,
+                                         GFP_KERNEL);
+        switch (ret) {
+        case 0: /* read BIO submitted (page in fscache) */
+                dfprintk(FSCACHE,
+                         "NFS:    readpage_from_fscache: BIO submitted\n");
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
+                return ret;
+        case -ENOBUFS: /* inode not in cache */
+        case -ENODATA: /* page not in cache */
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+                dfprintk(FSCACHE,
+                         "NFS:    readpage_from_fscache %d\n", ret);
+                return 1;
+        default:
+                dfprintk(FSCACHE, "NFS:    readpage_from_fscache %d\n", ret);
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+        }
+        return ret;
+}
+/*
+ * Retrieve a set of pages from fscache
+ */
+int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+                                 struct inode *inode,
+                                 struct address_space *mapping,
+                                 struct list_head *pages,
+                                 unsigned *nr_pages)
+{
+        int ret, npages = *nr_pages;
+        dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
+                 NFS_I(inode)->fscache, npages, inode);
+        ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+                                          mapping, pages, nr_pages,
+                                          nfs_readpage_from_fscache_complete,
+                                          ctx,
+                                          mapping_gfp_mask(mapping));
+        if (*nr_pages < npages)
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
+                                      npages);
+        if (*nr_pages > 0)
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
+                                      *nr_pages);
+        switch (ret) {
+        case 0: /* read submitted to the cache for all pages */
+                BUG_ON(!list_empty(pages));
+                BUG_ON(*nr_pages != 0);
+                dfprintk(FSCACHE,
+                         "NFS: nfs_getpages_from_fscache: submitted\n");
+                return ret;
+        case -ENOBUFS: /* some pages aren't cached and can't be */
+        case -ENODATA: /* some pages aren't cached */
+                dfprintk(FSCACHE,
+                         "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
+                return 1;
+        default:
+                dfprintk(FSCACHE,
+                         "NFS: nfs_getpages_from_fscache: ret  %d\n", ret);
+        }
+        return ret;
+}
+/*
+ * Store a newly fetched page in fscache
+ * - PG_fscache must be set on the page
+ */
+void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+{
+        int ret;
+        dfprintk(FSCACHE,
+                 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
+                 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+        ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+        dfprintk(FSCACHE,
+                 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+                 page, page->index, page->flags, ret);
+        if (ret != 0) {
+                fscache_uncache_page(NFS_I(inode)->fscache, page);
+                nfs_add_fscache_stats(inode,
+                                      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+        } else {
+                nfs_add_fscache_stats(inode,
+                                      NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
+        }
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 000000000000..6e809bb0ff08
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,220 @@
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+#ifdef CONFIG_NFS_FSCACHE
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+        struct rb_node          node;
+        struct nfs_client       *nfs_client;    /* the server */
+        /* the elements of the unique key - as used by nfs_compare_super() and
+         * nfs_compare_mount_options() to distinguish superblocks */
+        struct {
+                struct {
+                        unsigned long   s_flags;        /* various flags
+                                                         * (& NFS_MS_MASK) */
+                } super;
+                struct {
+                        struct nfs_fsid fsid;
+                        int             flags;
+                        unsigned int    rsize;          /* read size */
+                        unsigned int    wsize;          /* write size */
+                        unsigned int    acregmin;       /* attr cache timeouts */
+                        unsigned int    acregmax;
+                        unsigned int    acdirmin;
+                        unsigned int    acdirmax;
+                } nfs_server;
+                struct {
+                        rpc_authflavor_t au_flavor;
+                } rpc_auth;
+                /* uniquifier - can be used if nfs_server.flags includes
+                 * NFS_MOUNT_UNSHARED  */
+                u8 uniq_len;
+                char uniquifier[0];
+        } key;
+};
+/*
+ * fscache-index.c
+ */
+extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
+extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
+extern int nfs_fscache_register(void);
+extern void nfs_fscache_unregister(void);
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+extern void nfs_fscache_get_super_cookie(struct super_block *,
+                                         struct nfs_parsed_mount_data *);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+extern void nfs_fscache_init_inode_cookie(struct inode *);
+extern void nfs_fscache_release_inode_cookie(struct inode *);
+extern void nfs_fscache_zap_inode_cookie(struct inode *);
+extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void nfs_fscache_reset_inode_cookie(struct inode *);
+extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
+extern int nfs_fscache_release_page(struct page *, gfp_t);
+extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
+                                       struct inode *, struct page *);
+extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
+                                        struct inode *, struct address_space *,
+                                        struct list_head *, unsigned *);
+extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
+/*
+ * wait for a page to complete writing to the cache
+ */
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+                                                  struct page *page)
+{
+        if (PageFsCache(page))
+                fscache_wait_on_page_write(nfsi->fscache, page);
+}
+/*
+ * release the caching state associated with a page if undergoing complete page
+ * invalidation
+ */
+static inline void nfs_fscache_invalidate_page(struct page *page,
+                                               struct inode *inode)
+{
+        if (PageFsCache(page))
+                __nfs_fscache_invalidate_page(page, inode);
+}
+/*
+ * Retrieve a page from an inode data storage object.
+ */
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+                                            struct inode *inode,
+                                            struct page *page)
+{
+        if (NFS_I(inode)->fscache)
+                return __nfs_readpage_from_fscache(ctx, inode, page);
+        return -ENOBUFS;
+}
+/*
+ * Retrieve a set of pages from an inode data storage object.
+ */
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+                                             struct inode *inode,
+                                             struct address_space *mapping,
+                                             struct list_head *pages,
+                                             unsigned *nr_pages)
+{
+        if (NFS_I(inode)->fscache)
+                return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
+                                                    nr_pages);
+        return -ENOBUFS;
+}
+/*
+ * Store a page newly fetched from the server in an inode data storage object
+ * in the cache.
+ */
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+                                           struct page *page,
+                                           int sync)
+{
+        if (PageFsCache(page))
+                __nfs_readpage_to_fscache(inode, page, sync);
+}
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+        if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+                return "yes";
+        return "no ";
+}
+#else /* CONFIG_NFS_FSCACHE */
+static inline int nfs_fscache_register(void) { return 0; }
+static inline void nfs_fscache_unregister(void) {}
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_get_super_cookie(
+        struct super_block *sb,
+        struct nfs_parsed_mount_data *data)
+{
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
+                                                struct file *filp) {}
+static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+        return 1; /* True: may release page */
+}
+static inline void nfs_fscache_invalidate_page(struct page *page,
+                                               struct inode *inode) {}
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+                                                  struct page *page) {}
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+                                            struct inode *inode,
+                                            struct page *page)
+{
+        return -ENOBUFS;
+}
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+                                             struct inode *inode,
+                                             struct address_space *mapping,
+                                             struct list_head *pages,
+                                             unsigned *nr_pages)
+{
+        return -ENOBUFS;
+}
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+                                           struct page *page, int sync) {}
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+        return "no ";
+}
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f29..46177cb87064 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
                return ret;
        }
-        if (fattr.type != NFDIR) {
+        if (!S_ISDIR(fattr.mode)) {
                printk(KERN_ERR "nfs4_get_root:"
                       " getroot encountered non-directory\n");
                return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
                return ret;
        }
-        if (fattr.type != NFDIR) {
+        if (!S_ISDIR(fattr.mode)) {
                printk(KERN_ERR "nfs4_get_root:"
                       " lookupfh encountered non-directory\n");
                return -ENOTDIR;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171e..64f87194d390 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -66,6 +67,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 }
 /**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+        if (fatal_signal_pending(current))
+                return -ERESTARTSYS;
+        schedule();
+        return 0;
+}
+/**
 * nfs_compat_user_ino64 - returns the user-visible inode number
 * @fileid: 64-bit fileid
 *
@@ -109,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
        BUG_ON(!list_empty(&NFS_I(inode)->open_files));
        nfs_zap_acl_cache(inode);
        nfs_access_zap_cache(inode);
+        nfs_fscache_release_inode_cookie(inode);
 }
 /**
@@ -249,13 +263,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
        struct inode *inode = ERR_PTR(-ENOENT);
        unsigned long hash;
-        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
                goto out_no_inode;
+        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
-        if (!fattr->nlink) {
-                printk("NFS: Buggy server - nlink == 0!\n");
                goto out_no_inode;
-        }
        hash = nfs_fattr_to_ino_t(fattr);
@@ -291,7 +302,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                            && fattr->size <= NFS_LIMIT_READDIRPLUS)
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
-                        if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+                        if ((fattr->valid & NFS_ATTR_FATTR_FSID)
+                                        && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
                                if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
                                        inode->i_op = &nfs_referral_inode_operations;
                                else
@@ -304,30 +316,49 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                else
                        init_special_inode(inode, inode->i_mode, fattr->rdev);
+                memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+                memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+                memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+                nfsi->change_attr = 0;
+                inode->i_size = 0;
+                inode->i_nlink = 0;
+                inode->i_uid = -2;
+                inode->i_gid = -2;
+                inode->i_blocks = 0;
+                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
                nfsi->read_cache_jiffies = fattr->time_start;
                nfsi->attr_gencount = fattr->gencount;
-                inode->i_atime = fattr->atime;
+                if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-                inode->i_mtime = fattr->mtime;
+                        inode->i_atime = fattr->atime;
-                inode->i_ctime = fattr->ctime;
+                if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-                if (fattr->valid & NFS_ATTR_FATTR_V4)
+                        inode->i_mtime = fattr->mtime;
+                if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+                        inode->i_ctime = fattr->ctime;
+                if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
                        nfsi->change_attr = fattr->change_attr;
-                inode->i_size = nfs_size_to_loff_t(fattr->size);
+                if (fattr->valid & NFS_ATTR_FATTR_SIZE)
-                inode->i_nlink = fattr->nlink;
+                        inode->i_size = nfs_size_to_loff_t(fattr->size);
-                inode->i_uid = fattr->uid;
+                if (fattr->valid & NFS_ATTR_FATTR_NLINK)
-                inode->i_gid = fattr->gid;
+                        inode->i_nlink = fattr->nlink;
-                if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+                if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+                        inode->i_uid = fattr->uid;
+                if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+                        inode->i_gid = fattr->gid;
+                if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+                        inode->i_blocks = fattr->du.nfs2.blocks;
+                if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
                        /*
                         * report the blocks in 512byte units
                         */
                        inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-                } else {
-                        inode->i_blocks = fattr->du.nfs2.blocks;
                }
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
-                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
                nfsi->access_cache = RB_ROOT;
+                nfs_fscache_init_inode_cookie(inode);
                unlock_new_inode(inode);
        } else
                nfs_refresh_inode(inode, fattr);
@@ -514,6 +545,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        return err;
 }
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+        struct inode *inode;
+        struct nfs_server *server;
+        if (!(ctx->mode & FMODE_WRITE))
+                return;
+        if (!is_sync)
+                return;
+        inode = ctx->path.dentry->d_inode;
+        if (!list_empty(&NFS_I(inode)->open_files))
+                return;
+        server = NFS_SERVER(inode);
+        if (server->flags & NFS_MOUNT_NOCTO)
+                return;
+        nfs_revalidate_inode(server, inode);
+}
 static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
 {
        struct nfs_open_context *ctx;
@@ -540,24 +597,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
        return ctx;
 }
-static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
-        struct inode *inode;
+        struct inode *inode = ctx->path.dentry->d_inode;
-        if (ctx == NULL)
-                return;
-        inode = ctx->path.dentry->d_inode;
        if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
                return;
        list_del(&ctx->list);
        spin_unlock(&inode->i_lock);
-        if (ctx->state != NULL) {
+        NFS_PROTO(inode)->close_context(ctx, is_sync);
-                if (wait)
-                        nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
-                else
-                        nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
-        }
        if (ctx->cred != NULL)
                put_rpccred(ctx->cred);
        path_put(&ctx->path);
@@ -642,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
        ctx->mode = filp->f_mode;
        nfs_file_set_open_context(filp, ctx);
        put_nfs_open_context(ctx);
+        nfs_fscache_set_inode_cookie(inode, filp);
        return 0;
 }
@@ -670,9 +719,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        if (NFS_STALE(inode))
                goto out;
-        if (NFS_STALE(inode))
-                goto out;
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
        if (status != 0) {
@@ -745,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
        spin_unlock(&inode->i_lock);
        nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+        nfs_fscache_reset_inode_cookie(inode);
        dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
                        inode->i_sb->s_id, (long long)NFS_FILEID(inode));
        return 0;
@@ -815,25 +862,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
+        if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
-                        nfsi->change_attr == fattr->pre_change_attr) {
+                        && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+                        && nfsi->change_attr == fattr->pre_change_attr) {
                nfsi->change_attr = fattr->change_attr;
                if (S_ISDIR(inode->i_mode))
                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
        }
        /* If we have atomic WCC data, we may update some attributes */
-        if ((fattr->valid & NFS_ATTR_WCC) != 0) {
+        if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
-                if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+                        && (fattr->valid & NFS_ATTR_FATTR_CTIME)
+                        && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
                        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-                if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+        if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+                        && (fattr->valid & NFS_ATTR_FATTR_MTIME)
+                        && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
                        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
                        if (S_ISDIR(inode->i_mode))
                                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-                }
-                if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
-                    nfsi->npages == 0)
-                        i_size_write(inode, nfs_size_to_loff_t(fattr->size));
        }
+        if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+                        && (fattr->valid & NFS_ATTR_FATTR_SIZE)
+                        && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+                        && nfsi->npages == 0)
+                        i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 }
 /**
@@ -853,35 +906,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        /* Has the inode gone and changed behind our back? */
-        if (nfsi->fileid != fattr->fileid
+        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
-                        || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+                return -EIO;
+        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
                return -EIO;
-        }
-        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+        if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
                        nfsi->change_attr != fattr->change_attr)
                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
        /* Verify a few of the more important attributes */
-        if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+        if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-        cur_size = i_size_read(inode);
+        if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
-        new_isize = nfs_size_to_loff_t(fattr->size);
+                cur_size = i_size_read(inode);
-        if (cur_size != new_isize && nfsi->npages == 0)
+                new_isize = nfs_size_to_loff_t(fattr->size);
-                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+                if (cur_size != new_isize && nfsi->npages == 0)
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+        }
        /* Have any file permissions changed? */
-        if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
+        if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
-                        || inode->i_uid != fattr->uid
+                invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
-                        || inode->i_gid != fattr->gid)
+        if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+                invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+        if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
                invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
        /* Has the link count changed? */
-        if (inode->i_nlink != fattr->nlink)
+        if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
                invalid |= NFS_INO_INVALID_ATTR;
-        if (!timespec_equal(&inode->i_atime, &fattr->atime))
+        if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
                invalid |= NFS_INO_INVALID_ATIME;
        if (invalid != 0)
@@ -893,11 +950,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+        if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+                return 0;
        return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
 }
 static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+        if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+                return 0;
        return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
@@ -975,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
        spin_lock(&inode->i_lock);
        status = nfs_refresh_inode_locked(inode, fattr);
        spin_unlock(&inode->i_lock);
        return status;
 }
@@ -1033,20 +1095,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
        /* Don't do a WCC update if these attributes are already stale */
        if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
                        !nfs_inode_attrs_need_update(inode, fattr)) {
-                fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+                fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+                                | NFS_ATTR_FATTR_PRESIZE
+                                | NFS_ATTR_FATTR_PREMTIME
+                                | NFS_ATTR_FATTR_PRECTIME);
                goto out_noforce;
        }
-        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+        if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
-                        (fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+                        (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
                fattr->pre_change_attr = NFS_I(inode)->change_attr;
-                fattr->valid |= NFS_ATTR_WCC_V4;
+                fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
        }
-        if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
+        if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
-                        (fattr->valid & NFS_ATTR_WCC) == 0) {
+                        (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
                memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+                fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+        }
+        if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+                        (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
                memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+                fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+        }
+        if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+                        (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
                fattr->pre_size = i_size_read(inode);
-                fattr->valid |= NFS_ATTR_WCC;
+                fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
        }
 out_noforce:
        status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1078,18 +1151,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        __func__, inode->i_sb->s_id, inode->i_ino,
                        atomic_read(&inode->i_count), fattr->valid);
-        if (nfsi->fileid != fattr->fileid)
+        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
                goto out_fileid;
        /*
         * Make sure the inode's type hasn't changed.
         */
-        if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
                goto out_changed;
        server = NFS_SERVER(inode);
        /* Update the fsid? */
-        if (S_ISDIR(inode->i_mode) &&
+        if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
                        !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
                        !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
                server->fsid = fattr->fsid;
@@ -1099,14 +1172,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
         */
        nfsi->read_cache_jiffies = fattr->time_start;
-        nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
+        if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
-                        | NFS_INO_REVAL_PAGECACHE);
+            nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+                    | NFS_INO_INVALID_ATIME
+                    | NFS_INO_REVAL_PAGECACHE);
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
        /* More cache consistency checks */
-        if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+        if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+                if (nfsi->change_attr != fattr->change_attr) {
+                        dprintk("NFS: change_attr change on server for file %s/%ld\n",
+                                        inode->i_sb->s_id, inode->i_ino);
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        if (S_ISDIR(inode->i_mode))
+                                nfs_force_lookup_revalidate(inode);
+                        nfsi->change_attr = fattr->change_attr;
+                }
+        }
+        if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
                /* NFSv2/v3: Check if the mtime agrees */
                if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
                        dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1114,59 +1200,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
                        if (S_ISDIR(inode->i_mode))
                                nfs_force_lookup_revalidate(inode);
+                        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
                }
+        }
+        if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
                /* If ctime has changed we should definitely clear access+acl caches */
-                if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
+                if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-        } else if (nfsi->change_attr != fattr->change_attr) {
+                        /* and probably clear data for a directory too as utimes can cause
-                dprintk("NFS: change_attr change on server for file %s/%ld\n",
+                         * havoc with our cache.
-                                inode->i_sb->s_id, inode->i_ino);
+                         */
-                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        if (S_ISDIR(inode->i_mode)) {
-                if (S_ISDIR(inode->i_mode))
+                                invalid |= NFS_INO_INVALID_DATA;
-                        nfs_force_lookup_revalidate(inode);
+                                nfs_force_lookup_revalidate(inode);
+                        }
+                        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                }
        }
        /* Check if our cached file size is stale */
-        new_isize = nfs_size_to_loff_t(fattr->size);
+        if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
-        cur_isize = i_size_read(inode);
+                new_isize = nfs_size_to_loff_t(fattr->size);
-        if (new_isize != cur_isize) {
+                cur_isize = i_size_read(inode);
-                /* Do we perhaps have any outstanding writes, or has
+                if (new_isize != cur_isize) {
-                 * the file grown beyond our last write? */
+                        /* Do we perhaps have any outstanding writes, or has
-                if (nfsi->npages == 0 || new_isize > cur_isize) {
+                         * the file grown beyond our last write? */
-                        i_size_write(inode, new_isize);
+                        if (nfsi->npages == 0 || new_isize > cur_isize) {
-                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+                                i_size_write(inode, new_isize);
+                                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+                        }
+                        dprintk("NFS: isize change on server for file %s/%ld\n",
+                                        inode->i_sb->s_id, inode->i_ino);
                }
-                dprintk("NFS: isize change on server for file %s/%ld\n",
-                                inode->i_sb->s_id, inode->i_ino);
        }
-        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+        if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
-        memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
-        nfsi->change_attr = fattr->change_attr;
-        if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
-            inode->i_uid != fattr->uid ||
-            inode->i_gid != fattr->gid)
-                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-        if (inode->i_nlink != fattr->nlink)
+        if (fattr->valid & NFS_ATTR_FATTR_MODE) {
-                invalid |= NFS_INO_INVALID_ATTR;
+                if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        inode->i_mode = fattr->mode;
+                }
+        }
+        if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+                if (inode->i_uid != fattr->uid) {
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        inode->i_uid = fattr->uid;
+                }
+        }
+        if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+                if (inode->i_gid != fattr->gid) {
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        inode->i_gid = fattr->gid;
+                }
+        }
-        inode->i_mode = fattr->mode;
+        if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
-        inode->i_nlink = fattr->nlink;
+                if (inode->i_nlink != fattr->nlink) {
-        inode->i_uid = fattr->uid;
+                        invalid |= NFS_INO_INVALID_ATTR;
-        inode->i_gid = fattr->gid;
+                        if (S_ISDIR(inode->i_mode))
+                                invalid |= NFS_INO_INVALID_DATA;
+                        inode->i_nlink = fattr->nlink;
+                }
+        }
-        if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+        if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
                /*
                 * report the blocks in 512byte units
                 */
                inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-        } else {
-                inode->i_blocks = fattr->du.nfs2.blocks;
        }
+        if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+                inode->i_blocks = fattr->du.nfs2.blocks;
        /* Update attrtimeo value if we're out of the unstable period */
        if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1274,7 +1381,6 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
        INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
        INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-        nfsi->ncommit = 0;
        nfsi->npages = 0;
        atomic_set(&nfsi->silly_count, 1);
        INIT_HLIST_HEAD(&nfsi->silly_list);
@@ -1337,6 +1443,10 @@ static int __init init_nfs_fs(void)
 {
        int err;
+        err = nfs_fscache_register();
+        if (err < 0)
+                goto out7;
        err = nfsiod_start();
        if (err)
                goto out6;
@@ -1389,6 +1499,8 @@ out4:
 out5:
        nfsiod_stop();
 out6:
+        nfs_fscache_unregister();
+out7:
        return err;
 }
@@ -1399,6 +1511,7 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_readpagecache();
        nfs_destroy_inodecache();
        nfs_destroy_nfspagecache();
+        nfs_fscache_unregister();
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608f..e4d6a8348adf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
 #include <linux/mount.h>
 #include <linux/security.h>
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
 struct nfs_string;
 /* Maximum number of readahead requests
@@ -37,10 +39,12 @@ struct nfs_parsed_mount_data {
        int                     acregmin, acregmax,
                                acdirmin, acdirmax;
        int                     namlen;
+        unsigned int            options;
        unsigned int            bsize;
        unsigned int            auth_flavor_len;
        rpc_authflavor_t        auth_flavors[1];
        char                    *client_address;
+        char                    *fscache_uniq;
        struct {
                struct sockaddr_storage address;
@@ -152,6 +156,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 /* dir.c */
 extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
@@ -165,6 +172,7 @@ extern void nfs_clear_inode(struct inode *);
 extern void nfs4_clear_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
 /* super.c */
 void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a36952810032..a2ab2529b5ca 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
 struct nfs_iostats {
        unsigned long long      bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+        unsigned long long      fscache[__NFSIOS_FSCACHEMAX];
+#endif
        unsigned long           events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
        nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+                                         enum nfs_stat_fscachecounters stat,
+                                         unsigned long addend)
+{
+        struct nfs_iostats *iostats;
+        int cpu;
+        cpu = get_cpu();
+        iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
+        iostats->fscache[stat] += addend;
+        put_cpu_no_resched();
+}
+#endif
 static inline struct nfs_iostats *nfs_alloc_iostats(void)
 {
        return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d1519..c862c9340f9a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
 static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-        u32 rdev;
+        u32 rdev, type;
-        fattr->type = (enum nfs_ftype) ntohl(*p++);
+        type = ntohl(*p++);
        fattr->mode = ntohl(*p++);
        fattr->nlink = ntohl(*p++);
        fattr->uid = ntohl(*p++);
@@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
        p = xdr_decode_time(p, &fattr->atime);
        p = xdr_decode_time(p, &fattr->mtime);
        p = xdr_decode_time(p, &fattr->ctime);
-        fattr->valid |= NFS_ATTR_FATTR;
+        fattr->valid |= NFS_ATTR_FATTR_V2;
        fattr->rdev = new_decode_dev(rdev);
-        if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
+        if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
-                fattr->type = NFFIFO;
                fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
                fattr->rdev = 0;
        }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..d0cc5ce0edfe 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                data->arg.create.verifier[1] = current->pid;
        }
-        sattr->ia_mode &= ~current->fs->umask;
+        sattr->ia_mode &= ~current_umask();
        for (;;) {
                status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
        dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
-        sattr->ia_mode &= ~current->fs->umask;
+        sattr->ia_mode &= ~current_umask();
        data = nfs3_alloc_createdata();
        if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
                        MAJOR(rdev), MINOR(rdev));
-        sattr->ia_mode &= ~current->fs->umask;
+        sattr->ia_mode &= ~current_umask();
        data = nfs3_alloc_createdata();
        if (data == NULL)
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .commit_done    = nfs3_commit_done,
        .lock           = nfs3_proc_lock,
        .clear_acl_cache = nfs3_forget_cached_acls,
+        .close_context  = nfs_close_context,
 };
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde46..35869a4921f1 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
 /*
 * Map file type to S_IFMT bits
 */
-static struct {
+static const umode_t nfs_type2fmt[] = {
-        unsigned int    mode;
+        [NF3BAD] = 0,
-        unsigned int    nfs2type;
+        [NF3REG] = S_IFREG,
-} nfs_type2fmt[] = {
+        [NF3DIR] = S_IFDIR,
-      { 0,              NFNON   },
+        [NF3BLK] = S_IFBLK,
-      { S_IFREG,        NFREG   },
+        [NF3CHR] = S_IFCHR,
-      { S_IFDIR,        NFDIR   },
+        [NF3LNK] = S_IFLNK,
-      { S_IFBLK,        NFBLK   },
+        [NF3SOCK] = S_IFSOCK,
-      { S_IFCHR,        NFCHR   },
+        [NF3FIFO] = S_IFIFO,
-      { S_IFLNK,        NFLNK   },
-      { S_IFSOCK,       NFSOCK  },
-      { S_IFIFO,        NFFIFO  },
-      { 0,              NFBAD   }
 };
 /*
@@ -148,13 +144,12 @@ static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
        unsigned int    type, major, minor;
-        int             fmode;
+        umode_t         fmode;
        type = ntohl(*p++);
-        if (type >= NF3BAD)
+        if (type > NF3FIFO)
-                type = NF3BAD;
+                type = NF3NON;
-        fmode = nfs_type2fmt[type].mode;
+        fmode = nfs_type2fmt[type];
-        fattr->type = nfs_type2fmt[type].nfs2type;
        fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
        fattr->nlink = ntohl(*p++);
        fattr->uid = ntohl(*p++);
@@ -177,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
        p = xdr_decode_time3(p, &fattr->ctime);
        /* Update the mode bits */
-        fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
+        fattr->valid |= NFS_ATTR_FATTR_V3;
        return p;
 }
@@ -233,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
        p = xdr_decode_hyper(p, &fattr->pre_size);
        p = xdr_decode_time3(p, &fattr->pre_mtime);
        p = xdr_decode_time3(p, &fattr->pre_ctime);
-        fattr->valid |= NFS_ATTR_WCC;
+        fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+                | NFS_ATTR_FATTR_PREMTIME
+                | NFS_ATTR_FATTR_PRECTIME;
        return p;
 }
@@ -716,7 +713,8 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
        if (args->npages != 0)
                xdr_encode_pages(buf, args->pages, 0, args->len);
        else
-                req->rq_slen += args->len;
+                req->rq_slen = xdr_adjust_iovec(req->rq_svec,
+                                p + XDR_QUADLEN(args->len));
        err = nfsacl_encode(buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d9..4674f8092da8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
        kunmap_atomic(start, KM_USER0);
 }
-static int nfs4_wait_bit_killable(void *word)
-{
-        if (fatal_signal_pending(current))
-                return -ERESTARTSYS;
-        schedule();
-        return 0;
-}
 static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
        int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
        might_sleep();
        res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-                        nfs4_wait_bit_killable, TASK_KILLABLE);
+                        nfs_wait_bit_killable, TASK_KILLABLE);
        return res;
 }
@@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        if (calldata->arg.seqid == NULL)
                goto out_free_calldata;
        calldata->arg.fmode = 0;
-        calldata->arg.bitmask = server->attr_bitmask;
+        calldata->arg.bitmask = server->cache_consistency_bitmask;
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
@@ -1509,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                attr.ia_mode = nd->intent.open.create_mode;
                attr.ia_valid = ATTR_MODE;
                if (!IS_POSIXACL(dir))
-                        attr.ia_mode &= ~current->fs->umask;
+                        attr.ia_mode &= ~current_umask();
        } else {
                attr.ia_valid = 0;
                BUG_ON(nd->intent.open.flags & O_CREAT);
@@ -1580,6 +1572,15 @@ out_drop:
        return 0;
 }
+void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+        if (ctx->state == NULL)
+                return;
+        if (is_sync)
+                nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+        else
+                nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+}
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
@@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                        server->caps |= NFS_CAP_HARDLINKS;
                if (res.has_symlinks != 0)
                        server->caps |= NFS_CAP_SYMLINKS;
+                memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+                server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+                server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
                server->acl_bitmask = res.acl_bitmask;
        }
        return status;
@@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        struct nfs_removeargs *args = msg->rpc_argp;
        struct nfs_removeres *res = msg->rpc_resp;
-        args->bitmask = server->attr_bitmask;
+        args->bitmask = server->cache_consistency_bitmask;
        res->server = server;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .pages = &page,
                .pgbase = 0,
                .count = count,
-                .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+                .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
        };
        struct nfs4_readdir_res res;
        struct rpc_message msg = {
@@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        data->args.bitmask = server->attr_bitmask;
+        data->args.bitmask = server->cache_consistency_bitmask;
        data->res.server = server;
        data->timestamp   = jiffies;
@@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
        
-        data->args.bitmask = server->attr_bitmask;
+        data->args.bitmask = server->cache_consistency_bitmask;
        data->res.server = server;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
@@ -2590,12 +2594,9 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
        unsigned long timestamp = (unsigned long)data;
        if (task->tk_status < 0) {
-                switch (task->tk_status) {
+                /* Unless we're shutting down, schedule state recovery! */
-                        case -NFS4ERR_STALE_CLIENTID:
+                if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
-                        case -NFS4ERR_EXPIRED:
+                        nfs4_schedule_state_recovery(clp);
-                        case -NFS4ERR_CB_PATH_DOWN:
-                                nfs4_schedule_state_recovery(clp);
-                }
                return;
        }
        spin_lock(&clp->cl_lock);
@@ -3678,6 +3679,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
        return len;
 }
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+        if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
+                (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+                (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+                return;
+        fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+                NFS_ATTR_FATTR_NLINK;
+        fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+        fattr->nlink = 2;
+}
 int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page)
 {
@@ -3704,6 +3718,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
        fs_locations->server = server;
        fs_locations->nlocations = 0;
        status = rpc_call_sync(server->client, &msg, 0);
+        nfs_fixup_referral_attributes(&fs_locations->fattr);
        dprintk("%s: returned status = %d\n", __func__, status);
        return status;
 }
@@ -3767,6 +3782,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .commit_done    = nfs4_commit_done,
        .lock           = nfs4_proc_lock,
        .clear_acl_cache = nfs4_zap_acl_attr,
+        .close_context  = nfs4_close_context,
 };
 /*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966f..0298e909559f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
 static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 {
-        int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
+        unsigned short port;
-                        nfs_callback_tcpport, cred);
+        int status;
+        port = nfs_callback_tcpport;
+        if (clp->cl_addr.ss_family == AF_INET6)
+                port = nfs_callback_tcpport6;
+        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
        if (status == 0)
                status = nfs4_proc_setclientid_confirm(clp, cred);
        if (status == 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a9..1690f0e44b91 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
                                 decode_lookup_maxsz + \
                                 decode_fs_locations_maxsz)
-static struct {
+static const umode_t nfs_type2fmt[] = {
-        unsigned int    mode;
+        [NF4BAD] = 0,
-        unsigned int    nfs2type;
+        [NF4REG] = S_IFREG,
-} nfs_type2fmt[] = {
+        [NF4DIR] = S_IFDIR,
-        { 0,            NFNON        },
+        [NF4BLK] = S_IFBLK,
-        { S_IFREG,      NFREG        },
+        [NF4CHR] = S_IFCHR,
-        { S_IFDIR,      NFDIR        },
+        [NF4LNK] = S_IFLNK,
-        { S_IFBLK,      NFBLK        },
+        [NF4SOCK] = S_IFSOCK,
-        { S_IFCHR,      NFCHR        },
+        [NF4FIFO] = S_IFIFO,
-        { S_IFLNK,      NFLNK        },
+        [NF4ATTRDIR] = 0,
-        { S_IFSOCK,     NFSOCK       },
+        [NF4NAMEDATTR] = 0,
-        { S_IFIFO,      NFFIFO       },
-        { 0,            NFNON        },
-        { 0,            NFNON        },
 };
 struct compound_hdr {
@@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
 {
        __be32 *p;
+        int ret = 0;
        *type = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
                        return -EIO;
                }
                bitmap[0] &= ~FATTR4_WORD0_TYPE;
+                ret = NFS_ATTR_FATTR_TYPE;
        }
-        dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
+        dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
-        return 0;
+        return ret;
 }
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
 {
        __be32 *p;
+        int ret = 0;
        *change = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
                READ_BUF(8);
                READ64(*change);
                bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+                ret = NFS_ATTR_FATTR_CHANGE;
        }
        dprintk("%s: change attribute=%Lu\n", __func__,
                        (unsigned long long)*change);
-        return 0;
+        return ret;
 }
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
 {
        __be32 *p;
+        int ret = 0;
        *size = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
                READ_BUF(8);
                READ64(*size);
                bitmap[0] &= ~FATTR4_WORD0_SIZE;
+                ret = NFS_ATTR_FATTR_SIZE;
        }
        dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
-        return 0;
+        return ret;
 }
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
        __be32 *p;
+        int ret = 0;
        fsid->major = 0;
        fsid->minor = 0;
@@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
                READ64(fsid->major);
                READ64(fsid->minor);
                bitmap[0] &= ~FATTR4_WORD0_FSID;
+                ret = NFS_ATTR_FATTR_FSID;
        }
        dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
                        (unsigned long long)fsid->major,
                        (unsigned long long)fsid->minor);
-        return 0;
+        return ret;
 }
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
        __be32 *p;
+        int ret = 0;
        *fileid = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
                READ_BUF(8);
                READ64(*fileid);
                bitmap[0] &= ~FATTR4_WORD0_FILEID;
+                ret = NFS_ATTR_FATTR_FILEID;
        }
        dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-        return 0;
+        return ret;
 }
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
        __be32 *p;
+        int ret = 0;
        *fileid = 0;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
                READ_BUF(8);
                READ64(*fileid);
                bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+                ret = NFS_ATTR_FATTR_FILEID;
        }
        dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-        return 0;
+        return ret;
 }
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
                if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
                        res->nlocations++;
        }
+        if (res->nlocations != 0)
+                status = NFS_ATTR_FATTR_V4_REFERRAL;
 out:
        dprintk("%s: fs_locations done, error = %d\n", __func__, status);
        return status;
@@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
        return status;
 }
-static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
 {
+        uint32_t tmp;
        __be32 *p;
+        int ret = 0;
        *mode = 0;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
                READ_BUF(4);
-                READ32(*mode);
+                READ32(tmp);
-                *mode &= ~S_IFMT;
+                *mode = tmp & ~S_IFMT;
                bitmap[1] &= ~FATTR4_WORD1_MODE;
+                ret = NFS_ATTR_FATTR_MODE;
        }
        dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
-        return 0;
+        return ret;
 }
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
 {
        __be32 *p;
+        int ret = 0;
        *nlink = 1;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
                READ_BUF(4);
                READ32(*nlink);
                bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+                ret = NFS_ATTR_FATTR_NLINK;
        }
        dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
-        return 0;
+        return ret;
 }
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
 {
        uint32_t len;
        __be32 *p;
+        int ret = 0;
        *uid = -2;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                READ32(len);
                READ_BUF(len);
                if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0)
+                        if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+                                ret = NFS_ATTR_FATTR_OWNER;
+                        else
                                dprintk("%s: nfs_map_name_to_uid failed!\n",
                                                __func__);
                } else
@@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                bitmap[1] &= ~FATTR4_WORD1_OWNER;
        }
        dprintk("%s: uid=%d\n", __func__, (int)*uid);
-        return 0;
+        return ret;
 }
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
 {
        uint32_t len;
        __be32 *p;
+        int ret = 0;
        *gid = -2;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                READ32(len);
                READ_BUF(len);
                if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0)
+                        if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+                                ret = NFS_ATTR_FATTR_GROUP;
+                        else
                                dprintk("%s: nfs_map_group_to_gid failed!\n",
                                                __func__);
                } else
@@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
        }
        dprintk("%s: gid=%d\n", __func__, (int)*gid);
-        return 0;
+        return ret;
 }
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
 {
        uint32_t major = 0, minor = 0;
        __be32 *p;
+        int ret = 0;
        *rdev = MKDEV(0,0);
        if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
                if (MAJOR(tmp) == major && MINOR(tmp) == minor)
                        *rdev = tmp;
                bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+                ret = NFS_ATTR_FATTR_RDEV;
        }
        dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
-        return 0;
+        return ret;
 }
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
 {
        __be32 *p;
+        int ret = 0;
        *used = 0;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
                READ_BUF(8);
                READ64(*used);
                bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+                ret = NFS_ATTR_FATTR_SPACE_USED;
        }
        dprintk("%s: space used=%Lu\n", __func__,
                        (unsigned long long)*used);
-        return 0;
+        return ret;
 }
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
                status = decode_attr_time(xdr, time);
+                if (status == 0)
+                        status = NFS_ATTR_FATTR_ATIME;
                bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
        }
        dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
                status = decode_attr_time(xdr, time);
+                if (status == 0)
+                        status = NFS_ATTR_FATTR_CTIME;
                bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
        }
        dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
                status = decode_attr_time(xdr, time);
+                if (status == 0)
+                        status = NFS_ATTR_FATTR_MTIME;
                bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
        }
        dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
        uint32_t attrlen,
                 bitmap[2] = {0},
                 type;
-        int status, fmode = 0;
+        int status;
+        umode_t fmode = 0;
        uint64_t fileid;
-        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+        status = decode_op_hdr(xdr, OP_GETATTR);
-                goto xdr_error;
+        if (status < 0)
-        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
                goto xdr_error;
-        fattr->bitmap[0] = bitmap[0];
+        status = decode_attr_bitmap(xdr, bitmap);
-        fattr->bitmap[1] = bitmap[1];
+        if (status < 0)
+                goto xdr_error;
-        if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+        status = decode_attr_length(xdr, &attrlen, &savep);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
+        status = decode_attr_type(xdr, bitmap, &type);
+        if (status < 0)
                goto xdr_error;
-        fattr->type = nfs_type2fmt[type].nfs2type;
+        fattr->mode = 0;
-        fmode = nfs_type2fmt[type].mode;
+        if (status != 0) {
+                fattr->mode |= nfs_type2fmt[type];
+                fattr->valid |= status;
+        }
-        if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
+        status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_size(xdr, bitmap, &fattr->size);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+        fattr->valid |= status;
+        status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
                                                struct nfs4_fs_locations,
-                                                fattr))) != 0)
+                                                fattr));
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_mode(xdr, bitmap, &fmode);
+        if (status < 0)
                goto xdr_error;
-        fattr->mode |= fmode;
+        if (status != 0) {
-        if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
+                fattr->mode |= fmode;
+                fattr->valid |= status;
+        }
+        status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
+        if (status < 0)
                goto xdr_error;
-        if (fattr->fileid == 0 && fileid != 0)
+        if (status != 0 && !(fattr->valid & status)) {
                fattr->fileid = fileid;
-        if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
+                fattr->valid |= status;
-                fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
+        }
+        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
        dprintk("%s: xdr returned %d\n", __func__, -status);
        return status;
@@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
        status = decode_setattr(&xdr, res);
        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server);
-        if (status == NFS4ERR_DELAY)
-                status = 0;
 out:
        return status;
 }
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index d9ef602fbc5a..e3ed5908820b 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -129,7 +129,7 @@ enum {
        Opt_err
 };
-static match_table_t __initconst tokens = {
+static const match_table_t tokens __initconst = {
        {Opt_port, "port=%u"},
        {Opt_rsize, "rsize=%u"},
        {Opt_wsize, "wsize=%u"},
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70a..e2975939126a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
        kref_put(&req->wb_kref, nfs_free_request);
 }
-static int nfs_wait_bit_killable(void *word)
-{
-        int ret = 0;
-        if (fatal_signal_pending(current))
-                ret = -ERESTARTSYS;
-        else
-                schedule();
-        return ret;
-}
 /**
 * nfs_wait_on_request - Wait for a request to complete.
 * @req: request to wait upon.
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7c..7be72d90d49d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .commit_setup   = nfs_proc_commit_setup,
        .lock           = nfs_proc_lock,
        .lock_check_bounds = nfs_lock_check_bounds,
+        .close_context  = nfs_close_context,
 };
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7fa..4ace3c50a8eb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,6 +24,7 @@
 #include "internal.h"
 #include "iostat.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
        }
 }
-static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
-                struct page *page)
+                       struct page *page)
 {
        LIST_HEAD(one_request);
        struct nfs_page *new;
@@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 static void nfs_readpage_release(struct nfs_page *req)
 {
+        struct inode *d_inode = req->wb_context->path.dentry->d_inode;
+        if (PageUptodate(req->wb_page))
+                nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
        unlock_page(req->wb_page);
        dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
@@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page)
        } else
                ctx = get_nfs_open_context(nfs_file_open_context(file));
+        if (!IS_SYNC(inode)) {
+                error = nfs_readpage_from_fscache(ctx, inode, page);
+                if (error == 0)
+                        goto out;
+        }
        error = nfs_readpage_async(ctx, inode, page);
+out:
        put_nfs_open_context(ctx);
        return error;
 out_unlock:
@@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                        return -EBADF;
        } else
                desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
+        /* attempt to read as many of the pages as possible from the cache
+         * - this returns -ENOBUFS immediately if the cookie is negative
+         */
+        ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
+                                         pages, &nr_pages);
+        if (ret == 0)
+                goto read_complete; /* all pages were read */
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
@@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        nfs_pageio_complete(&pgio);
        npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+read_complete:
        put_nfs_open_context(desc.ctx);
 out:
        return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786dc..d2d67781c579 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -76,6 +77,7 @@ enum {
        Opt_rdirplus, Opt_nordirplus,
        Opt_sharecache, Opt_nosharecache,
        Opt_resvport, Opt_noresvport,
+        Opt_fscache, Opt_nofscache,
        /* Mount options that take integer arguments */
        Opt_port,
@@ -93,6 +95,7 @@ enum {
        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
        Opt_lookupcache,
+        Opt_fscache_uniq,
        /* Special mount options */
        Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_nosharecache, "nosharecache" },
        { Opt_resvport, "resvport" },
        { Opt_noresvport, "noresvport" },
+        { Opt_fscache, "fsc" },
+        { Opt_fscache_uniq, "fsc=%s" },
+        { Opt_nofscache, "nofsc" },
        { Opt_port, "port=%u" },
        { Opt_rsize, "rsize=%u" },
@@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        if (clp->rpc_ops->version == 4)
                seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
 #endif
+        if (nfss->options & NFS_OPTION_FSCACHE)
+                seq_printf(m, ",fsc");
 }
 /*
@@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
                        totals.events[i] += stats->events[i];
                for (i = 0; i < __NFSIOS_BYTESMAX; i++)
                        totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+                for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+                        totals.fscache[i] += stats->fscache[i];
+#endif
                preempt_enable();
        }
@@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
        seq_printf(m, "\n\tbytes:\t");
        for (i = 0; i < __NFSIOS_BYTESMAX; i++)
                seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+        if (nfss->options & NFS_OPTION_FSCACHE) {
+                seq_printf(m, "\n\tfsc:\t");
+                for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+                        seq_printf(m, "%Lu ", totals.bytes[i]);
+        }
+#endif
        seq_printf(m, "\n");
        rpc_print_iostats(m, nfss->client);
@@ -664,9 +683,12 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 */
 static void nfs_umount_begin(struct super_block *sb)
 {
-        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_server *server;
        struct rpc_clnt *rpc;
+        lock_kernel();
+        server = NFS_SB(sb);
        /* -EIO all pending I/O */
        rpc = server->client_acl;
        if (!IS_ERR(rpc))
@@ -674,6 +696,8 @@ static void nfs_umount_begin(struct super_block *sb)
        rpc = server->client;
        if (!IS_ERR(rpc))
                rpc_killall_tasks(rpc);
+        unlock_kernel();
 }
 /*
@@ -1018,6 +1042,7 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_rdma:
                        mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+                        xprt_load_transport(p);
                        break;
                case Opt_acl:
                        mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1043,6 +1068,24 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_noresvport:
                        mnt->flags |= NFS_MOUNT_NORESVPORT;
                        break;
+                case Opt_fscache:
+                        mnt->options |= NFS_OPTION_FSCACHE;
+                        kfree(mnt->fscache_uniq);
+                        mnt->fscache_uniq = NULL;
+                        break;
+                case Opt_nofscache:
+                        mnt->options &= ~NFS_OPTION_FSCACHE;
+                        kfree(mnt->fscache_uniq);
+                        mnt->fscache_uniq = NULL;
+                        break;
+                case Opt_fscache_uniq:
+                        string = match_strdup(args);
+                        if (!string)
+                                goto out_nomem;
+                        kfree(mnt->fscache_uniq);
+                        mnt->fscache_uniq = string;
+                        mnt->options |= NFS_OPTION_FSCACHE;
+                        break;
                /*
                 * options that take numeric values
@@ -1190,7 +1233,6 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        token = match_token(string,
                                            nfs_xprt_protocol_tokens, args);
-                        kfree(string);
                        switch (token) {
                        case Opt_xprt_udp:
@@ -1205,12 +1247,14 @@ static int nfs_parse_mount_options(char *raw,
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+                                xprt_load_transport(string);
                                break;
                        default:
                                errors++;
                                dfprintk(MOUNT, "NFS:   unrecognized "
                                                "transport protocol\n");
                        }
+                        kfree(string);
                        break;
                case Opt_mountproto:
                        string = match_strdup(args);
@@ -1868,8 +1912,6 @@ static void nfs_clone_super(struct super_block *sb,
        nfs_initialise_sb(sb);
 }
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
 {
        const struct nfs_server *a = s->s_fs_info;
@@ -2034,6 +2076,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        if (!s->s_root) {
                /* initial superblock/root creation */
                nfs_fill_super(s, data);
+                nfs_fscache_get_super_cookie(s, data);
        }
        mntroot = nfs_get_root(s, mntfh);
@@ -2054,6 +2097,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 out:
        kfree(data->nfs_server.hostname);
        kfree(data->mount_server.hostname);
+        kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        kfree(mntfh);
@@ -2067,8 +2111,7 @@ out_err_nosb:
 error_splat_root:
        dput(mntroot);
 error_splat_super:
-        up_write(&s->s_umount);
+        deactivate_locked_super(s);
-        deactivate_super(s);
        goto out;
 }
@@ -2081,6 +2124,7 @@ static void nfs_kill_super(struct super_block *s)
        bdi_unregister(&server->backing_dev_info);
        kill_anon_super(s);
+        nfs_fscache_release_super_cookie(s);
        nfs_free_server(server);
 }
@@ -2163,8 +2207,7 @@ out_err_noserver:
        return error;
 error_splat_super:
-        up_write(&s->s_umount);
+        deactivate_locked_super(s);
-        deactivate_super(s);
        dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
        return error;
 }
@@ -2388,6 +2431,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        if (!s->s_root) {
                /* initial superblock/root creation */
                nfs4_fill_super(s);
+                nfs_fscache_get_super_cookie(s, data);
        }
        mntroot = nfs4_get_root(s, mntfh);
@@ -2409,6 +2453,7 @@ out:
        kfree(data->client_address);
        kfree(data->nfs_server.export_path);
        kfree(data->nfs_server.hostname);
+        kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        kfree(mntfh);
@@ -2422,8 +2467,7 @@ out_free:
 error_splat_root:
        dput(mntroot);
 error_splat_super:
-        up_write(&s->s_umount);
+        deactivate_locked_super(s);
-        deactivate_super(s);
        goto out;
 }
@@ -2435,6 +2479,7 @@ static void nfs4_kill_super(struct super_block *sb)
        kill_anon_super(sb);
        nfs4_renewd_prepare_shutdown(server);
+        nfs_fscache_release_super_cookie(sb);
        nfs_free_server(server);
 }
@@ -2516,8 +2561,7 @@ out_err_noserver:
        return error;
 error_splat_super:
-        up_write(&s->s_umount);
+        deactivate_locked_super(s);
-        deactivate_super(s);
        dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
        return error;
 }
@@ -2601,8 +2645,7 @@ out_err_noserver:
        return error;
 error_splat_super:
-        up_write(&s->s_umount);
+        deactivate_locked_super(s);
-        deactivate_super(s);
        dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
        return error;
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc1..e560a78995a3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
+        unsigned long *bitlock = &NFS_I(inode)->flags;
        struct nfs_pageio_descriptor pgio;
        int err;
+        /* Stop dirtying of new pages while we sync */
+        err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
+                        nfs_wait_bit_killable, TASK_KILLABLE);
+        if (err)
+                goto out_err;
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
        nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
        err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
        nfs_pageio_complete(&pgio);
+        clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+        smp_mb__after_clear_bit();
+        wake_up_bit(bitlock, NFS_INO_FLUSHING);
        if (err < 0)
-                return err;
+                goto out_err;
-        if (pgio.pg_error < 0)
+        err = pgio.pg_error;
-                return pgio.pg_error;
+        if (err < 0)
+                goto out_err;
        return 0;
+out_err:
+        return err;
 }
 /*
@@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req)
        struct nfs_inode *nfsi = NFS_I(inode);
        spin_lock(&inode->i_lock);
-        nfsi->ncommit++;
        set_bit(PG_CLEAN, &(req)->wb_flags);
        radix_tree_tag_set(&nfsi->nfs_page_tree,
                        req->wb_index,
@@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int
+nfs_need_commit(struct nfs_inode *nfsi)
+{
+        return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+}
 /*
 * nfs_scan_commit - Scan an inode for commit requests
 * @inode: NFS inode to scan
@@ -538,16 +558,18 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        int res = 0;
-        if (nfsi->ncommit != 0) {
+        if (!nfs_need_commit(nfsi))
-                res = nfs_scan_list(nfsi, dst, idx_start, npages,
+                return 0;
-                                NFS_PAGE_TAG_COMMIT);
-                nfsi->ncommit -= res;
+        return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
-        }
-        return res;
 }
 #else
+static inline int nfs_need_commit(struct nfs_inode *nfsi)
+{
+        return 0;
+}
 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
        return 0;
@@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->args.stable  = NFS_UNSTABLE;
        if (how & FLUSH_STABLE) {
                data->args.stable = NFS_DATA_SYNC;
-                if (!NFS_I(inode)->ncommit)
+                if (!nfs_need_commit(NFS_I(inode)))
                        data->args.stable = NFS_FILE_SYNC;
        }
@@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 {
        struct writeback_control wbc = {
                .bdi = mapping->backing_dev_info,
-                .sync_mode = WB_SYNC_NONE,
+                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
                .range_start = 0,
                .range_end = LLONG_MAX,
                .for_writepages = 1,
        };
-        int ret;
-        ret = __nfs_write_mapping(mapping, &wbc, how);
-        if (ret < 0)
-                return ret;
-        wbc.sync_mode = WB_SYNC_ALL;
        return __nfs_write_mapping(mapping, &wbc, how);
 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
 config NFSD
        tristate "NFS server support"
        depends on INET
+        depends on FILE_LOCKING
        select LOCKD
        select SUNRPC
        select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
 #include <linux/unistd.h>
 #include <linux/slab.h>
 #include <linux/major.h>
+#include <linux/magic.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
                                         struct nfsd3_writeres  *resp)
 {
        __be32  nfserr;
+        unsigned long cnt = argp->len;
        dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
                                SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
        nfserr = nfsd_write(rqstp, &resp->fh, NULL,
                                   argp->offset,
                                   rqstp->rq_vec, argp->vlen,
-                                   argp->len,
+                                   &cnt,
                                   &resp->committed);
-        resp->count = argp->count;
+        resp->count = cnt;
        RETURN_STATUS(nfserr);
 }
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
                struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
                /* Note that we don't care for remote fs's here */
-                if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) {
+                if (sb->s_magic == MSDOS_SUPER_MAGIC) {
                        resp->f_properties = NFS3_FSF_BILLYBOY;
                }
                resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
                        resp->p_link_max = EXT2_LINK_MAX;
                        resp->p_name_max = EXT2_NAME_LEN;
                        break;
-                case 0x4d44:    /* MSDOS_SUPER_MAGIC */
+                case MSDOS_SUPER_MAGIC:
                        resp->p_case_insensitive = 1;
                        resp->p_case_preserving  = 0;
                        break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
 encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 {
        __be32 *p;
-        int len = cb_rec->cbr_fhlen;
+        int len = cb_rec->cbr_fh.fh_size;
        RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
        WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
        WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
        WRITE32(cb_rec->cbr_trunc);
        WRITE32(len);
-        WRITEMEM(cb_rec->cbr_fhval, len);
+        WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
        return 0;
 }
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
 /* Reference counting, callback cleanup, etc., all look racy as heck.
 * And why is cb_set an atomic? */
-static int do_probe_callback(void *data)
+static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
 {
-        struct nfs4_client *clp = data;
        struct sockaddr_in      addr;
        struct nfs4_callback    *cb = &clp->cl_callback;
        struct rpc_timeout      timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
                .client_name    = clp->cl_principal,
        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-                .rpc_argp       = clp,
-        };
        struct rpc_clnt *client;
-        int status;
-        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
+        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-                status = nfserr_cb_path_down;
+                return ERR_PTR(-EINVAL);
-                goto out_err;
-        }
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
        /* Create RPC client */
        client = rpc_create(&args);
+        if (IS_ERR(client))
+                dprintk("NFSD: couldn't create callback client: %ld\n",
+                        PTR_ERR(client));
+        return client;
+}
+static int do_probe_callback(void *data)
+{
+        struct nfs4_client *clp = data;
+        struct nfs4_callback    *cb = &clp->cl_callback;
+        struct rpc_message msg = {
+                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+                .rpc_argp       = clp,
+        };
+        struct rpc_clnt *client;
+        int status;
+        client = setup_callback_client(clp);
        if (IS_ERR(client)) {
-                dprintk("NFSD: couldn't create callback client\n");
                status = PTR_ERR(client);
+                dprintk("NFSD: couldn't create callback client: %d\n",
+                                                                status);
                goto out_err;
        }
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
 out_release_client:
        rpc_shutdown_client(client);
 out_err:
-        dprintk("NFSD: warning: no callback path to client %.*s\n",
+        dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
-                (int)clp->cl_name.len, clp->cl_name.data);
+                (int)clp->cl_name.len, clp->cl_name.data, status);
        put_nfs4_client(clp);
-        return status;
+        return 0;
 }
 /*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 /*
 * called with dp->dl_count inc'ed.
- * nfs4_lock_state() may or may not have been called.
 */
 void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
        open->op_truncate = 0;
        if (open->op_create) {
+                /* FIXME: check session persistence and pnfs flags.
+                 * The nfsv4.1 spec requires the following semantics:
+                 *
+                 * Persistent   | pNFS   | Server REQUIRED | Client Allowed
+                 * Reply Cache  | server |                 |
+                 * -------------+--------+-----------------+--------------------
+                 * no           | no     | EXCLUSIVE4_1    | EXCLUSIVE4_1
+                 *              |        |                 | (SHOULD)
+                 *              |        | and EXCLUSIVE4  | or EXCLUSIVE4
+                 *              |        |                 | (SHOULD NOT)
+                 * no           | yes    | EXCLUSIVE4_1    | EXCLUSIVE4_1
+                 * yes          | no     | GUARDED4        | GUARDED4
+                 * yes          | yes    | GUARDED4        | GUARDED4
+                 */
                /*
                 * Note: create modes (UNCHECKED,GUARDED...) are the same
                 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                                        (u32 *)open->op_verf.data,
                                        &open->op_truncate, &created);
-                /* If we ever decide to use different attrs to store the
+                /*
-                 * verifier in nfsd_create_v3, then we'll need to change this
+                 * Following rfc 3530 14.2.16, use the returned bitmask
+                 * to indicate which attributes we used to store the
+                 * verifier:
                 */
                if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
-                        open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+                        open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
                                                FATTR4_WORD1_TIME_MODIFY);
        } else {
                status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                goto out;
        set_change_info(&open->op_cinfo, current_fh);
-        /* set reply cache */
        fh_dup2(current_fh, &resfh);
-        open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
-        memcpy(open->op_stateowner->so_replay.rp_openfh,
-                        &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
+        /* set reply cache */
+        fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+                        &resfh.fh_handle);
        if (!created)
                status = do_open_permission(rqstp, current_fh, open,
                                            NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
        memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
        /* set replay cache */
-        open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size;
+        fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
-        memcpy(open->op_stateowner->so_replay.rp_openfh,
+                        &current_fh->fh_handle);
-                &current_fh->fh_handle.fh_base,
-                current_fh->fh_handle.fh_size);
        open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
                (open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
        return status;
 }
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+        struct nfsd4_sessionid *sid =
+                        (struct nfsd4_sessionid *)session->se_sessionid.data;
+        clid->cl_boot = sid->clientid.cl_boot;
+        clid->cl_id = sid->clientid.cl_id;
+}
 static __be32
 nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
           struct nfsd4_open *open)
 {
        __be32 status;
+        struct nfsd4_compoundres *resp;
        dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
                (int)open->op_fname.len, open->op_fname.data,
                open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
                return nfserr_inval;
+        if (nfsd4_has_session(cstate))
+                copy_clientid(&open->op_clientid, cstate->session);
        nfs4_lock_state();
        /* check seqid for replay. set nfs4_owner */
-        status = nfsd4_process_open1(open);
+        resp = rqstp->rq_resp;
+        status = nfsd4_process_open1(&resp->cstate, open);
        if (status == nfserr_replay_me) {
                struct nfs4_replay *rp = &open->op_stateowner->so_replay;
                fh_put(&cstate->current_fh);
-                cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
+                fh_copy_shallow(&cstate->current_fh.fh_handle,
-                memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
+                                &rp->rp_openfh);
-                                rp->rp_openfh_len);
                status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
                if (status)
                        dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (open->op_claim_type) {
                case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-                        status = nfserr_inval;
-                        if (open->op_create)
-                                goto out;
-                        /* fall through */
                case NFS4_OPEN_CLAIM_NULL:
                        /*
                         * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
                return nfserr_inval;
-        getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
+        getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
-        getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+        getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+        getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
        getattr->ga_fhp = &cstate->current_fh;
        return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        /* check stateid */
-        if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
-                                &read->rd_stateid,
+                                                 RD_STATE, &read->rd_filp))) {
-                                CHECK_FH | RD_STATE, &read->rd_filp))) {
                dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
                goto out;
        }
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
                return nfserr_inval;
-        readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
+        readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
-        readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+        readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+        readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
        if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
            (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
                nfs4_lock_state();
-                status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+                status = nfs4_preprocess_stateid_op(cstate,
-                        &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
+                        &setattr->sa_stateid, WR_STATE, NULL);
                nfs4_unlock_state();
                if (status) {
                        dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct file *filp = NULL;
        u32 *p;
        __be32 status = nfs_ok;
+        unsigned long cnt;
        /* no need to check permission - this will be done in nfsd_write() */
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                return nfserr_inval;
        nfs4_lock_state();
-        status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid,
+        status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
-                                        CHECK_FH | WR_STATE, &filp);
        if (filp)
                get_file(filp);
        nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                return status;
        }
-        write->wr_bytes_written = write->wr_buflen;
+        cnt = write->wr_buflen;
        write->wr_how_written = write->wr_stable_how;
        p = (u32 *)write->wr_verifier.data;
        *p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status =  nfsd_write(rqstp, &cstate->current_fh, filp,
                             write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-                             write->wr_buflen, &write->wr_how_written);
+                             &cnt, &write->wr_how_written);
        if (filp)
                fput(filp);
+        write->wr_bytes_written = cnt;
        if (status == nfserr_symlink)
                status = nfserr_inval;
        return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
-        if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0)
+        if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
-            || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+            || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+            || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
                return nfserr_attrnotsupp;
        if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
            || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                goto out_kfree;
-        p = buf + 3;
+        /* skip bitmap */
+        p = buf + 1 + ntohl(buf[0]);
        status = nfserr_not_same;
        if (ntohl(*p++) != verify->ve_attrlen)
                goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
                nfsdstats.nfs4_opcount[opnum]++;
 }
-static void cstate_free(struct nfsd4_compound_state *cstate)
-{
-        if (cstate == NULL)
-                return;
-        fh_put(&cstate->current_fh);
-        fh_put(&cstate->save_fh);
-        BUG_ON(cstate->replay_owner);
-        kfree(cstate);
-}
-static struct nfsd4_compound_state *cstate_alloc(void)
-{
-        struct nfsd4_compound_state *cstate;
-        cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
-        if (cstate == NULL)
-                return NULL;
-        fh_init(&cstate->current_fh, NFS4_FHSIZE);
-        fh_init(&cstate->save_fh, NFS4_FHSIZE);
-        cstate->replay_owner = NULL;
-        return cstate;
-}
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
                              void *);
+enum nfsd4_op_flags {
+        ALLOWED_WITHOUT_FH = 1 << 0,    /* No current filehandle required */
+        ALLOWED_ON_ABSENT_FS = 2 << 0,  /* ops processed on absent fs */
+        ALLOWED_AS_FIRST_OP = 3 << 0,   /* ops reqired first in compound */
+};
 struct nfsd4_operation {
        nfsd4op_func op_func;
        u32 op_flags;
-/* Most ops require a valid current filehandle; a few don't: */
-#define ALLOWED_WITHOUT_FH 1
-/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
-#define ALLOWED_ON_ABSENT_FS 2
        char *op_name;
 };
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 /*
+ * This is a replay of a compound for which no cache entry pages
+ * were used. Encode the sequence operation, and if cachethis is FALSE
+ * encode the uncache rep error on the next operation.
+ */
+static __be32
+nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
+                         struct nfsd4_compoundres *resp)
+{
+        struct nfsd4_op *op;
+        dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
+                resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
+        /* Encode the replayed sequence operation */
+        BUG_ON(resp->opcnt != 1);
+        op = &args->ops[resp->opcnt - 1];
+        nfsd4_encode_operation(resp, op);
+        /*return nfserr_retry_uncached_rep in next operation. */
+        if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
+                op = &args->ops[resp->opcnt++];
+                op->status = nfserr_retry_uncached_rep;
+                nfsd4_encode_operation(resp, op);
+        }
+        return op->status;
+}
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules.
+ *
+ * TODO:
+ * - enforce NFS4ERR_NOT_ONLY_OP,
+ * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ */
+static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+{
+        if (args->minorversion && args->opcnt > 0) {
+                struct nfsd4_op *op = &args->ops[0];
+                return (op->status == nfserr_op_illegal) ||
+                       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+        }
+        return true;
+}
+/*
 * COMPOUND call.
 */
 static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 {
        struct nfsd4_op *op;
        struct nfsd4_operation *opdesc;
-        struct nfsd4_compound_state *cstate = NULL;
+        struct nfsd4_compound_state *cstate = &resp->cstate;
        int             slack_bytes;
        __be32          status;
        resp->xbuf = &rqstp->rq_res;
-        resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+        resp->p = rqstp->rq_res.head[0].iov_base +
+                                                rqstp->rq_res.head[0].iov_len;
        resp->tagp = resp->p;
        /* reserve space for: taglen, tag, and opcnt */
        resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->tag = args->tag;
        resp->opcnt = 0;
        resp->rqstp = rqstp;
+        resp->cstate.minorversion = args->minorversion;
+        resp->cstate.replay_owner = NULL;
+        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+        /* Use the deferral mechanism only for NFSv4.0 compounds */
+        rqstp->rq_usedeferral = (args->minorversion == 0);
        /*
         * According to RFC3010, this takes precedence over all other errors.
         */
        status = nfserr_minor_vers_mismatch;
-        if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+        if (args->minorversion > nfsd_supported_minorversion)
                goto out;
-        status = nfserr_resource;
+        if (!nfs41_op_ordering_ok(args)) {
-        cstate = cstate_alloc();
+                op = &args->ops[0];
-        if (cstate == NULL)
+                op->status = nfserr_sequence_pos;
-                goto out;
+                goto encode_op;
+        }
        status = nfs_ok;
        while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
                        resp->opcnt, args->opcnt, op->opnum,
                        nfsd4_op_name(op->opnum));
                /*
                 * The XDR decode routines may have pre-set op->status;
                 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                        BUG_ON(op->status == nfs_ok);
 encode_op:
+                /* Only from SEQUENCE or CREATE_SESSION */
+                if (resp->cstate.status == nfserr_replay_cache) {
+                        dprintk("%s NFS4.1 replay from cache\n", __func__);
+                        if (nfsd4_not_cached(resp))
+                                status = nfsd4_enc_uncached_replay(args, resp);
+                        else
+                                status = op->status;
+                        goto out;
+                }
                if (op->status == nfserr_replay_me) {
                        op->replay = &cstate->replay_owner->so_replay;
                        nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
+        if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
+                dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
+                status = nfserr_jukebox;
+        }
-        cstate_free(cstate);
+        resp->cstate.status = status;
+        fh_put(&resp->cstate.current_fh);
+        fh_put(&resp->cstate.save_fh);
+        BUG_ON(resp->cstate.replay_owner);
 out:
        nfsd4_release_compoundargs(args);
+        /* Reset deferral mechanism for RPC deferrals */
+        rqstp->rq_usedeferral = 1;
        dprintk("nfsv4 compound returned %d\n", ntohl(status));
        return status;
 }
-static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
+static struct nfsd4_operation nfsd4_ops[] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
                .op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
                .op_name = "OP_PUTFH",
        },
        [OP_PUTPUBFH] = {
-                /* unsupported, just for future reference: */
+                .op_func = (nfsd4op_func)nfsd4_putrootfh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
                .op_name = "OP_PUTPUBFH",
        },
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
                .op_name = "OP_RELEASE_LOCKOWNER",
        },
+        /* NFSv4.1 operations */
+        [OP_EXCHANGE_ID] = {
+                .op_func = (nfsd4op_func)nfsd4_exchange_id,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_EXCHANGE_ID",
+        },
+        [OP_CREATE_SESSION] = {
+                .op_func = (nfsd4op_func)nfsd4_create_session,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_CREATE_SESSION",
+        },
+        [OP_DESTROY_SESSION] = {
+                .op_func = (nfsd4op_func)nfsd4_destroy_session,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_DESTROY_SESSION",
+        },
+        [OP_SEQUENCE] = {
+                .op_func = (nfsd4op_func)nfsd4_sequence,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_SEQUENCE",
+        },
 };
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..b5348405046b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
 typedef int (recdir_func)(struct dentry *, struct dentry *);
-struct dentry_list {
+struct name_list {
-        struct dentry *dentry;
+        char name[HEXDIR_LEN];
        struct list_head list;
 };
-struct dentry_list_arg {
-        struct list_head dentries;
-        struct dentry *parent;
-};
 static int
-nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
+nfsd4_build_namelist(void *arg, const char *name, int namlen,
                loff_t offset, u64 ino, unsigned int d_type)
 {
-        struct dentry_list_arg *dla = arg;
+        struct list_head *names = arg;
-        struct list_head *dentries = &dla->dentries;
+        struct name_list *entry;
-        struct dentry *parent = dla->parent;
-        struct dentry *dentry;
-        struct dentry_list *child;
-        if (name && isdotent(name, namlen))
+        if (namlen != HEXDIR_LEN - 1)
                return 0;
-        dentry = lookup_one_len(name, parent, namlen);
+        entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
-        if (IS_ERR(dentry))
+        if (entry == NULL)
-                return PTR_ERR(dentry);
-        child = kmalloc(sizeof(*child), GFP_KERNEL);
-        if (child == NULL)
                return -ENOMEM;
-        child->dentry = dentry;
+        memcpy(entry->name, name, HEXDIR_LEN - 1);
-        list_add(&child->list, dentries);
+        entry->name[HEXDIR_LEN - 1] = '\0';
+        list_add(&entry->list, names);
        return 0;
 }
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
        const struct cred *original_cred;
        struct file *filp;
-        struct dentry_list_arg dla = {
+        LIST_HEAD(names);
-                .parent = dir,
+        struct name_list *entry;
-        };
+        struct dentry *dentry;
-        struct list_head *dentries = &dla.dentries;
-        struct dentry_list *child;
        int status;
        if (!rec_dir_init)
@@ -233,67 +221,42 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
-        INIT_LIST_HEAD(dentries);
        filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
                           current_cred());
        status = PTR_ERR(filp);
        if (IS_ERR(filp))
                goto out;
-        INIT_LIST_HEAD(dentries);
+        status = vfs_readdir(filp, nfsd4_build_namelist, &names);
-        status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
        fput(filp);
-        while (!list_empty(dentries)) {
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-                child = list_entry(dentries->next, struct dentry_list, list);
+        while (!list_empty(&names)) {
-                status = f(dir, child->dentry);
+                entry = list_entry(names.next, struct name_list, list);
+                dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+                if (IS_ERR(dentry)) {
+                        status = PTR_ERR(dentry);
+                        break;
+                }
+                status = f(dir, dentry);
+                dput(dentry);
                if (status)
-                        goto out;
+                        break;
-                list_del(&child->list);
+                list_del(&entry->list);
-                dput(child->dentry);
+                kfree(entry);
-                kfree(child);
        }
+        mutex_unlock(&dir->d_inode->i_mutex);
 out:
-        while (!list_empty(dentries)) {
+        while (!list_empty(&names)) {
-                child = list_entry(dentries->next, struct dentry_list, list);
+                entry = list_entry(names.next, struct name_list, list);
-                list_del(&child->list);
+                list_del(&entry->list);
-                dput(child->dentry);
+                kfree(entry);
-                kfree(child);
        }
        nfs4_reset_creds(original_cred);
        return status;
 }
 static int
-nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
-{
-        int status;
-        if (!S_ISREG(dir->d_inode->i_mode)) {
-                printk("nfsd4: non-file found in client recovery directory\n");
-                return -EINVAL;
-        }
-        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        status = vfs_unlink(dir->d_inode, dentry);
-        mutex_unlock(&dir->d_inode->i_mutex);
-        return status;
-}
-static int
-nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
-{
-        int status;
-        /* For now this directory should already be empty, but we empty it of
-         * any regular files anyway, just in case the directory was created by
-         * a kernel from the future.... */
-        nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
-        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        status = vfs_rmdir(dir->d_inode, dentry);
-        mutex_unlock(&dir->d_inode->i_mutex);
-        return status;
-}
-static int
 nfsd4_unlink_clid_dir(char *name, int namlen)
 {
        struct dentry *dentry;
@@ -301,20 +264,20 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
        dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
-        mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
        dentry = lookup_one_len(name, rec_dir.dentry, namlen);
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
-                return status;
+                goto out_unlock;
        }
        status = -ENOENT;
        if (!dentry->d_inode)
                goto out;
+        status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
-        status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
 out:
        dput(dentry);
+out_unlock:
+        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
        return status;
 }
@@ -353,10 +316,11 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
        int status;
-        if (nfs4_has_reclaimed_state(child->d_name.name))
+        /* note: we currently use this path only for minorversion 0 */
+        if (nfs4_has_reclaimed_state(child->d_name.name, false))
                return 0;
-        status = nfsd4_clear_clid_dir(parent, child);
+        status = vfs_rmdir(parent->d_inode, child);
        if (status)
                printk("failed to remove client recovery directory %s\n",
                                child->d_name.name);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..3b711f5147a7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
 static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
+static u64 current_sessionid = 1;
 #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid;              /* bits all 1 */
 /* forward declarations */
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 static void nfs4_set_recdir(char *recdir);
-/* Locking:
+/* Locking: */
- *
- * client_mutex:
+/* Currently used for almost all code touching nfsv4 state: */
- *      protects clientid_hashtbl[], clientstr_hashtbl[],
- *      unconfstr_hashtbl[], uncofid_hashtbl[].
- */
 static DEFINE_MUTEX(client_mutex);
+/*
+ * Currently used for the del_recall_lru and file hash table.  In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(recall_lock);
 static struct kmem_cache *stateowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
        return x;
 }
-/* forward declarations */
-static void release_stateowner(struct nfs4_stateowner *sop);
-static void release_stateid(struct nfs4_stateid *stp, int flags);
-/*
- * Delegation state
- */
-/* recall_lock protects the del_recall_lru */
-static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
-static void
-free_nfs4_file(struct kref *kref)
-{
-        struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
-        list_del(&fp->fi_hash);
-        iput(fp->fi_inode);
-        kmem_cache_free(file_slab, fp);
-}
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
-        kref_put(&fi->fi_ref, free_nfs4_file);
+        if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+                list_del(&fi->fi_hash);
+                spin_unlock(&recall_lock);
+                iput(fi->fi_inode);
+                kmem_cache_free(file_slab, fi);
+        }
 }
 static inline void
 get_nfs4_file(struct nfs4_file *fi)
 {
-        kref_get(&fi->fi_ref);
+        atomic_inc(&fi->fi_ref);
 }
 static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
        dp->dl_stateid.si_generation = 0;
-        dp->dl_fhlen = current_fh->fh_handle.fh_size;
+        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
-        memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
-                        current_fh->fh_handle.fh_size);
        dp->dl_time = 0;
        atomic_set(&dp->dl_count, 1);
        list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,290 @@ static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
 static struct list_head client_lru;
 static struct list_head close_lru;
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+        list_del(&stp->st_hash);
+        list_del(&stp->st_perfile);
+        list_del(&stp->st_perstateowner);
+}
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+        put_nfs4_file(stp->st_file);
+        kmem_cache_free(stateid_slab, stp);
+}
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+        unhash_generic_stateid(stp);
+        locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+        free_generic_stateid(stp);
+}
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+        struct nfs4_stateid *stp;
+        list_del(&sop->so_idhash);
+        list_del(&sop->so_strhash);
+        list_del(&sop->so_perstateid);
+        while (!list_empty(&sop->so_stateids)) {
+                stp = list_first_entry(&sop->so_stateids,
+                                struct nfs4_stateid, st_perstateowner);
+                release_lock_stateid(stp);
+        }
+}
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+        unhash_lockowner(sop);
+        nfs4_put_stateowner(sop);
+}
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+        struct nfs4_stateowner *lock_sop;
+        while (!list_empty(&open_stp->st_lockowners)) {
+                lock_sop = list_entry(open_stp->st_lockowners.next,
+                                struct nfs4_stateowner, so_perstateid);
+                /* list_del(&open_stp->st_lockowners);  */
+                BUG_ON(lock_sop->so_is_open_owner);
+                release_lockowner(lock_sop);
+        }
+}
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
+        unhash_generic_stateid(stp);
+        release_stateid_lockowners(stp);
+        nfsd_close(stp->st_vfs_file);
+        free_generic_stateid(stp);
+}
+static void unhash_openowner(struct nfs4_stateowner *sop)
+{
+        struct nfs4_stateid *stp;
+        list_del(&sop->so_idhash);
+        list_del(&sop->so_strhash);
+        list_del(&sop->so_perclient);
+        list_del(&sop->so_perstateid); /* XXX: necessary? */
+        while (!list_empty(&sop->so_stateids)) {
+                stp = list_first_entry(&sop->so_stateids,
+                                struct nfs4_stateid, st_perstateowner);
+                release_open_stateid(stp);
+        }
+}
+static void release_openowner(struct nfs4_stateowner *sop)
+{
+        unhash_openowner(sop);
+        list_del(&sop->so_close_lru);
+        nfs4_put_stateowner(sop);
+}
+static DEFINE_SPINLOCK(sessionid_lock);
+#define SESSION_HASH_SIZE       512
+static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+        struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+        return sid->sequence % SESSION_HASH_SIZE;
+}
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+        u32 *ptr = (u32 *)(&sessionid->data[0]);
+        dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+        struct nfs4_client *clp = ses->se_client;
+        struct nfsd4_sessionid *sid;
+        sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+        sid->clientid = clp->cl_clientid;
+        sid->sequence = current_sessionid++;
+        sid->reserved = 0;
+}
+/*
+ * Give the client the number of slots it requests bound by
+ * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ *
+ * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
+ * should (up to a point) re-negotiate active sessions and reduce their
+ * slot usage to make rooom for new connections. For now we just fail the
+ * create session.
+ */
+static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+{
+        int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+        spin_lock(&nfsd_serv->sv_lock);
+        if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
+                np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
+        nfsd_serv->sv_drc_pages_used += np;
+        spin_unlock(&nfsd_serv->sv_lock);
+        if (np <= 0) {
+                status = nfserr_resource;
+                fchan->maxreqs = 0;
+        } else
+                fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+        return status;
+}
+/*
+ * fchan holds the client values on input, and the server values on output
+ */
+static int init_forechannel_attrs(struct svc_rqst *rqstp,
+                                    struct nfsd4_session *session,
+                                    struct nfsd4_channel_attrs *fchan)
+{
+        int status = 0;
+        __u32   maxcount = svc_max_payload(rqstp);
+        /* headerpadsz set to zero in encode routine */
+        /* Use the client's max request and max response size if possible */
+        if (fchan->maxreq_sz > maxcount)
+                fchan->maxreq_sz = maxcount;
+        session->se_fmaxreq_sz = fchan->maxreq_sz;
+        if (fchan->maxresp_sz > maxcount)
+                fchan->maxresp_sz = maxcount;
+        session->se_fmaxresp_sz = fchan->maxresp_sz;
+        /* Set the max response cached size our default which is
+         * a multiple of PAGE_SIZE and small */
+        session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+        fchan->maxresp_cached = session->se_fmaxresp_cached;
+        /* Use the client's maxops if possible */
+        if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+                fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+        session->se_fmaxops = fchan->maxops;
+        /* try to use the client requested number of slots */
+        if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+                fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+        /* FIXME: Error means no more DRC pages so the server should
+         * recover pages from existing sessions. For now fail session
+         * creation.
+         */
+        status = set_forechannel_maxreqs(fchan);
+        session->se_fnumslots = fchan->maxreqs;
+        return status;
+}
+static int
+alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
+                   struct nfsd4_create_session *cses)
+{
+        struct nfsd4_session *new, tmp;
+        int idx, status = nfserr_resource, slotsize;
+        memset(&tmp, 0, sizeof(tmp));
+        /* FIXME: For now, we just accept the client back channel attributes. */
+        status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+        if (status)
+                goto out;
+        /* allocate struct nfsd4_session and slot table in one piece */
+        slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
+        if (!new)
+                goto out;
+        memcpy(new, &tmp, sizeof(*new));
+        new->se_client = clp;
+        gen_sessionid(new);
+        idx = hash_sessionid(&new->se_sessionid);
+        memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
+               NFS4_MAX_SESSIONID_LEN);
+        new->se_flags = cses->flags;
+        kref_init(&new->se_ref);
+        spin_lock(&sessionid_lock);
+        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+        list_add(&new->se_perclnt, &clp->cl_sessions);
+        spin_unlock(&sessionid_lock);
+        status = nfs_ok;
+out:
+        return status;
+}
+/* caller must hold sessionid_lock */
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+{
+        struct nfsd4_session *elem;
+        int idx;
+        dump_sessionid(__func__, sessionid);
+        idx = hash_sessionid(sessionid);
+        dprintk("%s: idx is %d\n", __func__, idx);
+        /* Search in the appropriate list */
+        list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+                dump_sessionid("list traversal", &elem->se_sessionid);
+                if (!memcmp(elem->se_sessionid.data, sessionid->data,
+                            NFS4_MAX_SESSIONID_LEN)) {
+                        return elem;
+                }
+        }
+        dprintk("%s: session not found\n", __func__);
+        return NULL;
+}
+/* caller must hold sessionid_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+        list_del(&ses->se_hash);
+        list_del(&ses->se_perclnt);
+}
+static void
+release_session(struct nfsd4_session *ses)
+{
+        spin_lock(&sessionid_lock);
+        unhash_session(ses);
+        spin_unlock(&sessionid_lock);
+        nfsd4_put_session(ses);
+}
+static void nfsd4_release_respages(struct page **respages, short resused);
+void
+free_session(struct kref *kref)
+{
+        struct nfsd4_session *ses;
+        int i;
+        ses = container_of(kref, struct nfsd4_session, se_ref);
+        for (i = 0; i < ses->se_fnumslots; i++) {
+                struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
+                nfsd4_release_respages(e->ce_respages, e->ce_resused);
+        }
+        kfree(ses);
+}
 static inline void
 renew_client(struct nfs4_client *clp)
 {
@@ -330,8 +602,8 @@ STALE_CLIENTID(clientid_t *clid)
 {
        if (clid->cl_boot == boot_time)
                return 0;
-        dprintk("NFSD stale clientid (%08x/%08x)\n", 
+        dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
-                        clid->cl_boot, clid->cl_id);
+                clid->cl_boot, clid->cl_id, boot_time);
        return 1;
 }
@@ -376,6 +648,8 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
        shutdown_callback_client(clp);
+        nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
+                             clp->cl_slot.sl_cache_entry.ce_resused);
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
        kfree(clp->cl_principal);
@@ -420,7 +694,13 @@ expire_client(struct nfs4_client *clp)
        list_del(&clp->cl_lru);
        while (!list_empty(&clp->cl_openowners)) {
                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
-                release_stateowner(sop);
+                release_openowner(sop);
+        }
+        while (!list_empty(&clp->cl_sessions)) {
+                struct nfsd4_session  *ses;
+                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                                 se_perclnt);
+                release_session(ses);
        }
        put_nfs4_client(clp);
 }
@@ -439,6 +719,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
+        INIT_LIST_HEAD(&clp->cl_sessions);
        INIT_LIST_HEAD(&clp->cl_lru);
        return clp;
 }
@@ -568,25 +849,45 @@ find_unconfirmed_client(clientid_t *clid)
        return NULL;
 }
+/*
+ * Return 1 iff clp's clientid establishment method matches the use_exchange_id
+ * parameter. Matching is based on the fact the at least one of the
+ * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
+ *
+ * FIXME: we need to unify the clientid namespaces for nfsv4.x
+ * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
+ * and SET_CLIENTID{,_CONFIRM}
+ */
+static inline int
+match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+{
+        bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+        return use_exchange_id == has_exchange_flags;
+}
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+                             bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname))
+                if (same_name(clp->cl_recdir, dname) &&
+                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
 }
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+                               bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname))
+                if (same_name(clp->cl_recdir, dname) &&
+                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
@@ -685,6 +986,534 @@ out_err:
        return;
 }
+void
+nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        resp->cstate.statp = statp;
+}
+/*
+ * Dereference the result pages.
+ */
+static void
+nfsd4_release_respages(struct page **respages, short resused)
+{
+        int i;
+        dprintk("--> %s\n", __func__);
+        for (i = 0; i < resused; i++) {
+                if (!respages[i])
+                        continue;
+                put_page(respages[i]);
+                respages[i] = NULL;
+        }
+}
+static void
+nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
+{
+        int i;
+        for (i = 0; i < count; i++) {
+                topages[i] = frompages[i];
+                if (!topages[i])
+                        continue;
+                get_page(topages[i]);
+        }
+}
+/*
+ * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
+ * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
+ * length of the XDR response is less than se_fmaxresp_cached
+ * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
+ * of the reply (e.g. readdir).
+ *
+ * Store the base and length of the rq_req.head[0] page
+ * of the NFSv4.1 data, just past the rpc header.
+ */
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+        struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+        struct svc_rqst *rqstp = resp->rqstp;
+        struct nfsd4_compoundargs *args = rqstp->rq_argp;
+        struct nfsd4_op *op = &args->ops[resp->opcnt];
+        struct kvec *resv = &rqstp->rq_res.head[0];
+        dprintk("--> %s entry %p\n", __func__, entry);
+        /* Don't cache a failed OP_SEQUENCE. */
+        if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
+                return;
+        nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
+        entry->ce_opcnt = resp->opcnt;
+        entry->ce_status = resp->cstate.status;
+        /*
+         * Don't need a page to cache just the sequence operation - the slot
+         * does this for us!
+         */
+        if (nfsd4_not_cached(resp)) {
+                entry->ce_resused = 0;
+                entry->ce_rpchdrlen = 0;
+                dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
+                        resp->cstate.slot->sl_cache_entry.ce_cachethis);
+                return;
+        }
+        entry->ce_resused = rqstp->rq_resused;
+        if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
+                entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
+        nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
+                         entry->ce_resused);
+        entry->ce_datav.iov_base = resp->cstate.statp;
+        entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
+                                (char *)page_address(rqstp->rq_respages[0]));
+        /* Current request rpc header length*/
+        entry->ce_rpchdrlen = (char *)resp->cstate.statp -
+                                (char *)page_address(rqstp->rq_respages[0]);
+}
+/*
+ * We keep the rpc header, but take the nfs reply from the replycache.
+ */
+static int
+nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
+                        struct nfsd4_cache_entry *entry)
+{
+        struct svc_rqst *rqstp = resp->rqstp;
+        struct kvec *resv = &resp->rqstp->rq_res.head[0];
+        int len;
+        /* Current request rpc header length*/
+        len = (char *)resp->cstate.statp -
+                        (char *)page_address(rqstp->rq_respages[0]);
+        if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
+                dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
+                        entry->ce_datav.iov_len);
+                return 0;
+        }
+        /* copy the cached reply nfsd data past the current rpc header */
+        memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
+                entry->ce_datav.iov_len);
+        resv->iov_len = len + entry->ce_datav.iov_len;
+        return 1;
+}
+/*
+ * Keep the first page of the replay. Copy the NFSv4.1 data from the first
+ * cached page.  Replace any futher replay pages from the cache.
+ */
+__be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+                         struct nfsd4_sequence *seq)
+{
+        struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+        __be32 status;
+        dprintk("--> %s entry %p\n", __func__, entry);
+        /*
+         * If this is just the sequence operation, we did not keep
+         * a page in the cache entry because we can just use the
+         * slot info stored in struct nfsd4_sequence that was checked
+         * against the slot in nfsd4_sequence().
+         *
+         * This occurs when seq->cachethis is FALSE, or when the client
+         * session inactivity timer fires and a solo sequence operation
+         * is sent (lease renewal).
+         */
+        if (seq && nfsd4_not_cached(resp)) {
+                seq->maxslots = resp->cstate.session->se_fnumslots;
+                return nfs_ok;
+        }
+        if (!nfsd41_copy_replay_data(resp, entry)) {
+                /*
+                 * Not enough room to use the replay rpc header, send the
+                 * cached header. Release all the allocated result pages.
+                 */
+                svc_free_res_pages(resp->rqstp);
+                nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
+                        entry->ce_resused);
+        } else {
+                /* Release all but the first allocated result page */
+                resp->rqstp->rq_resused--;
+                svc_free_res_pages(resp->rqstp);
+                nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
+                                 &entry->ce_respages[1],
+                                 entry->ce_resused - 1);
+        }
+        resp->rqstp->rq_resused = entry->ce_resused;
+        resp->opcnt = entry->ce_opcnt;
+        resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
+        status = entry->ce_status;
+        return status;
+}
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+        /* pNFS is not supported */
+        new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+        /* Referrals are supported, Migration is not. */
+        new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+        /* set the wire flags to return to client. */
+        clid->flags = new->cl_exchange_flags;
+}
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+                  struct nfsd4_compound_state *cstate,
+                  struct nfsd4_exchange_id *exid)
+{
+        struct nfs4_client *unconf, *conf, *new;
+        int status;
+        unsigned int            strhashval;
+        char                    dname[HEXDIR_LEN];
+        nfs4_verifier           verf = exid->verifier;
+        u32                     ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+        dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+                " ip_addr=%u flags %x, spa_how %d\n",
+                __func__, rqstp, exid, exid->clname.len, exid->clname.data,
+                ip_addr, exid->flags, exid->spa_how);
+        if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+                return nfserr_inval;
+        /* Currently only support SP4_NONE */
+        switch (exid->spa_how) {
+        case SP4_NONE:
+                break;
+        case SP4_SSV:
+                return nfserr_encr_alg_unsupp;
+        default:
+                BUG();                          /* checked by xdr code */
+        case SP4_MACH_CRED:
+                return nfserr_serverfault;      /* no excuse :-/ */
+        }
+        status = nfs4_make_rec_clidname(dname, &exid->clname);
+        if (status)
+                goto error;
+        strhashval = clientstr_hashval(dname);
+        nfs4_lock_state();
+        status = nfs_ok;
+        conf = find_confirmed_client_by_str(dname, strhashval, true);
+        if (conf) {
+                if (!same_verf(&verf, &conf->cl_verifier)) {
+                        /* 18.35.4 case 8 */
+                        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+                                status = nfserr_not_same;
+                                goto out;
+                        }
+                        /* Client reboot: destroy old state */
+                        expire_client(conf);
+                        goto out_new;
+                }
+                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+                        /* 18.35.4 case 9 */
+                        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+                                status = nfserr_perm;
+                                goto out;
+                        }
+                        expire_client(conf);
+                        goto out_new;
+                }
+                if (ip_addr != conf->cl_addr &&
+                    !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
+                        /* Client collision. 18.35.4 case 3 */
+                        status = nfserr_clid_inuse;
+                        goto out;
+                }
+                /*
+                 * Set bit when the owner id and verifier map to an already
+                 * confirmed client id (18.35.3).
+                 */
+                exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+                /*
+                 * Falling into 18.35.4 case 2, possible router replay.
+                 * Leave confirmed record intact and return same result.
+                 */
+                copy_verf(conf, &verf);
+                new = conf;
+                goto out_copy;
+        } else {
+                /* 18.35.4 case 7 */
+                if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+                        status = nfserr_noent;
+                        goto out;
+                }
+        }
+        unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+        if (unconf) {
+                /*
+                 * Possible retry or client restart.  Per 18.35.4 case 4,
+                 * a new unconfirmed record should be generated regardless
+                 * of whether any properties have changed.
+                 */
+                expire_client(unconf);
+        }
+out_new:
+        /* Normal case */
+        new = create_client(exid->clname, dname);
+        if (new == NULL) {
+                status = nfserr_resource;
+                goto out;
+        }
+        copy_verf(new, &verf);
+        copy_cred(&new->cl_cred, &rqstp->rq_cred);
+        new->cl_addr = ip_addr;
+        gen_clid(new);
+        gen_confirm(new);
+        add_to_unconfirmed(new, strhashval);
+out_copy:
+        exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+        exid->clientid.cl_id = new->cl_clientid.cl_id;
+        new->cl_slot.sl_seqid = 0;
+        exid->seqid = 1;
+        nfsd4_set_ex_flags(new, exid);
+        dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+                new->cl_slot.sl_seqid, new->cl_exchange_flags);
+        status = nfs_ok;
+out:
+        nfs4_unlock_state();
+error:
+        dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
+        return status;
+}
+static int
+check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+{
+        dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
+                slot->sl_seqid);
+        /* The slot is in use, and no response has been sent. */
+        if (slot->sl_inuse) {
+                if (seqid == slot->sl_seqid)
+                        return nfserr_jukebox;
+                else
+                        return nfserr_seq_misordered;
+        }
+        /* Normal */
+        if (likely(seqid == slot->sl_seqid + 1))
+                return nfs_ok;
+        /* Replay */
+        if (seqid == slot->sl_seqid)
+                return nfserr_replay_cache;
+        /* Wraparound */
+        if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+                return nfs_ok;
+        /* Misordered replay or misordered new request */
+        return nfserr_seq_misordered;
+}
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+                     struct nfsd4_compound_state *cstate,
+                     struct nfsd4_create_session *cr_ses)
+{
+        u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfs4_client *conf, *unconf;
+        struct nfsd4_slot *slot = NULL;
+        int status = 0;
+        nfs4_lock_state();
+        unconf = find_unconfirmed_client(&cr_ses->clientid);
+        conf = find_confirmed_client(&cr_ses->clientid);
+        if (conf) {
+                slot = &conf->cl_slot;
+                status = check_slot_seqid(cr_ses->seqid, slot);
+                if (status == nfserr_replay_cache) {
+                        dprintk("Got a create_session replay! seqid= %d\n",
+                                slot->sl_seqid);
+                        cstate->slot = slot;
+                        cstate->status = status;
+                        /* Return the cached reply status */
+                        status = nfsd4_replay_cache_entry(resp, NULL);
+                        goto out;
+                } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+                        status = nfserr_seq_misordered;
+                        dprintk("Sequence misordered!\n");
+                        dprintk("Expected seqid= %d but got seqid= %d\n",
+                                slot->sl_seqid, cr_ses->seqid);
+                        goto out;
+                }
+                conf->cl_slot.sl_seqid++;
+        } else if (unconf) {
+                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+                    (ip_addr != unconf->cl_addr)) {
+                        status = nfserr_clid_inuse;
+                        goto out;
+                }
+                slot = &unconf->cl_slot;
+                status = check_slot_seqid(cr_ses->seqid, slot);
+                if (status) {
+                        /* an unconfirmed replay returns misordered */
+                        status = nfserr_seq_misordered;
+                        goto out;
+                }
+                slot->sl_seqid++; /* from 0 to 1 */
+                move_to_confirmed(unconf);
+                /*
+                 * We do not support RDMA or persistent sessions
+                 */
+                cr_ses->flags &= ~SESSION4_PERSIST;
+                cr_ses->flags &= ~SESSION4_RDMA;
+                conf = unconf;
+        } else {
+                status = nfserr_stale_clientid;
+                goto out;
+        }
+        status = alloc_init_session(rqstp, conf, cr_ses);
+        if (status)
+                goto out;
+        memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+               NFS4_MAX_SESSIONID_LEN);
+        cr_ses->seqid = slot->sl_seqid;
+        slot->sl_inuse = true;
+        cstate->slot = slot;
+        /* Ensure a page is used for the cache */
+        slot->sl_cache_entry.ce_cachethis = 1;
+out:
+        nfs4_unlock_state();
+        dprintk("%s returns %d\n", __func__, ntohl(status));
+        return status;
+}
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+                      struct nfsd4_compound_state *cstate,
+                      struct nfsd4_destroy_session *sessionid)
+{
+        struct nfsd4_session *ses;
+        u32 status = nfserr_badsession;
+        /* Notes:
+         * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
+         * - Should we return nfserr_back_chan_busy if waiting for
+         *   callbacks on to-be-destroyed session?
+         * - Do we need to clear any callback info from previous session?
+         */
+        dump_sessionid(__func__, &sessionid->sessionid);
+        spin_lock(&sessionid_lock);
+        ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+        if (!ses) {
+                spin_unlock(&sessionid_lock);
+                goto out;
+        }
+        unhash_session(ses);
+        spin_unlock(&sessionid_lock);
+        /* wait for callbacks */
+        shutdown_callback_client(ses->se_client);
+        nfsd4_put_session(ses);
+        status = nfs_ok;
+out:
+        dprintk("%s returns %d\n", __func__, ntohl(status));
+        return status;
+}
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp,
+               struct nfsd4_compound_state *cstate,
+               struct nfsd4_sequence *seq)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfsd4_session *session;
+        struct nfsd4_slot *slot;
+        int status;
+        if (resp->opcnt != 1)
+                return nfserr_sequence_pos;
+        spin_lock(&sessionid_lock);
+        status = nfserr_badsession;
+        session = find_in_sessionid_hashtbl(&seq->sessionid);
+        if (!session)
+                goto out;
+        status = nfserr_badslot;
+        if (seq->slotid >= session->se_fnumslots)
+                goto out;
+        slot = &session->se_slots[seq->slotid];
+        dprintk("%s: slotid %d\n", __func__, seq->slotid);
+        status = check_slot_seqid(seq->seqid, slot);
+        if (status == nfserr_replay_cache) {
+                cstate->slot = slot;
+                cstate->session = session;
+                /* Return the cached reply status and set cstate->status
+                 * for nfsd4_svc_encode_compoundres processing */
+                status = nfsd4_replay_cache_entry(resp, seq);
+                cstate->status = nfserr_replay_cache;
+                goto replay_cache;
+        }
+        if (status)
+                goto out;
+        /* Success! bump slot seqid */
+        slot->sl_inuse = true;
+        slot->sl_seqid = seq->seqid;
+        slot->sl_cache_entry.ce_cachethis = seq->cachethis;
+        /* Always set the cache entry cachethis for solo sequence */
+        if (nfsd4_is_solo_sequence(resp))
+                slot->sl_cache_entry.ce_cachethis = 1;
+        cstate->slot = slot;
+        cstate->session = session;
+replay_cache:
+        /* Renew the clientid on success and on replay.
+         * Hold a session reference until done processing the compound:
+         * nfsd4_put_session called only if the cstate slot is set.
+         */
+        renew_client(session->se_client);
+        nfsd4_get_session(session);
+out:
+        spin_unlock(&sessionid_lock);
+        dprintk("%s: return %d\n", __func__, ntohl(status));
+        return status;
+}
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
@@ -716,14 +1545,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        strhashval = clientstr_hashval(dname);
        nfs4_lock_state();
-        conf = find_confirmed_client_by_str(dname, strhashval);
+        conf = find_confirmed_client_by_str(dname, strhashval, false);
        if (conf) {
                /* RFC 3530 14.2.33 CASE 0: */
                status = nfserr_clid_inuse;
-                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
+                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
-                                || conf->cl_addr != sin->sin_addr.s_addr) {
+                        dprintk("NFSD: setclientid: string in use by client"
-                        dprintk("NFSD: setclientid: string in use by clientat %pI4\n",
+                                " at %pI4\n", &conf->cl_addr);
-                                &conf->cl_addr);
                        goto out;
                }
        }
@@ -732,7 +1560,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * has a description of SETCLIENTID request processing consisting
         * of 5 bullet points, labeled as CASE0 - CASE4 below.
         */
-        unconf = find_unconfirmed_client_by_str(dname, strhashval);
+        unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
        status = nfserr_resource;
        if (!conf) {
                /*
@@ -887,7 +1715,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        unsigned int hash =
                                clientstr_hashval(unconf->cl_recdir);
                        conf = find_confirmed_client_by_str(unconf->cl_recdir,
-                                                                        hash);
+                                                            hash, false);
                        if (conf) {
                                nfsd4_remove_clid_dir(conf);
                                expire_client(conf);
@@ -923,11 +1751,13 @@ alloc_init_file(struct inode *ino)
        fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
        if (fp) {
-                kref_init(&fp->fi_ref);
+                atomic_set(&fp->fi_ref, 1);
                INIT_LIST_HEAD(&fp->fi_hash);
                INIT_LIST_HEAD(&fp->fi_stateids);
                INIT_LIST_HEAD(&fp->fi_delegations);
+                spin_lock(&recall_lock);
                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+                spin_unlock(&recall_lock);
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
@@ -1037,48 +1867,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
        return sop;
 }
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
-        struct nfs4_stateowner *lock_sop;
-        while (!list_empty(&open_stp->st_lockowners)) {
-                lock_sop = list_entry(open_stp->st_lockowners.next,
-                                struct nfs4_stateowner, so_perstateid);
-                /* list_del(&open_stp->st_lockowners);  */
-                BUG_ON(lock_sop->so_is_open_owner);
-                release_stateowner(lock_sop);
-        }
-}
-static void
-unhash_stateowner(struct nfs4_stateowner *sop)
-{
-        struct nfs4_stateid *stp;
-        list_del(&sop->so_idhash);
-        list_del(&sop->so_strhash);
-        if (sop->so_is_open_owner)
-                list_del(&sop->so_perclient);
-        list_del(&sop->so_perstateid);
-        while (!list_empty(&sop->so_stateids)) {
-                stp = list_entry(sop->so_stateids.next,
-                        struct nfs4_stateid, st_perstateowner);
-                if (sop->so_is_open_owner)
-                        release_stateid(stp, OPEN_STATE);
-                else
-                        release_stateid(stp, LOCK_STATE);
-        }
-}
-static void
-release_stateowner(struct nfs4_stateowner *sop)
-{
-        unhash_stateowner(sop);
-        list_del(&sop->so_close_lru);
-        nfs4_put_stateowner(sop);
-}
 static inline void
 init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
        struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1888,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
        stp->st_stateid.si_generation = 0;
        stp->st_access_bmap = 0;
        stp->st_deny_bmap = 0;
-        __set_bit(open->op_share_access, &stp->st_access_bmap);
+        __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
+                  &stp->st_access_bmap);
        __set_bit(open->op_share_deny, &stp->st_deny_bmap);
        stp->st_openstp = NULL;
 }
 static void
-release_stateid(struct nfs4_stateid *stp, int flags)
-{
-        struct file *filp = stp->st_vfs_file;
-        list_del(&stp->st_hash);
-        list_del(&stp->st_perfile);
-        list_del(&stp->st_perstateowner);
-        if (flags & OPEN_STATE) {
-                release_stateid_lockowners(stp);
-                stp->st_vfs_file = NULL;
-                nfsd_close(filp);
-        } else if (flags & LOCK_STATE)
-                locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
-        put_nfs4_file(stp->st_file);
-        kmem_cache_free(stateid_slab, stp);
-}
-static void
 move_to_close_lru(struct nfs4_stateowner *sop)
 {
        dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1931,33 @@ find_file(struct inode *ino)
        unsigned int hashval = file_hashval(ino);
        struct nfs4_file *fp;
+        spin_lock(&recall_lock);
        list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
                if (fp->fi_inode == ino) {
                        get_nfs4_file(fp);
+                        spin_unlock(&recall_lock);
                        return fp;
                }
        }
+        spin_unlock(&recall_lock);
        return NULL;
 }
-static inline int access_valid(u32 x)
+static inline int access_valid(u32 x, u32 minorversion)
 {
-        if (x < NFS4_SHARE_ACCESS_READ)
+        if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
                return 0;
-        if (x > NFS4_SHARE_ACCESS_BOTH)
+        if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
+                return 0;
+        x &= ~NFS4_SHARE_ACCESS_MASK;
+        if (minorversion && x) {
+                if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
+                        return 0;
+                if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
+                        return 0;
+                x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
+        }
+        if (x)
                return 0;
        return 1;
 }
@@ -1409,7 +2193,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
 __be32
-nfsd4_process_open1(struct nfsd4_open *open)
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+                    struct nfsd4_open *open)
 {
        clientid_t *clientid = &open->op_clientid;
        struct nfs4_client *clp = NULL;
@@ -1432,10 +2217,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
                        return nfserr_expired;
                goto renew;
        }
+        /* When sessions are used, skip open sequenceid processing */
+        if (nfsd4_has_session(cstate))
+                goto renew;
        if (!sop->so_confirmed) {
                /* Replace unconfirmed owners without checking for replay. */
                clp = sop->so_client;
-                release_stateowner(sop);
+                release_openowner(sop);
                open->op_stateowner = NULL;
                goto renew;
        }
@@ -1709,6 +2497,7 @@ out:
 __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
        struct nfs4_file *fp = NULL;
        struct inode *ino = current_fh->fh_dentry->d_inode;
        struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2505,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        __be32 status;
        status = nfserr_inval;
-        if (!access_valid(open->op_share_access)
+        if (!access_valid(open->op_share_access, resp->cstate.minorversion)
                        || !deny_valid(open->op_share_deny))
                goto out;
        /*
@@ -1764,12 +2553,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
                init_stateid(stp, fp, open);
                status = nfsd4_truncate(rqstp, current_fh, open);
                if (status) {
-                        release_stateid(stp, OPEN_STATE);
+                        release_open_stateid(stp);
                        goto out;
                }
+                if (nfsd4_has_session(&resp->cstate))
+                        update_stateid(&stp->st_stateid);
        }
        memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+        if (nfsd4_has_session(&resp->cstate))
+                open->op_stateowner->so_confirmed = 1;
        /*
        * Attempt to hand out a delegation. No error return, because the
        * OPEN succeeds even if we fail.
@@ -1790,7 +2584,8 @@ out:
        * To finish the open response, we just need to set the rflags.
        */
        open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-        if (!open->op_stateowner->so_confirmed)
+        if (!open->op_stateowner->so_confirmed &&
+            !nfsd4_has_session(&resp->cstate))
                open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
        return status;
@@ -1898,7 +2693,7 @@ nfs4_laundromat(void)
                }
                dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
                        sop->so_id);
-                release_stateowner(sop);
+                release_openowner(sop);
        }
        if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
                clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2778,7 @@ out:
 static inline __be32
 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 {
-        /* Trying to call delegreturn with a special stateid? Yuch: */
+        if (ONE_STATEID(stateid) && (flags & RD_STATE))
-        if (!(flags & (RD_STATE | WR_STATE)))
-                return nfserr_bad_stateid;
-        else if (ONE_STATEID(stateid) && (flags & RD_STATE))
                return nfs_ok;
        else if (locks_in_grace()) {
                /* Answer in remaining cases depends on existance of
@@ -2005,14 +2797,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 * that are not able to provide mandatory locking.
 */
 static inline int
-io_during_grace_disallowed(struct inode *inode, int flags)
+grace_disallows_io(struct inode *inode)
 {
-        return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
+        return locks_in_grace() && mandatory_lock(inode);
-                && mandatory_lock(inode);
 }
-static int check_stateid_generation(stateid_t *in, stateid_t *ref)
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
 {
+        /*
+         * When sessions are used the stateid generation number is ignored
+         * when it is zero.
+         */
+        if ((flags & HAS_SESSION) && in->si_generation == 0)
+                goto out;
        /* If the client sends us a stateid from the future, it's buggy: */
        if (in->si_generation > ref->si_generation)
                return nfserr_bad_stateid;
@@ -2028,74 +2826,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
         */
        if (in->si_generation < ref->si_generation)
                return nfserr_old_stateid;
+out:
        return nfs_ok;
 }
+static int is_delegation_stateid(stateid_t *stateid)
+{
+        return stateid->si_fileid == 0;
+}
 /*
 * Checks for stateid operations
 */
 __be32
-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+                           stateid_t *stateid, int flags, struct file **filpp)
 {
        struct nfs4_stateid *stp = NULL;
        struct nfs4_delegation *dp = NULL;
-        stateid_t *stidp;
+        struct svc_fh *current_fh = &cstate->current_fh;
        struct inode *ino = current_fh->fh_dentry->d_inode;
        __be32 status;
-        dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
-                stateid->si_boot, stateid->si_stateownerid, 
-                stateid->si_fileid, stateid->si_generation); 
        if (filpp)
                *filpp = NULL;
-        if (io_during_grace_disallowed(ino, flags))
+        if (grace_disallows_io(ino))
                return nfserr_grace;
+        if (nfsd4_has_session(cstate))
+                flags |= HAS_SESSION;
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
                return check_special_stateids(current_fh, stateid, flags);
-        /* STALE STATEID */
        status = nfserr_stale_stateid;
        if (STALE_STATEID(stateid)) 
                goto out;
-        /* BAD STATEID */
        status = nfserr_bad_stateid;
-        if (!stateid->si_fileid) { /* delegation stateid */
+        if (is_delegation_stateid(stateid)) {
-                if(!(dp = find_delegation_stateid(ino, stateid))) {
+                dp = find_delegation_stateid(ino, stateid);
-                        dprintk("NFSD: delegation stateid not found\n");
+                if (!dp)
                        goto out;
-                }
+                status = check_stateid_generation(stateid, &dp->dl_stateid,
-                stidp = &dp->dl_stateid;
+                                                  flags);
+                if (status)
+                        goto out;
+                status = nfs4_check_delegmode(dp, flags);
+                if (status)
+                        goto out;
+                renew_client(dp->dl_client);
+                if (filpp)
+                        *filpp = dp->dl_vfs_file;
        } else { /* open or lock stateid */
-                if (!(stp = find_stateid(stateid, flags))) {
+                stp = find_stateid(stateid, flags);
-                        dprintk("NFSD: open or lock stateid not found\n");
+                if (!stp)
                        goto out;
-                }
+                if (nfs4_check_fh(current_fh, stp))
-                if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
                        goto out;
-                stidp = &stp->st_stateid;
+                status = check_stateid_generation(stateid, &stp->st_stateid,
-        }
+                                                  flags);
-        status = check_stateid_generation(stateid, stidp);
+                if (status)
-        if (status)
+                        goto out;
-                goto out;
+                status = nfs4_check_openmode(stp, flags);
-        if (stp) {
+                if (status)
-                if ((status = nfs4_check_openmode(stp,flags)))
                        goto out;
                renew_client(stp->st_stateowner->so_client);
                if (filpp)
                        *filpp = stp->st_vfs_file;
-        } else {
-                if ((status = nfs4_check_delegmode(dp, flags)))
-                        goto out;
-                renew_client(dp->dl_client);
-                if (flags & DELEG_RET)
-                        unhash_delegation(dp);
-                if (filpp)
-                        *filpp = dp->dl_vfs_file;
        }
        status = nfs_ok;
 out:
@@ -2113,10 +2914,14 @@ setlkflg (int type)
 * Checks for sequence id mutating operations. 
 */
 static __be32
-nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+                         stateid_t *stateid, int flags,
+                         struct nfs4_stateowner **sopp,
+                         struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
        struct nfs4_stateid *stp;
        struct nfs4_stateowner *sop;
+        struct svc_fh *current_fh = &cstate->current_fh;
        __be32 status;
        dprintk("NFSD: preprocess_seqid_op: seqid=%d " 
@@ -2134,6 +2939,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
        if (STALE_STATEID(stateid))
                return nfserr_stale_stateid;
+        if (nfsd4_has_session(cstate))
+                flags |= HAS_SESSION;
        /*
        * We return BAD_STATEID if filehandle doesn't match stateid, 
        * the confirmed flag is incorrecly set, or the generation 
@@ -2166,8 +2975,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                if (lock->lk_is_new) {
                        if (!sop->so_is_open_owner)
                                return nfserr_bad_stateid;
-                        if (!same_clid(&clp->cl_clientid, lockclid))
+                        if (!(flags & HAS_SESSION) &&
-                               return nfserr_bad_stateid;
+                            !same_clid(&clp->cl_clientid, lockclid))
+                                return nfserr_bad_stateid;
                        /* stp is the open stateid */
                        status = nfs4_check_openmode(stp, lkflg);
                        if (status)
@@ -2190,7 +3000,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
        *  For the moment, we ignore the possibility of 
        *  generation number wraparound.
        */
-        if (seqid != sop->so_seqid)
+        if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
                goto check_replay;
        if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3013,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                                " confirmed yet!\n");
                return nfserr_bad_stateid;
        }
-        status = check_stateid_generation(stateid, &stp->st_stateid);
+        status = check_stateid_generation(stateid, &stp->st_stateid, flags);
        if (status)
                return status;
        renew_client(sop->so_client);
@@ -2239,7 +3049,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        oc->oc_seqid, &oc->oc_req_stateid,
                                        CONFIRM | OPEN_STATE,
                                        &oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3114,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
                        (int)cstate->current_fh.fh_dentry->d_name.len,
                        cstate->current_fh.fh_dentry->d_name.name);
-        if (!access_valid(od->od_share_access)
+        if (!access_valid(od->od_share_access, cstate->minorversion)
                        || !deny_valid(od->od_share_deny))
                return nfserr_inval;
        nfs4_lock_state();
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        od->od_seqid,
                                        &od->od_stateid, 
                                        OPEN_STATE,
@@ -2362,7 +3172,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        /* check close_lru for replay */
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        close->cl_seqid,
                                        &close->cl_stateid, 
                                        OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3183,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
        /* release_stateid() calls nfsd_close() if needed */
-        release_stateid(stp, OPEN_STATE);
+        release_open_stateid(stp);
        /* place unused nfs4_stateowners on so_close_lru list to be
         * released by the laundromat service after the lease period
@@ -2394,16 +3204,40 @@ __be32
 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_delegreturn *dr)
 {
+        struct nfs4_delegation *dp;
+        stateid_t *stateid = &dr->dr_stateid;
+        struct inode *inode;
        __be32 status;
+        int flags = 0;
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
-                goto out;
+                return status;
+        inode = cstate->current_fh.fh_dentry->d_inode;
+        if (nfsd4_has_session(cstate))
+                flags |= HAS_SESSION;
        nfs4_lock_state();
-        status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+        status = nfserr_bad_stateid;
-                                            &dr->dr_stateid, DELEG_RET, NULL);
+        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-        nfs4_unlock_state();
+                goto out;
+        status = nfserr_stale_stateid;
+        if (STALE_STATEID(stateid))
+                goto out;
+        status = nfserr_bad_stateid;
+        if (!is_delegation_stateid(stateid))
+                goto out;
+        dp = find_delegation_stateid(inode, stateid);
+        if (!dp)
+                goto out;
+        status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+        if (status)
+                goto out;
+        renew_client(dp->dl_client);
+        unhash_delegation(dp);
 out:
+        nfs4_unlock_state();
        return status;
 }
@@ -2684,11 +3518,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                struct nfs4_file *fp;
                
                status = nfserr_stale_clientid;
-                if (STALE_CLIENTID(&lock->lk_new_clientid))
+                if (!nfsd4_has_session(cstate) &&
+                    STALE_CLIENTID(&lock->lk_new_clientid))
                        goto out;
                /* validate and update open stateid and open seqid */
-                status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+                status = nfs4_preprocess_seqid_op(cstate,
                                        lock->lk_new_open_seqid,
                                        &lock->lk_new_open_stateid,
                                        OPEN_STATE,
@@ -2715,7 +3550,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
        } else {
                /* lock (lock owner + lock stateid) already exists */
-                status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+                status = nfs4_preprocess_seqid_op(cstate,
                                       lock->lk_old_lock_seqid, 
                                       &lock->lk_old_lock_stateid, 
                                       LOCK_STATE,
@@ -2788,7 +3623,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        }
 out:
        if (status && lock->lk_is_new && lock_sop)
-                release_stateowner(lock_sop);
+                release_lockowner(lock_sop);
        if (lock->lk_replay_owner) {
                nfs4_get_stateowner(lock->lk_replay_owner);
                cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3673,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        status = nfserr_stale_clientid;
-        if (STALE_CLIENTID(&lockt->lt_clientid))
+        if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
                goto out;
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3746,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
                                                                                
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        locku->lu_seqid, 
                                        &locku->lu_stateid, 
                                        LOCK_STATE,
@@ -3037,7 +3872,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
                /* unhash_stateowner deletes so_perclient only
                 * for openowners. */
                list_del(&sop->so_perclient);
-                release_stateowner(sop);
+                release_lockowner(sop);
        }
 out:
        nfs4_unlock_state();
@@ -3051,12 +3886,12 @@ alloc_reclaim(void)
 }
 int
-nfs4_has_reclaimed_state(const char *name)
+nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
 {
        unsigned int strhashval = clientstr_hashval(name);
        struct nfs4_client *clp;
-        clp = find_confirmed_client_by_str(name, strhashval);
+        clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
        return clp ? 1 : 0;
 }
@@ -3153,6 +3988,8 @@ nfs4_state_init(void)
                INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
                INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
        }
+        for (i = 0; i < SESSION_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&sessionid_hashtbl[i]);
        for (i = 0; i < FILE_HASH_SIZE; i++) {
                INIT_LIST_HEAD(&file_hashtbl[i]);
        }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b73549d293be 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/vfs.h>
+#include <linux/utsname.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
        return p;
 }
+static int zero_clientid(clientid_t *clid)
+{
+        return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
 static int
 defer_free(struct nfsd4_compoundargs *argp,
                void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
        bmval[0] = 0;
        bmval[1] = 0;
+        bmval[2] = 0;
        READ_BUF(4);
        READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
                READ32(bmval[0]);
        if (bmlen > 1)
                READ32(bmval[1]);
+        if (bmlen > 2)
+                READ32(bmval[2]);
        DECODE_TAIL;
 }
+static u32 nfsd_attrmask[] = {
+        NFSD_WRITEABLE_ATTRS_WORD0,
+        NFSD_WRITEABLE_ATTRS_WORD1,
+        NFSD_WRITEABLE_ATTRS_WORD2
+};
+static u32 nfsd41_ex_attrmask[] = {
+        NFSD_SUPPATTR_EXCLCREAT_WORD0,
+        NFSD_SUPPATTR_EXCLCREAT_WORD1,
+        NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
 static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
-    struct nfs4_acl **acl)
+                   struct iattr *iattr, struct nfs4_acl **acl)
 {
        int expected_len, len = 0;
        u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
         * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
         * read-only attributes return ERR_INVAL.
         */
-        if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+        if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
+            (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
+            (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
                return nfserr_attrnotsupp;
-        if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1))
+        if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+            (bmval[2] & ~writable[2]))
                return nfserr_inval;
        READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
                        goto xdr_error;
                }
        }
+        BUG_ON(bmval[2]);       /* no such writeable attr supported yet */
        if (len != expected_len)
                goto xdr_error;
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
        if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
                return status;
-        if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl)))
+        status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
+                                    &create->cr_iattr, &create->cr_acl);
+        if (status)
                goto out;
        DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
        READ_BUF(lockt->lt_owner.len);
        READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
+        if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
+                return nfserr_inval;
        DECODE_TAIL;
 }
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                switch (open->op_createmode) {
                case NFS4_CREATE_UNCHECKED:
                case NFS4_CREATE_GUARDED:
-                        if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl)))
+                        status = nfsd4_decode_fattr(argp, open->op_bmval,
+                                nfsd_attrmask, &open->op_iattr, &open->op_acl);
+                        if (status)
                                goto out;
                        break;
                case NFS4_CREATE_EXCLUSIVE:
                        READ_BUF(8);
                        COPYMEM(open->op_verf.data, 8);
                        break;
+                case NFS4_CREATE_EXCLUSIVE4_1:
+                        if (argp->minorversion < 1)
+                                goto xdr_error;
+                        READ_BUF(8);
+                        COPYMEM(open->op_verf.data, 8);
+                        status = nfsd4_decode_fattr(argp, open->op_bmval,
+                                nfsd41_ex_attrmask, &open->op_iattr,
+                                &open->op_acl);
+                        if (status)
+                                goto out;
+                        break;
                default:
                        goto xdr_error;
                }
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
        status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
        if (status)
                return status;
-        return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+        return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
                                  &setattr->sa_iattr, &setattr->sa_acl);
 }
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
        READ_BUF(rlockowner->rl_owner.len);
        READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
+        if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+                return nfserr_inval;
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+                         struct nfsd4_exchange_id *exid)
+{
+        int dummy;
+        DECODE_HEAD;
+        READ_BUF(NFS4_VERIFIER_SIZE);
+        COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+        READ_BUF(4);
+        READ32(exid->clname.len);
+        READ_BUF(exid->clname.len);
+        SAVEMEM(exid->clname.data, exid->clname.len);
+        READ_BUF(4);
+        READ32(exid->flags);
+        /* Ignore state_protect4_a */
+        READ_BUF(4);
+        READ32(exid->spa_how);
+        switch (exid->spa_how) {
+        case SP4_NONE:
+                break;
+        case SP4_MACH_CRED:
+                /* spo_must_enforce */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                /* spo_must_allow */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                break;
+        case SP4_SSV:
+                /* ssp_ops */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                /* ssp_hash_algs<> */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* ssp_encr_algs<> */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* ssp_window and ssp_num_gss_handles */
+                READ_BUF(8);
+                READ32(dummy);
+                READ32(dummy);
+                break;
+        default:
+                goto xdr_error;
+        }
+        /* Ignore Implementation ID */
+        READ_BUF(4);    /* nfs_impl_id4 array length */
+        READ32(dummy);
+        if (dummy > 1)
+                goto xdr_error;
+        if (dummy == 1) {
+                /* nii_domain */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* nii_name */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* nii_date */
+                READ_BUF(12);
+                p += 3;
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+                            struct nfsd4_create_session *sess)
+{
+        DECODE_HEAD;
+        u32 dummy;
+        char *machine_name;
+        int i;
+        int nr_secflavs;
+        READ_BUF(16);
+        COPYMEM(&sess->clientid, 8);
+        READ32(sess->seqid);
+        READ32(sess->flags);
+        /* Fore channel attrs */
+        READ_BUF(28);
+        READ32(dummy); /* headerpadsz is always 0 */
+        READ32(sess->fore_channel.maxreq_sz);
+        READ32(sess->fore_channel.maxresp_sz);
+        READ32(sess->fore_channel.maxresp_cached);
+        READ32(sess->fore_channel.maxops);
+        READ32(sess->fore_channel.maxreqs);
+        READ32(sess->fore_channel.nr_rdma_attrs);
+        if (sess->fore_channel.nr_rdma_attrs == 1) {
+                READ_BUF(4);
+                READ32(sess->fore_channel.rdma_attrs);
+        } else if (sess->fore_channel.nr_rdma_attrs > 1) {
+                dprintk("Too many fore channel attr bitmaps!\n");
+                goto xdr_error;
+        }
+        /* Back channel attrs */
+        READ_BUF(28);
+        READ32(dummy); /* headerpadsz is always 0 */
+        READ32(sess->back_channel.maxreq_sz);
+        READ32(sess->back_channel.maxresp_sz);
+        READ32(sess->back_channel.maxresp_cached);
+        READ32(sess->back_channel.maxops);
+        READ32(sess->back_channel.maxreqs);
+        READ32(sess->back_channel.nr_rdma_attrs);
+        if (sess->back_channel.nr_rdma_attrs == 1) {
+                READ_BUF(4);
+                READ32(sess->back_channel.rdma_attrs);
+        } else if (sess->back_channel.nr_rdma_attrs > 1) {
+                dprintk("Too many back channel attr bitmaps!\n");
+                goto xdr_error;
+        }
+        READ_BUF(8);
+        READ32(sess->callback_prog);
+        /* callback_sec_params4 */
+        READ32(nr_secflavs);
+        for (i = 0; i < nr_secflavs; ++i) {
+                READ_BUF(4);
+                READ32(dummy);
+                switch (dummy) {
+                case RPC_AUTH_NULL:
+                        /* Nothing to read */
+                        break;
+                case RPC_AUTH_UNIX:
+                        READ_BUF(8);
+                        /* stamp */
+                        READ32(dummy);
+                        /* machine name */
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        SAVEMEM(machine_name, dummy);
+                        /* uid, gid */
+                        READ_BUF(8);
+                        READ32(sess->uid);
+                        READ32(sess->gid);
+                        /* more gids */
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy * 4);
+                        for (i = 0; i < dummy; ++i)
+                                READ32(dummy);
+                        break;
+                case RPC_AUTH_GSS:
+                        dprintk("RPC_AUTH_GSS callback secflavor "
+                                "not supported!\n");
+                        READ_BUF(8);
+                        /* gcbp_service */
+                        READ32(dummy);
+                        /* gcbp_handle_from_server */
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                        /* gcbp_handle_from_client */
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                        break;
+                default:
+                        dprintk("Illegal callback secflavor\n");
+                        return nfserr_inval;
+                }
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+                             struct nfsd4_destroy_session *destroy_session)
+{
+        DECODE_HEAD;
+        READ_BUF(NFS4_MAX_SESSIONID_LEN);
+        COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+                      struct nfsd4_sequence *seq)
+{
+        DECODE_HEAD;
+        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+        COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        READ32(seq->seqid);
+        READ32(seq->slotid);
+        READ32(seq->maxslots);
+        READ32(seq->cachethis);
        DECODE_TAIL;
 }
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 static __be32
 nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
 {
-        return nfserr_opnotsupp;
+        return nfserr_notsupp;
 }
 typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_OPEN_CONFIRM]       = (nfsd4_dec)nfsd4_decode_open_confirm,
        [OP_OPEN_DOWNGRADE]     = (nfsd4_dec)nfsd4_decode_open_downgrade,
        [OP_PUTFH]              = (nfsd4_dec)nfsd4_decode_putfh,
-        [OP_PUTPUBFH]           = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_PUTPUBFH]           = (nfsd4_dec)nfsd4_decode_noop,
        [OP_PUTROOTFH]          = (nfsd4_dec)nfsd4_decode_noop,
        [OP_READ]               = (nfsd4_dec)nfsd4_decode_read,
        [OP_READDIR]            = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_RELEASE_LOCKOWNER]  = (nfsd4_dec)nfsd4_decode_release_lockowner,
 };
+static nfsd4_dec nfsd41_dec_ops[] = {
+        [OP_ACCESS]             (nfsd4_dec)nfsd4_decode_access,
+        [OP_CLOSE]              (nfsd4_dec)nfsd4_decode_close,
+        [OP_COMMIT]             (nfsd4_dec)nfsd4_decode_commit,
+        [OP_CREATE]             (nfsd4_dec)nfsd4_decode_create,
+        [OP_DELEGPURGE]         (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DELEGRETURN]        (nfsd4_dec)nfsd4_decode_delegreturn,
+        [OP_GETATTR]            (nfsd4_dec)nfsd4_decode_getattr,
+        [OP_GETFH]              (nfsd4_dec)nfsd4_decode_noop,
+        [OP_LINK]               (nfsd4_dec)nfsd4_decode_link,
+        [OP_LOCK]               (nfsd4_dec)nfsd4_decode_lock,
+        [OP_LOCKT]              (nfsd4_dec)nfsd4_decode_lockt,
+        [OP_LOCKU]              (nfsd4_dec)nfsd4_decode_locku,
+        [OP_LOOKUP]             (nfsd4_dec)nfsd4_decode_lookup,
+        [OP_LOOKUPP]            (nfsd4_dec)nfsd4_decode_noop,
+        [OP_NVERIFY]            (nfsd4_dec)nfsd4_decode_verify,
+        [OP_OPEN]               (nfsd4_dec)nfsd4_decode_open,
+        [OP_OPENATTR]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPEN_CONFIRM]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPEN_DOWNGRADE]     (nfsd4_dec)nfsd4_decode_open_downgrade,
+        [OP_PUTFH]              (nfsd4_dec)nfsd4_decode_putfh,
+        [OP_PUTPUBFH]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_PUTROOTFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_READ]               (nfsd4_dec)nfsd4_decode_read,
+        [OP_READDIR]            (nfsd4_dec)nfsd4_decode_readdir,
+        [OP_READLINK]           (nfsd4_dec)nfsd4_decode_noop,
+        [OP_REMOVE]             (nfsd4_dec)nfsd4_decode_remove,
+        [OP_RENAME]             (nfsd4_dec)nfsd4_decode_rename,
+        [OP_RENEW]              (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RESTOREFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_SAVEFH]             (nfsd4_dec)nfsd4_decode_noop,
+        [OP_SECINFO]            (nfsd4_dec)nfsd4_decode_secinfo,
+        [OP_SETATTR]            (nfsd4_dec)nfsd4_decode_setattr,
+        [OP_SETCLIENTID]        (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_VERIFY]             (nfsd4_dec)nfsd4_decode_verify,
+        [OP_WRITE]              (nfsd4_dec)nfsd4_decode_write,
+        [OP_RELEASE_LOCKOWNER]  (nfsd4_dec)nfsd4_decode_notsupp,
+        /* new operations for NFSv4.1 */
+        [OP_BACKCHANNEL_CTL]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_EXCHANGE_ID]        (nfsd4_dec)nfsd4_decode_exchange_id,
+        [OP_CREATE_SESSION]     (nfsd4_dec)nfsd4_decode_create_session,
+        [OP_DESTROY_SESSION]    (nfsd4_dec)nfsd4_decode_destroy_session,
+        [OP_FREE_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICEINFO]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICELIST]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTCOMMIT]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTGET]          (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTRETURN]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SECINFO_NO_NAME]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SEQUENCE]           (nfsd4_dec)nfsd4_decode_sequence,
+        [OP_SET_SSV]            (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_TEST_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_WANT_DELEGATION]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DESTROY_CLIENTID]   (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RECLAIM_COMPLETE]   (nfsd4_dec)nfsd4_decode_notsupp,
+};
 struct nfsd4_minorversion_ops {
        nfsd4_dec *decoders;
        int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
 static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
        [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+        [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
 };
 static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 {
        u32 bmval0 = bmval[0];
        u32 bmval1 = bmval[1];
+        u32 bmval2 = bmval[2];
        struct kstat stat;
        struct svc_fh tempfh;
        struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        int err;
        int aclsupport = 0;
        struct nfs4_acl *acl = NULL;
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        u32 minorversion = resp->cstate.minorversion;
        BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-        BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
+        BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
-        BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
+        BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+        BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
        if (exp->ex_fslocs.migrated) {
+                BUG_ON(bmval[2]);
                status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
                if (status)
                        goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if ((buflen -= 16) < 0)
                goto out_resource;
-        WRITE32(2);
+        if (unlikely(bmval2)) {
-        WRITE32(bmval0);
+                WRITE32(3);
-        WRITE32(bmval1);
+                WRITE32(bmval0);
+                WRITE32(bmval1);
+                WRITE32(bmval2);
+        } else if (likely(bmval1)) {
+                WRITE32(2);
+                WRITE32(bmval0);
+                WRITE32(bmval1);
+        } else {
+                WRITE32(1);
+                WRITE32(bmval0);
+        }
        attrlenp = p++;                /* to be backfilled later */
        if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-                u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
+                u32 word0 = nfsd_suppattrs0(minorversion);
+                u32 word1 = nfsd_suppattrs1(minorversion);
+                u32 word2 = nfsd_suppattrs2(minorversion);
                if ((buflen -= 12) < 0)
                        goto out_resource;
                if (!aclsupport)
                        word0 &= ~FATTR4_WORD0_ACL;
                if (!exp->ex_fslocs.locations)
                        word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
-                WRITE32(2);
+                if (!word2) {
-                WRITE32(word0);
+                        WRITE32(2);
-                WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+                        WRITE32(word0);
+                        WRITE32(word1);
+                } else {
+                        WRITE32(3);
+                        WRITE32(word0);
+                        WRITE32(word1);
+                        WRITE32(word2);
+                }
        }
        if (bmval0 & FATTR4_WORD0_TYPE) {
                if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
                }
                WRITE64(stat.ino);
        }
+        if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+                WRITE32(3);
+                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+        }
        *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
        *countp = p - buffer;
        status = nfs_ok;
@@ -1843,6 +2214,15 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
        dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
        if (IS_ERR(dentry))
                return nfserrno(PTR_ERR(dentry));
+        if (!dentry->d_inode) {
+                /*
+                 * nfsd_buffered_readdir drops the i_mutex between
+                 * readdir and calling this callback, leaving a window
+                 * where this directory entry could have gone away.
+                 */
+                dput(dentry);
+                return nfserr_noent;
+        }
        exp_get(exp);
        /*
@@ -1905,6 +2285,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
        int buflen;
        __be32 *p = cd->buffer;
+        __be32 *cookiep;
        __be32 nfserr = nfserr_toosmall;
        /* In nfsv4, "." and ".." never make it onto the wire.. */
@@ -1921,7 +2302,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
                goto fail;
        *p++ = xdr_one;                             /* mark entry present */
-        cd->offset = p;                             /* remember pointer */
+        cookiep = p;
        p = xdr_encode_hyper(p, NFS_OFFSET_MAX);    /* offset of next entry */
        p = xdr_encode_array(p, name, namlen);      /* name length & name */
@@ -1935,6 +2316,8 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
                goto fail;
        case nfserr_dropit:
                goto fail;
+        case nfserr_noent:
+                goto skip_entry;
        default:
                /*
                 * If the client requested the RDATTR_ERROR attribute,
@@ -1953,6 +2336,8 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        }
        cd->buflen -= (p - cd->buffer);
        cd->buffer = p;
+        cd->offset = cookiep;
+skip_entry:
        cd->common.err = nfs_ok;
        return 0;
 fail:
@@ -2572,6 +2957,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 }
 static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+                         struct nfsd4_exchange_id *exid)
+{
+        ENCODE_HEAD;
+        char *major_id;
+        char *server_scope;
+        int major_id_sz;
+        int server_scope_sz;
+        uint64_t minor_id = 0;
+        if (nfserr)
+                return nfserr;
+        major_id = utsname()->nodename;
+        major_id_sz = strlen(major_id);
+        server_scope = utsname()->nodename;
+        server_scope_sz = strlen(server_scope);
+        RESERVE_SPACE(
+                8 /* eir_clientid */ +
+                4 /* eir_sequenceid */ +
+                4 /* eir_flags */ +
+                4 /* spr_how (SP4_NONE) */ +
+                8 /* so_minor_id */ +
+                4 /* so_major_id.len */ +
+                (XDR_QUADLEN(major_id_sz) * 4) +
+                4 /* eir_server_scope.len */ +
+                (XDR_QUADLEN(server_scope_sz) * 4) +
+                4 /* eir_server_impl_id.count (0) */);
+        WRITEMEM(&exid->clientid, 8);
+        WRITE32(exid->seqid);
+        WRITE32(exid->flags);
+        /* state_protect4_r. Currently only support SP4_NONE */
+        BUG_ON(exid->spa_how != SP4_NONE);
+        WRITE32(exid->spa_how);
+        /* The server_owner struct */
+        WRITE64(minor_id);      /* Minor id */
+        /* major id */
+        WRITE32(major_id_sz);
+        WRITEMEM(major_id, major_id_sz);
+        /* Server scope */
+        WRITE32(server_scope_sz);
+        WRITEMEM(server_scope, server_scope_sz);
+        /* Implementation id */
+        WRITE32(0);     /* zero length nfs_impl_id4 array */
+        ADJUST_ARGS();
+        return 0;
+}
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+                            struct nfsd4_create_session *sess)
+{
+        ENCODE_HEAD;
+        if (nfserr)
+                return nfserr;
+        RESERVE_SPACE(24);
+        WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        WRITE32(sess->seqid);
+        WRITE32(sess->flags);
+        ADJUST_ARGS();
+        RESERVE_SPACE(28);
+        WRITE32(0); /* headerpadsz */
+        WRITE32(sess->fore_channel.maxreq_sz);
+        WRITE32(sess->fore_channel.maxresp_sz);
+        WRITE32(sess->fore_channel.maxresp_cached);
+        WRITE32(sess->fore_channel.maxops);
+        WRITE32(sess->fore_channel.maxreqs);
+        WRITE32(sess->fore_channel.nr_rdma_attrs);
+        ADJUST_ARGS();
+        if (sess->fore_channel.nr_rdma_attrs) {
+                RESERVE_SPACE(4);
+                WRITE32(sess->fore_channel.rdma_attrs);
+                ADJUST_ARGS();
+        }
+        RESERVE_SPACE(28);
+        WRITE32(0); /* headerpadsz */
+        WRITE32(sess->back_channel.maxreq_sz);
+        WRITE32(sess->back_channel.maxresp_sz);
+        WRITE32(sess->back_channel.maxresp_cached);
+        WRITE32(sess->back_channel.maxops);
+        WRITE32(sess->back_channel.maxreqs);
+        WRITE32(sess->back_channel.nr_rdma_attrs);
+        ADJUST_ARGS();
+        if (sess->back_channel.nr_rdma_attrs) {
+                RESERVE_SPACE(4);
+                WRITE32(sess->back_channel.rdma_attrs);
+                ADJUST_ARGS();
+        }
+        return 0;
+}
+static __be32
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+                             struct nfsd4_destroy_session *destroy_session)
+{
+        return nfserr;
+}
+__be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+                      struct nfsd4_sequence *seq)
+{
+        ENCODE_HEAD;
+        if (nfserr)
+                return nfserr;
+        RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
+        WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        WRITE32(seq->seqid);
+        WRITE32(seq->slotid);
+        WRITE32(seq->maxslots);
+        /*
+         * FIXME: for now:
+         *   target_maxslots = maxslots
+         *   status_flags = 0
+         */
+        WRITE32(seq->maxslots);
+        WRITE32(0);
+        ADJUST_ARGS();
+        return 0;
+}
+static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
        return nfserr;
@@ -2579,6 +3101,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
 static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_ACCESS]             = (nfsd4_enc)nfsd4_encode_access,
        [OP_CLOSE]              = (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3144,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_VERIFY]             = (nfsd4_enc)nfsd4_encode_noop,
        [OP_WRITE]              = (nfsd4_enc)nfsd4_encode_write,
        [OP_RELEASE_LOCKOWNER]  = (nfsd4_enc)nfsd4_encode_noop,
+        /* NFSv4.1 operations */
+        [OP_BACKCHANNEL_CTL]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_EXCHANGE_ID]        = (nfsd4_enc)nfsd4_encode_exchange_id,
+        [OP_CREATE_SESSION]     = (nfsd4_enc)nfsd4_encode_create_session,
+        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_destroy_session,
+        [OP_FREE_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SEQUENCE]           = (nfsd4_enc)nfsd4_encode_sequence,
+        [OP_SET_SSV]            = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_TEST_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_WANT_DELEGATION]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_DESTROY_CLIENTID]   = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_RECLAIM_COMPLETE]   = (nfsd4_enc)nfsd4_encode_noop,
 };
+/*
+ * Calculate the total amount of memory that the compound response has taken
+ * after encoding the current operation.
+ *
+ * pad: add on 8 bytes for the next operation's op_code and status so that
+ * there is room to cache a failure on the next operation.
+ *
+ * Compare this length to the session se_fmaxresp_cached.
+ *
+ * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
+ * will be at least a page and will therefore hold the xdr_buf head.
+ */
+static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+{
+        int status = 0;
+        struct xdr_buf *xb = &resp->rqstp->rq_res;
+        struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+        struct nfsd4_session *session = NULL;
+        struct nfsd4_slot *slot = resp->cstate.slot;
+        u32 length, tlen = 0, pad = 8;
+        if (!nfsd4_has_session(&resp->cstate))
+                return status;
+        session = resp->cstate.session;
+        if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+                return status;
+        if (resp->opcnt >= args->opcnt)
+                pad = 0; /* this is the last operation */
+        if (xb->page_len == 0) {
+                length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
+        } else {
+                if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
+                        tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
+                length = xb->head[0].iov_len + xb->page_len + tlen + pad;
+        }
+        dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
+                length, xb->page_len, tlen, pad);
+        if (length <= session->se_fmaxresp_cached)
+                return status;
+        else
+                return nfserr_rep_too_big_to_cache;
+}
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
@@ -2635,6 +3231,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
        BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
               !nfsd4_enc_ops[op->opnum]);
        op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+        /* nfsd4_check_drc_limit guarantees enough room for error status */
+        if (!op->status && nfsd4_check_drc_limit(resp))
+                op->status = nfserr_rep_too_big_to_cache;
 status:
        /*
         * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3334,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
                iov = &rqstp->rq_res.head[0];
        iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
        BUG_ON(iov->iov_len > PAGE_SIZE);
+        if (nfsd4_has_session(&resp->cstate)) {
+                if (resp->cstate.status == nfserr_replay_cache &&
+                                !nfsd4_not_cached(resp)) {
+                        iov->iov_len = resp->cstate.iovlen;
+                } else {
+                        nfsd4_store_cache_entry(resp);
+                        dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+                        resp->cstate.slot->sl_inuse = 0;
+                }
+                if (resp->cstate.session)
+                        nfsd4_put_session(resp->cstate.session);
+        }
        return 1;
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce5..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
        NFSD_FO_UnlockFS,
        NFSD_Threads,
        NFSD_Pool_Threads,
+        NFSD_Pool_Stats,
        NFSD_Versions,
        NFSD_Ports,
        NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
        .owner          = THIS_MODULE,
 };
+extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+static struct file_operations pool_stats_operations = {
+        .open           = nfsd_pool_stats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+        .owner          = THIS_MODULE,
+};
 /*----------------------------------------------------------------------------*/
 /*
 * payload - write methods
@@ -781,8 +792,9 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
-        char *vers, sign;
+        char *vers, *minorp, sign;
        int len, num;
+        unsigned minor;
        ssize_t tlen = 0;
        char *sep;
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                do {
                        sign = *vers;
                        if (sign == '+' || sign == '-')
-                                num = simple_strtol((vers+1), NULL, 0);
+                                num = simple_strtol((vers+1), &minorp, 0);
                        else
-                                num = simple_strtol(vers, NULL, 0);
+                                num = simple_strtol(vers, &minorp, 0);
+                        if (*minorp == '.') {
+                                if (num < 4)
+                                        return -EINVAL;
+                                minor = simple_strtoul(minorp+1, NULL, 0);
+                                if (minor == 0)
+                                        return -EINVAL;
+                                if (nfsd_minorversion(minor, sign == '-' ?
+                                                     NFSD_CLEAR : NFSD_SET) < 0)
+                                        return -EINVAL;
+                                goto next;
+                        }
                        switch(num) {
                        case 2:
                        case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                        default:
                                return -EINVAL;
                        }
+                next:
                        vers += len + 1;
                        tlen += len;
                } while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                                       num);
                        sep = " ";
                }
+        if (nfsd_vers(4, NFSD_AVAIL))
+                for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+                        len += sprintf(buf+len, " %c4.%u",
+                                        (nfsd_vers(4, NFSD_TEST) &&
+                                         nfsd_minorversion(minor, NFSD_TEST)) ?
+                                                '+' : '-',
+                                        minor);
        len += sprintf(buf+len, "\n");
        return len;
 }
@@ -938,10 +969,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
                char transport[16];
                int port;
                if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+                        if (port < 1 || port > 65535)
+                                return -EINVAL;
                        err = nfsd_create_serv();
                        if (!err) {
                                err = svc_create_xprt(nfsd_serv,
-                                                      transport, port,
+                                                      transport, PF_INET, port,
                                                      SVC_SOCK_ANONYMOUS);
                                if (err == -ENOENT)
                                        /* Give a reasonable perror msg for
@@ -960,7 +993,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
                char transport[16];
                int port;
                if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
-                        if (port == 0)
+                        if (port < 1 || port > 65535)
                                return -EINVAL;
                        if (nfsd_serv) {
                                xprt = svc_find_xprt(nfsd_serv, transport,
@@ -1246,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+                [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
                [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 {
        __be32  nfserr;
        int     stable = 1;
+        unsigned long cnt = argp->len;
        dprintk("nfsd: WRITE    %s %d bytes at %d\n",
                SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
        nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
                                   argp->offset,
                                   rqstp->rq_vec, argp->vlen,
-                                   argp->len,
+                                   &cnt,
                                   &stable);
        return nfsd_return_attrs(nfserr, resp);
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
+#include <linux/swap.h>
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
 extern struct svc_program       nfsd_program;
 static int                      nfsd(void *vrqstp);
 struct timeval                  nfssvc_boot;
-static atomic_t                 nfsd_busy;
-static unsigned long            nfsd_last_call;
-static DEFINE_SPINLOCK(nfsd_call_lock);
 /*
 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program		nfsd_program = {
 };
+u32 nfsd_supported_minorversion;
 int nfsd_vers(int vers, enum vers_op change)
 {
        if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
        }
        return 0;
 }
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+        if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+                return -1;
+        switch(change) {
+        case NFSD_SET:
+                nfsd_supported_minorversion = minorversion;
+                break;
+        case NFSD_CLEAR:
+                if (minorversion == 0)
+                        return -1;
+                nfsd_supported_minorversion = minorversion - 1;
+                break;
+        case NFSD_TEST:
+                return minorversion <= nfsd_supported_minorversion;
+        case NFSD_AVAIL:
+                return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+        }
+        return 0;
+}
 /*
 * Maximum number of nfsd processes
 */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
        }
 }
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+        /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
+        #define NFSD_DRC_SIZE_SHIFT     7
+        nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
+                                                >> NFSD_DRC_SIZE_SHIFT;
+        nfsd_serv->sv_drc_pages_used = 0;
+        dprintk("%s svc_drc_max_pages %u\n", __func__,
+                nfsd_serv->sv_drc_max_pages);
+}
 int nfsd_create_serv(void)
 {
@@ -227,12 +271,12 @@ int nfsd_create_serv(void)
                        nfsd_max_blksize /= 2;
        }
-        atomic_set(&nfsd_busy, 0);
        nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-                                      AF_INET,
                                      nfsd_last_thread, nfsd, THIS_MODULE);
        if (nfsd_serv == NULL)
                err = -ENOMEM;
+        else
+                set_max_drc();
        do_gettimeofday(&nfssvc_boot);          /* record boot time */
        return err;
@@ -244,7 +288,7 @@ static int nfsd_init_socks(int port)
        if (!list_empty(&nfsd_serv->sv_permsocks))
                return 0;
-        error = svc_create_xprt(nfsd_serv, "udp", port,
+        error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
@@ -253,7 +297,7 @@ static int nfsd_init_socks(int port)
        if (error < 0)
                return error;
-        error = svc_create_xprt(nfsd_serv, "tcp", port,
+        error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
@@ -376,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
        return error;
 }
-static inline void
-update_thread_usage(int busy_threads)
-{
-        unsigned long prev_call;
-        unsigned long diff;
-        int decile;
-        spin_lock(&nfsd_call_lock);
-        prev_call = nfsd_last_call;
-        nfsd_last_call = jiffies;
-        decile = busy_threads*10/nfsdstats.th_cnt;
-        if (decile>0 && decile <= 10) {
-                diff = nfsd_last_call - prev_call;
-                if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
-                        nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
-                if (decile == 10)
-                        nfsdstats.th_fullcnt++;
-        }
-        spin_unlock(&nfsd_call_lock);
-}
 /*
 * This is the NFS server kernel thread
@@ -404,7 +428,6 @@ static int
 nfsd(void *vrqstp)
 {
        struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
-        struct fs_struct *fsp;
        int err, preverr = 0;
        /* Lock module and set up kernel thread */
@@ -413,13 +436,11 @@ nfsd(void *vrqstp)
        /* At this point, the thread shares current->fs
         * with the init process. We need to create files with a
         * umask of 0 instead of init's umask. */
-        fsp = copy_fs_struct(current->fs);
+        if (unshare_fs_struct() < 0) {
-        if (!fsp) {
                printk("Unable to start nfsd thread: out of memory\n");
                goto out;
        }
-        exit_fs(current);
-        current->fs = fsp;
        current->fs->umask = 0;
        /*
@@ -464,8 +485,6 @@ nfsd(void *vrqstp)
                        continue;
                }
-                update_thread_usage(atomic_read(&nfsd_busy));
-                atomic_inc(&nfsd_busy);
                /* Lock the export hash tables for reading. */
                exp_readlock();
@@ -474,8 +493,6 @@ nfsd(void *vrqstp)
                /* Unlock export hash tables */
                exp_readunlock();
-                update_thread_usage(atomic_read(&nfsd_busy));
-                atomic_dec(&nfsd_busy);
        }
        /* Clear signals before calling svc_exit_thread() */
@@ -543,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
                + rqstp->rq_res.head[0].iov_len;
        rqstp->rq_res.head[0].iov_len += sizeof(__be32);
+        /* NFSv4.1 DRC requires statp */
+        if (rqstp->rq_vers == 4)
+                nfsd4_set_statp(rqstp, statp);
        /* Now call the procedure handler, and encode NFS status. */
        nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
        nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -574,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
        nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
        return 1;
 }
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+        if (nfsd_serv == NULL)
+                return -ENODEV;
+        return svc_pool_stats_open(nfsd_serv, file);
+}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..b660435978d2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -116,10 +116,15 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
        }
        if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
                /* successfully crossed mount point */
-                exp_put(exp);
+                /*
-                *expp = exp2;
+                 * This is subtle: dentry is *not* under mnt at this point.
+                 * The only reason we are safe is that original mnt is pinned
+                 * down by exp, so we should dput before putting exp.
+                 */
                dput(dentry);
                *dpp = mounts;
+                exp_put(exp);
+                *expp = exp2;
        } else {
                exp_put(exp2);
                dput(mounts);
@@ -366,8 +371,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        }
        /* Revoke setuid/setgid on chown */
-        if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+        if (!S_ISDIR(inode->i_mode) &&
-            ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) {
+            (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+             ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
                iap->ia_valid |= ATTR_KILL_PRIV;
                if (iap->ia_valid & ATTR_MODE) {
                        /* we're setting mode too, just clear the s*id bits */
@@ -960,7 +966,7 @@ static void kill_suid(struct dentry *dentry)
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                                loff_t offset, struct kvec *vec, int vlen,
-                                unsigned long cnt, int *stablep)
+                                unsigned long *cnt, int *stablep)
 {
        struct svc_export       *exp;
        struct dentry           *dentry;
@@ -974,7 +980,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        err = nfserr_perm;
        if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-                (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
+                (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
                goto out;
 #endif
@@ -1009,7 +1015,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
        set_fs(oldfs);
        if (host_err >= 0) {
-                nfsdstats.io_write += cnt;
+                *cnt = host_err;
+                nfsdstats.io_write += host_err;
                fsnotify_modify(file->f_path.dentry);
        }
@@ -1056,7 +1063,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        dprintk("nfsd: write complete host_err=%d\n", host_err);
        if (host_err >= 0)
                err = 0;
-        else 
+        else
                err = nfserrno(host_err);
 out:
        return err;
@@ -1098,7 +1105,7 @@ out:
 */
 __be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-                loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
+                loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
                int *stablep)
 {
        __be32                  err = 0;
@@ -1179,6 +1186,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
        return 0;
 }
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+        if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+                iap->ia_valid &= ~ATTR_SIZE;
+}
 /*
 * Create a file (regular, directory, device, fifo); UNIX sockets 
 * not yet implemented.
@@ -1274,6 +1296,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        switch (type) {
        case S_IFREG:
                host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+                if (!host_err)
+                        nfsd_check_ignore_resizing(iap);
                break;
        case S_IFDIR:
                host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1451,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                /* setattr will sync the child (or not) */
        }
+        nfsd_check_ignore_resizing(iap);
        if (createmode == NFS3_CREATE_EXCLUSIVE) {
                /* Cram the verifier into atime/mtime */
                iap->ia_valid = ATTR_MTIME|ATTR_ATIME
@@ -1864,8 +1890,8 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
        return 0;
 }
-static int nfsd_buffered_readdir(struct file *file, filldir_t func,
+static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
-                                 struct readdir_cd *cdp, loff_t *offsetp)
+                                    struct readdir_cd *cdp, loff_t *offsetp)
 {
        struct readdir_data buf;
        struct buffered_dirent *de;
@@ -1875,11 +1901,12 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
        buf.dirent = (void *)__get_free_page(GFP_KERNEL);
        if (!buf.dirent)
-                return -ENOMEM;
+                return nfserrno(-ENOMEM);
        offset = *offsetp;
        while (1) {
+                struct inode *dir_inode = file->f_path.dentry->d_inode;
                unsigned int reclen;
                cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1898,26 +1925,38 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
                if (!size)
                        break;
+                /*
+                 * Various filldir functions may end up calling back into
+                 * lookup_one_len() and the file system's ->lookup() method.
+                 * These expect i_mutex to be held, as it would within readdir.
+                 */
+                host_err = mutex_lock_killable(&dir_inode->i_mutex);
+                if (host_err)
+                        break;
                de = (struct buffered_dirent *)buf.dirent;
                while (size > 0) {
                        offset = de->offset;
                        if (func(cdp, de->name, de->namlen, de->offset,
                                 de->ino, de->d_type))
-                                goto done;
+                                break;
                        if (cdp->err != nfs_ok)
-                                goto done;
+                                break;
                        reclen = ALIGN(sizeof(*de) + de->namlen,
                                       sizeof(u64));
                        size -= reclen;
                        de = (struct buffered_dirent *)((char *)de + reclen);
                }
+                mutex_unlock(&dir_inode->i_mutex);
+                if (size > 0) /* We bailed out early */
+                        break;
                offset = vfs_llseek(file, 0, SEEK_CUR);
        }
- done:
        free_page((unsigned long)(buf.dirent));
        if (host_err)
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
new file mode 100644
index 000000000000..df3e62c1ddc5
--- /dev/null
+++ b/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_NILFS2_FS) += nilfs2.o
+nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
+        btnode.o bmap.o btree.o direct.o dat.o recovery.o \
+        the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
+        ifile.o alloc.o gcinode.o ioctl.o gcdat.o
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
new file mode 100644
index 000000000000..d69e6ae59251
--- /dev/null
+++ b/fs/nilfs2/alloc.c
@@ -0,0 +1,504 @@
+/*
+ * alloc.c - NILFS dat/inode allocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include "mdt.h"
+#include "alloc.h"
+static inline unsigned long
+nilfs_palloc_groups_per_desc_block(const struct inode *inode)
+{
+        return (1UL << inode->i_blkbits) /
+                sizeof(struct nilfs_palloc_group_desc);
+}
+static inline unsigned long
+nilfs_palloc_groups_count(const struct inode *inode)
+{
+        return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
+}
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+{
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+        mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
+        if (!mi->mi_bgl)
+                return -ENOMEM;
+        bgl_lock_init(mi->mi_bgl);
+        nilfs_mdt_set_entry_size(inode, entry_size, 0);
+        mi->mi_blocks_per_group =
+                DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
+                             mi->mi_entries_per_block) + 1;
+                /* Number of blocks in a group including entry blocks and
+                   a bitmap block */
+        mi->mi_blocks_per_desc_block =
+                nilfs_palloc_groups_per_desc_block(inode) *
+                mi->mi_blocks_per_group + 1;
+                /* Number of blocks per descriptor including the
+                   descriptor block */
+        return 0;
+}
+static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
+                                        unsigned long *offset)
+{
+        __u64 group = nr;
+        *offset = do_div(group, nilfs_palloc_entries_per_group(inode));
+        return group;
+}
+static unsigned long
+nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
+{
+        unsigned long desc_block =
+                group / nilfs_palloc_groups_per_desc_block(inode);
+        return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
+}
+static unsigned long
+nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
+{
+        unsigned long desc_offset =
+                group % nilfs_palloc_groups_per_desc_block(inode);
+        return nilfs_palloc_desc_blkoff(inode, group) + 1 +
+                desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
+}
+static unsigned long
+nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
+                               const struct nilfs_palloc_group_desc *desc)
+{
+        unsigned long nfree;
+        spin_lock(nilfs_mdt_bgl_lock(inode, group));
+        nfree = le32_to_cpu(desc->pg_nfrees);
+        spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+        return nfree;
+}
+static void
+nilfs_palloc_group_desc_add_entries(struct inode *inode,
+                                    unsigned long group,
+                                    struct nilfs_palloc_group_desc *desc,
+                                    u32 n)
+{
+        spin_lock(nilfs_mdt_bgl_lock(inode, group));
+        le32_add_cpu(&desc->pg_nfrees, n);
+        spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+}
+static unsigned long
+nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
+{
+        unsigned long group, group_offset;
+        group = nilfs_palloc_group(inode, nr, &group_offset);
+        return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
+                group_offset / NILFS_MDT(inode)->mi_entries_per_block;
+}
+static void nilfs_palloc_desc_block_init(struct inode *inode,
+                                         struct buffer_head *bh, void *kaddr)
+{
+        struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
+        unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
+        __le32 nfrees;
+        nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
+        while (n-- > 0) {
+                desc->pg_nfrees = nfrees;
+                desc++;
+        }
+}
+static int nilfs_palloc_get_desc_block(struct inode *inode,
+                                       unsigned long group,
+                                       int create, struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(inode,
+                                   nilfs_palloc_desc_blkoff(inode, group),
+                                   create, nilfs_palloc_desc_block_init, bhp);
+}
+static int nilfs_palloc_get_bitmap_block(struct inode *inode,
+                                         unsigned long group,
+                                         int create, struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(inode,
+                                   nilfs_palloc_bitmap_blkoff(inode, group),
+                                   create, NULL, bhp);
+}
+int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
+                                 int create, struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
+                                   create, NULL, bhp);
+}
+static struct nilfs_palloc_group_desc *
+nilfs_palloc_block_get_group_desc(const struct inode *inode,
+                                  unsigned long group,
+                                  const struct buffer_head *bh, void *kaddr)
+{
+        return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
+                group % nilfs_palloc_groups_per_desc_block(inode);
+}
+static unsigned char *
+nilfs_palloc_block_get_bitmap(const struct inode *inode,
+                              const struct buffer_head *bh, void *kaddr)
+{
+        return (unsigned char *)(kaddr + bh_offset(bh));
+}
+void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
+                                   const struct buffer_head *bh, void *kaddr)
+{
+        unsigned long entry_offset, group_offset;
+        nilfs_palloc_group(inode, nr, &group_offset);
+        entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
+        return kaddr + bh_offset(bh) +
+                entry_offset * NILFS_MDT(inode)->mi_entry_size;
+}
+static int nilfs_palloc_find_available_slot(struct inode *inode,
+                                            unsigned long group,
+                                            unsigned long target,
+                                            unsigned char *bitmap,
+                                            int bsize)  /* size in bits */
+{
+        int curr, pos, end, i;
+        if (target > 0) {
+                end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
+                if (end > bsize)
+                        end = bsize;
+                pos = nilfs_find_next_zero_bit(bitmap, end, target);
+                if (pos < end &&
+                    !nilfs_set_bit_atomic(
+                            nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
+                        return pos;
+        } else
+                end = 0;
+        for (i = 0, curr = end;
+             i < bsize;
+             i += BITS_PER_LONG, curr += BITS_PER_LONG) {
+                /* wrap around */
+                if (curr >= bsize)
+                        curr = 0;
+                while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
+                       != ~0UL) {
+                        end = curr + BITS_PER_LONG;
+                        if (end > bsize)
+                                end = bsize;
+                        pos = nilfs_find_next_zero_bit(bitmap, end, curr);
+                        if ((pos < end) &&
+                            !nilfs_set_bit_atomic(
+                                    nilfs_mdt_bgl_lock(inode, group), pos,
+                                    bitmap))
+                                return pos;
+                }
+        }
+        return -ENOSPC;
+}
+static unsigned long
+nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
+                                       unsigned long curr, unsigned long max)
+{
+        return min_t(unsigned long,
+                     nilfs_palloc_groups_per_desc_block(inode) -
+                     curr % nilfs_palloc_groups_per_desc_block(inode),
+                     max - curr + 1);
+}
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+                                     struct nilfs_palloc_req *req)
+{
+        struct buffer_head *desc_bh, *bitmap_bh;
+        struct nilfs_palloc_group_desc *desc;
+        unsigned char *bitmap;
+        void *desc_kaddr, *bitmap_kaddr;
+        unsigned long group, maxgroup, ngroups;
+        unsigned long group_offset, maxgroup_offset;
+        unsigned long n, entries_per_group, groups_per_desc_block;
+        unsigned long i, j;
+        int pos, ret;
+        ngroups = nilfs_palloc_groups_count(inode);
+        maxgroup = ngroups - 1;
+        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+        entries_per_group = nilfs_palloc_entries_per_group(inode);
+        groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
+        for (i = 0; i < ngroups; i += n) {
+                if (group >= ngroups) {
+                        /* wrap around */
+                        group = 0;
+                        maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
+                                                      &maxgroup_offset) - 1;
+                }
+                ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+                if (ret < 0)
+                        return ret;
+                desc_kaddr = kmap(desc_bh->b_page);
+                desc = nilfs_palloc_block_get_group_desc(
+                        inode, group, desc_bh, desc_kaddr);
+                n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
+                                                           maxgroup);
+                for (j = 0; j < n; j++, desc++, group++) {
+                        if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
+                            > 0) {
+                                ret = nilfs_palloc_get_bitmap_block(
+                                        inode, group, 1, &bitmap_bh);
+                                if (ret < 0)
+                                        goto out_desc;
+                                bitmap_kaddr = kmap(bitmap_bh->b_page);
+                                bitmap = nilfs_palloc_block_get_bitmap(
+                                        inode, bitmap_bh, bitmap_kaddr);
+                                pos = nilfs_palloc_find_available_slot(
+                                        inode, group, group_offset, bitmap,
+                                        entries_per_group);
+                                if (pos >= 0) {
+                                        /* found a free entry */
+                                        nilfs_palloc_group_desc_add_entries(
+                                                inode, group, desc, -1);
+                                        req->pr_entry_nr =
+                                                entries_per_group * group + pos;
+                                        kunmap(desc_bh->b_page);
+                                        kunmap(bitmap_bh->b_page);
+                                        req->pr_desc_bh = desc_bh;
+                                        req->pr_bitmap_bh = bitmap_bh;
+                                        return 0;
+                                }
+                                kunmap(bitmap_bh->b_page);
+                                brelse(bitmap_bh);
+                        }
+                        group_offset = 0;
+                }
+                kunmap(desc_bh->b_page);
+                brelse(desc_bh);
+        }
+        /* no entries left */
+        return -ENOSPC;
+ out_desc:
+        kunmap(desc_bh->b_page);
+        brelse(desc_bh);
+        return ret;
+}
+void nilfs_palloc_commit_alloc_entry(struct inode *inode,
+                                     struct nilfs_palloc_req *req)
+{
+        nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+        nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+        nilfs_mdt_mark_dirty(inode);
+        brelse(req->pr_bitmap_bh);
+        brelse(req->pr_desc_bh);
+}
+void nilfs_palloc_commit_free_entry(struct inode *inode,
+                                    struct nilfs_palloc_req *req)
+{
+        struct nilfs_palloc_group_desc *desc;
+        unsigned long group, group_offset;
+        unsigned char *bitmap;
+        void *desc_kaddr, *bitmap_kaddr;
+        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+        desc_kaddr = kmap(req->pr_desc_bh->b_page);
+        desc = nilfs_palloc_block_get_group_desc(inode, group,
+                                                 req->pr_desc_bh, desc_kaddr);
+        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+        bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+                                               bitmap_kaddr);
+        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+                                    group_offset, bitmap))
+                printk(KERN_WARNING "%s: entry number %llu already freed\n",
+                       __func__, (unsigned long long)req->pr_entry_nr);
+        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+        kunmap(req->pr_bitmap_bh->b_page);
+        kunmap(req->pr_desc_bh->b_page);
+        nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+        nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+        nilfs_mdt_mark_dirty(inode);
+        brelse(req->pr_bitmap_bh);
+        brelse(req->pr_desc_bh);
+}
+void nilfs_palloc_abort_alloc_entry(struct inode *inode,
+                                    struct nilfs_palloc_req *req)
+{
+        struct nilfs_palloc_group_desc *desc;
+        void *desc_kaddr, *bitmap_kaddr;
+        unsigned char *bitmap;
+        unsigned long group, group_offset;
+        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+        desc_kaddr = kmap(req->pr_desc_bh->b_page);
+        desc = nilfs_palloc_block_get_group_desc(inode, group,
+                                                 req->pr_desc_bh, desc_kaddr);
+        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+        bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+                                               bitmap_kaddr);
+        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+                                    group_offset, bitmap))
+                printk(KERN_WARNING "%s: entry numer %llu already freed\n",
+                       __func__, (unsigned long long)req->pr_entry_nr);
+        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+        kunmap(req->pr_bitmap_bh->b_page);
+        kunmap(req->pr_desc_bh->b_page);
+        brelse(req->pr_bitmap_bh);
+        brelse(req->pr_desc_bh);
+        req->pr_entry_nr = 0;
+        req->pr_bitmap_bh = NULL;
+        req->pr_desc_bh = NULL;
+}
+int nilfs_palloc_prepare_free_entry(struct inode *inode,
+                                    struct nilfs_palloc_req *req)
+{
+        struct buffer_head *desc_bh, *bitmap_bh;
+        unsigned long group, group_offset;
+        int ret;
+        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+        ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+        if (ret < 0)
+                return ret;
+        ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
+        if (ret < 0) {
+                brelse(desc_bh);
+                return ret;
+        }
+        req->pr_desc_bh = desc_bh;
+        req->pr_bitmap_bh = bitmap_bh;
+        return 0;
+}
+void nilfs_palloc_abort_free_entry(struct inode *inode,
+                                   struct nilfs_palloc_req *req)
+{
+        brelse(req->pr_bitmap_bh);
+        brelse(req->pr_desc_bh);
+        req->pr_entry_nr = 0;
+        req->pr_bitmap_bh = NULL;
+        req->pr_desc_bh = NULL;
+}
+static int
+nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
+{
+        __u64 first, last;
+        first = group * nilfs_palloc_entries_per_group(inode);
+        last = first + nilfs_palloc_entries_per_group(inode) - 1;
+        return (nr >= first) && (nr <= last);
+}
+int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
+{
+        struct buffer_head *desc_bh, *bitmap_bh;
+        struct nilfs_palloc_group_desc *desc;
+        unsigned char *bitmap;
+        void *desc_kaddr, *bitmap_kaddr;
+        unsigned long group, group_offset;
+        int i, j, n, ret;
+        for (i = 0; i < nitems; i += n) {
+                group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
+                ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
+                if (ret < 0)
+                        return ret;
+                ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
+                                                    &bitmap_bh);
+                if (ret < 0) {
+                        brelse(desc_bh);
+                        return ret;
+                }
+                desc_kaddr = kmap(desc_bh->b_page);
+                desc = nilfs_palloc_block_get_group_desc(
+                        inode, group, desc_bh, desc_kaddr);
+                bitmap_kaddr = kmap(bitmap_bh->b_page);
+                bitmap = nilfs_palloc_block_get_bitmap(
+                        inode, bitmap_bh, bitmap_kaddr);
+                for (j = i, n = 0;
+                     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
+                                                              entry_nrs[j]);
+                     j++, n++) {
+                        nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
+                        if (!nilfs_clear_bit_atomic(
+                                    nilfs_mdt_bgl_lock(inode, group),
+                                    group_offset, bitmap)) {
+                                printk(KERN_WARNING
+                                       "%s: entry number %llu already freed\n",
+                                       __func__,
+                                       (unsigned long long)entry_nrs[j]);
+                        }
+                }
+                nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+                kunmap(bitmap_bh->b_page);
+                kunmap(desc_bh->b_page);
+                nilfs_mdt_mark_buffer_dirty(desc_bh);
+                nilfs_mdt_mark_buffer_dirty(bitmap_bh);
+                nilfs_mdt_mark_dirty(inode);
+                brelse(bitmap_bh);
+                brelse(desc_bh);
+        }
+        return 0;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
new file mode 100644
index 000000000000..4ace5475c2c7
--- /dev/null
+++ b/fs/nilfs2/alloc.h
@@ -0,0 +1,72 @@
+/*
+ * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+#ifndef _NILFS_ALLOC_H
+#define _NILFS_ALLOC_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+static inline unsigned long
+nilfs_palloc_entries_per_group(const struct inode *inode)
+{
+        return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
+}
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
+                                 struct buffer_head **);
+void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
+                                   const struct buffer_head *, void *);
+/**
+ * nilfs_palloc_req - persistent alloctor request and reply
+ * @pr_entry_nr: entry number (vblocknr or inode number)
+ * @pr_desc_bh: buffer head of the buffer containing block group descriptors
+ * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
+ * @pr_entry_bh: buffer head of the buffer containing translation entries
+ */
+struct nilfs_palloc_req {
+        __u64 pr_entry_nr;
+        struct buffer_head *pr_desc_bh;
+        struct buffer_head *pr_bitmap_bh;
+        struct buffer_head *pr_entry_bh;
+};
+int nilfs_palloc_prepare_alloc_entry(struct inode *,
+                                     struct nilfs_palloc_req *);
+void nilfs_palloc_commit_alloc_entry(struct inode *,
+                                     struct nilfs_palloc_req *);
+void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
+#define nilfs_set_bit_atomic            ext2_set_bit_atomic
+#define nilfs_clear_bit_atomic          ext2_clear_bit_atomic
+#define nilfs_find_next_zero_bit        ext2_find_next_zero_bit
+#endif  /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
new file mode 100644
index 000000000000..064279e33bbb
--- /dev/null
+++ b/fs/nilfs2/bmap.c
@@ -0,0 +1,788 @@
+/*
+ * bmap.c - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "bmap.h"
+#include "sb.h"
+#include "btnode.h"
+#include "mdt.h"
+#include "dat.h"
+#include "alloc.h"
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
+                               __u64 *ptrp)
+{
+        __u64 ptr;
+        int ret;
+        down_read(&bmap->b_sem);
+        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
+        if (ret < 0)
+                goto out;
+        if (bmap->b_pops->bpop_translate != NULL) {
+                ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
+                if (ret < 0)
+                        goto out;
+                *ptrp = ptr;
+        }
+ out:
+        up_read(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_lookup - find a record
+ * @bmap: bmap
+ * @key: key
+ * @recp: pointer to record
+ *
+ * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
+ * @bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @recp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
+                      unsigned long key,
+                      unsigned long *recp)
+{
+        __u64 ptr;
+        int ret;
+        /* XXX: use macro for level 1 */
+        ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
+        if (recp != NULL)
+                *recp = ptr;
+        return ret;
+}
+static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+        __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
+        __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
+        int ret, n;
+        if (bmap->b_ops->bop_check_insert != NULL) {
+                ret = bmap->b_ops->bop_check_insert(bmap, key);
+                if (ret > 0) {
+                        n = bmap->b_ops->bop_gather_data(
+                                bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
+                        if (n < 0)
+                                return n;
+                        ret = nilfs_btree_convert_and_insert(
+                                bmap, key, ptr, keys, ptrs, n,
+                                NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
+                        if (ret == 0)
+                                bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
+                        return ret;
+                } else if (ret < 0)
+                        return ret;
+        }
+        return bmap->b_ops->bop_insert(bmap, key, ptr);
+}
+/**
+ * nilfs_bmap_insert - insert a new key-record pair into a bmap
+ * @bmap: bmap
+ * @key: key
+ * @rec: record
+ *
+ * Description: nilfs_bmap_insert() inserts the new key-record pair specified
+ * by @key and @rec into @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EEXIST - A record associated with @key already exist.
+ */
+int nilfs_bmap_insert(struct nilfs_bmap *bmap,
+                      unsigned long key,
+                      unsigned long rec)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = nilfs_bmap_do_insert(bmap, key, rec);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+        __u64 keys[NILFS_BMAP_LARGE_LOW + 1];
+        __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
+        int ret, n;
+        if (bmap->b_ops->bop_check_delete != NULL) {
+                ret = bmap->b_ops->bop_check_delete(bmap, key);
+                if (ret > 0) {
+                        n = bmap->b_ops->bop_gather_data(
+                                bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
+                        if (n < 0)
+                                return n;
+                        ret = nilfs_direct_delete_and_convert(
+                                bmap, key, keys, ptrs, n,
+                                NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
+                        if (ret == 0)
+                                bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
+                        return ret;
+                } else if (ret < 0)
+                        return ret;
+        }
+        return bmap->b_ops->bop_delete(bmap, key);
+}
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+{
+        __u64 lastkey;
+        int ret;
+        down_read(&bmap->b_sem);
+        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+        if (!ret)
+                *key = lastkey;
+        up_read(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_delete - delete a key-record pair from a bmap
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_delete() deletes the key-record pair specified by
+ * @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = nilfs_bmap_do_delete(bmap, key);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+        __u64 lastkey;
+        int ret;
+        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+        if (ret < 0) {
+                if (ret == -ENOENT)
+                        ret = 0;
+                return ret;
+        }
+        while (key <= lastkey) {
+                ret = nilfs_bmap_do_delete(bmap, lastkey);
+                if (ret < 0)
+                        return ret;
+                ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+                if (ret < 0) {
+                        if (ret == -ENOENT)
+                                ret = 0;
+                        return ret;
+                }
+        }
+        return 0;
+}
+/**
+ * nilfs_bmap_truncate - truncate a bmap to a specified key
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
+ * greater than or equal to @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = nilfs_bmap_do_truncate(bmap, key);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_clear - free resources a bmap holds
+ * @bmap: bmap
+ *
+ * Description: nilfs_bmap_clear() frees resources associated with @bmap.
+ */
+void nilfs_bmap_clear(struct nilfs_bmap *bmap)
+{
+        down_write(&bmap->b_sem);
+        if (bmap->b_ops->bop_clear != NULL)
+                bmap->b_ops->bop_clear(bmap);
+        up_write(&bmap->b_sem);
+}
+/**
+ * nilfs_bmap_propagate - propagate dirty state
+ * @bmap: bmap
+ * @bh: buffer head
+ *
+ * Description: nilfs_bmap_propagate() marks the buffers that directly or
+ * indirectly refer to the block specified by @bh dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = bmap->b_ops->bop_propagate(bmap, bh);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_lookup_dirty_buffers -
+ * @bmap: bmap
+ * @listp: pointer to buffer head list
+ */
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+                                     struct list_head *listp)
+{
+        if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
+                bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
+}
+/**
+ * nilfs_bmap_assign - assign a new block number to a block
+ * @bmap: bmap
+ * @bhp: pointer to buffer head
+ * @blocknr: block number
+ * @binfo: block information
+ *
+ * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
+ * buffer specified by @bh.
+ *
+ * Return Value: On success, 0 is returned and the buffer head of a newly
+ * create buffer and the block information associated with the buffer are
+ * stored in the place pointed by @bh and @binfo, respectively. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_assign(struct nilfs_bmap *bmap,
+                      struct buffer_head **bh,
+                      unsigned long blocknr,
+                      union nilfs_binfo *binfo)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_mark - mark block dirty
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ *
+ * Description: nilfs_bmap_mark() marks the block specified by @key and @level
+ * as dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+        int ret;
+        if (bmap->b_ops->bop_mark == NULL)
+                return 0;
+        down_write(&bmap->b_sem);
+        ret = bmap->b_ops->bop_mark(bmap, key, level);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
+ * @bmap: bmap
+ *
+ * Description: nilfs_test_and_clear() is the atomic operation to test and
+ * clear the dirty state of @bmap.
+ *
+ * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
+ */
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = nilfs_bmap_dirty(bmap);
+        nilfs_bmap_clear_dirty(bmap);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/*
+ * Internal use only
+ */
+void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
+{
+        inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
+        if (NILFS_MDT(bmap->b_inode))
+                nilfs_mdt_mark_dirty(bmap->b_inode);
+        else
+                mark_inode_dirty(bmap->b_inode);
+}
+void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
+{
+        inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
+        if (NILFS_MDT(bmap->b_inode))
+                nilfs_mdt_mark_dirty(bmap->b_inode);
+        else
+                mark_inode_dirty(bmap->b_inode);
+}
+int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
+                         struct buffer_head **bhp)
+{
+        return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
+                                ptr, 0, bhp, 0);
+}
+void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
+                          struct buffer_head *bh)
+{
+        brelse(bh);
+}
+int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
+                             struct buffer_head **bhp)
+{
+        int ret;
+        ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
+                               ptr, 0, bhp, 1);
+        if (ret < 0)
+                return ret;
+        set_buffer_nilfs_volatile(*bhp);
+        return 0;
+}
+void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
+                             struct buffer_head *bh)
+{
+        nilfs_btnode_delete(bh);
+}
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
+                              const struct buffer_head *bh)
+{
+        struct buffer_head *pbh;
+        __u64 key;
+        key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
+                                         bmap->b_inode->i_blkbits);
+        for (pbh = page_buffers(bh->b_page); pbh != bh;
+             pbh = pbh->b_this_page, key++);
+        return key;
+}
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
+{
+        __s64 diff;
+        diff = key - bmap->b_last_allocated_key;
+        if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
+            (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
+            (bmap->b_last_allocated_ptr + diff > 0))
+                return bmap->b_last_allocated_ptr + diff;
+        else
+                return NILFS_BMAP_INVALID_PTR;
+}
+static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+}
+#define NILFS_BMAP_GROUP_DIV    8
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
+{
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
+        unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
+        unsigned long group = bmap->b_inode->i_ino / entries_per_group;
+        return group * entries_per_group +
+                (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
+                (entries_per_group / NILFS_BMAP_GROUP_DIV);
+}
+static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
+                                     union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req,
+                                      sector_t blocknr)
+{
+        nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
+                               blocknr);
+}
+static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
+                                     union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
+                                    union nilfs_bmap_ptr_req *req)
+{
+        return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
+                                    union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
+}
+static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
+                                       union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
+}
+static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
+                                   union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
+                      sector_t blocknr)
+{
+        return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
+}
+int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
+{
+        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
+}
+int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
+                              union nilfs_bmap_ptr_req *oldreq,
+                              union nilfs_bmap_ptr_req *newreq)
+{
+        int ret;
+        ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
+        if (ret < 0)
+                return ret;
+        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
+        if (ret < 0)
+                bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+        return ret;
+}
+void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
+                              union nilfs_bmap_ptr_req *oldreq,
+                              union nilfs_bmap_ptr_req *newreq)
+{
+        bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
+        bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
+}
+void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
+                             union nilfs_bmap_ptr_req *oldreq,
+                             union nilfs_bmap_ptr_req *newreq)
+{
+        bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+        bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
+}
+static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
+                                  __u64 *ptrp)
+{
+        sector_t blocknr;
+        int ret;
+        ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
+        if (ret < 0)
+                return ret;
+        if (ptrp != NULL)
+                *ptrp = blocknr;
+        return 0;
+}
+static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        /* ignore target ptr */
+        req->bpr_ptr = bmap->b_last_allocated_ptr++;
+        return 0;
+}
+static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        /* do nothing */
+}
+static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
+                                     union nilfs_bmap_ptr_req *req)
+{
+        bmap->b_last_allocated_ptr--;
+}
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
+        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
+        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
+        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
+        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
+        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
+        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
+        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
+        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_v,
+        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
+        .bpop_translate         =       nilfs_bmap_translate_v,
+};
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
+        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
+        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
+        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
+        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
+        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
+        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
+        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
+        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_vmdt,
+        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
+        .bpop_translate         =       nilfs_bmap_translate_v,
+};
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
+        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_p,
+        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_p,
+        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_p,
+        .bpop_prepare_start_ptr =       NULL,
+        .bpop_commit_start_ptr  =       NULL,
+        .bpop_abort_start_ptr   =       NULL,
+        .bpop_prepare_end_ptr   =       NULL,
+        .bpop_commit_end_ptr    =       NULL,
+        .bpop_abort_end_ptr     =       NULL,
+        .bpop_translate         =       NULL,
+};
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
+        .bpop_prepare_alloc_ptr =       NULL,
+        .bpop_commit_alloc_ptr  =       NULL,
+        .bpop_abort_alloc_ptr   =       NULL,
+        .bpop_prepare_start_ptr =       NULL,
+        .bpop_commit_start_ptr  =       NULL,
+        .bpop_abort_start_ptr   =       NULL,
+        .bpop_prepare_end_ptr   =       NULL,
+        .bpop_commit_end_ptr    =       NULL,
+        .bpop_abort_end_ptr     =       NULL,
+        .bpop_translate         =       NULL,
+};
+static struct lock_class_key nilfs_bmap_dat_lock_key;
+/**
+ * nilfs_bmap_read - read a bmap from an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_read() initializes the bmap @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+        if (raw_inode == NULL)
+                memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
+        else
+                memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
+        init_rwsem(&bmap->b_sem);
+        bmap->b_state = 0;
+        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+        switch (bmap->b_inode->i_ino) {
+        case NILFS_DAT_INO:
+                bmap->b_pops = &nilfs_bmap_ptr_ops_p;
+                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+                break;
+        case NILFS_CPFILE_INO:
+        case NILFS_SUFILE_INO:
+                bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
+                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+                break;
+        default:
+                bmap->b_pops = &nilfs_bmap_ptr_ops_v;
+                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+                break;
+        }
+        return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
+                nilfs_btree_init(bmap,
+                                 NILFS_BMAP_LARGE_LOW,
+                                 NILFS_BMAP_LARGE_HIGH) :
+                nilfs_direct_init(bmap,
+                                  NILFS_BMAP_SMALL_LOW,
+                                  NILFS_BMAP_SMALL_HIGH);
+}
+/**
+ * nilfs_bmap_write - write back a bmap to an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
+ */
+void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+        down_write(&bmap->b_sem);
+        memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
+               NILFS_INODE_BMAP_SIZE * sizeof(__le64));
+        if (bmap->b_inode->i_ino == NILFS_DAT_INO)
+                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+        up_write(&bmap->b_sem);
+}
+void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
+{
+        memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
+        init_rwsem(&bmap->b_sem);
+        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+        bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
+        bmap->b_last_allocated_key = 0;
+        bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+        bmap->b_state = 0;
+        nilfs_btree_init_gc(bmap);
+}
+void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+{
+        memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
+        init_rwsem(&gcbmap->b_sem);
+        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+        gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
+}
+void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+{
+        memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
+        init_rwsem(&bmap->b_sem);
+        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
new file mode 100644
index 000000000000..4f2708abb1ba
--- /dev/null
+++ b/fs/nilfs2/bmap.h
@@ -0,0 +1,244 @@
+/*
+ * bmap.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_BMAP_H
+#define _NILFS_BMAP_H
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "alloc.h"
+#define NILFS_BMAP_INVALID_PTR  0
+#define nilfs_bmap_dkey_to_key(dkey)    le64_to_cpu(dkey)
+#define nilfs_bmap_key_to_dkey(key)     cpu_to_le64(key)
+#define nilfs_bmap_dptr_to_ptr(dptr)    le64_to_cpu(dptr)
+#define nilfs_bmap_ptr_to_dptr(ptr)     cpu_to_le64(ptr)
+#define nilfs_bmap_keydiff_abs(diff)    ((diff) < 0 ? -(diff) : (diff))
+struct nilfs_bmap;
+/**
+ * union nilfs_bmap_ptr_req - request for bmap ptr
+ * @bpr_ptr: bmap pointer
+ * @bpr_req: request for persistent allocator
+ */
+union nilfs_bmap_ptr_req {
+        __u64 bpr_ptr;
+        struct nilfs_palloc_req bpr_req;
+};
+/**
+ * struct nilfs_bmap_stats - bmap statistics
+ * @bs_nblocks: number of blocks created or deleted
+ */
+struct nilfs_bmap_stats {
+        unsigned int bs_nblocks;
+};
+/**
+ * struct nilfs_bmap_operations - bmap operation table
+ */
+struct nilfs_bmap_operations {
+        int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+        int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
+        int (*bop_delete)(struct nilfs_bmap *, __u64);
+        void (*bop_clear)(struct nilfs_bmap *);
+        int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
+        void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
+                                         struct list_head *);
+        int (*bop_assign)(struct nilfs_bmap *,
+                          struct buffer_head **,
+                          sector_t,
+                          union nilfs_binfo *);
+        int (*bop_mark)(struct nilfs_bmap *, __u64, int);
+        /* The following functions are internal use only. */
+        int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+        int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
+        int (*bop_check_delete)(struct nilfs_bmap *, __u64);
+        int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
+};
+/**
+ * struct nilfs_bmap_ptr_operations - bmap ptr operation table
+ */
+struct nilfs_bmap_ptr_operations {
+        int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
+                                      union nilfs_bmap_ptr_req *);
+        void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
+                                      union nilfs_bmap_ptr_req *);
+        void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
+                                     union nilfs_bmap_ptr_req *);
+        int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
+                                      union nilfs_bmap_ptr_req *);
+        void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
+                                      union nilfs_bmap_ptr_req *,
+                                      sector_t);
+        void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
+                                     union nilfs_bmap_ptr_req *);
+        int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
+                                    union nilfs_bmap_ptr_req *);
+        void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
+                                    union nilfs_bmap_ptr_req *);
+        void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
+                                   union nilfs_bmap_ptr_req *);
+        int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
+};
+#define NILFS_BMAP_SIZE         (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
+#define NILFS_BMAP_KEY_BIT      (sizeof(unsigned long) * 8 /* CHAR_BIT */)
+#define NILFS_BMAP_NEW_PTR_INIT \
+        (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
+static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
+{
+        return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
+}
+/**
+ * struct nilfs_bmap - bmap structure
+ * @b_u: raw data
+ * @b_sem: semaphore
+ * @b_inode: owner of bmap
+ * @b_ops: bmap operation table
+ * @b_pops: bmap ptr operation table
+ * @b_low: low watermark of conversion
+ * @b_high: high watermark of conversion
+ * @b_last_allocated_key: last allocated key for data block
+ * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_state: state
+ */
+struct nilfs_bmap {
+        union {
+                __u8 u_flags;
+                __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
+        } b_u;
+        struct rw_semaphore b_sem;
+        struct inode *b_inode;
+        const struct nilfs_bmap_operations *b_ops;
+        const struct nilfs_bmap_ptr_operations *b_pops;
+        __u64 b_low;
+        __u64 b_high;
+        __u64 b_last_allocated_key;
+        __u64 b_last_allocated_ptr;
+        int b_state;
+};
+/* state */
+#define NILFS_BMAP_DIRTY        0x00000001
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
+int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
+void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
+int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
+int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
+int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
+int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+void nilfs_bmap_clear(struct nilfs_bmap *);
+int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
+int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
+                      unsigned long, union nilfs_binfo *);
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
+int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
+void nilfs_bmap_init_gc(struct nilfs_bmap *);
+void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+/*
+ * Internal use only
+ */
+int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
+int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
+                              const struct buffer_head *);
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
+int nilfs_bmap_prepare_update(struct nilfs_bmap *,
+                              union nilfs_bmap_ptr_req *,
+                              union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_update(struct nilfs_bmap *,
+                              union nilfs_bmap_ptr_req *,
+                              union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_update(struct nilfs_bmap *,
+                             union nilfs_bmap_ptr_req *,
+                             union nilfs_bmap_ptr_req *);
+void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
+void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
+int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
+                         struct buffer_head **);
+void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
+int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
+                             struct buffer_head **);
+void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
+/* Assume that bmap semaphore is locked. */
+static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
+{
+        return !!(bmap->b_state & NILFS_BMAP_DIRTY);
+}
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
+{
+        bmap->b_state |= NILFS_BMAP_DIRTY;
+}
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
+{
+        bmap->b_state &= ~NILFS_BMAP_DIRTY;
+}
+#define NILFS_BMAP_LARGE        0x1
+#define NILFS_BMAP_SMALL_LOW    NILFS_DIRECT_KEY_MIN
+#define NILFS_BMAP_SMALL_HIGH   NILFS_DIRECT_KEY_MAX
+#define NILFS_BMAP_LARGE_LOW    NILFS_BTREE_ROOT_NCHILDREN_MAX
+#define NILFS_BMAP_LARGE_HIGH   NILFS_BTREE_KEY_MAX
+#endif  /* _NILFS_BMAP_H */
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
new file mode 100644
index 000000000000..d41509bff47b
--- /dev/null
+++ b/fs/nilfs2/bmap_union.h
@@ -0,0 +1,42 @@
+/*
+ * bmap_union.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_BMAP_UNION_H
+#define _NILFS_BMAP_UNION_H
+#include "bmap.h"
+#include "direct.h"
+#include "btree.h"
+/**
+ * nilfs_bmap_union -
+ * @bi_bmap: bmap structure
+ * @bi_btree: direct map structure
+ * @bi_direct: B-tree structure
+ */
+union nilfs_bmap_union {
+        struct nilfs_bmap bi_bmap;
+        struct nilfs_direct bi_direct;
+        struct nilfs_btree bi_btree;
+};
+#endif  /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
new file mode 100644
index 000000000000..4cc07b2c30e0
--- /dev/null
+++ b/fs/nilfs2/btnode.c
@@ -0,0 +1,316 @@
+/*
+ * btnode.c - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * This file was originally written by Seiji Kihara <kihara@osrg.net>
+ * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
+ * stabilization and simplification.
+ *
+ */
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "dat.h"
+#include "page.h"
+#include "btnode.h"
+void nilfs_btnode_cache_init_once(struct address_space *btnc)
+{
+        INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
+        spin_lock_init(&btnc->tree_lock);
+        INIT_LIST_HEAD(&btnc->private_list);
+        spin_lock_init(&btnc->private_lock);
+        spin_lock_init(&btnc->i_mmap_lock);
+        INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
+        INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
+}
+static struct address_space_operations def_btnode_aops;
+void nilfs_btnode_cache_init(struct address_space *btnc)
+{
+        btnc->host = NULL;  /* can safely set to host inode ? */
+        btnc->flags = 0;
+        mapping_set_gfp_mask(btnc, GFP_NOFS);
+        btnc->assoc_mapping = NULL;
+        btnc->backing_dev_info = &default_backing_dev_info;
+        btnc->a_ops = &def_btnode_aops;
+}
+void nilfs_btnode_cache_clear(struct address_space *btnc)
+{
+        invalidate_mapping_pages(btnc, 0, -1);
+        truncate_inode_pages(btnc, 0);
+}
+int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
+                              sector_t pblocknr, struct buffer_head **pbh,
+                              int newblk)
+{
+        struct buffer_head *bh;
+        struct inode *inode = NILFS_BTNC_I(btnc);
+        int err;
+        bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+        if (unlikely(!bh))
+                return -ENOMEM;
+        err = -EEXIST; /* internal code */
+        if (newblk) {
+                if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+                             buffer_dirty(bh))) {
+                        brelse(bh);
+                        BUG();
+                }
+                bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+                bh->b_blocknr = blocknr;
+                set_buffer_mapped(bh);
+                set_buffer_uptodate(bh);
+                goto found;
+        }
+        if (buffer_uptodate(bh) || buffer_dirty(bh))
+                goto found;
+        if (pblocknr == 0) {
+                pblocknr = blocknr;
+                if (inode->i_ino != NILFS_DAT_INO) {
+                        struct inode *dat =
+                                nilfs_dat_inode(NILFS_I_NILFS(inode));
+                        /* blocknr is a virtual block number */
+                        err = nilfs_dat_translate(dat, blocknr, &pblocknr);
+                        if (unlikely(err)) {
+                                brelse(bh);
+                                goto out_locked;
+                        }
+                }
+        }
+        lock_buffer(bh);
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                err = -EEXIST; /* internal code */
+                goto found;
+        }
+        set_buffer_mapped(bh);
+        bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+        bh->b_blocknr = pblocknr; /* set block address for read */
+        bh->b_end_io = end_buffer_read_sync;
+        get_bh(bh);
+        submit_bh(READ, bh);
+        bh->b_blocknr = blocknr; /* set back to the given block address */
+        err = 0;
+found:
+        *pbh = bh;
+out_locked:
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        return err;
+}
+int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
+                     sector_t pblocknr, struct buffer_head **pbh, int newblk)
+{
+        struct buffer_head *bh;
+        int err;
+        err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
+        if (err == -EEXIST) /* internal code (cache hit) */
+                return 0;
+        if (unlikely(err))
+                return err;
+        bh = *pbh;
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
+                brelse(bh);
+                return -EIO;
+        }
+        return 0;
+}
+/**
+ * nilfs_btnode_delete - delete B-tree node buffer
+ * @bh: buffer to be deleted
+ *
+ * nilfs_btnode_delete() invalidates the specified buffer and delete the page
+ * including the buffer if the page gets unbusy.
+ */
+void nilfs_btnode_delete(struct buffer_head *bh)
+{
+        struct address_space *mapping;
+        struct page *page = bh->b_page;
+        pgoff_t index = page_index(page);
+        int still_dirty;
+        page_cache_get(page);
+        lock_page(page);
+        wait_on_page_writeback(page);
+        nilfs_forget_buffer(bh);
+        still_dirty = PageDirty(page);
+        mapping = page->mapping;
+        unlock_page(page);
+        page_cache_release(page);
+        if (!still_dirty && mapping)
+                invalidate_inode_pages2_range(mapping, index, index);
+}
+/**
+ * nilfs_btnode_prepare_change_key
+ *  prepare to move contents of the block for old key to one of new key.
+ *  the old buffer will not be removed, but might be reused for new buffer.
+ *  it might return -ENOMEM because of memory allocation errors,
+ *  and might return -EIO because of disk read errors.
+ */
+int nilfs_btnode_prepare_change_key(struct address_space *btnc,
+                                    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+        struct buffer_head *obh, *nbh;
+        struct inode *inode = NILFS_BTNC_I(btnc);
+        __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+        int err;
+        if (oldkey == newkey)
+                return 0;
+        obh = ctxt->bh;
+        ctxt->newbh = NULL;
+        if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
+                lock_page(obh->b_page);
+                /*
+                 * We cannot call radix_tree_preload for the kernels older
+                 * than 2.6.23, because it is not exported for modules.
+                 */
+                err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+                if (err)
+                        goto failed_unlock;
+                /* BUG_ON(oldkey != obh->b_page->index); */
+                if (unlikely(oldkey != obh->b_page->index))
+                        NILFS_PAGE_BUG(obh->b_page,
+                                       "invalid oldkey %lld (newkey=%lld)",
+                                       (unsigned long long)oldkey,
+                                       (unsigned long long)newkey);
+retry:
+                spin_lock_irq(&btnc->tree_lock);
+                err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
+                spin_unlock_irq(&btnc->tree_lock);
+                /*
+                 * Note: page->index will not change to newkey until
+                 * nilfs_btnode_commit_change_key() will be called.
+                 * To protect the page in intermediate state, the page lock
+                 * is held.
+                 */
+                radix_tree_preload_end();
+                if (!err)
+                        return 0;
+                else if (err != -EEXIST)
+                        goto failed_unlock;
+                err = invalidate_inode_pages2_range(btnc, newkey, newkey);
+                if (!err)
+                        goto retry;
+                /* fallback to copy mode */
+                unlock_page(obh->b_page);
+        }
+        err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
+        if (likely(!err)) {
+                BUG_ON(nbh == obh);
+                ctxt->newbh = nbh;
+        }
+        return err;
+ failed_unlock:
+        unlock_page(obh->b_page);
+        return err;
+}
+/**
+ * nilfs_btnode_commit_change_key
+ *  commit the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_commit_change_key(struct address_space *btnc,
+                                    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+        struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
+        __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+        struct page *opage;
+        if (oldkey == newkey)
+                return;
+        if (nbh == NULL) {      /* blocksize == pagesize */
+                opage = obh->b_page;
+                if (unlikely(oldkey != opage->index))
+                        NILFS_PAGE_BUG(opage,
+                                       "invalid oldkey %lld (newkey=%lld)",
+                                       (unsigned long long)oldkey,
+                                       (unsigned long long)newkey);
+                if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
+                        BUG();
+                spin_lock_irq(&btnc->tree_lock);
+                radix_tree_delete(&btnc->page_tree, oldkey);
+                radix_tree_tag_set(&btnc->page_tree, newkey,
+                                   PAGECACHE_TAG_DIRTY);
+                spin_unlock_irq(&btnc->tree_lock);
+                opage->index = obh->b_blocknr = newkey;
+                unlock_page(opage);
+        } else {
+                nilfs_copy_buffer(nbh, obh);
+                nilfs_btnode_mark_dirty(nbh);
+                nbh->b_blocknr = newkey;
+                ctxt->bh = nbh;
+                nilfs_btnode_delete(obh); /* will decrement bh->b_count */
+        }
+}
+/**
+ * nilfs_btnode_abort_change_key
+ *  abort the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_abort_change_key(struct address_space *btnc,
+                                   struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+        struct buffer_head *nbh = ctxt->newbh;
+        __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+        if (oldkey == newkey)
+                return;
+        if (nbh == NULL) {      /* blocksize == pagesize */
+                spin_lock_irq(&btnc->tree_lock);
+                radix_tree_delete(&btnc->page_tree, newkey);
+                spin_unlock_irq(&btnc->tree_lock);
+                unlock_page(ctxt->bh->b_page);
+        } else
+                brelse(nbh);
+}
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
new file mode 100644
index 000000000000..35faa86444a7
--- /dev/null
+++ b/fs/nilfs2/btnode.h
@@ -0,0 +1,58 @@
+/*
+ * btnode.h - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#ifndef _NILFS_BTNODE_H
+#define _NILFS_BTNODE_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+struct nilfs_btnode_chkey_ctxt {
+        __u64 oldkey;
+        __u64 newkey;
+        struct buffer_head *bh;
+        struct buffer_head *newbh;
+};
+void nilfs_btnode_cache_init_once(struct address_space *);
+void nilfs_btnode_cache_init(struct address_space *);
+void nilfs_btnode_cache_clear(struct address_space *);
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
+                              struct buffer_head **, int);
+int nilfs_btnode_get(struct address_space *, __u64, sector_t,
+                     struct buffer_head **, int);
+void nilfs_btnode_delete(struct buffer_head *);
+int nilfs_btnode_prepare_change_key(struct address_space *,
+                                    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_commit_change_key(struct address_space *,
+                                    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_abort_change_key(struct address_space *,
+                                   struct nilfs_btnode_chkey_ctxt *);
+#define nilfs_btnode_mark_dirty(bh)     nilfs_mark_buffer_dirty(bh)
+#endif  /* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
new file mode 100644
index 000000000000..6b37a2767293
--- /dev/null
+++ b/fs/nilfs2/btree.c
@@ -0,0 +1,2269 @@
+/*
+ * btree.c - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "btnode.h"
+#include "btree.h"
+#include "alloc.h"
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+        struct buffer_head *bp_bh;
+        struct buffer_head *bp_sib_bh;
+        int bp_index;
+        union nilfs_bmap_ptr_req bp_oldreq;
+        union nilfs_bmap_ptr_req bp_newreq;
+        struct nilfs_btnode_chkey_ctxt bp_ctxt;
+        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+                      int, __u64 *, __u64 *);
+};
+/*
+ * B-tree path operations
+ */
+static struct kmem_cache *nilfs_btree_path_cache;
+int __init nilfs_btree_path_cache_init(void)
+{
+        nilfs_btree_path_cache =
+                kmem_cache_create("nilfs2_btree_path_cache",
+                                  sizeof(struct nilfs_btree_path) *
+                                  NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
+        return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
+}
+void nilfs_btree_path_cache_destroy(void)
+{
+        kmem_cache_destroy(nilfs_btree_path_cache);
+}
+static inline struct nilfs_btree_path *
+nilfs_btree_alloc_path(const struct nilfs_btree *btree)
+{
+        return (struct nilfs_btree_path *)
+                kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+}
+static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
+                                         struct nilfs_btree_path *path)
+{
+        kmem_cache_free(nilfs_btree_path_cache, path);
+}
+static void nilfs_btree_init_path(const struct nilfs_btree *btree,
+                                  struct nilfs_btree_path *path)
+{
+        int level;
+        for (level = NILFS_BTREE_LEVEL_DATA;
+             level < NILFS_BTREE_LEVEL_MAX;
+             level++) {
+                path[level].bp_bh = NULL;
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index = 0;
+                path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+                path[level].bp_op = NULL;
+        }
+}
+static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
+                                   struct nilfs_btree_path *path)
+{
+        int level;
+        for (level = NILFS_BTREE_LEVEL_DATA;
+             level < NILFS_BTREE_LEVEL_MAX;
+             level++) {
+                if (path[level].bp_bh != NULL) {
+                        nilfs_bmap_put_block(&btree->bt_bmap,
+                                             path[level].bp_bh);
+                        path[level].bp_bh = NULL;
+                }
+                /* sib_bh is released or deleted by prepare or commit
+                 * operations. */
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index = 0;
+                path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+                path[level].bp_op = NULL;
+        }
+}
+/*
+ * B-tree node operations
+ */
+static inline int
+nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
+                           const struct nilfs_btree_node *node)
+{
+        return node->bn_flags;
+}
+static inline void
+nilfs_btree_node_set_flags(struct nilfs_btree *btree,
+                           struct nilfs_btree_node *node,
+                           int flags)
+{
+        node->bn_flags = flags;
+}
+static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
+                                        const struct nilfs_btree_node *node)
+{
+        return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+}
+static inline int
+nilfs_btree_node_get_level(const struct nilfs_btree *btree,
+                           const struct nilfs_btree_node *node)
+{
+        return node->bn_level;
+}
+static inline void
+nilfs_btree_node_set_level(struct nilfs_btree *btree,
+                           struct nilfs_btree_node *node,
+                           int level)
+{
+        node->bn_level = level;
+}
+static inline int
+nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
+                               const struct nilfs_btree_node *node)
+{
+        return le16_to_cpu(node->bn_nchildren);
+}
+static inline void
+nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
+                               struct nilfs_btree_node *node,
+                               int nchildren)
+{
+        node->bn_nchildren = cpu_to_le16(nchildren);
+}
+static inline int
+nilfs_btree_node_size(const struct nilfs_btree *btree)
+{
+        return 1 << btree->bt_bmap.b_inode->i_blkbits;
+}
+static inline int
+nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
+                               const struct nilfs_btree_node *node)
+{
+        return nilfs_btree_node_root(btree, node) ?
+                NILFS_BTREE_ROOT_NCHILDREN_MIN :
+                NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+}
+static inline int
+nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
+                               const struct nilfs_btree_node *node)
+{
+        return nilfs_btree_node_root(btree, node) ?
+                NILFS_BTREE_ROOT_NCHILDREN_MAX :
+                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
+}
+static inline __le64 *
+nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
+                       const struct nilfs_btree_node *node)
+{
+        return (__le64 *)((char *)(node + 1) +
+                          (nilfs_btree_node_root(btree, node) ?
+                           0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
+}
+static inline __le64 *
+nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
+                       const struct nilfs_btree_node *node)
+{
+        return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
+                          nilfs_btree_node_nchildren_max(btree, node));
+}
+static inline __u64
+nilfs_btree_node_get_key(const struct nilfs_btree *btree,
+                         const struct nilfs_btree_node *node, int index)
+{
+        return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
+                                        index));
+}
+static inline void
+nilfs_btree_node_set_key(struct nilfs_btree *btree,
+                         struct nilfs_btree_node *node, int index, __u64 key)
+{
+        *(nilfs_btree_node_dkeys(btree, node) + index) =
+                nilfs_bmap_key_to_dkey(key);
+}
+static inline __u64
+nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
+                         const struct nilfs_btree_node *node,
+                         int index)
+{
+        return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+                                        index));
+}
+static inline void
+nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
+                         struct nilfs_btree_node *node,
+                         int index,
+                         __u64 ptr)
+{
+        *(nilfs_btree_node_dptrs(btree, node) + index) =
+                nilfs_bmap_ptr_to_dptr(ptr);
+}
+static void nilfs_btree_node_init(struct nilfs_btree *btree,
+                                  struct nilfs_btree_node *node,
+                                  int flags, int level, int nchildren,
+                                  const __u64 *keys, const __u64 *ptrs)
+{
+        __le64 *dkeys;
+        __le64 *dptrs;
+        int i;
+        nilfs_btree_node_set_flags(btree, node, flags);
+        nilfs_btree_node_set_level(btree, node, level);
+        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dptrs = nilfs_btree_node_dptrs(btree, node);
+        for (i = 0; i < nchildren; i++) {
+                dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
+                dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
+        }
+}
+/* Assume the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
+                                       struct nilfs_btree_node *left,
+                                       struct nilfs_btree_node *right,
+                                       int n)
+{
+        __le64 *ldkeys, *rdkeys;
+        __le64 *ldptrs, *rdptrs;
+        int lnchildren, rnchildren;
+        ldkeys = nilfs_btree_node_dkeys(btree, left);
+        ldptrs = nilfs_btree_node_dptrs(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        rdkeys = nilfs_btree_node_dkeys(btree, right);
+        rdptrs = nilfs_btree_node_dptrs(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
+        memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
+        memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
+        memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
+        lnchildren += n;
+        rnchildren -= n;
+        nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+        nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+}
+/* Assume that the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
+                                        struct nilfs_btree_node *left,
+                                        struct nilfs_btree_node *right,
+                                        int n)
+{
+        __le64 *ldkeys, *rdkeys;
+        __le64 *ldptrs, *rdptrs;
+        int lnchildren, rnchildren;
+        ldkeys = nilfs_btree_node_dkeys(btree, left);
+        ldptrs = nilfs_btree_node_dptrs(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        rdkeys = nilfs_btree_node_dkeys(btree, right);
+        rdptrs = nilfs_btree_node_dptrs(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
+        memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
+        memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
+        memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
+        lnchildren -= n;
+        rnchildren += n;
+        nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+        nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+}
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_insert(struct nilfs_btree *btree,
+                                    struct nilfs_btree_node *node,
+                                    __u64 key, __u64 ptr, int index)
+{
+        __le64 *dkeys;
+        __le64 *dptrs;
+        int nchildren;
+        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dptrs = nilfs_btree_node_dptrs(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        if (index < nchildren) {
+                memmove(dkeys + index + 1, dkeys + index,
+                        (nchildren - index) * sizeof(*dkeys));
+                memmove(dptrs + index + 1, dptrs + index,
+                        (nchildren - index) * sizeof(*dptrs));
+        }
+        dkeys[index] = nilfs_bmap_key_to_dkey(key);
+        dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
+        nchildren++;
+        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+}
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_delete(struct nilfs_btree *btree,
+                                    struct nilfs_btree_node *node,
+                                    __u64 *keyp, __u64 *ptrp, int index)
+{
+        __u64 key;
+        __u64 ptr;
+        __le64 *dkeys;
+        __le64 *dptrs;
+        int nchildren;
+        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dptrs = nilfs_btree_node_dptrs(btree, node);
+        key = nilfs_bmap_dkey_to_key(dkeys[index]);
+        ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        if (keyp != NULL)
+                *keyp = key;
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        if (index < nchildren - 1) {
+                memmove(dkeys + index, dkeys + index + 1,
+                        (nchildren - index - 1) * sizeof(*dkeys));
+                memmove(dptrs + index, dptrs + index + 1,
+                        (nchildren - index - 1) * sizeof(*dptrs));
+        }
+        nchildren--;
+        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+}
+static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
+                                   const struct nilfs_btree_node *node,
+                                   __u64 key, int *indexp)
+{
+        __u64 nkey;
+        int index, low, high, s;
+        /* binary search */
+        low = 0;
+        high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+        index = 0;
+        s = 0;
+        while (low <= high) {
+                index = (low + high) / 2;
+                nkey = nilfs_btree_node_get_key(btree, node, index);
+                if (nkey == key) {
+                        s = 0;
+                        goto out;
+                } else if (nkey < key) {
+                        low = index + 1;
+                        s = -1;
+                } else {
+                        high = index - 1;
+                        s = 1;
+                }
+        }
+        /* adjust index */
+        if (nilfs_btree_node_get_level(btree, node) >
+            NILFS_BTREE_LEVEL_NODE_MIN) {
+                if ((s > 0) && (index > 0))
+                        index--;
+        } else if (s < 0)
+                index++;
+ out:
+        *indexp = index;
+        return s == 0;
+}
+static inline struct nilfs_btree_node *
+nilfs_btree_get_root(const struct nilfs_btree *btree)
+{
+        return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
+}
+static inline struct nilfs_btree_node *
+nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
+                             const struct nilfs_btree_path *path,
+                             int level)
+{
+        return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
+}
+static inline struct nilfs_btree_node *
+nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
+                         const struct nilfs_btree_path *path,
+                         int level)
+{
+        return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
+}
+static inline int nilfs_btree_height(const struct nilfs_btree *btree)
+{
+        return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
+                + 1;
+}
+static inline struct nilfs_btree_node *
+nilfs_btree_get_node(const struct nilfs_btree *btree,
+                     const struct nilfs_btree_path *path,
+                     int level)
+{
+        return (level == nilfs_btree_height(btree) - 1) ?
+                nilfs_btree_get_root(btree) :
+                nilfs_btree_get_nonroot_node(btree, path, level);
+}
+static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
+                                 struct nilfs_btree_path *path,
+                                 __u64 key, __u64 *ptrp, int minlevel)
+{
+        struct nilfs_btree_node *node;
+        __u64 ptr;
+        int level, index, found, ret;
+        node = nilfs_btree_get_root(btree);
+        level = nilfs_btree_node_get_level(btree, node);
+        if ((level < minlevel) ||
+            (nilfs_btree_node_get_nchildren(btree, node) <= 0))
+                return -ENOENT;
+        found = nilfs_btree_node_lookup(btree, node, key, &index);
+        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+        path[level].bp_bh = NULL;
+        path[level].bp_index = index;
+        for (level--; level >= minlevel; level--) {
+                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                                           &path[level].bp_bh);
+                if (ret < 0)
+                        return ret;
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+                if (!found)
+                        found = nilfs_btree_node_lookup(btree, node, key,
+                                                        &index);
+                else
+                        index = 0;
+                if (index < nilfs_btree_node_nchildren_max(btree, node))
+                        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+                else {
+                        WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
+                        /* insert */
+                        ptr = NILFS_BMAP_INVALID_PTR;
+                }
+                path[level].bp_index = index;
+        }
+        if (!found)
+                return -ENOENT;
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        return 0;
+}
+static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node;
+        __u64 ptr;
+        int index, level, ret;
+        node = nilfs_btree_get_root(btree);
+        index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+        if (index < 0)
+                return -ENOENT;
+        level = nilfs_btree_node_get_level(btree, node);
+        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+        path[level].bp_bh = NULL;
+        path[level].bp_index = index;
+        for (level--; level > 0; level--) {
+                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                                           &path[level].bp_bh);
+                if (ret < 0)
+                        return ret;
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+                index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+                ptr = nilfs_btree_node_get_ptr(btree, node, index);
+                path[level].bp_index = index;
+        }
+        if (keyp != NULL)
+                *keyp = nilfs_btree_node_get_key(btree, node, index);
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        return 0;
+}
+static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
+                              __u64 key, int level, __u64 *ptrp)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        __u64 ptr;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static void nilfs_btree_promote_key(struct nilfs_btree *btree,
+                                    struct nilfs_btree_path *path,
+                                    int level, __u64 key)
+{
+        if (level < nilfs_btree_height(btree) - 1) {
+                do {
+                        lock_buffer(path[level].bp_bh);
+                        nilfs_btree_node_set_key(
+                                btree,
+                                nilfs_btree_get_nonroot_node(
+                                        btree, path, level),
+                                path[level].bp_index, key);
+                        if (!buffer_dirty(path[level].bp_bh))
+                                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                        unlock_buffer(path[level].bp_bh);
+                } while ((path[level].bp_index == 0) &&
+                         (++level < nilfs_btree_height(btree) - 1));
+        }
+        /* root */
+        if (level == nilfs_btree_height(btree) - 1) {
+                nilfs_btree_node_set_key(btree,
+                                         nilfs_btree_get_root(btree),
+                                         path[level].bp_index, key);
+        }
+}
+static void nilfs_btree_do_insert(struct nilfs_btree *btree,
+                                  struct nilfs_btree_path *path,
+                                  int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node;
+        if (level < nilfs_btree_height(btree) - 1) {
+                lock_buffer(path[level].bp_bh);
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+                                        path[level].bp_index);
+                if (!buffer_dirty(path[level].bp_bh))
+                        nilfs_btnode_mark_dirty(path[level].bp_bh);
+                unlock_buffer(path[level].bp_bh);
+                if (path[level].bp_index == 0)
+                        nilfs_btree_promote_key(btree, path, level + 1,
+                                                nilfs_btree_node_get_key(
+                                                        btree, node, 0));
+        } else {
+                node = nilfs_btree_get_root(btree);
+                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+                                        path[level].bp_index);
+        }
+}
+static void nilfs_btree_carry_left(struct nilfs_btree *btree,
+                                   struct nilfs_btree_path *path,
+                                   int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *left;
+        int nchildren, lnchildren, n, move;
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        move = 0;
+        n = (nchildren + lnchildren + 1) / 2 - lnchildren;
+        if (n > path[level].bp_index) {
+                /* move insert point */
+                n--;
+                move = 1;
+        }
+        nilfs_btree_node_move_left(btree, left, node, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, node, 0));
+        if (move) {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                path[level].bp_bh = path[level].bp_sib_bh;
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index += lnchildren;
+                path[level + 1].bp_index--;
+        } else {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index -= n;
+        }
+        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+static void nilfs_btree_carry_right(struct nilfs_btree *btree,
+                                    struct nilfs_btree_path *path,
+                                    int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *right;
+        int nchildren, rnchildren, n, move;
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        move = 0;
+        n = (nchildren + rnchildren + 1) / 2 - rnchildren;
+        if (n > nchildren - path[level].bp_index) {
+                /* move insert point */
+                n--;
+                move = 1;
+        }
+        nilfs_btree_node_move_right(btree, node, right, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        path[level + 1].bp_index++;
+        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, right, 0));
+        path[level + 1].bp_index--;
+        if (move) {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                path[level].bp_bh = path[level].bp_sib_bh;
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index -=
+                        nilfs_btree_node_get_nchildren(btree, node);
+                path[level + 1].bp_index++;
+        } else {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                path[level].bp_sib_bh = NULL;
+        }
+        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+static void nilfs_btree_split(struct nilfs_btree *btree,
+                              struct nilfs_btree_path *path,
+                              int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *right;
+        __u64 newkey;
+        __u64 newptr;
+        int nchildren, n, move;
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        move = 0;
+        n = (nchildren + 1) / 2;
+        if (n > nchildren - path[level].bp_index) {
+                n--;
+                move = 1;
+        }
+        nilfs_btree_node_move_right(btree, node, right, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        newkey = nilfs_btree_node_get_key(btree, right, 0);
+        newptr = path[level].bp_newreq.bpr_ptr;
+        if (move) {
+                path[level].bp_index -=
+                        nilfs_btree_node_get_nchildren(btree, node);
+                nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
+                                        path[level].bp_index);
+                *keyp = nilfs_btree_node_get_key(btree, right, 0);
+                *ptrp = path[level].bp_newreq.bpr_ptr;
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                path[level].bp_bh = path[level].bp_sib_bh;
+                path[level].bp_sib_bh = NULL;
+        } else {
+                nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+                *keyp = nilfs_btree_node_get_key(btree, right, 0);
+                *ptrp = path[level].bp_newreq.bpr_ptr;
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                path[level].bp_sib_bh = NULL;
+        }
+        path[level + 1].bp_index++;
+}
+static void nilfs_btree_grow(struct nilfs_btree *btree,
+                             struct nilfs_btree_path *path,
+                             int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *root, *child;
+        int n;
+        lock_buffer(path[level].bp_sib_bh);
+        root = nilfs_btree_get_root(btree);
+        child = nilfs_btree_get_sib_node(btree, path, level);
+        n = nilfs_btree_node_get_nchildren(btree, root);
+        nilfs_btree_node_move_right(btree, root, child, n);
+        nilfs_btree_node_set_level(btree, root, level + 1);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        path[level].bp_bh = path[level].bp_sib_bh;
+        path[level].bp_sib_bh = NULL;
+        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+        *keyp = nilfs_btree_node_get_key(btree, child, 0);
+        *ptrp = path[level].bp_newreq.bpr_ptr;
+}
+static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
+                                   const struct nilfs_btree_path *path)
+{
+        struct nilfs_btree_node *node;
+        int level;
+        if (path == NULL)
+                return NILFS_BMAP_INVALID_PTR;
+        /* left sibling */
+        level = NILFS_BTREE_LEVEL_NODE_MIN;
+        if (path[level].bp_index > 0) {
+                node = nilfs_btree_get_node(btree, path, level);
+                return nilfs_btree_node_get_ptr(btree, node,
+                                                path[level].bp_index - 1);
+        }
+        /* parent */
+        level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
+        if (level <= nilfs_btree_height(btree) - 1) {
+                node = nilfs_btree_get_node(btree, path, level);
+                return nilfs_btree_node_get_ptr(btree, node,
+                                                path[level].bp_index);
+        }
+        return NILFS_BMAP_INVALID_PTR;
+}
+static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
+                                       const struct nilfs_btree_path *path,
+                                       __u64 key)
+{
+        __u64 ptr;
+        ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
+        if (ptr != NILFS_BMAP_INVALID_PTR)
+                /* sequential access */
+                return ptr;
+        else {
+                ptr = nilfs_btree_find_near(btree, path);
+                if (ptr != NILFS_BMAP_INVALID_PTR)
+                        /* near */
+                        return ptr;
+        }
+        /* block group */
+        return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
+}
+static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
+                                     __u64 ptr)
+{
+        btree->bt_bmap.b_last_allocated_key = key;
+        btree->bt_bmap.b_last_allocated_ptr = ptr;
+}
+static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      int *levelp, __u64 key, __u64 ptr,
+                                      struct nilfs_bmap_stats *stats)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree_node *node, *parent, *sib;
+        __u64 sibptr;
+        int pindex, level, ret;
+        stats->bs_nblocks = 0;
+        level = NILFS_BTREE_LEVEL_DATA;
+        /* allocate a new ptr for data block */
+        if (btree->bt_ops->btop_find_target != NULL)
+                path[level].bp_newreq.bpr_ptr =
+                        btree->bt_ops->btop_find_target(btree, path, key);
+        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+                &btree->bt_bmap, &path[level].bp_newreq);
+        if (ret < 0)
+                goto err_out_data;
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+             level < nilfs_btree_height(btree) - 1;
+             level++) {
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                if (nilfs_btree_node_get_nchildren(btree, node) <
+                    nilfs_btree_node_nchildren_max(btree, node)) {
+                        path[level].bp_op = nilfs_btree_do_insert;
+                        stats->bs_nblocks++;
+                        goto out;
+                }
+                parent = nilfs_btree_get_node(btree, path, level + 1);
+                pindex = path[level + 1].bp_index;
+                /* left sibling */
+                if (pindex > 0) {
+                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          pindex - 1);
+                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                                                   &bh);
+                        if (ret < 0)
+                                goto err_out_child_node;
+                        sib = (struct nilfs_btree_node *)bh->b_data;
+                        if (nilfs_btree_node_get_nchildren(btree, sib) <
+                            nilfs_btree_node_nchildren_max(btree, sib)) {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_carry_left;
+                                stats->bs_nblocks++;
+                                goto out;
+                        } else
+                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                }
+                /* right sibling */
+                if (pindex <
+                    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          pindex + 1);
+                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                                                   &bh);
+                        if (ret < 0)
+                                goto err_out_child_node;
+                        sib = (struct nilfs_btree_node *)bh->b_data;
+                        if (nilfs_btree_node_get_nchildren(btree, sib) <
+                            nilfs_btree_node_nchildren_max(btree, sib)) {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_carry_right;
+                                stats->bs_nblocks++;
+                                goto out;
+                        } else
+                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                }
+                /* split */
+                path[level].bp_newreq.bpr_ptr =
+                        path[level - 1].bp_newreq.bpr_ptr + 1;
+                ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+                        &btree->bt_bmap, &path[level].bp_newreq);
+                if (ret < 0)
+                        goto err_out_child_node;
+                ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+                                               path[level].bp_newreq.bpr_ptr,
+                                               &bh);
+                if (ret < 0)
+                        goto err_out_curr_node;
+                stats->bs_nblocks++;
+                lock_buffer(bh);
+                nilfs_btree_node_init(btree,
+                                      (struct nilfs_btree_node *)bh->b_data,
+                                      0, level, 0, NULL, NULL);
+                unlock_buffer(bh);
+                path[level].bp_sib_bh = bh;
+                path[level].bp_op = nilfs_btree_split;
+        }
+        /* root */
+        node = nilfs_btree_get_root(btree);
+        if (nilfs_btree_node_get_nchildren(btree, node) <
+            nilfs_btree_node_nchildren_max(btree, node)) {
+                path[level].bp_op = nilfs_btree_do_insert;
+                stats->bs_nblocks++;
+                goto out;
+        }
+        /* grow */
+        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
+        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+                &btree->bt_bmap, &path[level].bp_newreq);
+        if (ret < 0)
+                goto err_out_child_node;
+        ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+                                       path[level].bp_newreq.bpr_ptr, &bh);
+        if (ret < 0)
+                goto err_out_curr_node;
+        lock_buffer(bh);
+        nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
+                              0, level, 0, NULL, NULL);
+        unlock_buffer(bh);
+        path[level].bp_sib_bh = bh;
+        path[level].bp_op = nilfs_btree_grow;
+        level++;
+        path[level].bp_op = nilfs_btree_do_insert;
+        /* a newly-created node block and a data block are added */
+        stats->bs_nblocks += 2;
+        /* success */
+ out:
+        *levelp = level;
+        return ret;
+        /* error */
+ err_out_curr_node:
+        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+                                                    &path[level].bp_newreq);
+ err_out_child_node:
+        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
+                nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
+                        &btree->bt_bmap, &path[level].bp_newreq);
+        }
+        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+                                                       &path[level].bp_newreq);
+ err_out_data:
+        *levelp = level;
+        stats->bs_nblocks = 0;
+        return ret;
+}
+static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      int maxlevel, __u64 key, __u64 ptr)
+{
+        int level;
+        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
+        if (btree->bt_ops->btop_set_target != NULL)
+                btree->bt_ops->btop_set_target(btree, key, ptr);
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+                if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
+                        btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
+                                &btree->bt_bmap, &path[level - 1].bp_newreq);
+                }
+                path[level].bp_op(btree, path, level, &key, &ptr);
+        }
+        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+                nilfs_bmap_set_dirty(&btree->bt_bmap);
+}
+static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        struct nilfs_bmap_stats stats;
+        int level, ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+                                    NILFS_BTREE_LEVEL_NODE_MIN);
+        if (ret != -ENOENT) {
+                if (ret == 0)
+                        ret = -EEXIST;
+                goto out;
+        }
+        ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
+        if (ret < 0)
+                goto out;
+        nilfs_btree_commit_insert(btree, path, level, key, ptr);
+        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static void nilfs_btree_do_delete(struct nilfs_btree *btree,
+                                  struct nilfs_btree_path *path,
+                                  int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node;
+        if (level < nilfs_btree_height(btree) - 1) {
+                lock_buffer(path[level].bp_bh);
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                nilfs_btree_node_delete(btree, node, keyp, ptrp,
+                                        path[level].bp_index);
+                if (!buffer_dirty(path[level].bp_bh))
+                        nilfs_btnode_mark_dirty(path[level].bp_bh);
+                unlock_buffer(path[level].bp_bh);
+                if (path[level].bp_index == 0)
+                        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, node, 0));
+        } else {
+                node = nilfs_btree_get_root(btree);
+                nilfs_btree_node_delete(btree, node, keyp, ptrp,
+                                        path[level].bp_index);
+        }
+}
+static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
+                                    struct nilfs_btree_path *path,
+                                    int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *left;
+        int nchildren, lnchildren, n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        n = (nchildren + lnchildren) / 2 - nchildren;
+        nilfs_btree_node_move_right(btree, left, node, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, node, 0));
+        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        path[level].bp_sib_bh = NULL;
+        path[level].bp_index += n;
+}
+static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
+                                     struct nilfs_btree_path *path,
+                                     int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *right;
+        int nchildren, rnchildren, n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        n = (nchildren + rnchildren) / 2 - nchildren;
+        nilfs_btree_node_move_left(btree, node, right, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        path[level + 1].bp_index++;
+        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, right, 0));
+        path[level + 1].bp_index--;
+        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        path[level].bp_sib_bh = NULL;
+}
+static void nilfs_btree_concat_left(struct nilfs_btree *btree,
+                                    struct nilfs_btree_path *path,
+                                    int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *left;
+        int n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(btree, path, level);
+        n = nilfs_btree_node_get_nchildren(btree, node);
+        nilfs_btree_node_move_left(btree, left, node, n);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        path[level].bp_bh = path[level].bp_sib_bh;
+        path[level].bp_sib_bh = NULL;
+        path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+}
+static void nilfs_btree_concat_right(struct nilfs_btree *btree,
+                                     struct nilfs_btree_path *path,
+                                     int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *right;
+        int n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(btree, path, level);
+        n = nilfs_btree_node_get_nchildren(btree, right);
+        nilfs_btree_node_move_left(btree, node, right, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        path[level].bp_sib_bh = NULL;
+        path[level + 1].bp_index++;
+}
+static void nilfs_btree_shrink(struct nilfs_btree *btree,
+                               struct nilfs_btree_path *path,
+                               int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *root, *child;
+        int n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        root = nilfs_btree_get_root(btree);
+        child = nilfs_btree_get_nonroot_node(btree, path, level);
+        nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
+        nilfs_btree_node_set_level(btree, root, level);
+        n = nilfs_btree_node_get_nchildren(btree, child);
+        nilfs_btree_node_move_left(btree, root, child, n);
+        unlock_buffer(path[level].bp_bh);
+        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        path[level].bp_bh = NULL;
+}
+static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      int *levelp,
+                                      struct nilfs_bmap_stats *stats)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree_node *node, *parent, *sib;
+        __u64 sibptr;
+        int pindex, level, ret;
+        ret = 0;
+        stats->bs_nblocks = 0;
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+             level < nilfs_btree_height(btree) - 1;
+             level++) {
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                path[level].bp_oldreq.bpr_ptr =
+                        nilfs_btree_node_get_ptr(btree, node,
+                                                 path[level].bp_index);
+                if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+                        ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+                                &btree->bt_bmap, &path[level].bp_oldreq);
+                        if (ret < 0)
+                                goto err_out_child_node;
+                }
+                if (nilfs_btree_node_get_nchildren(btree, node) >
+                    nilfs_btree_node_nchildren_min(btree, node)) {
+                        path[level].bp_op = nilfs_btree_do_delete;
+                        stats->bs_nblocks++;
+                        goto out;
+                }
+                parent = nilfs_btree_get_node(btree, path, level + 1);
+                pindex = path[level + 1].bp_index;
+                if (pindex > 0) {
+                        /* left sibling */
+                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          pindex - 1);
+                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                                                   &bh);
+                        if (ret < 0)
+                                goto err_out_curr_node;
+                        sib = (struct nilfs_btree_node *)bh->b_data;
+                        if (nilfs_btree_node_get_nchildren(btree, sib) >
+                            nilfs_btree_node_nchildren_min(btree, sib)) {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_borrow_left;
+                                stats->bs_nblocks++;
+                                goto out;
+                        } else {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_concat_left;
+                                stats->bs_nblocks++;
+                                /* continue; */
+                        }
+                } else if (pindex <
+                           nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+                        /* right sibling */
+                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          pindex + 1);
+                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                                                   &bh);
+                        if (ret < 0)
+                                goto err_out_curr_node;
+                        sib = (struct nilfs_btree_node *)bh->b_data;
+                        if (nilfs_btree_node_get_nchildren(btree, sib) >
+                            nilfs_btree_node_nchildren_min(btree, sib)) {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_borrow_right;
+                                stats->bs_nblocks++;
+                                goto out;
+                        } else {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_concat_right;
+                                stats->bs_nblocks++;
+                                /* continue; */
+                        }
+                } else {
+                        /* no siblings */
+                        /* the only child of the root node */
+                        WARN_ON(level != nilfs_btree_height(btree) - 2);
+                        if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+                            NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+                                path[level].bp_op = nilfs_btree_shrink;
+                                stats->bs_nblocks += 2;
+                        } else {
+                                path[level].bp_op = nilfs_btree_do_delete;
+                                stats->bs_nblocks++;
+                        }
+                        goto out;
+                }
+        }
+        node = nilfs_btree_get_root(btree);
+        path[level].bp_oldreq.bpr_ptr =
+                nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
+        if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+                ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+                        &btree->bt_bmap, &path[level].bp_oldreq);
+                if (ret < 0)
+                        goto err_out_child_node;
+        }
+        /* child of the root node is deleted */
+        path[level].bp_op = nilfs_btree_do_delete;
+        stats->bs_nblocks++;
+        /* success */
+ out:
+        *levelp = level;
+        return ret;
+        /* error */
+ err_out_curr_node:
+        if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+                btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+                        &btree->bt_bmap, &path[level].bp_oldreq);
+ err_out_child_node:
+        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+                        btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+                                &btree->bt_bmap, &path[level].bp_oldreq);
+        }
+        *levelp = level;
+        stats->bs_nblocks = 0;
+        return ret;
+}
+static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      int maxlevel)
+{
+        int level;
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+                if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
+                        btree->bt_bmap.b_pops->bpop_commit_end_ptr(
+                                &btree->bt_bmap, &path[level].bp_oldreq);
+                path[level].bp_op(btree, path, level, NULL, NULL);
+        }
+        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+                nilfs_bmap_set_dirty(&btree->bt_bmap);
+}
+static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        struct nilfs_bmap_stats stats;
+        int level, ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+                                    NILFS_BTREE_LEVEL_NODE_MIN);
+        if (ret < 0)
+                goto out;
+        ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+        if (ret < 0)
+                goto out;
+        nilfs_btree_commit_delete(btree, path, level);
+        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree *btree;
+        struct nilfs_btree_node *root, *node;
+        __u64 maxkey, nextmaxkey;
+        __u64 ptr;
+        int nchildren, ret;
+        btree = (struct nilfs_btree *)bmap;
+        root = nilfs_btree_get_root(btree);
+        switch (nilfs_btree_height(btree)) {
+        case 2:
+                bh = NULL;
+                node = root;
+                break;
+        case 3:
+                nchildren = nilfs_btree_node_get_nchildren(btree, root);
+                if (nchildren > 1)
+                        return 0;
+                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                if (ret < 0)
+                        return ret;
+                node = (struct nilfs_btree_node *)bh->b_data;
+                break;
+        default:
+                return 0;
+        }
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+        nextmaxkey = (nchildren > 1) ?
+                nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+        if (bh != NULL)
+                nilfs_bmap_put_block(bmap, bh);
+        return (maxkey == key) && (nextmaxkey < bmap->b_low);
+}
+static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
+                                   __u64 *keys, __u64 *ptrs, int nitems)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree *btree;
+        struct nilfs_btree_node *node, *root;
+        __le64 *dkeys;
+        __le64 *dptrs;
+        __u64 ptr;
+        int nchildren, i, ret;
+        btree = (struct nilfs_btree *)bmap;
+        root = nilfs_btree_get_root(btree);
+        switch (nilfs_btree_height(btree)) {
+        case 2:
+                bh = NULL;
+                node = root;
+                break;
+        case 3:
+                nchildren = nilfs_btree_node_get_nchildren(btree, root);
+                WARN_ON(nchildren > 1);
+                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                if (ret < 0)
+                        return ret;
+                node = (struct nilfs_btree_node *)bh->b_data;
+                break;
+        default:
+                node = NULL;
+                return -EINVAL;
+        }
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        if (nchildren < nitems)
+                nitems = nchildren;
+        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dptrs = nilfs_btree_node_dptrs(btree, node);
+        for (i = 0; i < nitems; i++) {
+                keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
+                ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
+        }
+        if (bh != NULL)
+                nilfs_bmap_put_block(bmap, bh);
+        return nitems;
+}
+static int
+nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
+                                       union nilfs_bmap_ptr_req *dreq,
+                                       union nilfs_bmap_ptr_req *nreq,
+                                       struct buffer_head **bhp,
+                                       struct nilfs_bmap_stats *stats)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree *btree;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        stats->bs_nblocks = 0;
+        /* for data */
+        /* cannot find near ptr */
+        if (btree->bt_ops->btop_find_target != NULL)
+                dreq->bpr_ptr
+                        = btree->bt_ops->btop_find_target(btree, NULL, key);
+        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
+        if (ret < 0)
+                return ret;
+        *bhp = NULL;
+        stats->bs_nblocks++;
+        if (nreq != NULL) {
+                nreq->bpr_ptr = dreq->bpr_ptr + 1;
+                ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
+                if (ret < 0)
+                        goto err_out_dreq;
+                ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
+                if (ret < 0)
+                        goto err_out_nreq;
+                *bhp = bh;
+                stats->bs_nblocks++;
+        }
+        /* success */
+        return 0;
+        /* error */
+ err_out_nreq:
+        bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
+ err_out_dreq:
+        bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
+        stats->bs_nblocks = 0;
+        return ret;
+}
+static void
+nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
+                                      __u64 key, __u64 ptr,
+                                      const __u64 *keys, const __u64 *ptrs,
+                                      int n, __u64 low, __u64 high,
+                                      union nilfs_bmap_ptr_req *dreq,
+                                      union nilfs_bmap_ptr_req *nreq,
+                                      struct buffer_head *bh)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_node *node;
+        __u64 tmpptr;
+        /* free resources */
+        if (bmap->b_ops->bop_clear != NULL)
+                bmap->b_ops->bop_clear(bmap);
+        /* ptr must be a pointer to a buffer head. */
+        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+        /* convert and insert */
+        btree = (struct nilfs_btree *)bmap;
+        nilfs_btree_init(bmap, low, high);
+        if (nreq != NULL) {
+                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
+                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
+                }
+                /* create child node at level 1 */
+                lock_buffer(bh);
+                node = (struct nilfs_btree_node *)bh->b_data;
+                nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
+                nilfs_btree_node_insert(btree, node,
+                                        key, dreq->bpr_ptr, n);
+                if (!buffer_dirty(bh))
+                        nilfs_btnode_mark_dirty(bh);
+                if (!nilfs_bmap_dirty(bmap))
+                        nilfs_bmap_set_dirty(bmap);
+                unlock_buffer(bh);
+                nilfs_bmap_put_block(bmap, bh);
+                /* create root node at level 2 */
+                node = nilfs_btree_get_root(btree);
+                tmpptr = nreq->bpr_ptr;
+                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+                                      2, 1, &keys[0], &tmpptr);
+        } else {
+                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
+                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+                /* create root node at level 1 */
+                node = nilfs_btree_get_root(btree);
+                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+                                      1, n, keys, ptrs);
+                nilfs_btree_node_insert(btree, node,
+                                        key, dreq->bpr_ptr, n);
+                if (!nilfs_bmap_dirty(bmap))
+                        nilfs_bmap_set_dirty(bmap);
+        }
+        if (btree->bt_ops->btop_set_target != NULL)
+                btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
+}
+/**
+ * nilfs_btree_convert_and_insert -
+ * @bmap:
+ * @key:
+ * @ptr:
+ * @keys:
+ * @ptrs:
+ * @n:
+ * @low:
+ * @high:
+ */
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
+                                   __u64 key, __u64 ptr,
+                                   const __u64 *keys, const __u64 *ptrs,
+                                   int n, __u64 low, __u64 high)
+{
+        struct buffer_head *bh;
+        union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
+        struct nilfs_bmap_stats stats;
+        int ret;
+        if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+                di = &dreq;
+                ni = NULL;
+        } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
+                           1 << bmap->b_inode->i_blkbits)) {
+                di = &dreq;
+                ni = &nreq;
+        } else {
+                di = NULL;
+                ni = NULL;
+                BUG();
+        }
+        ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
+                                                     &stats);
+        if (ret < 0)
+                return ret;
+        nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
+                                              low, high, di, ni, bh);
+        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        return 0;
+}
+static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
+                                   struct nilfs_btree_path *path,
+                                   int level,
+                                   struct buffer_head *bh)
+{
+        while ((++level < nilfs_btree_height(btree) - 1) &&
+               !buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        return 0;
+}
+static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
+                                        struct nilfs_btree_path *path,
+                                        int level)
+{
+        struct nilfs_btree_node *parent;
+        int ret;
+        parent = nilfs_btree_get_node(btree, path, level + 1);
+        path[level].bp_oldreq.bpr_ptr =
+                nilfs_btree_node_get_ptr(btree, parent,
+                                         path[level + 1].bp_index);
+        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
+        ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
+                                        &path[level].bp_oldreq,
+                                        &path[level].bp_newreq);
+        if (ret < 0)
+                return ret;
+        if (buffer_nilfs_node(path[level].bp_bh)) {
+                path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
+                path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
+                path[level].bp_ctxt.bh = path[level].bp_bh;
+                ret = nilfs_btnode_prepare_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+                if (ret < 0) {
+                        nilfs_bmap_abort_update(&btree->bt_bmap,
+                                                &path[level].bp_oldreq,
+                                                &path[level].bp_newreq);
+                        return ret;
+                }
+        }
+        return 0;
+}
+static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
+                                        struct nilfs_btree_path *path,
+                                        int level)
+{
+        struct nilfs_btree_node *parent;
+        nilfs_bmap_commit_update(&btree->bt_bmap,
+                                 &path[level].bp_oldreq,
+                                 &path[level].bp_newreq);
+        if (buffer_nilfs_node(path[level].bp_bh)) {
+                nilfs_btnode_commit_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+                path[level].bp_bh = path[level].bp_ctxt.bh;
+        }
+        set_buffer_nilfs_volatile(path[level].bp_bh);
+        parent = nilfs_btree_get_node(btree, path, level + 1);
+        nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
+                                 path[level].bp_newreq.bpr_ptr);
+}
+static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
+                                       struct nilfs_btree_path *path,
+                                       int level)
+{
+        nilfs_bmap_abort_update(&btree->bt_bmap,
+                                &path[level].bp_oldreq,
+                                &path[level].bp_newreq);
+        if (buffer_nilfs_node(path[level].bp_bh))
+                nilfs_btnode_abort_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+}
+static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
+                                           struct nilfs_btree_path *path,
+                                           int minlevel,
+                                           int *maxlevelp)
+{
+        int level, ret;
+        level = minlevel;
+        if (!buffer_nilfs_volatile(path[level].bp_bh)) {
+                ret = nilfs_btree_prepare_update_v(btree, path, level);
+                if (ret < 0)
+                        return ret;
+        }
+        while ((++level < nilfs_btree_height(btree) - 1) &&
+               !buffer_dirty(path[level].bp_bh)) {
+                WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
+                ret = nilfs_btree_prepare_update_v(btree, path, level);
+                if (ret < 0)
+                        goto out;
+        }
+        /* success */
+        *maxlevelp = level - 1;
+        return 0;
+        /* error */
+ out:
+        while (--level > minlevel)
+                nilfs_btree_abort_update_v(btree, path, level);
+        if (!buffer_nilfs_volatile(path[level].bp_bh))
+                nilfs_btree_abort_update_v(btree, path, level);
+        return ret;
+}
+static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
+                                           struct nilfs_btree_path *path,
+                                           int minlevel,
+                                           int maxlevel,
+                                           struct buffer_head *bh)
+{
+        int level;
+        if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
+                nilfs_btree_commit_update_v(btree, path, minlevel);
+        for (level = minlevel + 1; level <= maxlevel; level++)
+                nilfs_btree_commit_update_v(btree, path, level);
+}
+static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
+                                   struct nilfs_btree_path *path,
+                                   int level,
+                                   struct buffer_head *bh)
+{
+        int maxlevel, ret;
+        struct nilfs_btree_node *parent;
+        __u64 ptr;
+        get_bh(bh);
+        path[level].bp_bh = bh;
+        ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+        if (ret < 0)
+                goto out;
+        if (buffer_nilfs_volatile(path[level].bp_bh)) {
+                parent = nilfs_btree_get_node(btree, path, level + 1);
+                ptr = nilfs_btree_node_get_ptr(btree, parent,
+                                               path[level + 1].bp_index);
+                ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+                if (ret < 0)
+                        goto out;
+        }
+        nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+ out:
+        brelse(path[level].bp_bh);
+        path[level].bp_bh = NULL;
+        return ret;
+}
+static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
+                                 struct buffer_head *bh)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        struct nilfs_btree_node *node;
+        __u64 key;
+        int level, ret;
+        WARN_ON(!buffer_dirty(bh));
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        if (buffer_nilfs_node(bh)) {
+                node = (struct nilfs_btree_node *)bh->b_data;
+                key = nilfs_btree_node_get_key(btree, node, 0);
+                level = nilfs_btree_node_get_level(btree, node);
+        } else {
+                key = nilfs_bmap_data_get_key(bmap, bh);
+                level = NILFS_BTREE_LEVEL_DATA;
+        }
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+        if (ret < 0) {
+                if (unlikely(ret == -ENOENT))
+                        printk(KERN_CRIT "%s: key = %llu, level == %d\n",
+                               __func__, (unsigned long long)key, level);
+                goto out;
+        }
+        ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
+                                    struct buffer_head *bh)
+{
+        return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+}
+static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
+                                         struct list_head *lists,
+                                         struct buffer_head *bh)
+{
+        struct list_head *head;
+        struct buffer_head *cbh;
+        struct nilfs_btree_node *node, *cnode;
+        __u64 key, ckey;
+        int level;
+        get_bh(bh);
+        node = (struct nilfs_btree_node *)bh->b_data;
+        key = nilfs_btree_node_get_key(btree, node, 0);
+        level = nilfs_btree_node_get_level(btree, node);
+        list_for_each(head, &lists[level]) {
+                cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
+                cnode = (struct nilfs_btree_node *)cbh->b_data;
+                ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+                if (key < ckey)
+                        break;
+        }
+        list_add_tail(&bh->b_assoc_buffers, head);
+}
+static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+                                             struct list_head *listp)
+{
+        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+        struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
+        struct list_head lists[NILFS_BTREE_LEVEL_MAX];
+        struct pagevec pvec;
+        struct buffer_head *bh, *head;
+        pgoff_t index = 0;
+        int level, i;
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+             level < NILFS_BTREE_LEVEL_MAX;
+             level++)
+                INIT_LIST_HEAD(&lists[level]);
+        pagevec_init(&pvec, 0);
+        while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
+                                  PAGEVEC_SIZE)) {
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        bh = head = page_buffers(pvec.pages[i]);
+                        do {
+                                if (buffer_dirty(bh))
+                                        nilfs_btree_add_dirty_buffer(btree,
+                                                                     lists, bh);
+                        } while ((bh = bh->b_this_page) != head);
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+             level < NILFS_BTREE_LEVEL_MAX;
+             level++)
+                list_splice(&lists[level], listp->prev);
+}
+static int nilfs_btree_assign_p(struct nilfs_btree *btree,
+                                struct nilfs_btree_path *path,
+                                int level,
+                                struct buffer_head **bh,
+                                sector_t blocknr,
+                                union nilfs_binfo *binfo)
+{
+        struct nilfs_btree_node *parent;
+        __u64 key;
+        __u64 ptr;
+        int ret;
+        parent = nilfs_btree_get_node(btree, path, level + 1);
+        ptr = nilfs_btree_node_get_ptr(btree, parent,
+                                       path[level + 1].bp_index);
+        if (buffer_nilfs_node(*bh)) {
+                path[level].bp_ctxt.oldkey = ptr;
+                path[level].bp_ctxt.newkey = blocknr;
+                path[level].bp_ctxt.bh = *bh;
+                ret = nilfs_btnode_prepare_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+                if (ret < 0)
+                        return ret;
+                nilfs_btnode_commit_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+                *bh = path[level].bp_ctxt.bh;
+        }
+        nilfs_btree_node_set_ptr(btree, parent,
+                                 path[level + 1].bp_index, blocknr);
+        key = nilfs_btree_node_get_key(btree, parent,
+                                       path[level + 1].bp_index);
+        /* on-disk format */
+        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_dat.bi_level = level;
+        return 0;
+}
+static int nilfs_btree_assign_v(struct nilfs_btree *btree,
+                                struct nilfs_btree_path *path,
+                                int level,
+                                struct buffer_head **bh,
+                                sector_t blocknr,
+                                union nilfs_binfo *binfo)
+{
+        struct nilfs_btree_node *parent;
+        __u64 key;
+        __u64 ptr;
+        union nilfs_bmap_ptr_req req;
+        int ret;
+        parent = nilfs_btree_get_node(btree, path, level + 1);
+        ptr = nilfs_btree_node_get_ptr(btree, parent,
+                                       path[level + 1].bp_index);
+        req.bpr_ptr = ptr;
+        ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
+                                                               &req);
+        if (ret < 0)
+                return ret;
+        btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
+                                                        &req, blocknr);
+        key = nilfs_btree_node_get_key(btree, parent,
+                                       path[level + 1].bp_index);
+        /* on-disk format */
+        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        return 0;
+}
+static int nilfs_btree_assign(struct nilfs_bmap *bmap,
+                              struct buffer_head **bh,
+                              sector_t blocknr,
+                              union nilfs_binfo *binfo)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        struct nilfs_btree_node *node;
+        __u64 key;
+        int level, ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        if (buffer_nilfs_node(*bh)) {
+                node = (struct nilfs_btree_node *)(*bh)->b_data;
+                key = nilfs_btree_node_get_key(btree, node, 0);
+                level = nilfs_btree_node_get_level(btree, node);
+        } else {
+                key = nilfs_bmap_data_get_key(bmap, *bh);
+                level = NILFS_BTREE_LEVEL_DATA;
+        }
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+        if (ret < 0) {
+                WARN_ON(ret == -ENOENT);
+                goto out;
+        }
+        ret = btree->bt_ops->btop_assign(btree, path, level, bh,
+                                            blocknr, binfo);
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
+                                 struct buffer_head **bh,
+                                 sector_t blocknr,
+                                 union nilfs_binfo *binfo)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_node *node;
+        __u64 key;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+        if (ret < 0)
+                return ret;
+        if (buffer_nilfs_node(*bh)) {
+                node = (struct nilfs_btree_node *)(*bh)->b_data;
+                key = nilfs_btree_node_get_key(btree, node, 0);
+        } else
+                key = nilfs_bmap_data_get_key(bmap, *bh);
+        /* on-disk format */
+        binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
+        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        return 0;
+}
+static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        __u64 ptr;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
+        if (ret < 0) {
+                WARN_ON(ret == -ENOENT);
+                goto out;
+        }
+        ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
+        if (ret < 0) {
+                WARN_ON(ret == -ENOENT);
+                goto out;
+        }
+        if (!buffer_dirty(bh))
+                nilfs_btnode_mark_dirty(bh);
+        nilfs_bmap_put_block(&btree->bt_bmap, bh);
+        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+                nilfs_bmap_set_dirty(&btree->bt_bmap);
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static const struct nilfs_bmap_operations nilfs_btree_ops = {
+        .bop_lookup             =       nilfs_btree_lookup,
+        .bop_insert             =       nilfs_btree_insert,
+        .bop_delete             =       nilfs_btree_delete,
+        .bop_clear              =       NULL,
+        .bop_propagate          =       nilfs_btree_propagate,
+        .bop_lookup_dirty_buffers =     nilfs_btree_lookup_dirty_buffers,
+        .bop_assign             =       nilfs_btree_assign,
+        .bop_mark               =       nilfs_btree_mark,
+        .bop_last_key           =       nilfs_btree_last_key,
+        .bop_check_insert       =       NULL,
+        .bop_check_delete       =       nilfs_btree_check_delete,
+        .bop_gather_data        =       nilfs_btree_gather_data,
+};
+static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
+        .bop_lookup             =       NULL,
+        .bop_insert             =       NULL,
+        .bop_delete             =       NULL,
+        .bop_clear              =       NULL,
+        .bop_propagate          =       nilfs_btree_propagate_gc,
+        .bop_lookup_dirty_buffers =     nilfs_btree_lookup_dirty_buffers,
+        .bop_assign             =       nilfs_btree_assign_gc,
+        .bop_mark               =       NULL,
+        .bop_last_key           =       NULL,
+        .bop_check_insert       =       NULL,
+        .bop_check_delete       =       NULL,
+        .bop_gather_data        =       NULL,
+};
+static const struct nilfs_btree_operations nilfs_btree_ops_v = {
+        .btop_find_target       =       nilfs_btree_find_target_v,
+        .btop_set_target        =       nilfs_btree_set_target_v,
+        .btop_propagate         =       nilfs_btree_propagate_v,
+        .btop_assign            =       nilfs_btree_assign_v,
+};
+static const struct nilfs_btree_operations nilfs_btree_ops_p = {
+        .btop_find_target       =       NULL,
+        .btop_set_target        =       NULL,
+        .btop_propagate         =       nilfs_btree_propagate_p,
+        .btop_assign            =       nilfs_btree_assign_p,
+};
+int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+{
+        struct nilfs_btree *btree;
+        btree = (struct nilfs_btree *)bmap;
+        bmap->b_ops = &nilfs_btree_ops;
+        bmap->b_low = low;
+        bmap->b_high = high;
+        switch (bmap->b_inode->i_ino) {
+        case NILFS_DAT_INO:
+                btree->bt_ops = &nilfs_btree_ops_p;
+                break;
+        default:
+                btree->bt_ops = &nilfs_btree_ops_v;
+                break;
+        }
+        return 0;
+}
+void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
+{
+        bmap->b_low = NILFS_BMAP_LARGE_LOW;
+        bmap->b_high = NILFS_BMAP_LARGE_HIGH;
+        bmap->b_ops = &nilfs_btree_ops_gc;
+}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
new file mode 100644
index 000000000000..4766deb52fb1
--- /dev/null
+++ b/fs/nilfs2/btree.h
@@ -0,0 +1,117 @@
+/*
+ * btree.h - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_BTREE_H
+#define _NILFS_BTREE_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/list.h>
+#include <linux/nilfs2_fs.h>
+#include "btnode.h"
+#include "bmap.h"
+struct nilfs_btree;
+struct nilfs_btree_path;
+/**
+ * struct nilfs_btree_operations - B-tree operation table
+ */
+struct nilfs_btree_operations {
+        __u64 (*btop_find_target)(const struct nilfs_btree *,
+                                  const struct nilfs_btree_path *, __u64);
+        void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
+        struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
+        int (*btop_propagate)(struct nilfs_btree *,
+                              struct nilfs_btree_path *,
+                              int,
+                              struct buffer_head *);
+        int (*btop_assign)(struct nilfs_btree *,
+                           struct nilfs_btree_path *,
+                           int,
+                           struct buffer_head **,
+                           sector_t,
+                           union nilfs_binfo *);
+};
+/**
+ * struct nilfs_btree_node - B-tree node
+ * @bn_flags: flags
+ * @bn_level: level
+ * @bn_nchildren: number of children
+ * @bn_pad: padding
+ */
+struct nilfs_btree_node {
+        __u8 bn_flags;
+        __u8 bn_level;
+        __le16 bn_nchildren;
+        __le32 bn_pad;
+};
+/* flags */
+#define NILFS_BTREE_NODE_ROOT   0x01
+/* level */
+#define NILFS_BTREE_LEVEL_DATA          0
+#define NILFS_BTREE_LEVEL_NODE_MIN      (NILFS_BTREE_LEVEL_DATA + 1)
+#define NILFS_BTREE_LEVEL_MAX           14
+/**
+ * struct nilfs_btree - B-tree structure
+ * @bt_bmap: bmap base structure
+ * @bt_ops: B-tree operation table
+ */
+struct nilfs_btree {
+        struct nilfs_bmap bt_bmap;
+        /* B-tree-specific members */
+        const struct nilfs_btree_operations *bt_ops;
+};
+#define NILFS_BTREE_ROOT_SIZE           NILFS_BMAP_SIZE
+#define NILFS_BTREE_ROOT_NCHILDREN_MAX                                  \
+        ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) /    \
+         (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_ROOT_NCHILDREN_MIN  0
+#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64))
+#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize)                        \
+        (((nodesize) - sizeof(struct nilfs_btree_node) -                \
+                NILFS_BTREE_NODE_EXTRA_PAD_SIZE) /                      \
+         (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize)                        \
+        ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
+#define NILFS_BTREE_KEY_MIN     ((__u64)0)
+#define NILFS_BTREE_KEY_MAX     (~(__u64)0)
+int nilfs_btree_path_cache_init(void);
+void nilfs_btree_path_cache_destroy(void);
+int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
+                                   const __u64 *, const __u64 *,
+                                   int, __u64, __u64);
+void nilfs_btree_init_gc(struct nilfs_bmap *);
+#endif  /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
new file mode 100644
index 000000000000..300f1cdfa862
--- /dev/null
+++ b/fs/nilfs2/cpfile.c
@@ -0,0 +1,927 @@
+/*
+ * cpfile.c - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "cpfile.h"
+static inline unsigned long
+nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
+{
+        return NILFS_MDT(cpfile)->mi_entries_per_block;
+}
+/* block number from the beginning of the file */
+static unsigned long
+nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
+{
+        __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+        do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+        return (unsigned long)tcno;
+}
+/* offset in block */
+static unsigned long
+nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
+{
+        __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+        return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+}
+static unsigned long
+nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
+                                  __u64 curr,
+                                  __u64 max)
+{
+        return min_t(__u64,
+                     nilfs_cpfile_checkpoints_per_block(cpfile) -
+                     nilfs_cpfile_get_offset(cpfile, curr),
+                     max - curr);
+}
+static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
+                                           __u64 cno)
+{
+        return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
+}
+static unsigned int
+nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
+                                         struct buffer_head *bh,
+                                         void *kaddr,
+                                         unsigned int n)
+{
+        struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+        unsigned int count;
+        count = le32_to_cpu(cp->cp_checkpoints_count) + n;
+        cp->cp_checkpoints_count = cpu_to_le32(count);
+        return count;
+}
+static unsigned int
+nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
+                                         struct buffer_head *bh,
+                                         void *kaddr,
+                                         unsigned int n)
+{
+        struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+        unsigned int count;
+        WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
+        count = le32_to_cpu(cp->cp_checkpoints_count) - n;
+        cp->cp_checkpoints_count = cpu_to_le32(count);
+        return count;
+}
+static inline struct nilfs_cpfile_header *
+nilfs_cpfile_block_get_header(const struct inode *cpfile,
+                              struct buffer_head *bh,
+                              void *kaddr)
+{
+        return kaddr + bh_offset(bh);
+}
+static struct nilfs_checkpoint *
+nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
+                                  struct buffer_head *bh,
+                                  void *kaddr)
+{
+        return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
+                NILFS_MDT(cpfile)->mi_entry_size;
+}
+static void nilfs_cpfile_block_init(struct inode *cpfile,
+                                    struct buffer_head *bh,
+                                    void *kaddr)
+{
+        struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+        int n = nilfs_cpfile_checkpoints_per_block(cpfile);
+        while (n-- > 0) {
+                nilfs_checkpoint_set_invalid(cp);
+                cp = (void *)cp + cpsz;
+        }
+}
+static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
+                                                struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
+}
+static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
+                                                    __u64 cno,
+                                                    int create,
+                                                    struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(cpfile,
+                                   nilfs_cpfile_get_blkoff(cpfile, cno),
+                                   create, nilfs_cpfile_block_init, bhp);
+}
+static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
+                                                       __u64 cno)
+{
+        return nilfs_mdt_delete_block(cpfile,
+                                      nilfs_cpfile_get_blkoff(cpfile, cno));
+}
+/**
+ * nilfs_cpfile_get_checkpoint - get a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @create: create flag
+ * @cpp: pointer to a checkpoint
+ * @bhp: pointer to a buffer head
+ *
+ * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
+ * specified by @cno. A new checkpoint will be created if @cno is the current
+ * checkpoint number and @create is nonzero.
+ *
+ * Return Value: On success, 0 is returned, and the checkpoint and the
+ * buffer head of the buffer on which the checkpoint is located are stored in
+ * the place pointed by @cpp and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ *
+ * %-EINVAL - invalid checkpoint.
+ */
+int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
+                                __u64 cno,
+                                int create,
+                                struct nilfs_checkpoint **cpp,
+                                struct buffer_head **bhp)
+{
+        struct buffer_head *header_bh, *cp_bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        void *kaddr;
+        int ret;
+        if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
+                     (cno < nilfs_mdt_cno(cpfile) && create)))
+                return -EINVAL;
+        down_write(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
+        if (ret < 0)
+                goto out_header;
+        kaddr = kmap(cp_bh->b_page);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        if (nilfs_checkpoint_invalid(cp)) {
+                if (!create) {
+                        kunmap(cp_bh->b_page);
+                        brelse(cp_bh);
+                        ret = -ENOENT;
+                        goto out_header;
+                }
+                /* a newly-created checkpoint */
+                nilfs_checkpoint_clear_invalid(cp);
+                if (!nilfs_cpfile_is_in_first(cpfile, cno))
+                        nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
+                                                                 kaddr, 1);
+                nilfs_mdt_mark_buffer_dirty(cp_bh);
+                kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+                                                       kaddr);
+                le64_add_cpu(&header->ch_ncheckpoints, 1);
+                kunmap_atomic(kaddr, KM_USER0);
+                nilfs_mdt_mark_buffer_dirty(header_bh);
+                nilfs_mdt_mark_dirty(cpfile);
+        }
+        if (cpp != NULL)
+                *cpp = cp;
+        *bhp = cp_bh;
+ out_header:
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_cpfile_put_checkpoint - put a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @bh: buffer head
+ *
+ * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
+ * specified by @cno. @bh must be the buffer head which has been returned by
+ * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
+ */
+void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
+                                 struct buffer_head *bh)
+{
+        kunmap(bh->b_page);
+        brelse(bh);
+}
+/**
+ * nilfs_cpfile_delete_checkpoints - delete checkpoints
+ * @cpfile: inode of checkpoint file
+ * @start: start checkpoint number
+ * @end: end checkpoint numer
+ *
+ * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
+ * the period from @start to @end, excluding @end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
+int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
+                                    __u64 start,
+                                    __u64 end)
+{
+        struct buffer_head *header_bh, *cp_bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+        __u64 cno;
+        void *kaddr;
+        unsigned long tnicps;
+        int ret, ncps, nicps, count, i;
+        if (unlikely(start == 0 || start > end)) {
+                printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
+                       "[%llu, %llu)\n", __func__,
+                       (unsigned long long)start, (unsigned long long)end);
+                return -EINVAL;
+        }
+        /* cannot delete the latest checkpoint */
+        if (start == nilfs_mdt_cno(cpfile) - 1)
+                return -EPERM;
+        down_write(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        tnicps = 0;
+        for (cno = start; cno < end; cno += ncps) {
+                ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                goto out_header;
+                        /* skip hole */
+                        ret = 0;
+                        continue;
+                }
+                kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+                cp = nilfs_cpfile_block_get_checkpoint(
+                        cpfile, cno, cp_bh, kaddr);
+                nicps = 0;
+                for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
+                        WARN_ON(nilfs_checkpoint_snapshot(cp));
+                        if (!nilfs_checkpoint_invalid(cp)) {
+                                nilfs_checkpoint_set_invalid(cp);
+                                nicps++;
+                        }
+                }
+                if (nicps > 0) {
+                        tnicps += nicps;
+                        nilfs_mdt_mark_buffer_dirty(cp_bh);
+                        nilfs_mdt_mark_dirty(cpfile);
+                        if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
+                            (count = nilfs_cpfile_block_sub_valid_checkpoints(
+                                    cpfile, cp_bh, kaddr, nicps)) == 0) {
+                                /* make hole */
+                                kunmap_atomic(kaddr, KM_USER0);
+                                brelse(cp_bh);
+                                ret = nilfs_cpfile_delete_checkpoint_block(
+                                        cpfile, cno);
+                                if (ret == 0)
+                                        continue;
+                                printk(KERN_ERR "%s: cannot delete block\n",
+                                       __func__);
+                                goto out_header;
+                        }
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(cp_bh);
+        }
+        if (tnicps > 0) {
+                kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+                                                       kaddr);
+                le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
+                nilfs_mdt_mark_buffer_dirty(header_bh);
+                nilfs_mdt_mark_dirty(cpfile);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+ out_header:
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
+                                              struct nilfs_checkpoint *cp,
+                                              struct nilfs_cpinfo *ci)
+{
+        ci->ci_flags = le32_to_cpu(cp->cp_flags);
+        ci->ci_cno = le64_to_cpu(cp->cp_cno);
+        ci->ci_create = le64_to_cpu(cp->cp_create);
+        ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
+        ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
+        ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
+        ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+}
+static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
+                                          struct nilfs_cpinfo *ci, size_t nci)
+{
+        struct nilfs_checkpoint *cp;
+        struct buffer_head *bh;
+        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+        __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
+        void *kaddr;
+        int n, ret;
+        int ncps, i;
+        if (cno == 0)
+                return -ENOENT; /* checkpoint number 0 is invalid */
+        down_read(&NILFS_MDT(cpfile)->mi_sem);
+        for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
+                ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                goto out;
+                        continue; /* skip hole */
+                }
+                kaddr = kmap_atomic(bh->b_page, KM_USER0);
+                cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+                for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
+                        if (!nilfs_checkpoint_invalid(cp))
+                                nilfs_cpfile_checkpoint_to_cpinfo(
+                                        cpfile, cp, &ci[n++]);
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(bh);
+        }
+        ret = n;
+        if (n > 0)
+                *cnop = ci[n - 1].ci_cno + 1;
+ out:
+        up_read(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
+                                          struct nilfs_cpinfo *ci, size_t nci)
+{
+        struct buffer_head *bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        __u64 curr = *cnop, next;
+        unsigned long curr_blkoff, next_blkoff;
+        void *kaddr;
+        int n = 0, ret;
+        down_read(&NILFS_MDT(cpfile)->mi_sem);
+        if (curr == 0) {
+                ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+                if (ret < 0)
+                        goto out;
+                kaddr = kmap_atomic(bh->b_page, KM_USER0);
+                header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+                curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(bh);
+                if (curr == 0) {
+                        ret = 0;
+                        goto out;
+                }
+        } else if (unlikely(curr == ~(__u64)0)) {
+                ret = 0;
+                goto out;
+        }
+        curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
+        if (unlikely(ret < 0)) {
+                if (ret == -ENOENT)
+                        ret = 0; /* No snapshots (started from a hole block) */
+                goto out;
+        }
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        while (n < nci) {
+                cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
+                curr = ~(__u64)0; /* Terminator */
+                if (unlikely(nilfs_checkpoint_invalid(cp) ||
+                             !nilfs_checkpoint_snapshot(cp)))
+                        break;
+                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
+                next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+                if (next == 0)
+                        break; /* reach end of the snapshot list */
+                next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
+                if (curr_blkoff != next_blkoff) {
+                        kunmap_atomic(kaddr, KM_USER0);
+                        brelse(bh);
+                        ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
+                                                                0, &bh);
+                        if (unlikely(ret < 0)) {
+                                WARN_ON(ret == -ENOENT);
+                                goto out;
+                        }
+                        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+                }
+                curr = next;
+                curr_blkoff = next_blkoff;
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(bh);
+        *cnop = curr;
+        ret = n;
+ out:
+        up_read(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_cpfile_get_cpinfo -
+ * @cpfile:
+ * @cno:
+ * @ci:
+ * @nci:
+ */
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
+                                struct nilfs_cpinfo *ci, size_t nci)
+{
+        switch (mode) {
+        case NILFS_CHECKPOINT:
+                return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
+        case NILFS_SNAPSHOT:
+                return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
+        default:
+                return -EINVAL;
+        }
+}
+/**
+ * nilfs_cpfile_delete_checkpoint -
+ * @cpfile:
+ * @cno:
+ */
+int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
+{
+        struct nilfs_cpinfo ci;
+        __u64 tcno = cno;
+        ssize_t nci;
+        int ret;
+        nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
+        if (nci < 0)
+                return nci;
+        else if (nci == 0 || ci.ci_cno != cno)
+                return -ENOENT;
+        /* cannot delete the latest checkpoint nor snapshots */
+        ret = nilfs_cpinfo_snapshot(&ci);
+        if (ret < 0)
+                return ret;
+        else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
+                return -EPERM;
+        return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
+}
+static struct nilfs_snapshot_list *
+nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
+                                     __u64 cno,
+                                     struct buffer_head *bh,
+                                     void *kaddr)
+{
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        struct nilfs_snapshot_list *list;
+        if (cno != 0) {
+                cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+                list = &cp->cp_snapshot_list;
+        } else {
+                header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+                list = &header->ch_snapshot_list;
+        }
+        return list;
+}
+static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
+{
+        struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        struct nilfs_snapshot_list *list;
+        __u64 curr, prev;
+        unsigned long curr_blkoff, prev_blkoff;
+        void *kaddr;
+        int ret;
+        if (cno == 0)
+                return -ENOENT; /* checkpoint number 0 is invalid */
+        down_write(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        if (nilfs_checkpoint_invalid(cp)) {
+                ret = -ENOENT;
+                kunmap_atomic(kaddr, KM_USER0);
+                goto out_cp;
+        }
+        if (nilfs_checkpoint_snapshot(cp)) {
+                ret = 0;
+                kunmap_atomic(kaddr, KM_USER0);
+                goto out_cp;
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+        if (ret < 0)
+                goto out_cp;
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+        list = &header->ch_snapshot_list;
+        curr_bh = header_bh;
+        get_bh(curr_bh);
+        curr = 0;
+        curr_blkoff = 0;
+        prev = le64_to_cpu(list->ssl_prev);
+        while (prev > cno) {
+                prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
+                curr = prev;
+                if (curr_blkoff != prev_blkoff) {
+                        kunmap_atomic(kaddr, KM_USER0);
+                        brelse(curr_bh);
+                        ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
+                                                                0, &curr_bh);
+                        if (ret < 0)
+                                goto out_header;
+                        kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+                }
+                curr_blkoff = prev_blkoff;
+                cp = nilfs_cpfile_block_get_checkpoint(
+                        cpfile, curr, curr_bh, kaddr);
+                list = &cp->cp_snapshot_list;
+                prev = le64_to_cpu(list->ssl_prev);
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+        if (prev != 0) {
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+                                                        &prev_bh);
+                if (ret < 0)
+                        goto out_curr;
+        } else {
+                prev_bh = header_bh;
+                get_bh(prev_bh);
+        }
+        kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+        list = nilfs_cpfile_block_get_snapshot_list(
+                cpfile, curr, curr_bh, kaddr);
+        list->ssl_prev = cpu_to_le64(cno);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
+        cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
+        nilfs_checkpoint_set_snapshot(cp);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+        list = nilfs_cpfile_block_get_snapshot_list(
+                cpfile, prev, prev_bh, kaddr);
+        list->ssl_next = cpu_to_le64(cno);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+        le64_add_cpu(&header->ch_nsnapshots, 1);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(prev_bh);
+        nilfs_mdt_mark_buffer_dirty(curr_bh);
+        nilfs_mdt_mark_buffer_dirty(cp_bh);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+        nilfs_mdt_mark_dirty(cpfile);
+        brelse(prev_bh);
+ out_curr:
+        brelse(curr_bh);
+ out_header:
+        brelse(header_bh);
+ out_cp:
+        brelse(cp_bh);
+ out_sem:
+        up_write(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
+{
+        struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        struct nilfs_snapshot_list *list;
+        __u64 next, prev;
+        void *kaddr;
+        int ret;
+        if (cno == 0)
+                return -ENOENT; /* checkpoint number 0 is invalid */
+        down_write(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        if (nilfs_checkpoint_invalid(cp)) {
+                ret = -ENOENT;
+                kunmap_atomic(kaddr, KM_USER0);
+                goto out_cp;
+        }
+        if (!nilfs_checkpoint_snapshot(cp)) {
+                ret = 0;
+                kunmap_atomic(kaddr, KM_USER0);
+                goto out_cp;
+        }
+        list = &cp->cp_snapshot_list;
+        next = le64_to_cpu(list->ssl_next);
+        prev = le64_to_cpu(list->ssl_prev);
+        kunmap_atomic(kaddr, KM_USER0);
+        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+        if (ret < 0)
+                goto out_cp;
+        if (next != 0) {
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
+                                                        &next_bh);
+                if (ret < 0)
+                        goto out_header;
+        } else {
+                next_bh = header_bh;
+                get_bh(next_bh);
+        }
+        if (prev != 0) {
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+                                                        &prev_bh);
+                if (ret < 0)
+                        goto out_next;
+        } else {
+                prev_bh = header_bh;
+                get_bh(prev_bh);
+        }
+        kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
+        list = nilfs_cpfile_block_get_snapshot_list(
+                cpfile, next, next_bh, kaddr);
+        list->ssl_prev = cpu_to_le64(prev);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+        list = nilfs_cpfile_block_get_snapshot_list(
+                cpfile, prev, prev_bh, kaddr);
+        list->ssl_next = cpu_to_le64(next);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
+        cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
+        nilfs_checkpoint_clear_snapshot(cp);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+        le64_add_cpu(&header->ch_nsnapshots, -1);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(next_bh);
+        nilfs_mdt_mark_buffer_dirty(prev_bh);
+        nilfs_mdt_mark_buffer_dirty(cp_bh);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+        nilfs_mdt_mark_dirty(cpfile);
+        brelse(prev_bh);
+ out_next:
+        brelse(next_bh);
+ out_header:
+        brelse(header_bh);
+ out_cp:
+        brelse(cp_bh);
+ out_sem:
+        up_write(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_cpfile_is_snapshot -
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ *
+ * Description:
+ *
+ * Return Value: On success, 1 is returned if the checkpoint specified by
+ * @cno is a snapshot, or 0 if not. On error, one of the following negative
+ * error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
+{
+        struct buffer_head *bh;
+        struct nilfs_checkpoint *cp;
+        void *kaddr;
+        int ret;
+        if (cno == 0)
+                return -ENOENT; /* checkpoint number 0 is invalid */
+        down_read(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+        if (ret < 0)
+                goto out;
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+        ret = nilfs_checkpoint_snapshot(cp);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(bh);
+ out:
+        up_read(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_cpfile_change_cpmode - change checkpoint mode
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @status: mode of checkpoint
+ *
+ * Description: nilfs_change_cpmode() changes the mode of the checkpoint
+ * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
+{
+        struct the_nilfs *nilfs;
+        int ret;
+        nilfs = NILFS_MDT(cpfile)->mi_nilfs;
+        switch (mode) {
+        case NILFS_CHECKPOINT:
+                /*
+                 * Check for protecting existing snapshot mounts:
+                 * bd_mount_sem is used to make this operation atomic and
+                 * exclusive with a new mount job.  Though it doesn't cover
+                 * umount, it's enough for the purpose.
+                 */
+                down(&nilfs->ns_bdev->bd_mount_sem);
+                if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
+                        /* Current implementation does not have to protect
+                           plain read-only mounts since they are exclusive
+                           with a read/write mount and are protected from the
+                           cleaner. */
+                        ret = -EBUSY;
+                } else
+                        ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
+                up(&nilfs->ns_bdev->bd_mount_sem);
+                return ret;
+        case NILFS_SNAPSHOT:
+                return nilfs_cpfile_set_snapshot(cpfile, cno);
+        default:
+                return -EINVAL;
+        }
+}
+/**
+ * nilfs_cpfile_get_stat - get checkpoint statistics
+ * @cpfile: inode of checkpoint file
+ * @stat: pointer to a structure of checkpoint statistics
+ *
+ * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
+{
+        struct buffer_head *bh;
+        struct nilfs_cpfile_header *header;
+        void *kaddr;
+        int ret;
+        down_read(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+        cpstat->cs_cno = nilfs_mdt_cno(cpfile);
+        cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
+        cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(bh);
+ out_sem:
+        up_read(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
new file mode 100644
index 000000000000..1a8a1008c342
--- /dev/null
+++ b/fs/nilfs2/cpfile.h
@@ -0,0 +1,45 @@
+/*
+ * cpfile.h - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_CPFILE_H
+#define _NILFS_CPFILE_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#define NILFS_CPFILE_GFP        NILFS_MDT_GFP
+int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
+                                struct nilfs_checkpoint **,
+                                struct buffer_head **);
+void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
+int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
+int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
+int nilfs_cpfile_is_snapshot(struct inode *, __u64);
+int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
+                                struct nilfs_cpinfo *, size_t);
+#endif  /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
new file mode 100644
index 000000000000..bb8a5818e7f1
--- /dev/null
+++ b/fs/nilfs2/dat.c
@@ -0,0 +1,430 @@
+/*
+ * dat.c - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "dat.h"
+#define NILFS_CNO_MIN   ((__u64)1)
+#define NILFS_CNO_MAX   (~(__u64)0)
+static int nilfs_dat_prepare_entry(struct inode *dat,
+                                   struct nilfs_palloc_req *req, int create)
+{
+        return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+                                            create, &req->pr_entry_bh);
+}
+static void nilfs_dat_commit_entry(struct inode *dat,
+                                   struct nilfs_palloc_req *req)
+{
+        nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
+        nilfs_mdt_mark_dirty(dat);
+        brelse(req->pr_entry_bh);
+}
+static void nilfs_dat_abort_entry(struct inode *dat,
+                                  struct nilfs_palloc_req *req)
+{
+        brelse(req->pr_entry_bh);
+}
+int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        int ret;
+        ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+        if (ret < 0)
+                return ret;
+        ret = nilfs_dat_prepare_entry(dat, req, 1);
+        if (ret < 0)
+                nilfs_palloc_abort_alloc_entry(dat, req);
+        return ret;
+}
+void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        struct nilfs_dat_entry *entry;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+        entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
+        entry->de_blocknr = cpu_to_le64(0);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_palloc_commit_alloc_entry(dat, req);
+        nilfs_dat_commit_entry(dat, req);
+}
+void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        nilfs_dat_abort_entry(dat, req);
+        nilfs_palloc_abort_alloc_entry(dat, req);
+}
+int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        int ret;
+        ret = nilfs_palloc_prepare_free_entry(dat, req);
+        if (ret < 0)
+                return ret;
+        ret = nilfs_dat_prepare_entry(dat, req, 0);
+        if (ret < 0) {
+                nilfs_palloc_abort_free_entry(dat, req);
+                return ret;
+        }
+        return 0;
+}
+void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        struct nilfs_dat_entry *entry;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+        entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
+        entry->de_blocknr = cpu_to_le64(0);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_dat_commit_entry(dat, req);
+        nilfs_palloc_commit_free_entry(dat, req);
+}
+void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        nilfs_dat_abort_entry(dat, req);
+        nilfs_palloc_abort_free_entry(dat, req);
+}
+int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        int ret;
+        ret = nilfs_dat_prepare_entry(dat, req, 0);
+        WARN_ON(ret == -ENOENT);
+        return ret;
+}
+void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
+                            sector_t blocknr)
+{
+        struct nilfs_dat_entry *entry;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
+        if (entry->de_blocknr != cpu_to_le64(0) ||
+            entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
+                printk(KERN_CRIT
+                       "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
+                       __func__, (unsigned long long)req->pr_entry_nr,
+                       (unsigned long long)le64_to_cpu(entry->de_start),
+                       (unsigned long long)le64_to_cpu(entry->de_end),
+                       (unsigned long long)le64_to_cpu(entry->de_blocknr));
+        }
+        entry->de_blocknr = cpu_to_le64(blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_dat_commit_entry(dat, req);
+}
+void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        nilfs_dat_abort_entry(dat, req);
+}
+int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        struct nilfs_dat_entry *entry;
+        __u64 start;
+        sector_t blocknr;
+        void *kaddr;
+        int ret;
+        ret = nilfs_dat_prepare_entry(dat, req, 0);
+        if (ret < 0) {
+                WARN_ON(ret == -ENOENT);
+                return ret;
+        }
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        start = le64_to_cpu(entry->de_start);
+        blocknr = le64_to_cpu(entry->de_blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (blocknr == 0) {
+                ret = nilfs_palloc_prepare_free_entry(dat, req);
+                if (ret < 0) {
+                        nilfs_dat_abort_entry(dat, req);
+                        return ret;
+                }
+        }
+        return 0;
+}
+void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
+                          int dead)
+{
+        struct nilfs_dat_entry *entry;
+        __u64 start, end;
+        sector_t blocknr;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        end = start = le64_to_cpu(entry->de_start);
+        if (!dead) {
+                end = nilfs_mdt_cno(dat);
+                WARN_ON(start > end);
+        }
+        entry->de_end = cpu_to_le64(end);
+        blocknr = le64_to_cpu(entry->de_blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (blocknr == 0)
+                nilfs_dat_commit_free(dat, req);
+        else
+                nilfs_dat_commit_entry(dat, req);
+}
+void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        struct nilfs_dat_entry *entry;
+        __u64 start;
+        sector_t blocknr;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        start = le64_to_cpu(entry->de_start);
+        blocknr = le64_to_cpu(entry->de_blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (start == nilfs_mdt_cno(dat) && blocknr == 0)
+                nilfs_palloc_abort_free_entry(dat, req);
+        nilfs_dat_abort_entry(dat, req);
+}
+/**
+ * nilfs_dat_mark_dirty -
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
+{
+        struct nilfs_palloc_req req;
+        int ret;
+        req.pr_entry_nr = vblocknr;
+        ret = nilfs_dat_prepare_entry(dat, &req, 0);
+        if (ret == 0)
+                nilfs_dat_commit_entry(dat, &req);
+        return ret;
+}
+/**
+ * nilfs_dat_freev - free virtual block numbers
+ * @dat: DAT file inode
+ * @vblocknrs: array of virtual block numbers
+ * @nitems: number of virtual block numbers
+ *
+ * Description: nilfs_dat_freev() frees the virtual block numbers specified by
+ * @vblocknrs and @nitems.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * nagative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
+int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
+{
+        return nilfs_palloc_freev(dat, vblocknrs, nitems);
+}
+/**
+ * nilfs_dat_move - change a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknr: block number
+ *
+ * Description: nilfs_dat_move() changes the block number associated with
+ * @vblocknr to @blocknr.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
+{
+        struct buffer_head *entry_bh;
+        struct nilfs_dat_entry *entry;
+        void *kaddr;
+        int ret;
+        ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+        if (ret < 0)
+                return ret;
+        kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+        if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
+                printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
+                       (unsigned long long)vblocknr,
+                       (unsigned long long)le64_to_cpu(entry->de_start),
+                       (unsigned long long)le64_to_cpu(entry->de_end));
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(entry_bh);
+                return -EINVAL;
+        }
+        WARN_ON(blocknr == 0);
+        entry->de_blocknr = cpu_to_le64(blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(entry_bh);
+        nilfs_mdt_mark_dirty(dat);
+        brelse(entry_bh);
+        return 0;
+}
+/**
+ * nilfs_dat_translate - translate a virtual block number to a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknrp: pointer to a block number
+ *
+ * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
+ * to the corresponding block number.
+ *
+ * Return Value: On success, 0 is returned and the block number associated
+ * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A block number associated with @vblocknr does not exist.
+ */
+int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
+{
+        struct buffer_head *entry_bh;
+        struct nilfs_dat_entry *entry;
+        sector_t blocknr;
+        void *kaddr;
+        int ret;
+        ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+        if (ret < 0)
+                return ret;
+        kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+        blocknr = le64_to_cpu(entry->de_blocknr);
+        if (blocknr == 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        if (blocknrp != NULL)
+                *blocknrp = blocknr;
+ out:
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(entry_bh);
+        return ret;
+}
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
+                            size_t nvi)
+{
+        struct buffer_head *entry_bh;
+        struct nilfs_dat_entry *entry;
+        __u64 first, last;
+        void *kaddr;
+        unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
+        int i, j, n, ret;
+        for (i = 0; i < nvi; i += n) {
+                ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
+                                                   0, &entry_bh);
+                if (ret < 0)
+                        return ret;
+                kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+                /* last virtual block number in this block */
+                first = vinfo[i].vi_vblocknr;
+                do_div(first, entries_per_block);
+                first *= entries_per_block;
+                last = first + entries_per_block - 1;
+                for (j = i, n = 0;
+                     j < nvi && vinfo[j].vi_vblocknr >= first &&
+                             vinfo[j].vi_vblocknr <= last;
+                     j++, n++) {
+                        entry = nilfs_palloc_block_get_entry(
+                                dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
+                        vinfo[j].vi_start = le64_to_cpu(entry->de_start);
+                        vinfo[j].vi_end = le64_to_cpu(entry->de_end);
+                        vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(entry_bh);
+        }
+        return nvi;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
new file mode 100644
index 000000000000..d9560654a4b7
--- /dev/null
+++ b/fs/nilfs2/dat.h
@@ -0,0 +1,52 @@
+/*
+ * dat.h - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_DAT_H
+#define _NILFS_DAT_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#define NILFS_DAT_GFP   NILFS_MDT_GFP
+struct nilfs_palloc_req;
+int nilfs_dat_translate(struct inode *, __u64, sector_t *);
+int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
+                            sector_t);
+void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_mark_dirty(struct inode *, __u64);
+int nilfs_dat_freev(struct inode *, __u64 *, size_t);
+int nilfs_dat_move(struct inode *, __u64, sector_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
+#endif  /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
new file mode 100644
index 000000000000..54100acc1102
--- /dev/null
+++ b/fs/nilfs2/dir.c
@@ -0,0 +1,711 @@
+/*
+ * dir.c - NILFS directory entry operations
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * All code that works with directory layout had been switched to pagecache
+ * and moved here. AV
+ */
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include "nilfs.h"
+#include "page.h"
+/*
+ * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
+ * more robust, but we have what we have
+ */
+static inline unsigned nilfs_chunk_size(struct inode *inode)
+{
+        return inode->i_sb->s_blocksize;
+}
+static inline void nilfs_put_page(struct page *page)
+{
+        kunmap(page);
+        page_cache_release(page);
+}
+static inline unsigned long dir_pages(struct inode *inode)
+{
+        return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+        unsigned last_byte = inode->i_size;
+        last_byte -= page_nr << PAGE_CACHE_SHIFT;
+        if (last_byte > PAGE_CACHE_SIZE)
+                last_byte = PAGE_CACHE_SIZE;
+        return last_byte;
+}
+static int nilfs_prepare_chunk_uninterruptible(struct page *page,
+                                               struct address_space *mapping,
+                                               unsigned from, unsigned to)
+{
+        loff_t pos = page_offset(page) + from;
+        return block_write_begin(NULL, mapping, pos, to - from,
+                                 AOP_FLAG_UNINTERRUPTIBLE, &page,
+                                 NULL, nilfs_get_block);
+}
+static int nilfs_prepare_chunk(struct page *page,
+                               struct address_space *mapping,
+                               unsigned from, unsigned to)
+{
+        loff_t pos = page_offset(page) + from;
+        return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
+                                 NULL, nilfs_get_block);
+}
+static int nilfs_commit_chunk(struct page *page,
+                              struct address_space *mapping,
+                              unsigned from, unsigned to)
+{
+        struct inode *dir = mapping->host;
+        struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
+        loff_t pos = page_offset(page) + from;
+        unsigned len = to - from;
+        unsigned nr_dirty, copied;
+        int err;
+        nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
+        copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
+        if (pos + copied > dir->i_size) {
+                i_size_write(dir, pos + copied);
+                mark_inode_dirty(dir);
+        }
+        if (IS_DIRSYNC(dir))
+                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+        unlock_page(page);
+        return err;
+}
+static void nilfs_check_page(struct page *page)
+{
+        struct inode *dir = page->mapping->host;
+        struct super_block *sb = dir->i_sb;
+        unsigned chunk_size = nilfs_chunk_size(dir);
+        char *kaddr = page_address(page);
+        unsigned offs, rec_len;
+        unsigned limit = PAGE_CACHE_SIZE;
+        struct nilfs_dir_entry *p;
+        char *error;
+        if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+                limit = dir->i_size & ~PAGE_CACHE_MASK;
+                if (limit & (chunk_size - 1))
+                        goto Ebadsize;
+                if (!limit)
+                        goto out;
+        }
+        for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
+                p = (struct nilfs_dir_entry *)(kaddr + offs);
+                rec_len = le16_to_cpu(p->rec_len);
+                if (rec_len < NILFS_DIR_REC_LEN(1))
+                        goto Eshort;
+                if (rec_len & 3)
+                        goto Ealign;
+                if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
+                        goto Enamelen;
+                if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                        goto Espan;
+        }
+        if (offs != limit)
+                goto Eend;
+out:
+        SetPageChecked(page);
+        return;
+        /* Too bad, we had an error */
+Ebadsize:
+        nilfs_error(sb, "nilfs_check_page",
+                    "size of directory #%lu is not a multiple of chunk size",
+                    dir->i_ino
+        );
+        goto fail;
+Eshort:
+        error = "rec_len is smaller than minimal";
+        goto bad_entry;
+Ealign:
+        error = "unaligned directory entry";
+        goto bad_entry;
+Enamelen:
+        error = "rec_len is too small for name_len";
+        goto bad_entry;
+Espan:
+        error = "directory entry across blocks";
+bad_entry:
+        nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
+                    "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+                    dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                    (unsigned long) le64_to_cpu(p->inode),
+                    rec_len, p->name_len);
+        goto fail;
+Eend:
+        p = (struct nilfs_dir_entry *)(kaddr + offs);
+        nilfs_error(sb, "nilfs_check_page",
+                    "entry in directory #%lu spans the page boundary"
+                    "offset=%lu, inode=%lu",
+                    dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                    (unsigned long) le64_to_cpu(p->inode));
+fail:
+        SetPageChecked(page);
+        SetPageError(page);
+}
+static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
+{
+        struct address_space *mapping = dir->i_mapping;
+        struct page *page = read_cache_page(mapping, n,
+                                (filler_t *)mapping->a_ops->readpage, NULL);
+        if (!IS_ERR(page)) {
+                wait_on_page_locked(page);
+                kmap(page);
+                if (!PageUptodate(page))
+                        goto fail;
+                if (!PageChecked(page))
+                        nilfs_check_page(page);
+                if (PageError(page))
+                        goto fail;
+        }
+        return page;
+fail:
+        nilfs_put_page(page);
+        return ERR_PTR(-EIO);
+}
+/*
+ * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
+ *
+ * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
+ */
+static int
+nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
+{
+        if (len != de->name_len)
+                return 0;
+        if (!de->inode)
+                return 0;
+        return !memcmp(name, de->name, len);
+}
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
+{
+        return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+static unsigned char
+nilfs_filetype_table[NILFS_FT_MAX] = {
+        [NILFS_FT_UNKNOWN]      = DT_UNKNOWN,
+        [NILFS_FT_REG_FILE]     = DT_REG,
+        [NILFS_FT_DIR]          = DT_DIR,
+        [NILFS_FT_CHRDEV]       = DT_CHR,
+        [NILFS_FT_BLKDEV]       = DT_BLK,
+        [NILFS_FT_FIFO]         = DT_FIFO,
+        [NILFS_FT_SOCK]         = DT_SOCK,
+        [NILFS_FT_SYMLINK]      = DT_LNK,
+};
+#define S_SHIFT 12
+static unsigned char
+nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+        [S_IFREG >> S_SHIFT]    = NILFS_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = NILFS_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = NILFS_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = NILFS_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = NILFS_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = NILFS_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = NILFS_FT_SYMLINK,
+};
+static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
+{
+        mode_t mode = inode->i_mode;
+        de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        loff_t pos = filp->f_pos;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        unsigned int offset = pos & ~PAGE_CACHE_MASK;
+        unsigned long n = pos >> PAGE_CACHE_SHIFT;
+        unsigned long npages = dir_pages(inode);
+/*      unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
+        unsigned char *types = NULL;
+        int ret;
+        if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
+                goto success;
+        types = nilfs_filetype_table;
+        for ( ; n < npages; n++, offset = 0) {
+                char *kaddr, *limit;
+                struct nilfs_dir_entry *de;
+                struct page *page = nilfs_get_page(inode, n);
+                if (IS_ERR(page)) {
+                        nilfs_error(sb, __func__, "bad page in #%lu",
+                                    inode->i_ino);
+                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ret = -EIO;
+                        goto done;
+                }
+                kaddr = page_address(page);
+                de = (struct nilfs_dir_entry *)(kaddr + offset);
+                limit = kaddr + nilfs_last_byte(inode, n) -
+                        NILFS_DIR_REC_LEN(1);
+                for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
+                        if (de->rec_len == 0) {
+                                nilfs_error(sb, __func__,
+                                            "zero-length directory entry");
+                                ret = -EIO;
+                                nilfs_put_page(page);
+                                goto done;
+                        }
+                        if (de->inode) {
+                                int over;
+                                unsigned char d_type = DT_UNKNOWN;
+                                if (types && de->file_type < NILFS_FT_MAX)
+                                        d_type = types[de->file_type];
+                                offset = (char *)de - kaddr;
+                                over = filldir(dirent, de->name, de->name_len,
+                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                                le64_to_cpu(de->inode), d_type);
+                                if (over) {
+                                        nilfs_put_page(page);
+                                        goto success;
+                                }
+                        }
+                        filp->f_pos += le16_to_cpu(de->rec_len);
+                }
+                nilfs_put_page(page);
+        }
+success:
+        ret = 0;
+done:
+        return ret;
+}
+/*
+ *      nilfs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *dir, struct dentry *dentry,
+                 struct page **res_page)
+{
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+        unsigned long start, n;
+        unsigned long npages = dir_pages(dir);
+        struct page *page = NULL;
+        struct nilfs_inode_info *ei = NILFS_I(dir);
+        struct nilfs_dir_entry *de;
+        if (npages == 0)
+                goto out;
+        /* OFFSET_CACHE */
+        *res_page = NULL;
+        start = ei->i_dir_start_lookup;
+        if (start >= npages)
+                start = 0;
+        n = start;
+        do {
+                char *kaddr;
+                page = nilfs_get_page(dir, n);
+                if (!IS_ERR(page)) {
+                        kaddr = page_address(page);
+                        de = (struct nilfs_dir_entry *)kaddr;
+                        kaddr += nilfs_last_byte(dir, n) - reclen;
+                        while ((char *) de <= kaddr) {
+                                if (de->rec_len == 0) {
+                                        nilfs_error(dir->i_sb, __func__,
+                                                "zero-length directory entry");
+                                        nilfs_put_page(page);
+                                        goto out;
+                                }
+                                if (nilfs_match(namelen, name, de))
+                                        goto found;
+                                de = nilfs_next_entry(de);
+                        }
+                        nilfs_put_page(page);
+                }
+                if (++n >= npages)
+                        n = 0;
+                /* next page is past the blocks we've got */
+                if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+                        nilfs_error(dir->i_sb, __func__,
+                               "dir %lu size %lld exceeds block cout %llu",
+                               dir->i_ino, dir->i_size,
+                               (unsigned long long)dir->i_blocks);
+                        goto out;
+                }
+        } while (n != start);
+out:
+        return NULL;
+found:
+        *res_page = page;
+        ei->i_dir_start_lookup = n;
+        return de;
+}
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
+{
+        struct page *page = nilfs_get_page(dir, 0);
+        struct nilfs_dir_entry *de = NULL;
+        if (!IS_ERR(page)) {
+                de = nilfs_next_entry(
+                        (struct nilfs_dir_entry *)page_address(page));
+                *p = page;
+        }
+        return de;
+}
+ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+        ino_t res = 0;
+        struct nilfs_dir_entry *de;
+        struct page *page;
+        de = nilfs_find_entry(dir, dentry, &page);
+        if (de) {
+                res = le64_to_cpu(de->inode);
+                kunmap(page);
+                page_cache_release(page);
+        }
+        return res;
+}
+/* Releases the page */
+void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+                    struct page *page, struct inode *inode)
+{
+        unsigned from = (char *) de - (char *) page_address(page);
+        unsigned to = from + le16_to_cpu(de->rec_len);
+        struct address_space *mapping = page->mapping;
+        int err;
+        lock_page(page);
+        err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
+        BUG_ON(err);
+        de->inode = cpu_to_le64(inode->i_ino);
+        nilfs_set_de_type(de, inode);
+        err = nilfs_commit_chunk(page, mapping, from, to);
+        nilfs_put_page(page);
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+/*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
+        mark_inode_dirty(dir);
+}
+/*
+ *      Parent is locked.
+ */
+int nilfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        unsigned chunk_size = nilfs_chunk_size(dir);
+        unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+        unsigned short rec_len, name_len;
+        struct page *page = NULL;
+        struct nilfs_dir_entry *de;
+        unsigned long npages = dir_pages(dir);
+        unsigned long n;
+        char *kaddr;
+        unsigned from, to;
+        int err;
+        /*
+         * We take care of directory expansion in the same loop.
+         * This code plays outside i_size, so it locks the page
+         * to protect that region.
+         */
+        for (n = 0; n <= npages; n++) {
+                char *dir_end;
+                page = nilfs_get_page(dir, n);
+                err = PTR_ERR(page);
+                if (IS_ERR(page))
+                        goto out;
+                lock_page(page);
+                kaddr = page_address(page);
+                dir_end = kaddr + nilfs_last_byte(dir, n);
+                de = (struct nilfs_dir_entry *)kaddr;
+                kaddr += PAGE_CACHE_SIZE - reclen;
+                while ((char *)de <= kaddr) {
+                        if ((char *)de == dir_end) {
+                                /* We hit i_size */
+                                name_len = 0;
+                                rec_len = chunk_size;
+                                de->rec_len = cpu_to_le16(chunk_size);
+                                de->inode = 0;
+                                goto got_it;
+                        }
+                        if (de->rec_len == 0) {
+                                nilfs_error(dir->i_sb, __func__,
+                                            "zero-length directory entry");
+                                err = -EIO;
+                                goto out_unlock;
+                        }
+                        err = -EEXIST;
+                        if (nilfs_match(namelen, name, de))
+                                goto out_unlock;
+                        name_len = NILFS_DIR_REC_LEN(de->name_len);
+                        rec_len = le16_to_cpu(de->rec_len);
+                        if (!de->inode && rec_len >= reclen)
+                                goto got_it;
+                        if (rec_len >= name_len + reclen)
+                                goto got_it;
+                        de = (struct nilfs_dir_entry *)((char *)de + rec_len);
+                }
+                unlock_page(page);
+                nilfs_put_page(page);
+        }
+        BUG();
+        return -EINVAL;
+got_it:
+        from = (char *)de - (char *)page_address(page);
+        to = from + rec_len;
+        err = nilfs_prepare_chunk(page, page->mapping, from, to);
+        if (err)
+                goto out_unlock;
+        if (de->inode) {
+                struct nilfs_dir_entry *de1;
+                de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
+                de1->rec_len = cpu_to_le16(rec_len - name_len);
+                de->rec_len = cpu_to_le16(name_len);
+                de = de1;
+        }
+        de->name_len = namelen;
+        memcpy(de->name, name, namelen);
+        de->inode = cpu_to_le64(inode->i_ino);
+        nilfs_set_de_type(de, inode);
+        err = nilfs_commit_chunk(page, page->mapping, from, to);
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+/*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
+        mark_inode_dirty(dir);
+        /* OFFSET_CACHE */
+out_put:
+        nilfs_put_page(page);
+out:
+        return err;
+out_unlock:
+        unlock_page(page);
+        goto out_put;
+}
+/*
+ * nilfs_delete_entry deletes a directory entry by merging it with the
+ * previous entry. Page is up-to-date. Releases the page.
+ */
+int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        char *kaddr = page_address(page);
+        unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+        unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+        struct nilfs_dir_entry *pde = NULL;
+        struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+        int err;
+        while ((char *)de < (char *)dir) {
+                if (de->rec_len == 0) {
+                        nilfs_error(inode->i_sb, __func__,
+                                    "zero-length directory entry");
+                        err = -EIO;
+                        goto out;
+                }
+                pde = de;
+                de = nilfs_next_entry(de);
+        }
+        if (pde)
+                from = (char *)pde - (char *)page_address(page);
+        lock_page(page);
+        err = nilfs_prepare_chunk(page, mapping, from, to);
+        BUG_ON(err);
+        if (pde)
+                pde->rec_len = cpu_to_le16(to - from);
+        dir->inode = 0;
+        err = nilfs_commit_chunk(page, mapping, from, to);
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+/*      NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
+        mark_inode_dirty(inode);
+out:
+        nilfs_put_page(page);
+        return err;
+}
+/*
+ * Set the first fragment of directory.
+ */
+int nilfs_make_empty(struct inode *inode, struct inode *parent)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct page *page = grab_cache_page(mapping, 0);
+        unsigned chunk_size = nilfs_chunk_size(inode);
+        struct nilfs_dir_entry *de;
+        int err;
+        void *kaddr;
+        if (!page)
+                return -ENOMEM;
+        err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
+        if (unlikely(err)) {
+                unlock_page(page);
+                goto fail;
+        }
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr, 0, chunk_size);
+        de = (struct nilfs_dir_entry *)kaddr;
+        de->name_len = 1;
+        de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
+        memcpy(de->name, ".\0\0", 4);
+        de->inode = cpu_to_le64(inode->i_ino);
+        nilfs_set_de_type(de, inode);
+        de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
+        de->name_len = 2;
+        de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
+        de->inode = cpu_to_le64(parent->i_ino);
+        memcpy(de->name, "..\0", 4);
+        nilfs_set_de_type(de, inode);
+        kunmap_atomic(kaddr, KM_USER0);
+        err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
+fail:
+        page_cache_release(page);
+        return err;
+}
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int nilfs_empty_dir(struct inode *inode)
+{
+        struct page *page = NULL;
+        unsigned long i, npages = dir_pages(inode);
+        for (i = 0; i < npages; i++) {
+                char *kaddr;
+                struct nilfs_dir_entry *de;
+                page = nilfs_get_page(inode, i);
+                if (IS_ERR(page))
+                        continue;
+                kaddr = page_address(page);
+                de = (struct nilfs_dir_entry *)kaddr;
+                kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
+                while ((char *)de <= kaddr) {
+                        if (de->rec_len == 0) {
+                                nilfs_error(inode->i_sb, __func__,
+                                            "zero-length directory entry "
+                                            "(kaddr=%p, de=%p)\n", kaddr, de);
+                                goto not_empty;
+                        }
+                        if (de->inode != 0) {
+                                /* check for . and .. */
+                                if (de->name[0] != '.')
+                                        goto not_empty;
+                                if (de->name_len > 2)
+                                        goto not_empty;
+                                if (de->name_len < 2) {
+                                        if (de->inode !=
+                                            cpu_to_le64(inode->i_ino))
+                                                goto not_empty;
+                                } else if (de->name[1] != '.')
+                                        goto not_empty;
+                        }
+                        de = nilfs_next_entry(de);
+                }
+                nilfs_put_page(page);
+        }
+        return 1;
+not_empty:
+        nilfs_put_page(page);
+        return 0;
+}
+struct file_operations nilfs_dir_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = nilfs_readdir,
+        .unlocked_ioctl = nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = nilfs_ioctl,
+#endif  /* CONFIG_COMPAT */
+        .fsync          = nilfs_sync_file,
+};
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
new file mode 100644
index 000000000000..c6379e482781
--- /dev/null
+++ b/fs/nilfs2/direct.c
@@ -0,0 +1,436 @@
+/*
+ * direct.c - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "page.h"
+#include "direct.h"
+#include "alloc.h"
+static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
+{
+        return (__le64 *)
+                ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
+}
+static inline __u64
+nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
+{
+        return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
+}
+static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
+                                        __u64 key, __u64 ptr)
+{
+        *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
+}
+static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
+                               __u64 key, int level, __u64 *ptrp)
+{
+        struct nilfs_direct *direct;
+        __u64 ptr;
+        direct = (struct nilfs_direct *)bmap;
+        if ((key > NILFS_DIRECT_KEY_MAX) ||
+            (level != 1) ||     /* XXX: use macro for level 1 */
+            ((ptr = nilfs_direct_get_ptr(direct, key)) ==
+             NILFS_BMAP_INVALID_PTR))
+                return -ENOENT;
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        return 0;
+}
+static __u64
+nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
+{
+        __u64 ptr;
+        ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
+        if (ptr != NILFS_BMAP_INVALID_PTR)
+                /* sequential access */
+                return ptr;
+        else
+                /* block group */
+                return nilfs_bmap_find_target_in_group(&direct->d_bmap);
+}
+static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
+                                      __u64 key, __u64 ptr)
+{
+        direct->d_bmap.b_last_allocated_key = key;
+        direct->d_bmap.b_last_allocated_ptr = ptr;
+}
+static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
+                                       __u64 key,
+                                       union nilfs_bmap_ptr_req *req,
+                                       struct nilfs_bmap_stats *stats)
+{
+        int ret;
+        if (direct->d_ops->dop_find_target != NULL)
+                req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
+        ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
+                                                               req);
+        if (ret < 0)
+                return ret;
+        stats->bs_nblocks = 1;
+        return 0;
+}
+static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
+                                       union nilfs_bmap_ptr_req *req,
+                                       __u64 key, __u64 ptr)
+{
+        struct buffer_head *bh;
+        /* ptr must be a pointer to a buffer head. */
+        bh = (struct buffer_head *)((unsigned long)ptr);
+        set_buffer_nilfs_volatile(bh);
+        if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
+                direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
+                        &direct->d_bmap, req);
+        nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
+        if (!nilfs_bmap_dirty(&direct->d_bmap))
+                nilfs_bmap_set_dirty(&direct->d_bmap);
+        if (direct->d_ops->dop_set_target != NULL)
+                direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
+}
+static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+        struct nilfs_direct *direct;
+        union nilfs_bmap_ptr_req req;
+        struct nilfs_bmap_stats stats;
+        int ret;
+        direct = (struct nilfs_direct *)bmap;
+        if (key > NILFS_DIRECT_KEY_MAX)
+                return -ENOENT;
+        if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
+                return -EEXIST;
+        ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
+        if (ret < 0)
+                return ret;
+        nilfs_direct_commit_insert(direct, &req, key, ptr);
+        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        return 0;
+}
+static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
+                                       union nilfs_bmap_ptr_req *req,
+                                       __u64 key,
+                                       struct nilfs_bmap_stats *stats)
+{
+        int ret;
+        if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+                req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+                ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
+                        &direct->d_bmap, req);
+                if (ret < 0)
+                        return ret;
+        }
+        stats->bs_nblocks = 1;
+        return 0;
+}
+static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
+                                       union nilfs_bmap_ptr_req *req,
+                                       __u64 key)
+{
+        if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
+                direct->d_bmap.b_pops->bpop_commit_end_ptr(
+                        &direct->d_bmap, req);
+        nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+}
+static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+        struct nilfs_direct *direct;
+        union nilfs_bmap_ptr_req req;
+        struct nilfs_bmap_stats stats;
+        int ret;
+        direct = (struct nilfs_direct *)bmap;
+        if ((key > NILFS_DIRECT_KEY_MAX) ||
+            nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
+                return -ENOENT;
+        ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
+        if (ret < 0)
+                return ret;
+        nilfs_direct_commit_delete(direct, &req, key);
+        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+        return 0;
+}
+static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+{
+        struct nilfs_direct *direct;
+        __u64 key, lastkey;
+        direct = (struct nilfs_direct *)bmap;
+        lastkey = NILFS_DIRECT_KEY_MAX + 1;
+        for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
+                if (nilfs_direct_get_ptr(direct, key) !=
+                    NILFS_BMAP_INVALID_PTR)
+                        lastkey = key;
+        if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
+                return -ENOENT;
+        *keyp = lastkey;
+        return 0;
+}
+static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
+{
+        return key > NILFS_DIRECT_KEY_MAX;
+}
+static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
+                                    __u64 *keys, __u64 *ptrs, int nitems)
+{
+        struct nilfs_direct *direct;
+        __u64 key;
+        __u64 ptr;
+        int n;
+        direct = (struct nilfs_direct *)bmap;
+        if (nitems > NILFS_DIRECT_NBLOCKS)
+                nitems = NILFS_DIRECT_NBLOCKS;
+        n = 0;
+        for (key = 0; key < nitems; key++) {
+                ptr = nilfs_direct_get_ptr(direct, key);
+                if (ptr != NILFS_BMAP_INVALID_PTR) {
+                        keys[n] = key;
+                        ptrs[n] = ptr;
+                        n++;
+                }
+        }
+        return n;
+}
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
+                                    __u64 key, __u64 *keys, __u64 *ptrs,
+                                    int n, __u64 low, __u64 high)
+{
+        struct nilfs_direct *direct;
+        __le64 *dptrs;
+        int ret, i, j;
+        /* no need to allocate any resource for conversion */
+        /* delete */
+        ret = bmap->b_ops->bop_delete(bmap, key);
+        if (ret < 0)
+                return ret;
+        /* free resources */
+        if (bmap->b_ops->bop_clear != NULL)
+                bmap->b_ops->bop_clear(bmap);
+        /* convert */
+        direct = (struct nilfs_direct *)bmap;
+        dptrs = nilfs_direct_dptrs(direct);
+        for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
+                if ((j < n) && (i == keys[j])) {
+                        dptrs[i] = (i != key) ?
+                                nilfs_bmap_ptr_to_dptr(ptrs[j]) :
+                                NILFS_BMAP_INVALID_PTR;
+                        j++;
+                } else
+                        dptrs[i] = NILFS_BMAP_INVALID_PTR;
+        }
+        nilfs_direct_init(bmap, low, high);
+        return 0;
+}
+static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
+                                    struct buffer_head *bh)
+{
+        union nilfs_bmap_ptr_req oldreq, newreq;
+        __u64 key;
+        __u64 ptr;
+        int ret;
+        key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+        ptr = nilfs_direct_get_ptr(direct, key);
+        if (!buffer_nilfs_volatile(bh)) {
+                oldreq.bpr_ptr = ptr;
+                newreq.bpr_ptr = ptr;
+                ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
+                                                &newreq);
+                if (ret < 0)
+                        return ret;
+                nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
+                set_buffer_nilfs_volatile(bh);
+                nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+        } else
+                ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+        return ret;
+}
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+                                  struct buffer_head *bh)
+{
+        struct nilfs_direct *direct;
+        direct = (struct nilfs_direct *)bmap;
+        return (direct->d_ops->dop_propagate != NULL) ?
+                direct->d_ops->dop_propagate(direct, bh) :
+                0;
+}
+static int nilfs_direct_assign_v(struct nilfs_direct *direct,
+                                 __u64 key, __u64 ptr,
+                                 struct buffer_head **bh,
+                                 sector_t blocknr,
+                                 union nilfs_binfo *binfo)
+{
+        union nilfs_bmap_ptr_req req;
+        int ret;
+        req.bpr_ptr = ptr;
+        ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
+                &direct->d_bmap, &req);
+        if (ret < 0)
+                return ret;
+        direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
+                                                     &req, blocknr);
+        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        return 0;
+}
+static int nilfs_direct_assign_p(struct nilfs_direct *direct,
+                                 __u64 key, __u64 ptr,
+                                 struct buffer_head **bh,
+                                 sector_t blocknr,
+                                 union nilfs_binfo *binfo)
+{
+        nilfs_direct_set_ptr(direct, key, blocknr);
+        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_dat.bi_level = 0;
+        return 0;
+}
+static int nilfs_direct_assign(struct nilfs_bmap *bmap,
+                               struct buffer_head **bh,
+                               sector_t blocknr,
+                               union nilfs_binfo *binfo)
+{
+        struct nilfs_direct *direct;
+        __u64 key;
+        __u64 ptr;
+        direct = (struct nilfs_direct *)bmap;
+        key = nilfs_bmap_data_get_key(bmap, *bh);
+        if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
+                printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
+                       (unsigned long long)key);
+                return -EINVAL;
+        }
+        ptr = nilfs_direct_get_ptr(direct, key);
+        if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
+                printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
+                       (unsigned long long)ptr);
+                return -EINVAL;
+        }
+        return direct->d_ops->dop_assign(direct, key, ptr, bh,
+                                         blocknr, binfo);
+}
+static const struct nilfs_bmap_operations nilfs_direct_ops = {
+        .bop_lookup             =       nilfs_direct_lookup,
+        .bop_insert             =       nilfs_direct_insert,
+        .bop_delete             =       nilfs_direct_delete,
+        .bop_clear              =       NULL,
+        .bop_propagate          =       nilfs_direct_propagate,
+        .bop_lookup_dirty_buffers       =       NULL,
+        .bop_assign             =       nilfs_direct_assign,
+        .bop_mark               =       NULL,
+        .bop_last_key           =       nilfs_direct_last_key,
+        .bop_check_insert       =       nilfs_direct_check_insert,
+        .bop_check_delete       =       NULL,
+        .bop_gather_data        =       nilfs_direct_gather_data,
+};
+static const struct nilfs_direct_operations nilfs_direct_ops_v = {
+        .dop_find_target        =       nilfs_direct_find_target_v,
+        .dop_set_target         =       nilfs_direct_set_target_v,
+        .dop_propagate          =       nilfs_direct_propagate_v,
+        .dop_assign             =       nilfs_direct_assign_v,
+};
+static const struct nilfs_direct_operations nilfs_direct_ops_p = {
+        .dop_find_target        =       NULL,
+        .dop_set_target         =       NULL,
+        .dop_propagate          =       NULL,
+        .dop_assign             =       nilfs_direct_assign_p,
+};
+int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+{
+        struct nilfs_direct *direct;
+        direct = (struct nilfs_direct *)bmap;
+        bmap->b_ops = &nilfs_direct_ops;
+        bmap->b_low = low;
+        bmap->b_high = high;
+        switch (bmap->b_inode->i_ino) {
+        case NILFS_DAT_INO:
+                direct->d_ops = &nilfs_direct_ops_p;
+                break;
+        default:
+                direct->d_ops = &nilfs_direct_ops_v;
+                break;
+        }
+        return 0;
+}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
new file mode 100644
index 000000000000..45d2c5cda812
--- /dev/null
+++ b/fs/nilfs2/direct.h
@@ -0,0 +1,78 @@
+/*
+ * direct.h - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_DIRECT_H
+#define _NILFS_DIRECT_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "bmap.h"
+struct nilfs_direct;
+/**
+ * struct nilfs_direct_operations - direct mapping operation table
+ */
+struct nilfs_direct_operations {
+        __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
+        void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
+        int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
+        int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
+                          struct buffer_head **, sector_t,
+                          union nilfs_binfo *);
+};
+/**
+ * struct nilfs_direct_node - direct node
+ * @dn_flags: flags
+ * @dn_pad: padding
+ */
+struct nilfs_direct_node {
+        __u8 dn_flags;
+        __u8 pad[7];
+};
+/**
+ * struct nilfs_direct - direct mapping
+ * @d_bmap: bmap structure
+ * @d_ops: direct mapping operation table
+ */
+struct nilfs_direct {
+        struct nilfs_bmap d_bmap;
+        /* direct-mapping-specific members */
+        const struct nilfs_direct_operations *d_ops;
+};
+#define NILFS_DIRECT_NBLOCKS    (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
+#define NILFS_DIRECT_KEY_MIN    0
+#define NILFS_DIRECT_KEY_MAX    (NILFS_DIRECT_NBLOCKS - 1)
+int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
+                                    __u64 *, int, __u64, __u64);
+#endif  /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
new file mode 100644
index 000000000000..6bd84a0d8238
--- /dev/null
+++ b/fs/nilfs2/file.c
@@ -0,0 +1,160 @@
+/*
+ * file.c - NILFS regular file handling primitives including fsync().
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>,
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include "nilfs.h"
+#include "segment.h"
+int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+        /*
+         * Called from fsync() system call
+         * This is the only entry point that can catch write and synch
+         * timing for both data blocks and intermediate blocks.
+         *
+         * This function should be implemented when the writeback function
+         * will be implemented.
+         */
+        struct inode *inode = dentry->d_inode;
+        int err;
+        if (!nilfs_inode_dirty(inode))
+                return 0;
+        if (datasync)
+                err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
+                                                    LLONG_MAX);
+        else
+                err = nilfs_construct_segment(inode->i_sb);
+        return err;
+}
+static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct page *page = vmf->page;
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct nilfs_transaction_info ti;
+        int ret;
+        if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
+                return VM_FAULT_SIGBUS; /* -ENOSPC */
+        lock_page(page);
+        if (page->mapping != inode->i_mapping ||
+            page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
+                unlock_page(page);
+                return VM_FAULT_NOPAGE; /* make the VM retry the fault */
+        }
+        /*
+         * check to see if the page is mapped already (no holes)
+         */
+        if (PageMappedToDisk(page)) {
+                unlock_page(page);
+                goto mapped;
+        }
+        if (page_has_buffers(page)) {
+                struct buffer_head *bh, *head;
+                int fully_mapped = 1;
+                bh = head = page_buffers(page);
+                do {
+                        if (!buffer_mapped(bh)) {
+                                fully_mapped = 0;
+                                break;
+                        }
+                } while (bh = bh->b_this_page, bh != head);
+                if (fully_mapped) {
+                        SetPageMappedToDisk(page);
+                        unlock_page(page);
+                        goto mapped;
+                }
+        }
+        unlock_page(page);
+        /*
+         * fill hole blocks
+         */
+        ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+        /* never returns -ENOMEM, but may return -ENOSPC */
+        if (unlikely(ret))
+                return VM_FAULT_SIGBUS;
+        ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
+        if (unlikely(ret)) {
+                nilfs_transaction_abort(inode->i_sb);
+                return ret;
+        }
+        nilfs_transaction_commit(inode->i_sb);
+ mapped:
+        SetPageChecked(page);
+        wait_on_page_writeback(page);
+        return 0;
+}
+struct vm_operations_struct nilfs_file_vm_ops = {
+        .fault          = filemap_fault,
+        .page_mkwrite   = nilfs_page_mkwrite,
+};
+static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        file_accessed(file);
+        vma->vm_ops = &nilfs_file_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+        return 0;
+}
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the nilfs filesystem.
+ */
+struct file_operations nilfs_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = generic_file_aio_read,
+        .aio_write      = generic_file_aio_write,
+        .unlocked_ioctl = nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = nilfs_ioctl,
+#endif  /* CONFIG_COMPAT */
+        .mmap           = nilfs_file_mmap,
+        .open           = generic_file_open,
+        /* .release     = nilfs_release_file, */
+        .fsync          = nilfs_sync_file,
+        .splice_read    = generic_file_splice_read,
+};
+struct inode_operations nilfs_file_inode_operations = {
+        .truncate       = nilfs_truncate,
+        .setattr        = nilfs_setattr,
+        .permission     = nilfs_permission,
+};
+/* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
new file mode 100644
index 000000000000..93383c5cee90
--- /dev/null
+++ b/fs/nilfs2/gcdat.c
@@ -0,0 +1,84 @@
+/*
+ * gcdat.c - NILFS shadow DAT inode for GC
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
+{
+        struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
+        struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
+        int err;
+        gcdat->i_state = 0;
+        gcdat->i_blocks = dat->i_blocks;
+        gii->i_flags = dii->i_flags;
+        gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
+        gii->i_cno = 0;
+        nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
+        err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
+        if (unlikely(err))
+                return err;
+        return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
+                                      &dii->i_btnode_cache);
+}
+void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
+{
+        struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
+        struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
+        struct address_space *mapping = dat->i_mapping;
+        struct address_space *gmapping = gcdat->i_mapping;
+        down_write(&NILFS_MDT(dat)->mi_sem);
+        dat->i_blocks = gcdat->i_blocks;
+        dii->i_flags = gii->i_flags;
+        dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
+        nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
+        nilfs_clear_dirty_pages(mapping);
+        nilfs_copy_back_pages(mapping, gmapping);
+        /* note: mdt dirty flags should be cleared by segctor. */
+        nilfs_clear_dirty_pages(&dii->i_btnode_cache);
+        nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
+        up_write(&NILFS_MDT(dat)->mi_sem);
+}
+void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
+{
+        struct inode *gcdat = nilfs->ns_gc_dat;
+        struct nilfs_inode_info *gii = NILFS_I(gcdat);
+        gcdat->i_state = I_CLEAR;
+        gii->i_flags = 0;
+        truncate_inode_pages(gcdat->i_mapping, 0);
+        truncate_inode_pages(&gii->i_btnode_cache, 0);
+}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
new file mode 100644
index 000000000000..19d2102b6a69
--- /dev/null
+++ b/fs/nilfs2/gcinode.c
@@ -0,0 +1,288 @@
+/*
+ * gcinode.c - dummy inodes to buffer blocks for garbage collection
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+/*
+ * This file adds the cache of on-disk blocks to be moved in garbage
+ * collection.  The disk blocks are held with dummy inodes (called
+ * gcinodes), and this file provides lookup function of the dummy
+ * inodes and their buffer read function.
+ *
+ * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
+ * has to treat blocks that belong to a same file but have different
+ * checkpoint numbers.  To avoid interference among generations, dummy
+ * inodes are managed separatly from actual inodes, and their lookup
+ * function (nilfs_gc_iget) is designed to be specified with a
+ * checkpoint number argument as well as an inode number.
+ *
+ * Buffers and pages held by the dummy inodes will be released each
+ * time after they are copied to a new log.  Dirty blocks made on the
+ * current generation and the blocks to be moved by GC never overlap
+ * because the dirty blocks make a new generation; they rather must be
+ * written individually.
+ */
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+#include "dat.h"
+#include "ifile.h"
+static struct address_space_operations def_gcinode_aops = {};
+/* XXX need def_gcinode_iops/fops? */
+/*
+ * nilfs_gccache_submit_read_data() - add data buffer and submit read request
+ * @inode - gc inode
+ * @blkoff - dummy offset treated as the key for the page cache
+ * @pbn - physical block number of the block
+ * @vbn - virtual block number of the block, 0 for non-virtual block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_data() registers the data buffer
+ * specified by @pbn to the GC pagecache with the key @blkoff.
+ * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The block specified with @pbn does not exist.
+ */
+int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
+                                   sector_t pbn, __u64 vbn,
+                                   struct buffer_head **out_bh)
+{
+        struct buffer_head *bh;
+        int err;
+        bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+        if (unlikely(!bh))
+                return -ENOMEM;
+        if (buffer_uptodate(bh))
+                goto out;
+        if (pbn == 0) {
+                struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
+                                          /* use original dat, not gc dat. */
+                err = nilfs_dat_translate(dat_inode, vbn, &pbn);
+                if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
+                        brelse(bh);
+                        goto failed;
+                }
+        }
+        lock_buffer(bh);
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                goto out;
+        }
+        if (!buffer_mapped(bh)) {
+                bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+                set_buffer_mapped(bh);
+        }
+        bh->b_blocknr = pbn;
+        bh->b_end_io = end_buffer_read_sync;
+        get_bh(bh);
+        submit_bh(READ, bh);
+        if (vbn)
+                bh->b_blocknr = vbn;
+ out:
+        err = 0;
+        *out_bh = bh;
+ failed:
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        return err;
+}
+/*
+ * nilfs_gccache_submit_read_node() - add node buffer and submit read request
+ * @inode - gc inode
+ * @pbn - physical block number for the block
+ * @vbn - virtual block number for the block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_node() registers the node buffer
+ * specified by @vbn to the GC pagecache.  @pbn can be supplied by the
+ * caller to avoid translation of the disk block address.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
+                                   __u64 vbn, struct buffer_head **out_bh)
+{
+        int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+                                            vbn ? : pbn, pbn, out_bh, 0);
+        if (ret == -EEXIST) /* internal code (cache hit) */
+                ret = 0;
+        return ret;
+}
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
+{
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh))
+                return -EIO;
+        if (buffer_dirty(bh))
+                return -EEXIST;
+        if (buffer_nilfs_node(bh))
+                nilfs_btnode_mark_dirty(bh);
+        else
+                nilfs_mdt_mark_buffer_dirty(bh);
+        return 0;
+}
+/*
+ * nilfs_init_gccache() - allocate and initialize gc_inode hash table
+ * @nilfs - the_nilfs
+ *
+ * Return Value: On success, 0.
+ * On error, a negative error code is returned.
+ */
+int nilfs_init_gccache(struct the_nilfs *nilfs)
+{
+        int loop;
+        BUG_ON(nilfs->ns_gc_inodes_h);
+        INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
+        nilfs->ns_gc_inodes_h =
+                kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
+                        GFP_NOFS);
+        if (nilfs->ns_gc_inodes_h == NULL)
+                return -ENOMEM;
+        for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
+                INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
+        return 0;
+}
+/*
+ * nilfs_destroy_gccache() - free gc_inode hash table
+ * @nilfs - the nilfs
+ */
+void nilfs_destroy_gccache(struct the_nilfs *nilfs)
+{
+        if (nilfs->ns_gc_inodes_h) {
+                nilfs_remove_all_gcinode(nilfs);
+                kfree(nilfs->ns_gc_inodes_h);
+                nilfs->ns_gc_inodes_h = NULL;
+        }
+}
+static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
+                                   __u64 cno)
+{
+        struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
+        struct nilfs_inode_info *ii;
+        if (!inode)
+                return NULL;
+        inode->i_op = NULL;
+        inode->i_fop = NULL;
+        inode->i_mapping->a_ops = &def_gcinode_aops;
+        ii = NILFS_I(inode);
+        ii->i_cno = cno;
+        ii->i_flags = 0;
+        ii->i_state = 1 << NILFS_I_GCINODE;
+        ii->i_bh = NULL;
+        nilfs_bmap_init_gc(ii->i_bmap);
+        return inode;
+}
+static unsigned long ihash(ino_t ino, __u64 cno)
+{
+        return hash_long((unsigned long)((ino << 2) + cno),
+                         NILFS_GCINODE_HASH_BITS);
+}
+/*
+ * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
+ */
+struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
+{
+        struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
+        struct hlist_node *node;
+        struct inode *inode;
+        hlist_for_each_entry(inode, node, head, i_hash) {
+                if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
+                        return inode;
+        }
+        inode = alloc_gcinode(nilfs, ino, cno);
+        if (likely(inode)) {
+                hlist_add_head(&inode->i_hash, head);
+                list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
+        }
+        return inode;
+}
+/*
+ * nilfs_clear_gcinode() - clear and free a gc inode
+ */
+void nilfs_clear_gcinode(struct inode *inode)
+{
+        nilfs_mdt_clear(inode);
+        nilfs_mdt_destroy(inode);
+}
+/*
+ * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
+ */
+void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
+{
+        struct hlist_head *head = nilfs->ns_gc_inodes_h;
+        struct hlist_node *node, *n;
+        struct inode *inode;
+        int loop;
+        for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
+                hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
+                        hlist_del_init(&inode->i_hash);
+                        list_del_init(&NILFS_I(inode)->i_dirty);
+                        nilfs_clear_gcinode(inode); /* might sleep */
+                }
+        }
+}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
new file mode 100644
index 000000000000..de86401f209f
--- /dev/null
+++ b/fs/nilfs2/ifile.c
@@ -0,0 +1,150 @@
+/*
+ * ifile.c - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "ifile.h"
+/**
+ * nilfs_ifile_create_inode - create a new disk inode
+ * @ifile: ifile inode
+ * @out_ino: pointer to a variable to store inode number
+ * @out_bh: buffer_head contains newly allocated disk inode
+ *
+ * Return Value: On success, 0 is returned and the newly allocated inode
+ * number is stored in the place pointed by @ino, and buffer_head pointer
+ * that contains newly allocated disk inode structure is stored in the
+ * place pointed by @out_bh
+ * On error, one of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No inode left.
+ */
+int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
+                             struct buffer_head **out_bh)
+{
+        struct nilfs_palloc_req req;
+        int ret;
+        req.pr_entry_nr = 0;  /* 0 says find free inode from beginning of
+                                 a group. dull code!! */
+        req.pr_entry_bh = NULL;
+        ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+        if (!ret) {
+                ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
+                                                   &req.pr_entry_bh);
+                if (ret < 0)
+                        nilfs_palloc_abort_alloc_entry(ifile, &req);
+        }
+        if (ret < 0) {
+                brelse(req.pr_entry_bh);
+                return ret;
+        }
+        nilfs_palloc_commit_alloc_entry(ifile, &req);
+        nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+        nilfs_mdt_mark_dirty(ifile);
+        *out_ino = (ino_t)req.pr_entry_nr;
+        *out_bh = req.pr_entry_bh;
+        return 0;
+}
+/**
+ * nilfs_ifile_delete_inode - delete a disk inode
+ * @ifile: ifile inode
+ * @ino: inode number
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The inode number @ino have not been allocated.
+ */
+int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
+{
+        struct nilfs_palloc_req req = {
+                .pr_entry_nr = ino, .pr_entry_bh = NULL
+        };
+        struct nilfs_inode *raw_inode;
+        void *kaddr;
+        int ret;
+        ret = nilfs_palloc_prepare_free_entry(ifile, &req);
+        if (!ret) {
+                ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
+                                                   &req.pr_entry_bh);
+                if (ret < 0)
+                        nilfs_palloc_abort_free_entry(ifile, &req);
+        }
+        if (ret < 0) {
+                brelse(req.pr_entry_bh);
+                return ret;
+        }
+        kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
+        raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
+                                                 req.pr_entry_bh, kaddr);
+        raw_inode->i_flags = 0;
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+        brelse(req.pr_entry_bh);
+        nilfs_palloc_commit_free_entry(ifile, &req);
+        return 0;
+}
+int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
+                                struct buffer_head **out_bh)
+{
+        struct super_block *sb = ifile->i_sb;
+        int err;
+        if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
+                nilfs_error(sb, __func__, "bad inode number: %lu",
+                            (unsigned long) ino);
+                return -EINVAL;
+        }
+        err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
+        if (unlikely(err)) {
+                if (err == -EINVAL)
+                        nilfs_error(sb, __func__, "ifile is broken");
+                else
+                        nilfs_warning(sb, __func__,
+                                      "unable to read inode: %lu",
+                                      (unsigned long) ino);
+        }
+        return err;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
new file mode 100644
index 000000000000..5d30a35679b5
--- /dev/null
+++ b/fs/nilfs2/ifile.h
@@ -0,0 +1,53 @@
+/*
+ * ifile.h - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_IFILE_H
+#define _NILFS_IFILE_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "alloc.h"
+#define NILFS_IFILE_GFP  NILFS_MDT_GFP
+static inline struct nilfs_inode *
+nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
+{
+        void *kaddr = kmap(ibh->b_page);
+        return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
+}
+static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
+                                           struct buffer_head *ibh)
+{
+        kunmap(ibh->b_page);
+}
+int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
+int nilfs_ifile_delete_inode(struct inode *, ino_t);
+int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+#endif  /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
new file mode 100644
index 000000000000..49ab4a49bb4f
--- /dev/null
+++ b/fs/nilfs2/inode.c
@@ -0,0 +1,785 @@
+/*
+ * inode.c - NILFS inode operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/uio.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+#include "cpfile.h"
+#include "ifile.h"
+/**
+ * nilfs_get_block() - get a file block on the filesystem (callback function)
+ * @inode - inode struct of the target file
+ * @blkoff - file block number
+ * @bh_result - buffer head to be mapped on
+ * @create - indicate whether allocating the block or not when it has not
+ *      been allocated yet.
+ *
+ * This function does not issue actual read request of the specified data
+ * block. It is done by VFS.
+ * Bulk read for direct-io is not supported yet. (should be supported)
+ */
+int nilfs_get_block(struct inode *inode, sector_t blkoff,
+                    struct buffer_head *bh_result, int create)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        unsigned long blknum = 0;
+        int err = 0, ret;
+        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        /* This exclusion control is a workaround; should be revised */
+        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
+        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        if (ret == 0) { /* found */
+                map_bh(bh_result, inode->i_sb, blknum);
+                goto out;
+        }
+        /* data block was not found */
+        if (ret == -ENOENT && create) {
+                struct nilfs_transaction_info ti;
+                bh_result->b_blocknr = 0;
+                err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+                if (unlikely(err))
+                        goto out;
+                err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
+                                        (unsigned long)bh_result);
+                if (unlikely(err != 0)) {
+                        if (err == -EEXIST) {
+                                /*
+                                 * The get_block() function could be called
+                                 * from multiple callers for an inode.
+                                 * However, the page having this block must
+                                 * be locked in this case.
+                                 */
+                                printk(KERN_WARNING
+                                       "nilfs_get_block: a race condition "
+                                       "while inserting a data block. "
+                                       "(inode number=%lu, file block "
+                                       "offset=%llu)\n",
+                                       inode->i_ino,
+                                       (unsigned long long)blkoff);
+                                err = 0;
+                        } else if (err == -EINVAL) {
+                                nilfs_error(inode->i_sb, __func__,
+                                            "broken bmap (inode=%lu)\n",
+                                            inode->i_ino);
+                                err = -EIO;
+                        }
+                        nilfs_transaction_abort(inode->i_sb);
+                        goto out;
+                }
+                nilfs_transaction_commit(inode->i_sb); /* never fails */
+                /* Error handling should be detailed */
+                set_buffer_new(bh_result);
+                map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
+                                                      to proper value */
+        } else if (ret == -ENOENT) {
+                /* not found is not error (e.g. hole); must return without
+                   the mapped state flag. */
+                ;
+        } else {
+                err = ret;
+        }
+ out:
+        return err;
+}
+/**
+ * nilfs_readpage() - implement readpage() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @page - the page to be read
+ */
+static int nilfs_readpage(struct file *file, struct page *page)
+{
+        return mpage_readpage(page, nilfs_get_block);
+}
+/**
+ * nilfs_readpages() - implement readpages() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @mapping - address_space struct used for reading multiple pages
+ * @pages - the pages to be read
+ * @nr_pages - number of pages to be read
+ */
+static int nilfs_readpages(struct file *file, struct address_space *mapping,
+                           struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
+}
+static int nilfs_writepages(struct address_space *mapping,
+                            struct writeback_control *wbc)
+{
+        struct inode *inode = mapping->host;
+        int err = 0;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                err = nilfs_construct_dsync_segment(inode->i_sb, inode,
+                                                    wbc->range_start,
+                                                    wbc->range_end);
+        return err;
+}
+static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        int err;
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        if (wbc->sync_mode == WB_SYNC_ALL) {
+                err = nilfs_construct_segment(inode->i_sb);
+                if (unlikely(err))
+                        return err;
+        } else if (wbc->for_reclaim)
+                nilfs_flush_segment(inode->i_sb, inode->i_ino);
+        return 0;
+}
+static int nilfs_set_page_dirty(struct page *page)
+{
+        int ret = __set_page_dirty_buffers(page);
+        if (ret) {
+                struct inode *inode = page->mapping->host;
+                struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+                unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+                nilfs_set_file_dirty(sbi, inode, nr_dirty);
+        }
+        return ret;
+}
+static int nilfs_write_begin(struct file *file, struct address_space *mapping,
+                             loff_t pos, unsigned len, unsigned flags,
+                             struct page **pagep, void **fsdata)
+{
+        struct inode *inode = mapping->host;
+        int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
+        if (unlikely(err))
+                return err;
+        *pagep = NULL;
+        err = block_write_begin(file, mapping, pos, len, flags, pagep,
+                                fsdata, nilfs_get_block);
+        if (unlikely(err))
+                nilfs_transaction_abort(inode->i_sb);
+        return err;
+}
+static int nilfs_write_end(struct file *file, struct address_space *mapping,
+                           loff_t pos, unsigned len, unsigned copied,
+                           struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned nr_dirty;
+        int err;
+        nr_dirty = nilfs_page_count_clean_buffers(page, start,
+                                                  start + copied);
+        copied = generic_write_end(file, mapping, pos, len, copied, page,
+                                   fsdata);
+        nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+        err = nilfs_transaction_commit(inode->i_sb);
+        return err ? : copied;
+}
+static ssize_t
+nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+                loff_t offset, unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        ssize_t size;
+        if (rw == WRITE)
+                return 0;
+        /* Needs synchronization with the cleaner */
+        size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+                                  offset, nr_segs, nilfs_get_block, NULL);
+        return size;
+}
+struct address_space_operations nilfs_aops = {
+        .writepage              = nilfs_writepage,
+        .readpage               = nilfs_readpage,
+        /* .sync_page           = nilfs_sync_page, */
+        .writepages             = nilfs_writepages,
+        .set_page_dirty         = nilfs_set_page_dirty,
+        .readpages              = nilfs_readpages,
+        .write_begin            = nilfs_write_begin,
+        .write_end              = nilfs_write_end,
+        /* .releasepage         = nilfs_releasepage, */
+        .invalidatepage         = block_invalidatepage,
+        .direct_IO              = nilfs_direct_IO,
+};
+struct inode *nilfs_new_inode(struct inode *dir, int mode)
+{
+        struct super_block *sb = dir->i_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct inode *inode;
+        struct nilfs_inode_info *ii;
+        int err = -ENOMEM;
+        ino_t ino;
+        inode = new_inode(sb);
+        if (unlikely(!inode))
+                goto failed;
+        mapping_set_gfp_mask(inode->i_mapping,
+                             mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+        ii = NILFS_I(inode);
+        ii->i_state = 1 << NILFS_I_NEW;
+        err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
+        if (unlikely(err))
+                goto failed_ifile_create_inode;
+        /* reference count of i_bh inherits from nilfs_mdt_read_block() */
+        atomic_inc(&sbi->s_inodes_count);
+        inode->i_uid = current_fsuid();
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+        inode->i_ino = ino;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
+                err = nilfs_bmap_read(ii->i_bmap, NULL);
+                if (err < 0)
+                        goto failed_bmap;
+                set_bit(NILFS_I_BMAP, &ii->i_state);
+                /* No lock is needed; iget() ensures it. */
+        }
+        ii->i_flags = NILFS_I(dir)->i_flags;
+        if (S_ISLNK(mode))
+                ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
+        if (!S_ISDIR(mode))
+                ii->i_flags &= ~NILFS_DIRSYNC_FL;
+        /* ii->i_file_acl = 0; */
+        /* ii->i_dir_acl = 0; */
+        ii->i_dir_start_lookup = 0;
+#ifdef CONFIG_NILFS_FS_POSIX_ACL
+        ii->i_acl = NULL;
+        ii->i_default_acl = NULL;
+#endif
+        ii->i_cno = 0;
+        nilfs_set_inode_flags(inode);
+        spin_lock(&sbi->s_next_gen_lock);
+        inode->i_generation = sbi->s_next_generation++;
+        spin_unlock(&sbi->s_next_gen_lock);
+        insert_inode_hash(inode);
+        err = nilfs_init_acl(inode, dir);
+        if (unlikely(err))
+                goto failed_acl; /* never occur. When supporting
+                                    nilfs_init_acl(), proper cancellation of
+                                    above jobs should be considered */
+        mark_inode_dirty(inode);
+        return inode;
+ failed_acl:
+ failed_bmap:
+        inode->i_nlink = 0;
+        iput(inode);  /* raw_inode will be deleted through
+                         generic_delete_inode() */
+        goto failed;
+ failed_ifile_create_inode:
+        make_bad_inode(inode);
+        iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
+                         called */
+ failed:
+        return ERR_PTR(err);
+}
+void nilfs_free_inode(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        clear_inode(inode);
+        /* XXX: check error code? Is there any thing I can do? */
+        (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
+        atomic_dec(&sbi->s_inodes_count);
+}
+void nilfs_set_inode_flags(struct inode *inode)
+{
+        unsigned int flags = NILFS_I(inode)->i_flags;
+        inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+                            S_DIRSYNC);
+        if (flags & NILFS_SYNC_FL)
+                inode->i_flags |= S_SYNC;
+        if (flags & NILFS_APPEND_FL)
+                inode->i_flags |= S_APPEND;
+        if (flags & NILFS_IMMUTABLE_FL)
+                inode->i_flags |= S_IMMUTABLE;
+#ifndef NILFS_ATIME_DISABLE
+        if (flags & NILFS_NOATIME_FL)
+#endif
+                inode->i_flags |= S_NOATIME;
+        if (flags & NILFS_DIRSYNC_FL)
+                inode->i_flags |= S_DIRSYNC;
+        mapping_set_gfp_mask(inode->i_mapping,
+                             mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+}
+int nilfs_read_inode_common(struct inode *inode,
+                            struct nilfs_inode *raw_inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        int err;
+        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+        inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
+        inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
+        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+        inode->i_size = le64_to_cpu(raw_inode->i_size);
+        inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+        inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+        inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+        inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+        inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+        inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+        if (inode->i_nlink == 0 && inode->i_mode == 0)
+                return -EINVAL; /* this inode is deleted */
+        inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
+        ii->i_flags = le32_to_cpu(raw_inode->i_flags);
+#if 0
+        ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+        ii->i_dir_acl = S_ISREG(inode->i_mode) ?
+                0 : le32_to_cpu(raw_inode->i_dir_acl);
+#endif
+        ii->i_cno = 0;
+        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)) {
+                err = nilfs_bmap_read(ii->i_bmap, raw_inode);
+                if (err < 0)
+                        return err;
+                set_bit(NILFS_I_BMAP, &ii->i_state);
+                /* No lock is needed; iget() ensures it. */
+        }
+        return 0;
+}
+static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
+                              struct inode *inode)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
+        struct buffer_head *bh;
+        struct nilfs_inode *raw_inode;
+        int err;
+        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
+        if (unlikely(err))
+                goto bad_inode;
+        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
+#ifdef CONFIG_NILFS_FS_POSIX_ACL
+        ii->i_acl = NILFS_ACL_NOT_CACHED;
+        ii->i_default_acl = NILFS_ACL_NOT_CACHED;
+#endif
+        if (nilfs_read_inode_common(inode, raw_inode))
+                goto failed_unmap;
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &nilfs_file_inode_operations;
+                inode->i_fop = &nilfs_file_operations;
+                inode->i_mapping->a_ops = &nilfs_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &nilfs_dir_inode_operations;
+                inode->i_fop = &nilfs_dir_operations;
+                inode->i_mapping->a_ops = &nilfs_aops;
+        } else if (S_ISLNK(inode->i_mode)) {
+                inode->i_op = &nilfs_symlink_inode_operations;
+                inode->i_mapping->a_ops = &nilfs_aops;
+        } else {
+                inode->i_op = &nilfs_special_inode_operations;
+                init_special_inode(
+                        inode, inode->i_mode,
+                        new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+        }
+        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+        brelse(bh);
+        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        nilfs_set_inode_flags(inode);
+        return 0;
+ failed_unmap:
+        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+        brelse(bh);
+ bad_inode:
+        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        return err;
+}
+struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
+{
+        struct inode *inode;
+        int err;
+        inode = iget_locked(sb, ino);
+        if (unlikely(!inode))
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        err = __nilfs_read_inode(sb, ino, inode);
+        if (unlikely(err)) {
+                iget_failed(inode);
+                return ERR_PTR(err);
+        }
+        unlock_new_inode(inode);
+        return inode;
+}
+void nilfs_write_inode_common(struct inode *inode,
+                              struct nilfs_inode *raw_inode, int has_bmap)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+        raw_inode->i_uid = cpu_to_le32(inode->i_uid);
+        raw_inode->i_gid = cpu_to_le32(inode->i_gid);
+        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+        raw_inode->i_size = cpu_to_le64(inode->i_size);
+        raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+        raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
+        raw_inode->i_flags = cpu_to_le32(ii->i_flags);
+        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+        if (has_bmap)
+                nilfs_bmap_write(ii->i_bmap, raw_inode);
+        else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+                raw_inode->i_device_code =
+                        cpu_to_le64(new_encode_dev(inode->i_rdev));
+        /* When extending inode, nilfs->ns_inode_size should be checked
+           for substitutions of appended fields */
+}
+void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
+{
+        ino_t ino = inode->i_ino;
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_inode *raw_inode;
+        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
+        /* The buffer is guarded with lock_buffer() by the caller */
+        if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
+                memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
+        set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+        nilfs_write_inode_common(inode, raw_inode, 0);
+                /* XXX: call with has_bmap = 0 is a workaround to avoid
+                   deadlock of bmap. This delays update of i_bmap to just
+                   before writing */
+        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
+}
+#define NILFS_MAX_TRUNCATE_BLOCKS       16384  /* 64MB for 4KB block */
+static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
+                                unsigned long from)
+{
+        unsigned long b;
+        int ret;
+        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+                return;
+ repeat:
+        ret = nilfs_bmap_last_key(ii->i_bmap, &b);
+        if (ret == -ENOENT)
+                return;
+        else if (ret < 0)
+                goto failed;
+        if (b < from)
+                return;
+        b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
+        ret = nilfs_bmap_truncate(ii->i_bmap, b);
+        nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
+        if (!ret || (ret == -ENOMEM &&
+                     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
+                goto repeat;
+ failed:
+        if (ret == -EINVAL)
+                nilfs_error(ii->vfs_inode.i_sb, __func__,
+                            "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
+        else
+                nilfs_warning(ii->vfs_inode.i_sb, __func__,
+                              "failed to truncate bmap (ino=%lu, err=%d)",
+                              ii->vfs_inode.i_ino, ret);
+}
+void nilfs_truncate(struct inode *inode)
+{
+        unsigned long blkoff;
+        unsigned int blocksize;
+        struct nilfs_transaction_info ti;
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        blocksize = sb->s_blocksize;
+        blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
+        nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+        block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
+        nilfs_truncate_bmap(ii, blkoff);
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        if (IS_SYNC(inode))
+                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+        nilfs_transaction_commit(sb);
+        /* May construct a logical segment and may fail in sync mode.
+           But truncate has no return value. */
+}
+void nilfs_delete_inode(struct inode *inode)
+{
+        struct nilfs_transaction_info ti;
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        if (unlikely(is_bad_inode(inode))) {
+                if (inode->i_data.nrpages)
+                        truncate_inode_pages(&inode->i_data, 0);
+                clear_inode(inode);
+                return;
+        }
+        nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+        if (inode->i_data.nrpages)
+                truncate_inode_pages(&inode->i_data, 0);
+        nilfs_truncate_bmap(ii, 0);
+        nilfs_free_inode(inode);
+        /* nilfs_free_inode() marks inode buffer dirty */
+        if (IS_SYNC(inode))
+                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        nilfs_transaction_commit(sb);
+        /* May construct a logical segment and may fail in sync mode.
+           But delete_inode has no return value. */
+}
+int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct nilfs_transaction_info ti;
+        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        int err;
+        err = inode_change_ok(inode, iattr);
+        if (err)
+                return err;
+        err = nilfs_transaction_begin(sb, &ti, 0);
+        if (unlikely(err))
+                return err;
+        err = inode_setattr(inode, iattr);
+        if (!err && (iattr->ia_valid & ATTR_MODE))
+                err = nilfs_acl_chmod(inode);
+        if (likely(!err))
+                err = nilfs_transaction_commit(sb);
+        else
+                nilfs_transaction_abort(sb);
+        return err;
+}
+int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
+                           struct buffer_head **pbh)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        int err;
+        spin_lock(&sbi->s_inode_lock);
+        /* Caller of this function MUST lock s_inode_lock */
+        if (ii->i_bh == NULL) {
+                spin_unlock(&sbi->s_inode_lock);
+                err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
+                                                  pbh);
+                if (unlikely(err))
+                        return err;
+                spin_lock(&sbi->s_inode_lock);
+                if (ii->i_bh == NULL)
+                        ii->i_bh = *pbh;
+                else {
+                        brelse(*pbh);
+                        *pbh = ii->i_bh;
+                }
+        } else
+                *pbh = ii->i_bh;
+        get_bh(*pbh);
+        spin_unlock(&sbi->s_inode_lock);
+        return 0;
+}
+int nilfs_inode_dirty(struct inode *inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+        int ret = 0;
+        if (!list_empty(&ii->i_dirty)) {
+                spin_lock(&sbi->s_inode_lock);
+                ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
+                        test_bit(NILFS_I_BUSY, &ii->i_state);
+                spin_unlock(&sbi->s_inode_lock);
+        }
+        return ret;
+}
+int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
+                         unsigned nr_dirty)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
+        if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
+                return 0;
+        spin_lock(&sbi->s_inode_lock);
+        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+                /* Because this routine may race with nilfs_dispose_list(),
+                   we have to check NILFS_I_QUEUED here, too. */
+                if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
+                        /* This will happen when somebody is freeing
+                           this inode. */
+                        nilfs_warning(sbi->s_super, __func__,
+                                      "cannot get inode (ino=%lu)\n",
+                                      inode->i_ino);
+                        spin_unlock(&sbi->s_inode_lock);
+                        return -EINVAL; /* NILFS_I_DIRTY may remain for
+                                           freeing inode */
+                }
+                list_del(&ii->i_dirty);
+                list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
+                set_bit(NILFS_I_QUEUED, &ii->i_state);
+        }
+        spin_unlock(&sbi->s_inode_lock);
+        return 0;
+}
+int nilfs_mark_inode_dirty(struct inode *inode)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+        struct buffer_head *ibh;
+        int err;
+        err = nilfs_load_inode_block(sbi, inode, &ibh);
+        if (unlikely(err)) {
+                nilfs_warning(inode->i_sb, __func__,
+                              "failed to reget inode block.\n");
+                return err;
+        }
+        lock_buffer(ibh);
+        nilfs_update_inode(inode, ibh);
+        unlock_buffer(ibh);
+        nilfs_mdt_mark_buffer_dirty(ibh);
+        nilfs_mdt_mark_dirty(sbi->s_ifile);
+        brelse(ibh);
+        return 0;
+}
+/**
+ * nilfs_dirty_inode - reflect changes on given inode to an inode block.
+ * @inode: inode of the file to be registered.
+ *
+ * nilfs_dirty_inode() loads a inode block containing the specified
+ * @inode and copies data from a nilfs_inode to a corresponding inode
+ * entry in the inode block. This operation is excluded from the segment
+ * construction. This function can be called both as a single operation
+ * and as a part of indivisible file operations.
+ */
+void nilfs_dirty_inode(struct inode *inode)
+{
+        struct nilfs_transaction_info ti;
+        if (is_bad_inode(inode)) {
+                nilfs_warning(inode->i_sb, __func__,
+                              "tried to mark bad_inode dirty. ignored.\n");
+                dump_stack();
+                return;
+        }
+        nilfs_transaction_begin(inode->i_sb, &ti, 0);
+        nilfs_mark_inode_dirty(inode);
+        nilfs_transaction_commit(inode->i_sb); /* never fails */
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
new file mode 100644
index 000000000000..d6759b92006f
--- /dev/null
+++ b/fs/nilfs2/ioctl.c
@@ -0,0 +1,665 @@
+/*
+ * ioctl.c - NILFS ioctl operations.
+ *
+ * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/smp_lock.h>     /* lock_kernel(), unlock_kernel() */
+#include <linux/capability.h>   /* capable() */
+#include <linux/uaccess.h>      /* copy_from_user(), copy_to_user() */
+#include <linux/vmalloc.h>
+#include <linux/nilfs2_fs.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "bmap.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
+                                 struct nilfs_argv *argv, int dir,
+                                 ssize_t (*dofunc)(struct the_nilfs *,
+                                                   __u64 *, int,
+                                                   void *, size_t, size_t))
+{
+        void *buf;
+        void __user *base = (void __user *)(unsigned long)argv->v_base;
+        size_t maxmembs, total, n;
+        ssize_t nr;
+        int ret, i;
+        __u64 pos, ppos;
+        if (argv->v_nmembs == 0)
+                return 0;
+        if (argv->v_size > PAGE_SIZE)
+                return -EINVAL;
+        buf = (void *)__get_free_pages(GFP_NOFS, 0);
+        if (unlikely(!buf))
+                return -ENOMEM;
+        maxmembs = PAGE_SIZE / argv->v_size;
+        ret = 0;
+        total = 0;
+        pos = argv->v_index;
+        for (i = 0; i < argv->v_nmembs; i += n) {
+                n = (argv->v_nmembs - i < maxmembs) ?
+                        argv->v_nmembs - i : maxmembs;
+                if ((dir & _IOC_WRITE) &&
+                    copy_from_user(buf, base + argv->v_size * i,
+                                   argv->v_size * n)) {
+                        ret = -EFAULT;
+                        break;
+                }
+                ppos = pos;
+                nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
+                               n);
+                if (nr < 0) {
+                        ret = nr;
+                        break;
+                }
+                if ((dir & _IOC_READ) &&
+                    copy_to_user(base + argv->v_size * i, buf,
+                                 argv->v_size * nr)) {
+                        ret = -EFAULT;
+                        break;
+                }
+                total += nr;
+                if ((size_t)nr < n)
+                        break;
+                if (pos == ppos)
+                        pos += n;
+        }
+        argv->v_nmembs = total;
+        free_pages((unsigned long)buf, 0);
+        return ret;
+}
+static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
+                                     unsigned int cmd, void __user *argp)
+{
+        struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+        struct nilfs_transaction_info ti;
+        struct nilfs_cpmode cpmode;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
+                return -EFAULT;
+        nilfs_transaction_begin(inode->i_sb, &ti, 0);
+        ret = nilfs_cpfile_change_cpmode(
+                cpfile, cpmode.cm_cno, cpmode.cm_mode);
+        if (unlikely(ret < 0)) {
+                nilfs_transaction_abort(inode->i_sb);
+                return ret;
+        }
+        nilfs_transaction_commit(inode->i_sb); /* never fails */
+        return ret;
+}
+static int
+nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
+                              unsigned int cmd, void __user *argp)
+{
+        struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+        struct nilfs_transaction_info ti;
+        __u64 cno;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (copy_from_user(&cno, argp, sizeof(cno)))
+                return -EFAULT;
+        nilfs_transaction_begin(inode->i_sb, &ti, 0);
+        ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
+        if (unlikely(ret < 0)) {
+                nilfs_transaction_abort(inode->i_sb);
+                return ret;
+        }
+        nilfs_transaction_commit(inode->i_sb); /* never fails */
+        return ret;
+}
+static ssize_t
+nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                          void *buf, size_t size, size_t nmembs)
+{
+        int ret;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
+                                      nmembs);
+        up_read(&nilfs->ns_segctor_sem);
+        return ret;
+}
+static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
+                                  unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_cpstat cpstat;
+        int ret;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
+                ret = -EFAULT;
+        return ret;
+}
+static ssize_t
+nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                          void *buf, size_t size, size_t nmembs)
+{
+        int ret;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
+        up_read(&nilfs->ns_segctor_sem);
+        return ret;
+}
+static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
+                                  unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_sustat sustat;
+        int ret;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &sustat, sizeof(sustat)))
+                ret = -EFAULT;
+        return ret;
+}
+static ssize_t
+nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                         void *buf, size_t size, size_t nmembs)
+{
+        int ret;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
+        up_read(&nilfs->ns_segctor_sem);
+        return ret;
+}
+static ssize_t
+nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                          void *buf, size_t size, size_t nmembs)
+{
+        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+        struct nilfs_bdesc *bdescs = buf;
+        int ret, i;
+        down_read(&nilfs->ns_segctor_sem);
+        for (i = 0; i < nmembs; i++) {
+                ret = nilfs_bmap_lookup_at_level(bmap,
+                                                 bdescs[i].bd_offset,
+                                                 bdescs[i].bd_level + 1,
+                                                 &bdescs[i].bd_blocknr);
+                if (ret < 0) {
+                        if (ret != -ENOENT) {
+                                up_read(&nilfs->ns_segctor_sem);
+                                return ret;
+                        }
+                        bdescs[i].bd_blocknr = 0;
+                }
+        }
+        up_read(&nilfs->ns_segctor_sem);
+        return nmembs;
+}
+static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
+                                  unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_argv argv;
+        int ret;
+        if (copy_from_user(&argv, argp, sizeof(argv)))
+                return -EFAULT;
+        if (argv.v_size != sizeof(struct nilfs_bdesc))
+                return -EINVAL;
+        ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+                                    nilfs_ioctl_do_get_bdescs);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &argv, sizeof(argv)))
+                ret = -EFAULT;
+        return ret;
+}
+static int nilfs_ioctl_move_inode_block(struct inode *inode,
+                                        struct nilfs_vdesc *vdesc,
+                                        struct list_head *buffers)
+{
+        struct buffer_head *bh;
+        int ret;
+        if (vdesc->vd_flags == 0)
+                ret = nilfs_gccache_submit_read_data(
+                        inode, vdesc->vd_offset, vdesc->vd_blocknr,
+                        vdesc->vd_vblocknr, &bh);
+        else
+                ret = nilfs_gccache_submit_read_node(
+                        inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
+        if (unlikely(ret < 0)) {
+                if (ret == -ENOENT)
+                        printk(KERN_CRIT
+                               "%s: invalid virtual block address (%s): "
+                               "ino=%llu, cno=%llu, offset=%llu, "
+                               "blocknr=%llu, vblocknr=%llu\n",
+                               __func__, vdesc->vd_flags ? "node" : "data",
+                               (unsigned long long)vdesc->vd_ino,
+                               (unsigned long long)vdesc->vd_cno,
+                               (unsigned long long)vdesc->vd_offset,
+                               (unsigned long long)vdesc->vd_blocknr,
+                               (unsigned long long)vdesc->vd_vblocknr);
+                return ret;
+        }
+        bh->b_private = vdesc;
+        list_add_tail(&bh->b_assoc_buffers, buffers);
+        return 0;
+}
+static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
+                                   struct nilfs_argv *argv, void *buf)
+{
+        size_t nmembs = argv->v_nmembs;
+        struct inode *inode;
+        struct nilfs_vdesc *vdesc;
+        struct buffer_head *bh, *n;
+        LIST_HEAD(buffers);
+        ino_t ino;
+        __u64 cno;
+        int i, ret;
+        for (i = 0, vdesc = buf; i < nmembs; ) {
+                ino = vdesc->vd_ino;
+                cno = vdesc->vd_cno;
+                inode = nilfs_gc_iget(nilfs, ino, cno);
+                if (unlikely(inode == NULL)) {
+                        ret = -ENOMEM;
+                        goto failed;
+                }
+                do {
+                        ret = nilfs_ioctl_move_inode_block(inode, vdesc,
+                                                           &buffers);
+                        if (unlikely(ret < 0))
+                                goto failed;
+                        vdesc++;
+                } while (++i < nmembs &&
+                         vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+        }
+        list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+                ret = nilfs_gccache_wait_and_mark_dirty(bh);
+                if (unlikely(ret < 0)) {
+                        if (ret == -EEXIST) {
+                                vdesc = bh->b_private;
+                                printk(KERN_CRIT
+                                       "%s: conflicting %s buffer: "
+                                       "ino=%llu, cno=%llu, offset=%llu, "
+                                       "blocknr=%llu, vblocknr=%llu\n",
+                                       __func__,
+                                       vdesc->vd_flags ? "node" : "data",
+                                       (unsigned long long)vdesc->vd_ino,
+                                       (unsigned long long)vdesc->vd_cno,
+                                       (unsigned long long)vdesc->vd_offset,
+                                       (unsigned long long)vdesc->vd_blocknr,
+                                       (unsigned long long)vdesc->vd_vblocknr);
+                        }
+                        goto failed;
+                }
+                list_del_init(&bh->b_assoc_buffers);
+                bh->b_private = NULL;
+                brelse(bh);
+        }
+        return nmembs;
+ failed:
+        list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+                list_del_init(&bh->b_assoc_buffers);
+                bh->b_private = NULL;
+                brelse(bh);
+        }
+        return ret;
+}
+static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
+                                          struct nilfs_argv *argv, void *buf)
+{
+        size_t nmembs = argv->v_nmembs;
+        struct inode *cpfile = nilfs->ns_cpfile;
+        struct nilfs_period *periods = buf;
+        int ret, i;
+        for (i = 0; i < nmembs; i++) {
+                ret = nilfs_cpfile_delete_checkpoints(
+                        cpfile, periods[i].p_start, periods[i].p_end);
+                if (ret < 0)
+                        return ret;
+        }
+        return nmembs;
+}
+static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
+                                      struct nilfs_argv *argv, void *buf)
+{
+        size_t nmembs = argv->v_nmembs;
+        int ret;
+        ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+        return (ret < 0) ? ret : nmembs;
+}
+static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
+                                         struct nilfs_argv *argv, void *buf)
+{
+        size_t nmembs = argv->v_nmembs;
+        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+        struct nilfs_bdesc *bdescs = buf;
+        int ret, i;
+        for (i = 0; i < nmembs; i++) {
+                /* XXX: use macro or inline func to check liveness */
+                ret = nilfs_bmap_lookup_at_level(bmap,
+                                                 bdescs[i].bd_offset,
+                                                 bdescs[i].bd_level + 1,
+                                                 &bdescs[i].bd_blocknr);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                return ret;
+                        bdescs[i].bd_blocknr = 0;
+                }
+                if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
+                        /* skip dead block */
+                        continue;
+                if (bdescs[i].bd_level == 0) {
+                        ret = nilfs_mdt_mark_block_dirty(dat,
+                                                         bdescs[i].bd_offset);
+                        if (ret < 0) {
+                                WARN_ON(ret == -ENOENT);
+                                return ret;
+                        }
+                } else {
+                        ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
+                                              bdescs[i].bd_level);
+                        if (ret < 0) {
+                                WARN_ON(ret == -ENOENT);
+                                return ret;
+                        }
+                }
+        }
+        return nmembs;
+}
+static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
+                                     struct nilfs_argv *argv, void *buf)
+{
+        size_t nmembs = argv->v_nmembs;
+        struct nilfs_sb_info *sbi = nilfs->ns_writer;
+        int ret;
+        if (unlikely(!sbi)) {
+                /* never happens because called for a writable mount */
+                WARN_ON(1);
+                return -EROFS;
+        }
+        ret = nilfs_segctor_add_segments_to_be_freed(
+                NILFS_SC(sbi), buf, nmembs);
+        return (ret < 0) ? ret : nmembs;
+}
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
+                                       struct nilfs_argv *argv, void **kbufs)
+{
+        const char *msg;
+        int ret;
+        ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+        if (ret < 0) {
+                msg = "cannot read source blocks";
+                goto failed;
+        }
+        ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
+        if (ret < 0) {
+                /*
+                 * can safely abort because checkpoints can be removed
+                 * independently.
+                 */
+                msg = "cannot delete checkpoints";
+                goto failed;
+        }
+        ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], kbufs[2]);
+        if (ret < 0) {
+                /*
+                 * can safely abort because DAT file is updated atomically
+                 * using a copy-on-write technique.
+                 */
+                msg = "cannot delete virtual blocks from DAT file";
+                goto failed;
+        }
+        ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], kbufs[3]);
+        if (ret < 0) {
+                /*
+                 * can safely abort because the operation is nondestructive.
+                 */
+                msg = "cannot mark copying blocks dirty";
+                goto failed;
+        }
+        ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]);
+        if (ret < 0) {
+                /*
+                 * can safely abort because this operation is atomic.
+                 */
+                msg = "cannot set segments to be freed";
+                goto failed;
+        }
+        return 0;
+ failed:
+        nilfs_remove_all_gcinode(nilfs);
+        printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
+               msg, ret);
+        return ret;
+}
+static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
+                                      unsigned int cmd, void __user *argp)
+{
+        struct nilfs_argv argv[5];
+        const static size_t argsz[5] = {
+                sizeof(struct nilfs_vdesc),
+                sizeof(struct nilfs_period),
+                sizeof(__u64),
+                sizeof(struct nilfs_bdesc),
+                sizeof(__u64),
+        };
+        void __user *base;
+        void *kbufs[5];
+        struct the_nilfs *nilfs;
+        size_t len, nsegs;
+        int n, ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (copy_from_user(argv, argp, sizeof(argv)))
+                return -EFAULT;
+        nsegs = argv[4].v_nmembs;
+        if (argv[4].v_size != argsz[4])
+                return -EINVAL;
+        /*
+         * argv[4] points to segment numbers this ioctl cleans.  We
+         * use kmalloc() for its buffer because memory used for the
+         * segment numbers is enough small.
+         */
+        kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
+                               nsegs * sizeof(__u64));
+        if (IS_ERR(kbufs[4]))
+                return PTR_ERR(kbufs[4]);
+        nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        for (n = 0; n < 4; n++) {
+                ret = -EINVAL;
+                if (argv[n].v_size != argsz[n])
+                        goto out_free;
+                if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment)
+                        goto out_free;
+                len = argv[n].v_size * argv[n].v_nmembs;
+                base = (void __user *)(unsigned long)argv[n].v_base;
+                if (len == 0) {
+                        kbufs[n] = NULL;
+                        continue;
+                }
+                kbufs[n] = vmalloc(len);
+                if (!kbufs[n]) {
+                        ret = -ENOMEM;
+                        goto out_free;
+                }
+                if (copy_from_user(kbufs[n], base, len)) {
+                        ret = -EFAULT;
+                        vfree(kbufs[n]);
+                        goto out_free;
+                }
+        }
+        ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+ out_free:
+        while (--n >= 0)
+                vfree(kbufs[n]);
+        kfree(kbufs[4]);
+        return ret;
+}
+static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
+                            unsigned int cmd, void __user *argp)
+{
+        __u64 cno;
+        int ret;
+        ret = nilfs_construct_segment(inode->i_sb);
+        if (ret < 0)
+                return ret;
+        if (argp != NULL) {
+                cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
+                if (copy_to_user(argp, &cno, sizeof(cno)))
+                        return -EFAULT;
+        }
+        return 0;
+}
+static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
+                                unsigned int cmd, void __user *argp,
+                                size_t membsz,
+                                ssize_t (*dofunc)(struct the_nilfs *,
+                                                  __u64 *, int,
+                                                  void *, size_t, size_t))
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_argv argv;
+        int ret;
+        if (copy_from_user(&argv, argp, sizeof(argv)))
+                return -EFAULT;
+        if (argv.v_size != membsz)
+                return -EINVAL;
+        ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &argv, sizeof(argv)))
+                ret = -EFAULT;
+        return ret;
+}
+long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        void __user *argp = (void * __user *)arg;
+        switch (cmd) {
+        case NILFS_IOCTL_CHANGE_CPMODE:
+                return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
+        case NILFS_IOCTL_DELETE_CHECKPOINT:
+                return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
+        case NILFS_IOCTL_GET_CPINFO:
+                return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+                                            sizeof(struct nilfs_cpinfo),
+                                            nilfs_ioctl_do_get_cpinfo);
+        case NILFS_IOCTL_GET_CPSTAT:
+                return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
+        case NILFS_IOCTL_GET_SUINFO:
+                return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+                                            sizeof(struct nilfs_suinfo),
+                                            nilfs_ioctl_do_get_suinfo);
+        case NILFS_IOCTL_GET_SUSTAT:
+                return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
+        case NILFS_IOCTL_GET_VINFO:
+                return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+                                            sizeof(struct nilfs_vinfo),
+                                            nilfs_ioctl_do_get_vinfo);
+        case NILFS_IOCTL_GET_BDESCS:
+                return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
+        case NILFS_IOCTL_CLEAN_SEGMENTS:
+                return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
+        case NILFS_IOCTL_SYNC:
+                return nilfs_ioctl_sync(inode, filp, cmd, argp);
+        default:
+                return -ENOTTY;
+        }
+}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
new file mode 100644
index 000000000000..bb78745a0e30
--- /dev/null
+++ b/fs/nilfs2/mdt.c
@@ -0,0 +1,564 @@
+/*
+ * mdt.c - meta data file for NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+#define NILFS_MDT_MAX_RA_BLOCKS         (16 - 1)
+#define INIT_UNUSED_INODE_FIELDS
+static int
+nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
+                           struct buffer_head *bh,
+                           void (*init_block)(struct inode *,
+                                              struct buffer_head *, void *))
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        void *kaddr;
+        int ret;
+        /* Caller exclude read accesses using page lock */
+        /* set_buffer_new(bh); */
+        bh->b_blocknr = 0;
+        ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
+        if (unlikely(ret))
+                return ret;
+        set_buffer_mapped(bh);
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
+        if (init_block)
+                init_block(inode, bh, kaddr);
+        flush_dcache_page(bh->b_page);
+        kunmap_atomic(kaddr, KM_USER0);
+        set_buffer_uptodate(bh);
+        nilfs_mark_buffer_dirty(bh);
+        nilfs_mdt_mark_dirty(inode);
+        return 0;
+}
+static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
+                                  struct buffer_head **out_bh,
+                                  void (*init_block)(struct inode *,
+                                                     struct buffer_head *,
+                                                     void *))
+{
+        struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_transaction_info ti;
+        struct buffer_head *bh;
+        int err;
+        if (!sb) {
+                /*
+                 * Make sure this function is not called from any
+                 * read-only context.
+                 */
+                if (!nilfs->ns_writer) {
+                        WARN_ON(1);
+                        err = -EROFS;
+                        goto out;
+                }
+                sb = nilfs->ns_writer->s_super;
+        }
+        nilfs_transaction_begin(sb, &ti, 0);
+        err = -ENOMEM;
+        bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
+        if (unlikely(!bh))
+                goto failed_unlock;
+        err = -EEXIST;
+        if (buffer_uptodate(bh) || buffer_mapped(bh))
+                goto failed_bh;
+#if 0
+        /* The uptodate flag is not protected by the page lock, but
+           the mapped flag is.  Thus, we don't have to wait the buffer. */
+        wait_on_buffer(bh);
+        if (buffer_uptodate(bh))
+                goto failed_bh;
+#endif
+        bh->b_bdev = nilfs->ns_bdev;
+        err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
+        if (likely(!err)) {
+                get_bh(bh);
+                *out_bh = bh;
+        }
+ failed_bh:
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        brelse(bh);
+ failed_unlock:
+        if (likely(!err))
+                err = nilfs_transaction_commit(sb);
+        else
+                nilfs_transaction_abort(sb);
+ out:
+        return err;
+}
+static int
+nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
+                       int mode, struct buffer_head **out_bh)
+{
+        struct buffer_head *bh;
+        unsigned long blknum = 0;
+        int ret = -ENOMEM;
+        bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+        if (unlikely(!bh))
+                goto failed;
+        ret = -EEXIST; /* internal code */
+        if (buffer_uptodate(bh))
+                goto out;
+        if (mode == READA) {
+                if (!trylock_buffer(bh)) {
+                        ret = -EBUSY;
+                        goto failed_bh;
+                }
+        } else /* mode == READ */
+                lock_buffer(bh);
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                goto out;
+        }
+        if (!buffer_mapped(bh)) { /* unused buffer */
+                ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
+                                        &blknum);
+                if (unlikely(ret)) {
+                        unlock_buffer(bh);
+                        goto failed_bh;
+                }
+                bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+                bh->b_blocknr = blknum;
+                set_buffer_mapped(bh);
+        }
+        bh->b_end_io = end_buffer_read_sync;
+        get_bh(bh);
+        submit_bh(mode, bh);
+        ret = 0;
+ out:
+        get_bh(bh);
+        *out_bh = bh;
+ failed_bh:
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        brelse(bh);
+ failed:
+        return ret;
+}
+static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
+                                struct buffer_head **out_bh)
+{
+        struct buffer_head *first_bh, *bh;
+        unsigned long blkoff;
+        int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
+        int err;
+        err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
+        if (err == -EEXIST) /* internal code */
+                goto out;
+        if (unlikely(err))
+                goto failed;
+        blkoff = block + 1;
+        for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+                err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+                if (likely(!err || err == -EEXIST))
+                        brelse(bh);
+                else if (err != -EBUSY)
+                        break; /* abort readahead if bmap lookup failed */
+                if (!buffer_locked(first_bh))
+                        goto out_no_wait;
+        }
+        wait_on_buffer(first_bh);
+ out_no_wait:
+        err = -EIO;
+        if (!buffer_uptodate(first_bh))
+                goto failed_bh;
+ out:
+        *out_bh = first_bh;
+        return 0;
+ failed_bh:
+        brelse(first_bh);
+ failed:
+        return err;
+}
+/**
+ * nilfs_mdt_get_block - read or create a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @blkoff: block offset
+ * @create: create flag
+ * @init_block: initializer used for newly allocated block
+ * @out_bh: output of a pointer to the buffer_head
+ *
+ * nilfs_mdt_get_block() looks up the specified buffer and tries to create
+ * a new buffer if @create is not zero.  On success, the returned buffer is
+ * assured to be either existing or formatted using a buffer lock on success.
+ * @out_bh is substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ *
+ * %-EROFS - Read only filesystem (for create mode)
+ */
+int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
+                        void (*init_block)(struct inode *,
+                                           struct buffer_head *, void *),
+                        struct buffer_head **out_bh)
+{
+        int ret;
+        /* Should be rewritten with merging nilfs_mdt_read_block() */
+ retry:
+        ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
+        if (!create || ret != -ENOENT)
+                return ret;
+        ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
+        if (unlikely(ret == -EEXIST)) {
+                /* create = 0; */  /* limit read-create loop retries */
+                goto retry;
+        }
+        return ret;
+}
+/**
+ * nilfs_mdt_delete_block - make a hole on the meta data file.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, zero is returned.
+ * On error, one of the following negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ */
+int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        int err;
+        err = nilfs_bmap_delete(ii->i_bmap, block);
+        if (!err || err == -ENOENT) {
+                nilfs_mdt_mark_dirty(inode);
+                nilfs_mdt_forget_block(inode, block);
+        }
+        return err;
+}
+/**
+ * nilfs_mdt_forget_block - discard dirty state and try to remove the page
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
+ * tries to release the page including the buffer from a page cache.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EBUSY - page has an active buffer.
+ *
+ * %-ENOENT - page cache has no page addressed by the offset.
+ */
+int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
+{
+        pgoff_t index = (pgoff_t)block >>
+                (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        struct page *page;
+        unsigned long first_block;
+        int ret = 0;
+        int still_dirty;
+        page = find_lock_page(inode->i_mapping, index);
+        if (!page)
+                return -ENOENT;
+        wait_on_page_writeback(page);
+        first_block = (unsigned long)index <<
+                (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        if (page_has_buffers(page)) {
+                struct buffer_head *bh;
+                bh = nilfs_page_get_nth_block(page, block - first_block);
+                nilfs_forget_buffer(bh);
+        }
+        still_dirty = PageDirty(page);
+        unlock_page(page);
+        page_cache_release(page);
+        if (still_dirty ||
+            invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
+                ret = -EBUSY;
+        return ret;
+}
+/**
+ * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ */
+int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
+{
+        struct buffer_head *bh;
+        int err;
+        err = nilfs_mdt_read_block(inode, block, &bh);
+        if (unlikely(err))
+                return err;
+        nilfs_mark_buffer_dirty(bh);
+        nilfs_mdt_mark_dirty(inode);
+        brelse(bh);
+        return 0;
+}
+int nilfs_mdt_fetch_dirty(struct inode *inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
+                set_bit(NILFS_I_DIRTY, &ii->i_state);
+                return 1;
+        }
+        return test_bit(NILFS_I_DIRTY, &ii->i_state);
+}
+static int
+nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = container_of(page->mapping,
+                                           struct inode, i_data);
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_sb_info *writer = NULL;
+        int err = 0;
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        if (page->mapping->assoc_mapping)
+                return 0; /* Do not request flush for shadow page cache */
+        if (!sb) {
+                writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+                if (!writer)
+                        return -EROFS;
+                sb = writer->s_super;
+        }
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                err = nilfs_construct_segment(sb);
+        else if (wbc->for_reclaim)
+                nilfs_flush_segment(sb, inode->i_ino);
+        if (writer)
+                nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+        return err;
+}
+static struct address_space_operations def_mdt_aops = {
+        .writepage              = nilfs_mdt_write_page,
+};
+static struct inode_operations def_mdt_iops;
+static struct file_operations def_mdt_fops;
+/*
+ * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
+ * ifile, or gcinodes.  This allows the B-tree code and segment constructor
+ * to treat them like regular files, and this helps to simplify the
+ * implementation.
+ *   On the other hand, some of the pseudo inodes have an irregular point:
+ * They don't have valid inode->i_sb pointer because their lifetimes are
+ * longer than those of the super block structs; they may continue for
+ * several consecutive mounts/umounts.  This would need discussions.
+ */
+struct inode *
+nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
+                     ino_t ino, gfp_t gfp_mask)
+{
+        struct inode *inode = nilfs_alloc_inode(sb);
+        if (!inode)
+                return NULL;
+        else {
+                struct address_space * const mapping = &inode->i_data;
+                struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
+                if (!mi) {
+                        nilfs_destroy_inode(inode);
+                        return NULL;
+                }
+                mi->mi_nilfs = nilfs;
+                init_rwsem(&mi->mi_sem);
+                inode->i_sb = sb; /* sb may be NULL for some meta data files */
+                inode->i_blkbits = nilfs->ns_blocksize_bits;
+                inode->i_flags = 0;
+                atomic_set(&inode->i_count, 1);
+                inode->i_nlink = 1;
+                inode->i_ino = ino;
+                inode->i_mode = S_IFREG;
+                inode->i_private = mi;
+#ifdef INIT_UNUSED_INODE_FIELDS
+                atomic_set(&inode->i_writecount, 0);
+                inode->i_size = 0;
+                inode->i_blocks = 0;
+                inode->i_bytes = 0;
+                inode->i_generation = 0;
+#ifdef CONFIG_QUOTA
+                memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+#endif
+                inode->i_pipe = NULL;
+                inode->i_bdev = NULL;
+                inode->i_cdev = NULL;
+                inode->i_rdev = 0;
+#ifdef CONFIG_SECURITY
+                inode->i_security = NULL;
+#endif
+                inode->dirtied_when = 0;
+                INIT_LIST_HEAD(&inode->i_list);
+                INIT_LIST_HEAD(&inode->i_sb_list);
+                inode->i_state = 0;
+#endif
+                spin_lock_init(&inode->i_lock);
+                mutex_init(&inode->i_mutex);
+                init_rwsem(&inode->i_alloc_sem);
+                mapping->host = NULL;  /* instead of inode */
+                mapping->flags = 0;
+                mapping_set_gfp_mask(mapping, gfp_mask);
+                mapping->assoc_mapping = NULL;
+                mapping->backing_dev_info = nilfs->ns_bdi;
+                inode->i_mapping = mapping;
+        }
+        return inode;
+}
+struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
+                            ino_t ino, gfp_t gfp_mask)
+{
+        struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+        if (!inode)
+                return NULL;
+        inode->i_op = &def_mdt_iops;
+        inode->i_fop = &def_mdt_fops;
+        inode->i_mapping->a_ops = &def_mdt_aops;
+        return inode;
+}
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
+                              unsigned header_size)
+{
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+        mi->mi_entry_size = entry_size;
+        mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
+        mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
+}
+void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
+{
+        shadow->i_mapping->assoc_mapping = orig->i_mapping;
+        NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
+                &NILFS_I(orig)->i_btnode_cache;
+}
+void nilfs_mdt_clear(struct inode *inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        invalidate_mapping_pages(inode->i_mapping, 0, -1);
+        truncate_inode_pages(inode->i_mapping, 0);
+        nilfs_bmap_clear(ii->i_bmap);
+        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+}
+void nilfs_mdt_destroy(struct inode *inode)
+{
+        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+        kfree(mdi);
+        nilfs_destroy_inode(inode);
+}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
new file mode 100644
index 000000000000..df683e0bca6a
--- /dev/null
+++ b/fs/nilfs2/mdt.h
@@ -0,0 +1,125 @@
+/*
+ * mdt.h - NILFS meta data file prototype and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#ifndef _NILFS_MDT_H
+#define _NILFS_MDT_H
+#include <linux/buffer_head.h>
+#include <linux/blockgroup_lock.h>
+#include "nilfs.h"
+#include "page.h"
+/**
+ * struct nilfs_mdt_info - on-memory private data of meta data files
+ * @mi_nilfs: back pointer to the_nilfs struct
+ * @mi_sem: reader/writer semaphore for meta data operations
+ * @mi_bgl: per-blockgroup locking
+ * @mi_entry_size: size of an entry
+ * @mi_first_entry_offset: offset to the first entry
+ * @mi_entries_per_block: number of entries in a block
+ * @mi_blocks_per_group: number of blocks in a group
+ * @mi_blocks_per_desc_block: number of blocks per descriptor block
+ */
+struct nilfs_mdt_info {
+        struct the_nilfs       *mi_nilfs;
+        struct rw_semaphore     mi_sem;
+        struct blockgroup_lock *mi_bgl;
+        unsigned                mi_entry_size;
+        unsigned                mi_first_entry_offset;
+        unsigned long           mi_entries_per_block;
+        unsigned long           mi_blocks_per_group;
+        unsigned long           mi_blocks_per_desc_block;
+};
+static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
+{
+        return inode->i_private;
+}
+static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
+}
+/* Default GFP flags using highmem */
+#define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+int nilfs_mdt_get_block(struct inode *, unsigned long, int,
+                        void (*init_block)(struct inode *,
+                                           struct buffer_head *, void *),
+                        struct buffer_head **);
+int nilfs_mdt_delete_block(struct inode *, unsigned long);
+int nilfs_mdt_forget_block(struct inode *, unsigned long);
+int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
+int nilfs_mdt_fetch_dirty(struct inode *);
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+                            gfp_t);
+struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
+                                   ino_t, gfp_t);
+void nilfs_mdt_destroy(struct inode *);
+void nilfs_mdt_clear(struct inode *);
+void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+void nilfs_mdt_set_shadow(struct inode *, struct inode *);
+#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
+static inline void nilfs_mdt_mark_dirty(struct inode *inode)
+{
+        if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
+                set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+static inline void nilfs_mdt_clear_dirty(struct inode *inode)
+{
+        clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+static inline __u64 nilfs_mdt_cno(struct inode *inode)
+{
+        return NILFS_MDT(inode)->mi_nilfs->ns_cno;
+}
+#define nilfs_mdt_bgl_lock(inode, bg) \
+        (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
+static inline int
+nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
+                            unsigned n)
+{
+        return nilfs_read_inode_common(
+                inode, (struct nilfs_inode *)(bh->b_data + n));
+}
+static inline void
+nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
+                             unsigned n)
+{
+        nilfs_write_inode_common(
+                inode, (struct nilfs_inode *)(bh->b_data + n), 1);
+}
+#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
new file mode 100644
index 000000000000..df70dadb336f
--- /dev/null
+++ b/fs/nilfs2/namei.c
@@ -0,0 +1,474 @@
+/*
+ * namei.c - NILFS pathname lookup operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
+ *                       Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+#include <linux/pagemap.h>
+#include "nilfs.h"
+static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+        int err = nilfs_add_link(dentry, inode);
+        if (!err) {
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        inode_dec_link_count(inode);
+        iput(inode);
+        return err;
+}
+/*
+ * Methods themselves.
+ */
+static struct dentry *
+nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+        struct inode *inode;
+        ino_t ino;
+        if (dentry->d_name.len > NILFS_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        ino = nilfs_inode_by_name(dir, dentry);
+        inode = NULL;
+        if (ino) {
+                inode = nilfs_iget(dir->i_sb, ino);
+                if (IS_ERR(inode))
+                        return ERR_CAST(inode);
+        }
+        return d_splice_alias(inode, dentry);
+}
+struct dentry *nilfs_get_parent(struct dentry *child)
+{
+        unsigned long ino;
+        struct inode *inode;
+        struct dentry dotdot;
+        dotdot.d_name.name = "..";
+        dotdot.d_name.len = 2;
+        ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+        if (!ino)
+                return ERR_PTR(-ENOENT);
+        inode = nilfs_iget(child->d_inode->i_sb, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        return d_obtain_alias(inode);
+}
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
+                        struct nameidata *nd)
+{
+        struct inode *inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode = nilfs_new_inode(dir, mode);
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                inode->i_op = &nilfs_file_inode_operations;
+                inode->i_fop = &nilfs_file_operations;
+                inode->i_mapping->a_ops = &nilfs_aops;
+                mark_inode_dirty(inode);
+                err = nilfs_add_nondir(dentry, inode);
+        }
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int
+nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+        struct inode *inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode = nilfs_new_inode(dir, mode);
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                init_special_inode(inode, inode->i_mode, rdev);
+                mark_inode_dirty(inode);
+                err = nilfs_add_nondir(dentry, inode);
+        }
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
+                         const char *symname)
+{
+        struct nilfs_transaction_info ti;
+        struct super_block *sb = dir->i_sb;
+        unsigned l = strlen(symname)+1;
+        struct inode *inode;
+        int err;
+        if (l > sb->s_blocksize)
+                return -ENAMETOOLONG;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out;
+        /* slow symlink */
+        inode->i_op = &nilfs_symlink_inode_operations;
+        inode->i_mapping->a_ops = &nilfs_aops;
+        err = page_symlink(inode, symname, l);
+        if (err)
+                goto out_fail;
+        /* mark_inode_dirty(inode); */
+        /* nilfs_new_inode() and page_symlink() do this */
+        err = nilfs_add_nondir(dentry, inode);
+out:
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+out_fail:
+        inode_dec_link_count(inode);
+        iput(inode);
+        goto out;
+}
+static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
+                      struct dentry *dentry)
+{
+        struct inode *inode = old_dentry->d_inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        if (inode->i_nlink >= NILFS_LINK_MAX)
+                return -EMLINK;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode->i_ctime = CURRENT_TIME;
+        inode_inc_link_count(inode);
+        atomic_inc(&inode->i_count);
+        err = nilfs_add_nondir(dentry, inode);
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct inode *inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        if (dir->i_nlink >= NILFS_LINK_MAX)
+                return -EMLINK;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode_inc_link_count(dir);
+        inode = nilfs_new_inode(dir, S_IFDIR | mode);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_dir;
+        inode->i_op = &nilfs_dir_inode_operations;
+        inode->i_fop = &nilfs_dir_operations;
+        inode->i_mapping->a_ops = &nilfs_aops;
+        inode_inc_link_count(inode);
+        err = nilfs_make_empty(inode, dir);
+        if (err)
+                goto out_fail;
+        err = nilfs_add_link(dentry, inode);
+        if (err)
+                goto out_fail;
+        d_instantiate(dentry, inode);
+out:
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+out_fail:
+        inode_dec_link_count(inode);
+        inode_dec_link_count(inode);
+        iput(inode);
+out_dir:
+        inode_dec_link_count(dir);
+        goto out;
+}
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode;
+        struct nilfs_dir_entry *de;
+        struct page *page;
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+        if (err)
+                return err;
+        err = -ENOENT;
+        de = nilfs_find_entry(dir, dentry, &page);
+        if (!de)
+                goto out;
+        inode = dentry->d_inode;
+        err = -EIO;
+        if (le64_to_cpu(de->inode) != inode->i_ino)
+                goto out;
+        if (!inode->i_nlink) {
+                nilfs_warning(inode->i_sb, __func__,
+                              "deleting nonexistent file (%lu), %d\n",
+                              inode->i_ino, inode->i_nlink);
+                inode->i_nlink = 1;
+        }
+        err = nilfs_delete_entry(de, page);
+        if (err)
+                goto out;
+        inode->i_ctime = dir->i_ctime;
+        inode_dec_link_count(inode);
+        err = 0;
+out:
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+        if (err)
+                return err;
+        err = -ENOTEMPTY;
+        if (nilfs_empty_dir(inode)) {
+                err = nilfs_unlink(dir, dentry);
+                if (!err) {
+                        inode->i_size = 0;
+                        inode_dec_link_count(inode);
+                        inode_dec_link_count(dir);
+                }
+        }
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir,  struct dentry *new_dentry)
+{
+        struct inode *old_inode = old_dentry->d_inode;
+        struct inode *new_inode = new_dentry->d_inode;
+        struct page *dir_page = NULL;
+        struct nilfs_dir_entry *dir_de = NULL;
+        struct page *old_page;
+        struct nilfs_dir_entry *old_de;
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
+        if (unlikely(err))
+                return err;
+        err = -ENOENT;
+        old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
+        if (!old_de)
+                goto out;
+        if (S_ISDIR(old_inode->i_mode)) {
+                err = -EIO;
+                dir_de = nilfs_dotdot(old_inode, &dir_page);
+                if (!dir_de)
+                        goto out_old;
+        }
+        if (new_inode) {
+                struct page *new_page;
+                struct nilfs_dir_entry *new_de;
+                err = -ENOTEMPTY;
+                if (dir_de && !nilfs_empty_dir(new_inode))
+                        goto out_dir;
+                err = -ENOENT;
+                new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
+                if (!new_de)
+                        goto out_dir;
+                inode_inc_link_count(old_inode);
+                nilfs_set_link(new_dir, new_de, new_page, old_inode);
+                new_inode->i_ctime = CURRENT_TIME;
+                if (dir_de)
+                        drop_nlink(new_inode);
+                inode_dec_link_count(new_inode);
+        } else {
+                if (dir_de) {
+                        err = -EMLINK;
+                        if (new_dir->i_nlink >= NILFS_LINK_MAX)
+                                goto out_dir;
+                }
+                inode_inc_link_count(old_inode);
+                err = nilfs_add_link(new_dentry, old_inode);
+                if (err) {
+                        inode_dec_link_count(old_inode);
+                        goto out_dir;
+                }
+                if (dir_de)
+                        inode_inc_link_count(new_dir);
+        }
+        /*
+         * Like most other Unix systems, set the ctime for inodes on a
+         * rename.
+         * inode_dec_link_count() will mark the inode dirty.
+         */
+        old_inode->i_ctime = CURRENT_TIME;
+        nilfs_delete_entry(old_de, old_page);
+        inode_dec_link_count(old_inode);
+        if (dir_de) {
+                nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
+                inode_dec_link_count(old_dir);
+        }
+        err = nilfs_transaction_commit(old_dir->i_sb);
+        return err;
+out_dir:
+        if (dir_de) {
+                kunmap(dir_page);
+                page_cache_release(dir_page);
+        }
+out_old:
+        kunmap(old_page);
+        page_cache_release(old_page);
+out:
+        nilfs_transaction_abort(old_dir->i_sb);
+        return err;
+}
+struct inode_operations nilfs_dir_inode_operations = {
+        .create         = nilfs_create,
+        .lookup         = nilfs_lookup,
+        .link           = nilfs_link,
+        .unlink         = nilfs_unlink,
+        .symlink        = nilfs_symlink,
+        .mkdir          = nilfs_mkdir,
+        .rmdir          = nilfs_rmdir,
+        .mknod          = nilfs_mknod,
+        .rename         = nilfs_rename,
+        .setattr        = nilfs_setattr,
+        .permission     = nilfs_permission,
+};
+struct inode_operations nilfs_special_inode_operations = {
+        .setattr        = nilfs_setattr,
+        .permission     = nilfs_permission,
+};
+struct inode_operations nilfs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
new file mode 100644
index 000000000000..da6fc0bba2e5
--- /dev/null
+++ b/fs/nilfs2/nilfs.h
@@ -0,0 +1,314 @@
+/*
+ * nilfs.h - NILFS local header file.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#ifndef _NILFS_H
+#define _NILFS_H
+#include <linux/kernel.h>
+#include <linux/buffer_head.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/nilfs2_fs.h>
+#include "the_nilfs.h"
+#include "sb.h"
+#include "bmap.h"
+#include "bmap_union.h"
+/*
+ * nilfs inode data in memory
+ */
+struct nilfs_inode_info {
+        __u32 i_flags;
+        unsigned long  i_state;         /* Dynamic state flags */
+        struct nilfs_bmap *i_bmap;
+        union nilfs_bmap_union i_bmap_union;
+        __u64 i_xattr;  /* sector_t ??? */
+        __u32 i_dir_start_lookup;
+        __u64 i_cno;            /* check point number for GC inode */
+        struct address_space i_btnode_cache;
+        struct list_head i_dirty;       /* List for connecting dirty files */
+#ifdef CONFIG_NILFS_XATTR
+        /*
+         * Extended attributes can be read independently of the main file
+         * data. Taking i_sem even when reading would cause contention
+         * between readers of EAs and writers of regular file data, so
+         * instead we synchronize on xattr_sem when reading or changing
+         * EAs.
+         */
+        struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_NILFS_POSIX_ACL
+        struct posix_acl *i_acl;
+        struct posix_acl *i_default_acl;
+#endif
+        struct buffer_head *i_bh;       /* i_bh contains a new or dirty
+                                           disk inode */
+        struct inode vfs_inode;
+};
+static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
+{
+        return container_of(inode, struct nilfs_inode_info, vfs_inode);
+}
+static inline struct nilfs_inode_info *
+NILFS_BMAP_I(const struct nilfs_bmap *bmap)
+{
+        return container_of((union nilfs_bmap_union *)bmap,
+                            struct nilfs_inode_info,
+                            i_bmap_union);
+}
+static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
+{
+        struct nilfs_inode_info *ii =
+                container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
+        return &ii->vfs_inode;
+}
+static inline struct inode *NILFS_AS_I(struct address_space *mapping)
+{
+        return (mapping->host) ? :
+                container_of(mapping, struct inode, i_data);
+}
+/*
+ * Dynamic state flags of NILFS on-memory inode (i_state)
+ */
+enum {
+        NILFS_I_NEW = 0,                /* Inode is newly created */
+        NILFS_I_DIRTY,                  /* The file is dirty */
+        NILFS_I_QUEUED,                 /* inode is in dirty_files list */
+        NILFS_I_BUSY,                   /* inode is grabbed by a segment
+                                           constructor */
+        NILFS_I_COLLECTED,              /* All dirty blocks are collected */
+        NILFS_I_UPDATED,                /* The file has been written back */
+        NILFS_I_INODE_DIRTY,            /* write_inode is requested */
+        NILFS_I_BMAP,                   /* has bmap and btnode_cache */
+        NILFS_I_GCINODE,                /* inode for GC, on memory only */
+        NILFS_I_GCDAT,                  /* shadow DAT, on memory only */
+};
+/*
+ * Macros to check inode numbers
+ */
+#define NILFS_MDT_INO_BITS   \
+  ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO |          \
+                  1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO |        \
+                  1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+#define NILFS_SYS_INO_BITS   \
+  ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+#define NILFS_FIRST_INO(sb)  (NILFS_SB(sb)->s_nilfs->ns_first_ino)
+#define NILFS_MDT_INODE(sb, ino) \
+  ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+#define NILFS_VALID_INODE(sb, ino) \
+  ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+/**
+ * struct nilfs_transaction_info: context information for synchronization
+ * @ti_magic: Magic number
+ * @ti_save: Backup of journal_info field of task_struct
+ * @ti_flags: Flags
+ * @ti_count: Nest level
+ * @ti_garbage: List of inode to be put when releasing semaphore
+ */
+struct nilfs_transaction_info {
+        u32                     ti_magic;
+        void                   *ti_save;
+                                /* This should never used. If this happens,
+                                   one of other filesystems has a bug. */
+        unsigned short          ti_flags;
+        unsigned short          ti_count;
+        struct list_head        ti_garbage;
+};
+/* ti_magic */
+#define NILFS_TI_MAGIC          0xd9e392fb
+/* ti_flags */
+#define NILFS_TI_DYNAMIC_ALLOC  0x0001  /* Allocated from slab */
+#define NILFS_TI_SYNC           0x0002  /* Force to construct segment at the
+                                           end of transaction. */
+#define NILFS_TI_GC             0x0004  /* GC context */
+#define NILFS_TI_COMMIT         0x0008  /* Change happened or not */
+#define NILFS_TI_WRITER         0x0010  /* Constructor context */
+int nilfs_transaction_begin(struct super_block *,
+                            struct nilfs_transaction_info *, int);
+int nilfs_transaction_commit(struct super_block *);
+void nilfs_transaction_abort(struct super_block *);
+static inline void nilfs_set_transaction_flag(unsigned int flag)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        ti->ti_flags |= flag;
+}
+static inline int nilfs_test_transaction_flag(unsigned int flag)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
+                return 0;
+        return !!(ti->ti_flags & flag);
+}
+static inline int nilfs_doing_gc(void)
+{
+        return nilfs_test_transaction_flag(NILFS_TI_GC);
+}
+static inline int nilfs_doing_construction(void)
+{
+        return nilfs_test_transaction_flag(NILFS_TI_WRITER);
+}
+static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
+{
+        return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
+}
+/*
+ * function prototype
+ */
+#ifdef CONFIG_NILFS_POSIX_ACL
+#error "NILFS: not yet supported POSIX ACL"
+extern int nilfs_permission(struct inode *, int, struct nameidata *);
+extern int nilfs_acl_chmod(struct inode *);
+extern int nilfs_init_acl(struct inode *, struct inode *);
+#else
+#define nilfs_permission   NULL
+static inline int nilfs_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
+{
+        inode->i_mode &= ~current_umask();
+        return 0;
+}
+#endif
+#define NILFS_ATIME_DISABLE
+/* dir.c */
+extern int nilfs_add_link(struct dentry *, struct inode *);
+extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
+extern int nilfs_make_empty(struct inode *, struct inode *);
+extern struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *, struct dentry *, struct page **);
+extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
+extern int nilfs_empty_dir(struct inode *);
+extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
+extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
+                           struct page *, struct inode *);
+/* file.c */
+extern int nilfs_sync_file(struct file *, struct dentry *, int);
+/* ioctl.c */
+long nilfs_ioctl(struct file *, unsigned int, unsigned long);
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
+                                       void **);
+/* inode.c */
+extern struct inode *nilfs_new_inode(struct inode *, int);
+extern void nilfs_free_inode(struct inode *);
+extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern void nilfs_set_inode_flags(struct inode *);
+extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
+extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+extern struct inode *nilfs_iget(struct super_block *, unsigned long);
+extern void nilfs_update_inode(struct inode *, struct buffer_head *);
+extern void nilfs_truncate(struct inode *);
+extern void nilfs_delete_inode(struct inode *);
+extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
+                                  struct buffer_head **);
+extern int nilfs_inode_dirty(struct inode *);
+extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
+                                unsigned);
+extern int nilfs_mark_inode_dirty(struct inode *);
+extern void nilfs_dirty_inode(struct inode *);
+/* namei.c */
+extern struct dentry *nilfs_get_parent(struct dentry *);
+/* super.c */
+extern struct inode *nilfs_alloc_inode(struct super_block *);
+extern void nilfs_destroy_inode(struct inode *);
+extern void nilfs_error(struct super_block *, const char *, const char *, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
+extern struct nilfs_super_block *
+nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
+extern int nilfs_store_magic_and_option(struct super_block *,
+                                        struct nilfs_super_block *, char *);
+extern int nilfs_commit_super(struct nilfs_sb_info *, int);
+extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
+extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
+/* gcinode.c */
+int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
+                                   struct buffer_head **);
+int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
+                                   struct buffer_head **);
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
+int nilfs_init_gccache(struct the_nilfs *);
+void nilfs_destroy_gccache(struct the_nilfs *);
+void nilfs_clear_gcinode(struct inode *);
+struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
+void nilfs_remove_all_gcinode(struct the_nilfs *);
+/* gcdat.c */
+int nilfs_init_gcdat_inode(struct the_nilfs *);
+void nilfs_commit_gcdat_inode(struct the_nilfs *);
+void nilfs_clear_gcdat_inode(struct the_nilfs *);
+/*
+ * Inodes and files operations
+ */
+extern struct file_operations nilfs_dir_operations;
+extern struct inode_operations nilfs_file_inode_operations;
+extern struct file_operations nilfs_file_operations;
+extern struct address_space_operations nilfs_aops;
+extern struct inode_operations nilfs_dir_inode_operations;
+extern struct inode_operations nilfs_special_inode_operations;
+extern struct inode_operations nilfs_symlink_inode_operations;
+/*
+ * filesystem type
+ */
+extern struct file_system_type nilfs_fs_type;
+#endif  /* _NILFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
new file mode 100644
index 000000000000..a2692bbc7b50
--- /dev/null
+++ b/fs/nilfs2/page.c
@@ -0,0 +1,541 @@
+/*
+ * page.c - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+#define NILFS_BUFFER_INHERENT_BITS  \
+        ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
+         (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
+static struct buffer_head *
+__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
+                       int blkbits, unsigned long b_state)
+{
+        unsigned long first_block;
+        struct buffer_head *bh;
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << blkbits, b_state);
+        first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
+        bh = nilfs_page_get_nth_block(page, block - first_block);
+        touch_buffer(bh);
+        wait_on_buffer(bh);
+        return bh;
+}
+/*
+ * Since the page cache of B-tree node pages or data page cache of pseudo
+ * inodes does not have a valid mapping->host pointer, calling
+ * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
+ * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
+ * To avoid this problem, the old style mark_buffer_dirty() is used instead.
+ */
+void nilfs_mark_buffer_dirty(struct buffer_head *bh)
+{
+        if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
+                __set_page_dirty_nobuffers(bh->b_page);
+}
+struct buffer_head *nilfs_grab_buffer(struct inode *inode,
+                                      struct address_space *mapping,
+                                      unsigned long blkoff,
+                                      unsigned long b_state)
+{
+        int blkbits = inode->i_blkbits;
+        pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
+        struct page *page, *opage;
+        struct buffer_head *bh, *obh;
+        page = grab_cache_page(mapping, index);
+        if (unlikely(!page))
+                return NULL;
+        bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
+        if (unlikely(!bh)) {
+                unlock_page(page);
+                page_cache_release(page);
+                return NULL;
+        }
+        if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
+                /*
+                 * Shadow page cache uses assoc_mapping to point its original
+                 * page cache.  The following code tries the original cache
+                 * if the given cache is a shadow and it didn't hit.
+                 */
+                opage = find_lock_page(mapping->assoc_mapping, index);
+                if (!opage)
+                        return bh;
+                obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
+                                             b_state);
+                if (buffer_uptodate(obh)) {
+                        nilfs_copy_buffer(bh, obh);
+                        if (buffer_dirty(obh)) {
+                                nilfs_mark_buffer_dirty(bh);
+                                if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
+                                        nilfs_mdt_mark_dirty(inode);
+                        }
+                }
+                brelse(obh);
+                unlock_page(opage);
+                page_cache_release(opage);
+        }
+        return bh;
+}
+/**
+ * nilfs_forget_buffer - discard dirty state
+ * @inode: owner inode of the buffer
+ * @bh: buffer head of the buffer to be discarded
+ */
+void nilfs_forget_buffer(struct buffer_head *bh)
+{
+        struct page *page = bh->b_page;
+        lock_buffer(bh);
+        clear_buffer_nilfs_volatile(bh);
+        clear_buffer_dirty(bh);
+        if (nilfs_page_buffers_clean(page))
+                __nilfs_clear_page_dirty(page);
+        clear_buffer_uptodate(bh);
+        clear_buffer_mapped(bh);
+        bh->b_blocknr = -1;
+        ClearPageUptodate(page);
+        ClearPageMappedToDisk(page);
+        unlock_buffer(bh);
+        brelse(bh);
+}
+/**
+ * nilfs_copy_buffer -- copy buffer data and flags
+ * @dbh: destination buffer
+ * @sbh: source buffer
+ */
+void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
+{
+        void *kaddr0, *kaddr1;
+        unsigned long bits;
+        struct page *spage = sbh->b_page, *dpage = dbh->b_page;
+        struct buffer_head *bh;
+        kaddr0 = kmap_atomic(spage, KM_USER0);
+        kaddr1 = kmap_atomic(dpage, KM_USER1);
+        memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
+        kunmap_atomic(kaddr1, KM_USER1);
+        kunmap_atomic(kaddr0, KM_USER0);
+        dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
+        dbh->b_blocknr = sbh->b_blocknr;
+        dbh->b_bdev = sbh->b_bdev;
+        bh = dbh;
+        bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
+        while ((bh = bh->b_this_page) != dbh) {
+                lock_buffer(bh);
+                bits &= bh->b_state;
+                unlock_buffer(bh);
+        }
+        if (bits & (1UL << BH_Uptodate))
+                SetPageUptodate(dpage);
+        else
+                ClearPageUptodate(dpage);
+        if (bits & (1UL << BH_Mapped))
+                SetPageMappedToDisk(dpage);
+        else
+                ClearPageMappedToDisk(dpage);
+}
+/**
+ * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
+ * @page: page to be checked
+ *
+ * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
+ * Otherwise, it returns non-zero value.
+ */
+int nilfs_page_buffers_clean(struct page *page)
+{
+        struct buffer_head *bh, *head;
+        bh = head = page_buffers(page);
+        do {
+                if (buffer_dirty(bh))
+                        return 0;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return 1;
+}
+void nilfs_page_bug(struct page *page)
+{
+        struct address_space *m;
+        unsigned long ino = 0;
+        if (unlikely(!page)) {
+                printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
+                return;
+        }
+        m = page->mapping;
+        if (m) {
+                struct inode *inode = NILFS_AS_I(m);
+                if (inode != NULL)
+                        ino = inode->i_ino;
+        }
+        printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
+               "mapping=%p ino=%lu\n",
+               page, atomic_read(&page->_count),
+               (unsigned long long)page->index, page->flags, m, ino);
+        if (page_has_buffers(page)) {
+                struct buffer_head *bh, *head;
+                int i = 0;
+                bh = head = page_buffers(page);
+                do {
+                        printk(KERN_CRIT
+                               " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
+                               i++, bh, atomic_read(&bh->b_count),
+                               (unsigned long long)bh->b_blocknr, bh->b_state);
+                        bh = bh->b_this_page;
+                } while (bh != head);
+        }
+}
+/**
+ * nilfs_alloc_private_page - allocate a private page with buffer heads
+ *
+ * Return Value: On success, a pointer to the allocated page is returned.
+ * On error, NULL is returned.
+ */
+struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
+                                      unsigned long state)
+{
+        struct buffer_head *bh, *head, *tail;
+        struct page *page;
+        page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
+        if (unlikely(!page))
+                return NULL;
+        lock_page(page);
+        head = alloc_page_buffers(page, size, 0);
+        if (unlikely(!head)) {
+                unlock_page(page);
+                __free_page(page);
+                return NULL;
+        }
+        bh = head;
+        do {
+                bh->b_state = (1UL << BH_NILFS_Allocated) | state;
+                tail = bh;
+                bh->b_bdev = bdev;
+                bh = bh->b_this_page;
+        } while (bh);
+        tail->b_this_page = head;
+        attach_page_buffers(page, head);
+        return page;
+}
+void nilfs_free_private_page(struct page *page)
+{
+        BUG_ON(!PageLocked(page));
+        BUG_ON(page->mapping);
+        if (page_has_buffers(page) && !try_to_free_buffers(page))
+                NILFS_PAGE_BUG(page, "failed to free page");
+        unlock_page(page);
+        __free_page(page);
+}
+/**
+ * nilfs_copy_page -- copy the page with buffers
+ * @dst: destination page
+ * @src: source page
+ * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
+ *
+ * This fuction is for both data pages and btnode pages.  The dirty flag
+ * should be treated by caller.  The page must not be under i/o.
+ * Both src and dst page must be locked
+ */
+static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
+{
+        struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
+        unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
+        BUG_ON(PageWriteback(dst));
+        sbh = sbufs = page_buffers(src);
+        if (!page_has_buffers(dst))
+                create_empty_buffers(dst, sbh->b_size, 0);
+        if (copy_dirty)
+                mask |= (1UL << BH_Dirty);
+        dbh = dbufs = page_buffers(dst);
+        do {
+                lock_buffer(sbh);
+                lock_buffer(dbh);
+                dbh->b_state = sbh->b_state & mask;
+                dbh->b_blocknr = sbh->b_blocknr;
+                dbh->b_bdev = sbh->b_bdev;
+                sbh = sbh->b_this_page;
+                dbh = dbh->b_this_page;
+        } while (dbh != dbufs);
+        copy_highpage(dst, src);
+        if (PageUptodate(src) && !PageUptodate(dst))
+                SetPageUptodate(dst);
+        else if (!PageUptodate(src) && PageUptodate(dst))
+                ClearPageUptodate(dst);
+        if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
+                SetPageMappedToDisk(dst);
+        else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
+                ClearPageMappedToDisk(dst);
+        do {
+                unlock_buffer(sbh);
+                unlock_buffer(dbh);
+                sbh = sbh->b_this_page;
+                dbh = dbh->b_this_page;
+        } while (dbh != dbufs);
+}
+int nilfs_copy_dirty_pages(struct address_space *dmap,
+                           struct address_space *smap)
+{
+        struct pagevec pvec;
+        unsigned int i;
+        pgoff_t index = 0;
+        int err = 0;
+        pagevec_init(&pvec, 0);
+repeat:
+        if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
+                                PAGEVEC_SIZE))
+                return 0;
+        for (i = 0; i < pagevec_count(&pvec); i++) {
+                struct page *page = pvec.pages[i], *dpage;
+                lock_page(page);
+                if (unlikely(!PageDirty(page)))
+                        NILFS_PAGE_BUG(page, "inconsistent dirty state");
+                dpage = grab_cache_page(dmap, page->index);
+                if (unlikely(!dpage)) {
+                        /* No empty page is added to the page cache */
+                        err = -ENOMEM;
+                        unlock_page(page);
+                        break;
+                }
+                if (unlikely(!page_has_buffers(page)))
+                        NILFS_PAGE_BUG(page,
+                                       "found empty page in dat page cache");
+                nilfs_copy_page(dpage, page, 1);
+                __set_page_dirty_nobuffers(dpage);
+                unlock_page(dpage);
+                page_cache_release(dpage);
+                unlock_page(page);
+        }
+        pagevec_release(&pvec);
+        cond_resched();
+        if (likely(!err))
+                goto repeat;
+        return err;
+}
+/**
+ * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
+ * @dmap: destination page cache
+ * @smap: source page cache
+ *
+ * No pages must no be added to the cache during this process.
+ * This must be ensured by the caller.
+ */
+void nilfs_copy_back_pages(struct address_space *dmap,
+                           struct address_space *smap)
+{
+        struct pagevec pvec;
+        unsigned int i, n;
+        pgoff_t index = 0;
+        int err;
+        pagevec_init(&pvec, 0);
+repeat:
+        n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+        if (!n)
+                return;
+        index = pvec.pages[n - 1]->index + 1;
+        for (i = 0; i < pagevec_count(&pvec); i++) {
+                struct page *page = pvec.pages[i], *dpage;
+                pgoff_t offset = page->index;
+                lock_page(page);
+                dpage = find_lock_page(dmap, offset);
+                if (dpage) {
+                        /* override existing page on the destination cache */
+                        WARN_ON(PageDirty(dpage));
+                        nilfs_copy_page(dpage, page, 0);
+                        unlock_page(dpage);
+                        page_cache_release(dpage);
+                } else {
+                        struct page *page2;
+                        /* move the page to the destination cache */
+                        spin_lock_irq(&smap->tree_lock);
+                        page2 = radix_tree_delete(&smap->page_tree, offset);
+                        WARN_ON(page2 != page);
+                        smap->nrpages--;
+                        spin_unlock_irq(&smap->tree_lock);
+                        spin_lock_irq(&dmap->tree_lock);
+                        err = radix_tree_insert(&dmap->page_tree, offset, page);
+                        if (unlikely(err < 0)) {
+                                WARN_ON(err == -EEXIST);
+                                page->mapping = NULL;
+                                page_cache_release(page); /* for cache */
+                        } else {
+                                page->mapping = dmap;
+                                dmap->nrpages++;
+                                if (PageDirty(page))
+                                        radix_tree_tag_set(&dmap->page_tree,
+                                                           offset,
+                                                           PAGECACHE_TAG_DIRTY);
+                        }
+                        spin_unlock_irq(&dmap->tree_lock);
+                }
+                unlock_page(page);
+        }
+        pagevec_release(&pvec);
+        cond_resched();
+        goto repeat;
+}
+void nilfs_clear_dirty_pages(struct address_space *mapping)
+{
+        struct pagevec pvec;
+        unsigned int i;
+        pgoff_t index = 0;
+        pagevec_init(&pvec, 0);
+        while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+                                  PAGEVEC_SIZE)) {
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        struct page *page = pvec.pages[i];
+                        struct buffer_head *bh, *head;
+                        lock_page(page);
+                        ClearPageUptodate(page);
+                        ClearPageMappedToDisk(page);
+                        bh = head = page_buffers(page);
+                        do {
+                                lock_buffer(bh);
+                                clear_buffer_dirty(bh);
+                                clear_buffer_nilfs_volatile(bh);
+                                clear_buffer_uptodate(bh);
+                                clear_buffer_mapped(bh);
+                                unlock_buffer(bh);
+                                bh = bh->b_this_page;
+                        } while (bh != head);
+                        __nilfs_clear_page_dirty(page);
+                        unlock_page(page);
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+unsigned nilfs_page_count_clean_buffers(struct page *page,
+                                        unsigned from, unsigned to)
+{
+        unsigned block_start, block_end;
+        struct buffer_head *bh, *head;
+        unsigned nc = 0;
+        for (bh = head = page_buffers(page), block_start = 0;
+             bh != head || !block_start;
+             block_start = block_end, bh = bh->b_this_page) {
+                block_end = block_start + bh->b_size;
+                if (block_end > from && block_start < to && !buffer_dirty(bh))
+                        nc++;
+        }
+        return nc;
+}
+/*
+ * NILFS2 needs clear_page_dirty() in the following two cases:
+ *
+ * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
+ *    page dirty flags when it copies back pages from the shadow cache
+ *    (gcdat->{i_mapping,i_btnode_cache}) to its original cache
+ *    (dat->{i_mapping,i_btnode_cache}).
+ *
+ * 2) Some B-tree operations like insertion or deletion may dispose buffers
+ *    in dirty state, and this needs to cancel the dirty state of their pages.
+ */
+int __nilfs_clear_page_dirty(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        if (mapping) {
+                spin_lock_irq(&mapping->tree_lock);
+                if (test_bit(PG_dirty, &page->flags)) {
+                        radix_tree_tag_clear(&mapping->page_tree,
+                                             page_index(page),
+                                             PAGECACHE_TAG_DIRTY);
+                        spin_unlock_irq(&mapping->tree_lock);
+                        return clear_page_dirty_for_io(page);
+                }
+                spin_unlock_irq(&mapping->tree_lock);
+                return 0;
+        }
+        return TestClearPageDirty(page);
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
new file mode 100644
index 000000000000..8abca4d1c1f8
--- /dev/null
+++ b/fs/nilfs2/page.h
@@ -0,0 +1,76 @@
+/*
+ * page.h - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+#ifndef _NILFS_PAGE_H
+#define _NILFS_PAGE_H
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+/*
+ * Extended buffer state bits
+ */
+enum {
+        BH_NILFS_Allocated = BH_PrivateStart,
+        BH_NILFS_Node,
+        BH_NILFS_Volatile,
+};
+BUFFER_FNS(NILFS_Allocated, nilfs_allocated)    /* nilfs private buffers */
+BUFFER_FNS(NILFS_Node, nilfs_node)              /* nilfs node buffers */
+BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
+void nilfs_mark_buffer_dirty(struct buffer_head *bh);
+int __nilfs_clear_page_dirty(struct page *);
+struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
+                                      unsigned long, unsigned long);
+void nilfs_forget_buffer(struct buffer_head *);
+void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
+int nilfs_page_buffers_clean(struct page *);
+void nilfs_page_bug(struct page *);
+struct page *nilfs_alloc_private_page(struct block_device *, int,
+                                      unsigned long);
+void nilfs_free_private_page(struct page *);
+int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
+void nilfs_copy_back_pages(struct address_space *, struct address_space *);
+void nilfs_clear_dirty_pages(struct address_space *);
+unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+#define NILFS_PAGE_BUG(page, m, a...) \
+        do { nilfs_page_bug(page); BUG(); } while (0)
+static inline struct buffer_head *
+nilfs_page_get_nth_block(struct page *page, unsigned int count)
+{
+        struct buffer_head *bh = page_buffers(page);
+        while (count-- > 0)
+                bh = bh->b_this_page;
+        get_bh(bh);
+        return bh;
+}
+#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
new file mode 100644
index 000000000000..57afa9d24061
--- /dev/null
+++ b/fs/nilfs2/recovery.c
@@ -0,0 +1,919 @@
+/*
+ * recovery.c - NILFS recovery logic
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "sufile.h"
+#include "page.h"
+#include "seglist.h"
+#include "segbuf.h"
+/*
+ * Segment check result
+ */
+enum {
+        NILFS_SEG_VALID,
+        NILFS_SEG_NO_SUPER_ROOT,
+        NILFS_SEG_FAIL_IO,
+        NILFS_SEG_FAIL_MAGIC,
+        NILFS_SEG_FAIL_SEQ,
+        NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
+        NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
+        NILFS_SEG_FAIL_CHECKSUM_FULL,
+        NILFS_SEG_FAIL_CONSISTENCY,
+};
+/* work structure for recovery */
+struct nilfs_recovery_block {
+        ino_t ino;              /* Inode number of the file that this block
+                                   belongs to */
+        sector_t blocknr;       /* block number */
+        __u64 vblocknr;         /* virtual block number */
+        unsigned long blkoff;   /* File offset of the data block (per block) */
+        struct list_head list;
+};
+static int nilfs_warn_segment_error(int err)
+{
+        switch (err) {
+        case NILFS_SEG_FAIL_IO:
+                printk(KERN_WARNING
+                       "NILFS warning: I/O error on loading last segment\n");
+                return -EIO;
+        case NILFS_SEG_FAIL_MAGIC:
+                printk(KERN_WARNING
+                       "NILFS warning: Segment magic number invalid\n");
+                break;
+        case NILFS_SEG_FAIL_SEQ:
+                printk(KERN_WARNING
+                       "NILFS warning: Sequence number mismatch\n");
+                break;
+        case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
+                printk(KERN_WARNING
+                       "NILFS warning: Checksum error in segment summary\n");
+                break;
+        case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
+                printk(KERN_WARNING
+                       "NILFS warning: Checksum error in super root\n");
+                break;
+        case NILFS_SEG_FAIL_CHECKSUM_FULL:
+                printk(KERN_WARNING
+                       "NILFS warning: Checksum error in segment payload\n");
+                break;
+        case NILFS_SEG_FAIL_CONSISTENCY:
+                printk(KERN_WARNING
+                       "NILFS warning: Inconsistent segment\n");
+                break;
+        case NILFS_SEG_NO_SUPER_ROOT:
+                printk(KERN_WARNING
+                       "NILFS warning: No super root in the last segment\n");
+                break;
+        }
+        return -EINVAL;
+}
+static void store_segsum_info(struct nilfs_segsum_info *ssi,
+                              struct nilfs_segment_summary *sum,
+                              unsigned int blocksize)
+{
+        ssi->flags = le16_to_cpu(sum->ss_flags);
+        ssi->seg_seq = le64_to_cpu(sum->ss_seq);
+        ssi->ctime = le64_to_cpu(sum->ss_create);
+        ssi->next = le64_to_cpu(sum->ss_next);
+        ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
+        ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
+        ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
+        ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
+        ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
+}
+/**
+ * calc_crc_cont - check CRC of blocks continuously
+ * @sbi: nilfs_sb_info
+ * @bhs: buffer head of start block
+ * @sum: place to store result
+ * @offset: offset bytes in the first block
+ * @check_bytes: number of bytes to be checked
+ * @start: DBN of start block
+ * @nblock: number of blocks to be checked
+ */
+static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
+                         u32 *sum, unsigned long offset, u64 check_bytes,
+                         sector_t start, unsigned long nblock)
+{
+        unsigned long blocksize = sbi->s_super->s_blocksize;
+        unsigned long size;
+        u32 crc;
+        BUG_ON(offset >= blocksize);
+        check_bytes -= offset;
+        size = min_t(u64, check_bytes, blocksize - offset);
+        crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
+                       (unsigned char *)bhs->b_data + offset, size);
+        if (--nblock > 0) {
+                do {
+                        struct buffer_head *bh
+                                = sb_bread(sbi->s_super, ++start);
+                        if (!bh)
+                                return -EIO;
+                        check_bytes -= size;
+                        size = min_t(u64, check_bytes, blocksize);
+                        crc = crc32_le(crc, bh->b_data, size);
+                        brelse(bh);
+                } while (--nblock > 0);
+        }
+        *sum = crc;
+        return 0;
+}
+/**
+ * nilfs_read_super_root_block - read super root block
+ * @sb: super_block
+ * @sr_block: disk block number of the super root block
+ * @pbh: address of a buffer_head pointer to return super root buffer
+ * @check: CRC check flag
+ */
+int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
+                                struct buffer_head **pbh, int check)
+{
+        struct buffer_head *bh_sr;
+        struct nilfs_super_root *sr;
+        u32 crc;
+        int ret;
+        *pbh = NULL;
+        bh_sr = sb_bread(sb, sr_block);
+        if (unlikely(!bh_sr)) {
+                ret = NILFS_SEG_FAIL_IO;
+                goto failed;
+        }
+        sr = (struct nilfs_super_root *)bh_sr->b_data;
+        if (check) {
+                unsigned bytes = le16_to_cpu(sr->sr_bytes);
+                if (bytes == 0 || bytes > sb->s_blocksize) {
+                        ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+                        goto failed_bh;
+                }
+                if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
+                                  sizeof(sr->sr_sum), bytes, sr_block, 1)) {
+                        ret = NILFS_SEG_FAIL_IO;
+                        goto failed_bh;
+                }
+                if (crc != le32_to_cpu(sr->sr_sum)) {
+                        ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+                        goto failed_bh;
+                }
+        }
+        *pbh = bh_sr;
+        return 0;
+ failed_bh:
+        brelse(bh_sr);
+ failed:
+        return nilfs_warn_segment_error(ret);
+}
+/**
+ * load_segment_summary - read segment summary of the specified partial segment
+ * @sbi: nilfs_sb_info
+ * @pseg_start: start disk block number of partial segment
+ * @seg_seq: sequence number requested
+ * @ssi: pointer to nilfs_segsum_info struct to store information
+ * @full_check: full check flag
+ *              (0: only checks segment summary CRC, 1: data CRC)
+ */
+static int
+load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
+                     u64 seg_seq, struct nilfs_segsum_info *ssi,
+                     int full_check)
+{
+        struct buffer_head *bh_sum;
+        struct nilfs_segment_summary *sum;
+        unsigned long offset, nblock;
+        u64 check_bytes;
+        u32 crc, crc_sum;
+        int ret = NILFS_SEG_FAIL_IO;
+        bh_sum = sb_bread(sbi->s_super, pseg_start);
+        if (!bh_sum)
+                goto out;
+        sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+        /* Check consistency of segment summary */
+        if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
+                ret = NILFS_SEG_FAIL_MAGIC;
+                goto failed;
+        }
+        store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
+        if (seg_seq != ssi->seg_seq) {
+                ret = NILFS_SEG_FAIL_SEQ;
+                goto failed;
+        }
+        if (full_check) {
+                offset = sizeof(sum->ss_datasum);
+                check_bytes =
+                        ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
+                nblock = ssi->nblocks;
+                crc_sum = le32_to_cpu(sum->ss_datasum);
+                ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
+        } else { /* only checks segment summary */
+                offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
+                check_bytes = ssi->sumbytes;
+                nblock = ssi->nsumblk;
+                crc_sum = le32_to_cpu(sum->ss_sumsum);
+                ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
+        }
+        if (unlikely(nblock == 0 ||
+                     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
+                /* This limits the number of blocks read in the CRC check */
+                ret = NILFS_SEG_FAIL_CONSISTENCY;
+                goto failed;
+        }
+        if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
+                          pseg_start, nblock)) {
+                ret = NILFS_SEG_FAIL_IO;
+                goto failed;
+        }
+        if (crc == crc_sum)
+                ret = 0;
+ failed:
+        brelse(bh_sum);
+ out:
+        return ret;
+}
+static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
+                        unsigned int *offset, unsigned int bytes)
+{
+        void *ptr;
+        sector_t blocknr;
+        BUG_ON((*pbh)->b_size < *offset);
+        if (bytes > (*pbh)->b_size - *offset) {
+                blocknr = (*pbh)->b_blocknr;
+                brelse(*pbh);
+                *pbh = sb_bread(sb, blocknr + 1);
+                if (unlikely(!*pbh))
+                        return NULL;
+                *offset = 0;
+        }
+        ptr = (*pbh)->b_data + *offset;
+        *offset += bytes;
+        return ptr;
+}
+static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
+                        unsigned int *offset, unsigned int bytes,
+                        unsigned long count)
+{
+        unsigned int rest_item_in_current_block
+                = ((*pbh)->b_size - *offset) / bytes;
+        if (count <= rest_item_in_current_block) {
+                *offset += bytes * count;
+        } else {
+                sector_t blocknr = (*pbh)->b_blocknr;
+                unsigned int nitem_per_block = (*pbh)->b_size / bytes;
+                unsigned int bcnt;
+                count -= rest_item_in_current_block;
+                bcnt = DIV_ROUND_UP(count, nitem_per_block);
+                *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
+                brelse(*pbh);
+                *pbh = sb_bread(sb, blocknr + bcnt);
+        }
+}
+static int
+collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
+                           struct nilfs_segsum_info *ssi,
+                           struct list_head *head)
+{
+        struct buffer_head *bh;
+        unsigned int offset;
+        unsigned long nfinfo = ssi->nfinfo;
+        sector_t blocknr = sum_blocknr + ssi->nsumblk;
+        ino_t ino;
+        int err = -EIO;
+        if (!nfinfo)
+                return 0;
+        bh = sb_bread(sbi->s_super, sum_blocknr);
+        if (unlikely(!bh))
+                goto out;
+        offset = le16_to_cpu(
+                ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
+        for (;;) {
+                unsigned long nblocks, ndatablk, nnodeblk;
+                struct nilfs_finfo *finfo;
+                finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
+                if (unlikely(!finfo))
+                        goto out;
+                ino = le64_to_cpu(finfo->fi_ino);
+                nblocks = le32_to_cpu(finfo->fi_nblocks);
+                ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+                nnodeblk = nblocks - ndatablk;
+                while (ndatablk-- > 0) {
+                        struct nilfs_recovery_block *rb;
+                        struct nilfs_binfo_v *binfo;
+                        binfo = segsum_get(sbi->s_super, &bh, &offset,
+                                           sizeof(*binfo));
+                        if (unlikely(!binfo))
+                                goto out;
+                        rb = kmalloc(sizeof(*rb), GFP_NOFS);
+                        if (unlikely(!rb)) {
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        rb->ino = ino;
+                        rb->blocknr = blocknr++;
+                        rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
+                        rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
+                        /* INIT_LIST_HEAD(&rb->list); */
+                        list_add_tail(&rb->list, head);
+                }
+                if (--nfinfo == 0)
+                        break;
+                blocknr += nnodeblk; /* always 0 for the data sync segments */
+                segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
+                            nnodeblk);
+                if (unlikely(!bh))
+                        goto out;
+        }
+        err = 0;
+ out:
+        brelse(bh);   /* brelse(NULL) is just ignored */
+        return err;
+}
+static void dispose_recovery_list(struct list_head *head)
+{
+        while (!list_empty(head)) {
+                struct nilfs_recovery_block *rb
+                        = list_entry(head->next,
+                                     struct nilfs_recovery_block, list);
+                list_del(&rb->list);
+                kfree(rb);
+        }
+}
+void nilfs_dispose_segment_list(struct list_head *head)
+{
+        while (!list_empty(head)) {
+                struct nilfs_segment_entry *ent
+                        = list_entry(head->next,
+                                     struct nilfs_segment_entry, list);
+                list_del(&ent->list);
+                nilfs_free_segment_entry(ent);
+        }
+}
+static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
+                                              struct nilfs_sb_info *sbi,
+                                              struct nilfs_recovery_info *ri)
+{
+        struct list_head *head = &ri->ri_used_segments;
+        struct nilfs_segment_entry *ent, *n;
+        struct inode *sufile = nilfs->ns_sufile;
+        __u64 segnum[4];
+        int err;
+        int i;
+        segnum[0] = nilfs->ns_segnum;
+        segnum[1] = nilfs->ns_nextnum;
+        segnum[2] = ri->ri_segnum;
+        segnum[3] = ri->ri_nextnum;
+        nilfs_attach_writer(nilfs, sbi);
+        /*
+         * Releasing the next segment of the latest super root.
+         * The next segment is invalidated by this recovery.
+         */
+        err = nilfs_sufile_free(sufile, segnum[1]);
+        if (unlikely(err))
+                goto failed;
+        err = -ENOMEM;
+        for (i = 1; i < 4; i++) {
+                ent = nilfs_alloc_segment_entry(segnum[i]);
+                if (unlikely(!ent))
+                        goto failed;
+                list_add_tail(&ent->list, head);
+        }
+        /*
+         * Collecting segments written after the latest super root.
+         * These are marked dirty to avoid being reallocated in the next write.
+         */
+        list_for_each_entry_safe(ent, n, head, list) {
+                if (ent->segnum != segnum[0]) {
+                        err = nilfs_sufile_scrap(sufile, ent->segnum);
+                        if (unlikely(err))
+                                goto failed;
+                }
+                list_del(&ent->list);
+                nilfs_free_segment_entry(ent);
+        }
+        /* Allocate new segments for recovery */
+        err = nilfs_sufile_alloc(sufile, &segnum[0]);
+        if (unlikely(err))
+                goto failed;
+        nilfs->ns_pseg_offset = 0;
+        nilfs->ns_seg_seq = ri->ri_seq + 2;
+        nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
+ failed:
+        /* No need to recover sufile because it will be destroyed on error */
+        nilfs_detach_writer(nilfs, sbi);
+        return err;
+}
+static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
+                                     struct nilfs_recovery_block *rb,
+                                     struct page *page)
+{
+        struct buffer_head *bh_org;
+        void *kaddr;
+        bh_org = sb_bread(sbi->s_super, rb->blocknr);
+        if (unlikely(!bh_org))
+                return -EIO;
+        kaddr = kmap_atomic(page, KM_USER0);
+        memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(bh_org);
+        return 0;
+}
+static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
+                                struct list_head *head,
+                                unsigned long *nr_salvaged_blocks)
+{
+        struct inode *inode;
+        struct nilfs_recovery_block *rb, *n;
+        unsigned blocksize = sbi->s_super->s_blocksize;
+        struct page *page;
+        loff_t pos;
+        int err = 0, err2 = 0;
+        list_for_each_entry_safe(rb, n, head, list) {
+                inode = nilfs_iget(sbi->s_super, rb->ino);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        inode = NULL;
+                        goto failed_inode;
+                }
+                pos = rb->blkoff << inode->i_blkbits;
+                page = NULL;
+                err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
+                                        0, &page, NULL, nilfs_get_block);
+                if (unlikely(err))
+                        goto failed_inode;
+                err = nilfs_recovery_copy_block(sbi, rb, page);
+                if (unlikely(err))
+                        goto failed_page;
+                err = nilfs_set_file_dirty(sbi, inode, 1);
+                if (unlikely(err))
+                        goto failed_page;
+                block_write_end(NULL, inode->i_mapping, pos, blocksize,
+                                blocksize, page, NULL);
+                unlock_page(page);
+                page_cache_release(page);
+                (*nr_salvaged_blocks)++;
+                goto next;
+ failed_page:
+                unlock_page(page);
+                page_cache_release(page);
+ failed_inode:
+                printk(KERN_WARNING
+                       "NILFS warning: error recovering data block "
+                       "(err=%d, ino=%lu, block-offset=%llu)\n",
+                       err, rb->ino, (unsigned long long)rb->blkoff);
+                if (!err2)
+                        err2 = err;
+ next:
+                iput(inode); /* iput(NULL) is just ignored */
+                list_del_init(&rb->list);
+                kfree(rb);
+        }
+        return err2;
+}
+/**
+ * nilfs_do_roll_forward - salvage logical segments newer than the latest
+ * checkpoint
+ * @sbi: nilfs_sb_info
+ * @nilfs: the_nilfs
+ * @ri: pointer to a nilfs_recovery_info
+ */
+static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
+                                 struct nilfs_sb_info *sbi,
+                                 struct nilfs_recovery_info *ri)
+{
+        struct nilfs_segsum_info ssi;
+        sector_t pseg_start;
+        sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
+        unsigned long nsalvaged_blocks = 0;
+        u64 seg_seq;
+        __u64 segnum, nextnum = 0;
+        int empty_seg = 0;
+        int err = 0, ret;
+        LIST_HEAD(dsync_blocks);  /* list of data blocks to be recovered */
+        enum {
+                RF_INIT_ST,
+                RF_DSYNC_ST,   /* scanning data-sync segments */
+        };
+        int state = RF_INIT_ST;
+        nilfs_attach_writer(nilfs, sbi);
+        pseg_start = ri->ri_lsegs_start;
+        seg_seq = ri->ri_lsegs_start_seq;
+        segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+        while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
+                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+                if (ret) {
+                        if (ret == NILFS_SEG_FAIL_IO) {
+                                err = -EIO;
+                                goto failed;
+                        }
+                        goto strayed;
+                }
+                if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
+                        goto confused;
+                /* Found a valid partial segment; do recovery actions */
+                nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+                empty_seg = 0;
+                nilfs->ns_ctime = ssi.ctime;
+                if (!(ssi.flags & NILFS_SS_GC))
+                        nilfs->ns_nongc_ctime = ssi.ctime;
+                switch (state) {
+                case RF_INIT_ST:
+                        if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
+                                goto try_next_pseg;
+                        state = RF_DSYNC_ST;
+                        /* Fall through */
+                case RF_DSYNC_ST:
+                        if (!NILFS_SEG_DSYNC(&ssi))
+                                goto confused;
+                        err = collect_blocks_from_segsum(
+                                sbi, pseg_start, &ssi, &dsync_blocks);
+                        if (unlikely(err))
+                                goto failed;
+                        if (NILFS_SEG_LOGEND(&ssi)) {
+                                err = recover_dsync_blocks(
+                                        sbi, &dsync_blocks, &nsalvaged_blocks);
+                                if (unlikely(err))
+                                        goto failed;
+                                state = RF_INIT_ST;
+                        }
+                        break; /* Fall through to try_next_pseg */
+                }
+ try_next_pseg:
+                if (pseg_start == ri->ri_lsegs_end)
+                        break;
+                pseg_start += ssi.nblocks;
+                if (pseg_start < seg_end)
+                        continue;
+                goto feed_segment;
+ strayed:
+                if (pseg_start == ri->ri_lsegs_end)
+                        break;
+ feed_segment:
+                /* Looking to the next full segment */
+                if (empty_seg++)
+                        break;
+                seg_seq++;
+                segnum = nextnum;
+                nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+                pseg_start = seg_start;
+        }
+        if (nsalvaged_blocks) {
+                printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
+                       sbi->s_super->s_id, nsalvaged_blocks);
+                ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
+        }
+ out:
+        dispose_recovery_list(&dsync_blocks);
+        nilfs_detach_writer(sbi->s_nilfs, sbi);
+        return err;
+ confused:
+        err = -EINVAL;
+ failed:
+        printk(KERN_ERR
+               "NILFS (device %s): Error roll-forwarding "
+               "(err=%d, pseg block=%llu). ",
+               sbi->s_super->s_id, err, (unsigned long long)pseg_start);
+        goto out;
+}
+static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
+                                      struct nilfs_sb_info *sbi,
+                                      struct nilfs_recovery_info *ri)
+{
+        struct buffer_head *bh;
+        int err;
+        if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
+            nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
+                return;
+        bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
+        BUG_ON(!bh);
+        memset(bh->b_data, 0, bh->b_size);
+        set_buffer_dirty(bh);
+        err = sync_dirty_buffer(bh);
+        if (unlikely(err))
+                printk(KERN_WARNING
+                       "NILFS warning: buffer sync write failed during "
+                       "post-cleaning of recovery.\n");
+        brelse(bh);
+}
+/**
+ * nilfs_recover_logical_segments - salvage logical segments written after
+ * the latest super root
+ * @nilfs: the_nilfs
+ * @sbi: nilfs_sb_info
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - Inconsistent filesystem state.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
+                                   struct nilfs_sb_info *sbi,
+                                   struct nilfs_recovery_info *ri)
+{
+        int err;
+        if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
+                return 0;
+        err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
+        if (unlikely(err)) {
+                printk(KERN_ERR
+                       "NILFS: error loading the latest checkpoint.\n");
+                return err;
+        }
+        err = nilfs_do_roll_forward(nilfs, sbi, ri);
+        if (unlikely(err))
+                goto failed;
+        if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
+                err = nilfs_prepare_segment_for_recovery(nilfs, sbi, ri);
+                if (unlikely(err)) {
+                        printk(KERN_ERR "NILFS: Error preparing segments for "
+                               "recovery.\n");
+                        goto failed;
+                }
+                err = nilfs_attach_segment_constructor(sbi);
+                if (unlikely(err))
+                        goto failed;
+                set_nilfs_discontinued(nilfs);
+                err = nilfs_construct_segment(sbi->s_super);
+                nilfs_detach_segment_constructor(sbi);
+                if (unlikely(err)) {
+                        printk(KERN_ERR "NILFS: Oops! recovery failed. "
+                               "(err=%d)\n", err);
+                        goto failed;
+                }
+                nilfs_finish_roll_forward(nilfs, sbi, ri);
+        }
+        nilfs_detach_checkpoint(sbi);
+        return 0;
+ failed:
+        nilfs_detach_checkpoint(sbi);
+        nilfs_mdt_clear(nilfs->ns_cpfile);
+        nilfs_mdt_clear(nilfs->ns_sufile);
+        nilfs_mdt_clear(nilfs->ns_dat);
+        return err;
+}
+/**
+ * nilfs_search_super_root - search the latest valid super root
+ * @nilfs: the_nilfs
+ * @sbi: nilfs_sb_info
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * nilfs_search_super_root() looks for the latest super-root from a partial
+ * segment pointed by the superblock.  It sets up struct the_nilfs through
+ * this search. It fills nilfs_recovery_info (ri) required for recovery.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - No valid segment found
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
+                            struct nilfs_recovery_info *ri)
+{
+        struct nilfs_segsum_info ssi;
+        sector_t pseg_start, pseg_end, sr_pseg_start = 0;
+        sector_t seg_start, seg_end; /* range of full segment (block number) */
+        u64 seg_seq;
+        __u64 segnum, nextnum = 0;
+        __u64 cno;
+        struct nilfs_segment_entry *ent;
+        LIST_HEAD(segments);
+        int empty_seg = 0, scan_newer = 0;
+        int ret;
+        pseg_start = nilfs->ns_last_pseg;
+        seg_seq = nilfs->ns_last_seq;
+        cno = nilfs->ns_last_cno;
+        segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+        /* Calculate range of segment */
+        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+        for (;;) {
+                /* Load segment summary */
+                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+                if (ret) {
+                        if (ret == NILFS_SEG_FAIL_IO)
+                                goto failed;
+                        goto strayed;
+                }
+                pseg_end = pseg_start + ssi.nblocks - 1;
+                if (unlikely(pseg_end > seg_end)) {
+                        ret = NILFS_SEG_FAIL_CONSISTENCY;
+                        goto strayed;
+                }
+                /* A valid partial segment */
+                ri->ri_pseg_start = pseg_start;
+                ri->ri_seq = seg_seq;
+                ri->ri_segnum = segnum;
+                nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+                ri->ri_nextnum = nextnum;
+                empty_seg = 0;
+                if (!NILFS_SEG_HAS_SR(&ssi)) {
+                        if (!scan_newer) {
+                                /* This will never happen because a superblock
+                                   (last_segment) always points to a pseg
+                                   having a super root. */
+                                ret = NILFS_SEG_FAIL_CONSISTENCY;
+                                goto failed;
+                        }
+                        if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
+                                ri->ri_lsegs_start = pseg_start;
+                                ri->ri_lsegs_start_seq = seg_seq;
+                        }
+                        if (NILFS_SEG_LOGEND(&ssi))
+                                ri->ri_lsegs_end = pseg_start;
+                        goto try_next_pseg;
+                }
+                /* A valid super root was found. */
+                ri->ri_cno = cno++;
+                ri->ri_super_root = pseg_end;
+                ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
+                nilfs_dispose_segment_list(&segments);
+                nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
+                        + ssi.nblocks - seg_start;
+                nilfs->ns_seg_seq = seg_seq;
+                nilfs->ns_segnum = segnum;
+                nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
+                nilfs->ns_ctime = ssi.ctime;
+                nilfs->ns_nextnum = nextnum;
+                if (scan_newer)
+                        ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
+                else {
+                        if (nilfs->ns_mount_state & NILFS_VALID_FS)
+                                goto super_root_found;
+                        scan_newer = 1;
+                }
+                /* reset region for roll-forward */
+                pseg_start += ssi.nblocks;
+                if (pseg_start < seg_end)
+                        continue;
+                goto feed_segment;
+ try_next_pseg:
+                /* Standing on a course, or met an inconsistent state */
+                pseg_start += ssi.nblocks;
+                if (pseg_start < seg_end)
+                        continue;
+                goto feed_segment;
+ strayed:
+                /* Off the trail */
+                if (!scan_newer)
+                        /*
+                         * This can happen if a checkpoint was written without
+                         * barriers, or as a result of an I/O failure.
+                         */
+                        goto failed;
+ feed_segment:
+                /* Looking to the next full segment */
+                if (empty_seg++)
+                        goto super_root_found; /* found a valid super root */
+                ent = nilfs_alloc_segment_entry(segnum);
+                if (unlikely(!ent)) {
+                        ret = -ENOMEM;
+                        goto failed;
+                }
+                list_add_tail(&ent->list, &segments);
+                seg_seq++;
+                segnum = nextnum;
+                nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+                pseg_start = seg_start;
+        }
+ super_root_found:
+        /* Updating pointers relating to the latest checkpoint */
+        list_splice(&segments, ri->ri_used_segments.prev);
+        nilfs->ns_last_pseg = sr_pseg_start;
+        nilfs->ns_last_seq = nilfs->ns_seg_seq;
+        nilfs->ns_last_cno = ri->ri_cno;
+        return 0;
+ failed:
+        nilfs_dispose_segment_list(&segments);
+        return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
+}
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
new file mode 100644
index 000000000000..adccd4fc654e
--- /dev/null
+++ b/fs/nilfs2/sb.h
@@ -0,0 +1,102 @@
+/*
+ * sb.h - NILFS on-memory super block structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SB
+#define _NILFS_SB
+#include <linux/types.h>
+#include <linux/fs.h>
+/*
+ * Mount options
+ */
+struct nilfs_mount_options {
+        unsigned long mount_opt;
+        __u64 snapshot_cno;
+};
+struct the_nilfs;
+struct nilfs_sc_info;
+/*
+ * NILFS super-block data in memory
+ */
+struct nilfs_sb_info {
+        /* Snapshot status */
+        __u64 s_snapshot_cno;           /* Checkpoint number */
+        atomic_t s_inodes_count;
+        atomic_t s_blocks_count;        /* Reserved (might be deleted) */
+        /* Mount options */
+        unsigned long s_mount_opt;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned long s_interval;       /* construction interval */
+        unsigned long s_watermark;      /* threshold of data amount
+                                           for the segment construction */
+        /* Fundamental members */
+        struct super_block *s_super;    /* reverse pointer to super_block */
+        struct the_nilfs *s_nilfs;
+        struct list_head s_list;        /* list head for nilfs->ns_supers */
+        /* Segment constructor */
+        struct list_head s_dirty_files; /* dirty files list */
+        struct nilfs_sc_info *s_sc_info; /* segment constructor info */
+        spinlock_t s_inode_lock;        /* Lock for the nilfs inode.
+                                           It covers s_dirty_files list */
+        /* Metadata files */
+        struct inode *s_ifile;          /* index file inode */
+        /* Inode allocator */
+        spinlock_t s_next_gen_lock;
+        u32 s_next_generation;
+};
+static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
+{
+        return sbi->s_sc_info;
+}
+/*
+ * Bit operations for the mount option
+ */
+#define nilfs_clear_opt(sbi, opt)  \
+        do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+#define nilfs_set_opt(sbi, opt)  \
+        do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+#define nilfs_test_opt(sbi, opt)   ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
+#define nilfs_write_opt(sbi, mask, opt)                                 \
+        do { (sbi)->s_mount_opt =                                       \
+                (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) |           \
+                 NILFS_MOUNT_##opt);                                    \
+        } while (0)
+#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
new file mode 100644
index 000000000000..1e68821b4a9b
--- /dev/null
+++ b/fs/nilfs2/segbuf.c
@@ -0,0 +1,439 @@
+/*
+ * segbuf.c - NILFS segment buffer
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/crc32.h>
+#include "page.h"
+#include "segbuf.h"
+#include "seglist.h"
+static struct kmem_cache *nilfs_segbuf_cachep;
+static void nilfs_segbuf_init_once(void *obj)
+{
+        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
+int __init nilfs_init_segbuf_cache(void)
+{
+        nilfs_segbuf_cachep =
+                kmem_cache_create("nilfs2_segbuf_cache",
+                                  sizeof(struct nilfs_segment_buffer),
+                                  0, SLAB_RECLAIM_ACCOUNT,
+                                  nilfs_segbuf_init_once);
+        return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
+}
+void nilfs_destroy_segbuf_cache(void)
+{
+        kmem_cache_destroy(nilfs_segbuf_cachep);
+}
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
+{
+        struct nilfs_segment_buffer *segbuf;
+        segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
+        if (unlikely(!segbuf))
+                return NULL;
+        segbuf->sb_super = sb;
+        INIT_LIST_HEAD(&segbuf->sb_list);
+        INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
+        INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+        return segbuf;
+}
+void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
+{
+        kmem_cache_free(nilfs_segbuf_cachep, segbuf);
+}
+void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
+                     unsigned long offset, struct the_nilfs *nilfs)
+{
+        segbuf->sb_segnum = segnum;
+        nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
+                                &segbuf->sb_fseg_end);
+        segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
+        segbuf->sb_rest_blocks =
+                segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
+                                  __u64 nextnum, struct the_nilfs *nilfs)
+{
+        segbuf->sb_nextnum = nextnum;
+        segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
+}
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
+{
+        struct buffer_head *bh;
+        bh = sb_getblk(segbuf->sb_super,
+                       segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
+        if (unlikely(!bh))
+                return -ENOMEM;
+        nilfs_segbuf_add_segsum_buffer(segbuf, bh);
+        return 0;
+}
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
+                                struct buffer_head **bhp)
+{
+        struct buffer_head *bh;
+        bh = sb_getblk(segbuf->sb_super,
+                       segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
+        if (unlikely(!bh))
+                return -ENOMEM;
+        nilfs_segbuf_add_payload_buffer(segbuf, bh);
+        *bhp = bh;
+        return 0;
+}
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+                       time_t ctime)
+{
+        int err;
+        segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
+        err = nilfs_segbuf_extend_segsum(segbuf);
+        if (unlikely(err))
+                return err;
+        segbuf->sb_sum.flags = flags;
+        segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
+        segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
+        segbuf->sb_sum.ctime = ctime;
+        segbuf->sb_io_error = 0;
+        return 0;
+}
+/*
+ * Setup segument summary
+ */
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
+{
+        struct nilfs_segment_summary *raw_sum;
+        struct buffer_head *bh_sum;
+        bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
+                            struct buffer_head, b_assoc_buffers);
+        raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+        raw_sum->ss_magic    = cpu_to_le32(NILFS_SEGSUM_MAGIC);
+        raw_sum->ss_bytes    = cpu_to_le16(sizeof(*raw_sum));
+        raw_sum->ss_flags    = cpu_to_le16(segbuf->sb_sum.flags);
+        raw_sum->ss_seq      = cpu_to_le64(segbuf->sb_sum.seg_seq);
+        raw_sum->ss_create   = cpu_to_le64(segbuf->sb_sum.ctime);
+        raw_sum->ss_next     = cpu_to_le64(segbuf->sb_sum.next);
+        raw_sum->ss_nblocks  = cpu_to_le32(segbuf->sb_sum.nblocks);
+        raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
+        raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
+        raw_sum->ss_pad      = 0;
+}
+/*
+ * CRC calculation routines
+ */
+void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
+                                     u32 seed)
+{
+        struct buffer_head *bh;
+        struct nilfs_segment_summary *raw_sum;
+        unsigned long size, bytes = segbuf->sb_sum.sumbytes;
+        u32 crc;
+        bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+                        b_assoc_buffers);
+        raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+        size = min_t(unsigned long, bytes, bh->b_size);
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sum +
+                       sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
+                       size - (sizeof(raw_sum->ss_datasum) +
+                               sizeof(raw_sum->ss_sumsum)));
+        list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+                                     b_assoc_buffers) {
+                bytes -= size;
+                size = min_t(unsigned long, bytes, bh->b_size);
+                crc = crc32_le(crc, bh->b_data, size);
+        }
+        raw_sum->ss_sumsum = cpu_to_le32(crc);
+}
+void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+                                   u32 seed)
+{
+        struct buffer_head *bh;
+        struct nilfs_segment_summary *raw_sum;
+        void *kaddr;
+        u32 crc;
+        bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+                        b_assoc_buffers);
+        raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
+                       bh->b_size - sizeof(raw_sum->ss_datasum));
+        list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+                                     b_assoc_buffers) {
+                crc = crc32_le(crc, bh->b_data, bh->b_size);
+        }
+        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+                kaddr = kmap_atomic(bh->b_page, KM_USER0);
+                crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        raw_sum->ss_datasum = cpu_to_le32(crc);
+}
+void nilfs_release_buffers(struct list_head *list)
+{
+        struct buffer_head *bh, *n;
+        list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
+                list_del_init(&bh->b_assoc_buffers);
+                if (buffer_nilfs_allocated(bh)) {
+                        struct page *clone_page = bh->b_page;
+                        /* remove clone page */
+                        brelse(bh);
+                        page_cache_release(clone_page); /* for each bh */
+                        if (page_count(clone_page) <= 2) {
+                                lock_page(clone_page);
+                                nilfs_free_private_page(clone_page);
+                        }
+                        continue;
+                }
+                brelse(bh);
+        }
+}
+/*
+ * BIO operations
+ */
+static void nilfs_end_bio_write(struct bio *bio, int err)
+{
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct nilfs_write_info *wi = bio->bi_private;
+        if (err == -EOPNOTSUPP) {
+                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+                bio_put(bio);
+                /* to be detected by submit_seg_bio() */
+        }
+        if (!uptodate)
+                atomic_inc(&wi->err);
+        bio_put(bio);
+        complete(&wi->bio_event);
+}
+static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
+{
+        struct bio *bio = wi->bio;
+        int err;
+        if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
+                wait_for_completion(&wi->bio_event);
+                wi->nbio--;
+                if (unlikely(atomic_read(&wi->err))) {
+                        bio_put(bio);
+                        err = -EIO;
+                        goto failed;
+                }
+        }
+        bio->bi_end_io = nilfs_end_bio_write;
+        bio->bi_private = wi;
+        bio_get(bio);
+        submit_bio(mode, bio);
+        if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+                bio_put(bio);
+                err = -EOPNOTSUPP;
+                goto failed;
+        }
+        wi->nbio++;
+        bio_put(bio);
+        wi->bio = NULL;
+        wi->rest_blocks -= wi->end - wi->start;
+        wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+        wi->start = wi->end;
+        return 0;
+ failed:
+        wi->bio = NULL;
+        return err;
+}
+/**
+ * nilfs_alloc_seg_bio - allocate a bio for writing segment.
+ * @sb: super block
+ * @start: beginning disk block number of this BIO.
+ * @nr_vecs: request size of page vector.
+ *
+ * alloc_seg_bio() allocates a new BIO structure and initialize it.
+ *
+ * Return Value: On success, pointer to the struct bio is returned.
+ * On error, NULL is returned.
+ */
+static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
+                                       int nr_vecs)
+{
+        struct bio *bio;
+        bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+        if (bio == NULL) {
+                while (!bio && (nr_vecs >>= 1))
+                        bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+        }
+        if (likely(bio)) {
+                bio->bi_bdev = sb->s_bdev;
+                bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
+        }
+        return bio;
+}
+void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+                                struct nilfs_write_info *wi)
+{
+        wi->bio = NULL;
+        wi->rest_blocks = segbuf->sb_sum.nblocks;
+        wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
+        wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+        wi->start = wi->end = 0;
+        wi->nbio = 0;
+        wi->blocknr = segbuf->sb_pseg_start;
+        atomic_set(&wi->err, 0);
+        init_completion(&wi->bio_event);
+}
+static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
+                           int mode)
+{
+        int len, err;
+        BUG_ON(wi->nr_vecs <= 0);
+ repeat:
+        if (!wi->bio) {
+                wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
+                                              wi->nr_vecs);
+                if (unlikely(!wi->bio))
+                        return -ENOMEM;
+        }
+        len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
+        if (len == bh->b_size) {
+                wi->end++;
+                return 0;
+        }
+        /* bio is FULL */
+        err = nilfs_submit_seg_bio(wi, mode);
+        /* never submit current bh */
+        if (likely(!err))
+                goto repeat;
+        return err;
+}
+int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+                       struct nilfs_write_info *wi)
+{
+        struct buffer_head *bh;
+        int res, rw = WRITE;
+        list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
+                res = nilfs_submit_bh(wi, bh, rw);
+                if (unlikely(res))
+                        goto failed_bio;
+        }
+        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+                res = nilfs_submit_bh(wi, bh, rw);
+                if (unlikely(res))
+                        goto failed_bio;
+        }
+        if (wi->bio) {
+                /*
+                 * Last BIO is always sent through the following
+                 * submission.
+                 */
+                rw |= (1 << BIO_RW_SYNCIO);
+                res = nilfs_submit_seg_bio(wi, rw);
+                if (unlikely(res))
+                        goto failed_bio;
+        }
+        res = 0;
+ out:
+        return res;
+ failed_bio:
+        atomic_inc(&wi->err);
+        goto out;
+}
+/**
+ * nilfs_segbuf_wait - wait for completion of requested BIOs
+ * @wi: nilfs_write_info
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
+                      struct nilfs_write_info *wi)
+{
+        int err = 0;
+        if (!wi->nbio)
+                return 0;
+        do {
+                wait_for_completion(&wi->bio_event);
+        } while (--wi->nbio > 0);
+        if (unlikely(atomic_read(&wi->err) > 0)) {
+                printk(KERN_ERR "NILFS: IO error writing segment\n");
+                err = -EIO;
+                segbuf->sb_io_error = 1;
+        }
+        return err;
+}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
new file mode 100644
index 000000000000..0c3076f4e592
--- /dev/null
+++ b/fs/nilfs2/segbuf.h
@@ -0,0 +1,201 @@
+/*
+ * segbuf.h - NILFS Segment buffer prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGBUF_H
+#define _NILFS_SEGBUF_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+/**
+ * struct nilfs_segsum_info - On-memory segment summary
+ * @flags: Flags
+ * @nfinfo: Number of file information structures
+ * @nblocks: Number of blocks included in the partial segment
+ * @nsumblk: Number of summary blocks
+ * @sumbytes: Byte count of segment summary
+ * @nfileblk: Total number of file blocks
+ * @seg_seq: Segment sequence number
+ * @ctime: Creation time
+ * @next: Block number of the next full segment
+ */
+struct nilfs_segsum_info {
+        unsigned int            flags;
+        unsigned long           nfinfo;
+        unsigned long           nblocks;
+        unsigned long           nsumblk;
+        unsigned long           sumbytes;
+        unsigned long           nfileblk;
+        u64                     seg_seq;
+        time_t                  ctime;
+        sector_t                next;
+};
+/* macro for the flags */
+#define NILFS_SEG_HAS_SR(sum)    ((sum)->flags & NILFS_SS_SR)
+#define NILFS_SEG_LOGBGN(sum)    ((sum)->flags & NILFS_SS_LOGBGN)
+#define NILFS_SEG_LOGEND(sum)    ((sum)->flags & NILFS_SS_LOGEND)
+#define NILFS_SEG_DSYNC(sum)     ((sum)->flags & NILFS_SS_SYNDT)
+#define NILFS_SEG_SIMPLEX(sum) \
+        (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
+         (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
+#define NILFS_SEG_EMPTY(sum)    ((sum)->nblocks == (sum)->nsumblk)
+/**
+ * struct nilfs_segment_buffer - Segment buffer
+ * @sb_super: back pointer to a superblock struct
+ * @sb_list: List head to chain this structure
+ * @sb_sum: On-memory segment summary
+ * @sb_segnum: Index number of the full segment
+ * @sb_nextnum: Index number of the next full segment
+ * @sb_fseg_start: Start block number of the full segment
+ * @sb_fseg_end: End block number of the full segment
+ * @sb_pseg_start: Disk block number of partial segment
+ * @sb_rest_blocks: Number of residual blocks in the current segment
+ * @sb_segsum_buffers: List of buffers for segment summaries
+ * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_io_error: I/O error status
+ */
+struct nilfs_segment_buffer {
+        struct super_block     *sb_super;
+        struct list_head        sb_list;
+        /* Segment information */
+        struct nilfs_segsum_info sb_sum;
+        __u64                   sb_segnum;
+        __u64                   sb_nextnum;
+        sector_t                sb_fseg_start, sb_fseg_end;
+        sector_t                sb_pseg_start;
+        unsigned                sb_rest_blocks;
+        /* Buffers */
+        struct list_head        sb_segsum_buffers;
+        struct list_head        sb_payload_buffers; /* including super root */
+        /* io status */
+        int                     sb_io_error;
+};
+#define NILFS_LIST_SEGBUF(head)  \
+        list_entry((head), struct nilfs_segment_buffer, sb_list)
+#define NILFS_NEXT_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
+#define NILFS_PREV_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
+#define NILFS_LAST_SEGBUF(head)    NILFS_LIST_SEGBUF((head)->prev)
+#define NILFS_FIRST_SEGBUF(head)   NILFS_LIST_SEGBUF((head)->next)
+#define NILFS_SEGBUF_IS_LAST(segbuf, head)  ((segbuf)->sb_list.next == (head))
+#define nilfs_for_each_segbuf_before(s, t, h) \
+        for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
+             (s) = NILFS_NEXT_SEGBUF(s))
+#define NILFS_SEGBUF_FIRST_BH(head)  \
+        (list_entry((head)->next, struct buffer_head, b_assoc_buffers))
+#define NILFS_SEGBUF_NEXT_BH(bh)  \
+        (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
+                    b_assoc_buffers))
+#define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+int __init nilfs_init_segbuf_cache(void);
+void nilfs_destroy_segbuf_cache(void);
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
+void nilfs_segbuf_free(struct nilfs_segment_buffer *);
+void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
+                      struct the_nilfs *);
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
+                                  struct the_nilfs *);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
+                                struct buffer_head **);
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
+void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
+void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
+static inline void
+nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
+                               struct buffer_head *bh)
+{
+        list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
+        segbuf->sb_sum.nblocks++;
+        segbuf->sb_sum.nsumblk++;
+}
+static inline void
+nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
+                                struct buffer_head *bh)
+{
+        list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
+        segbuf->sb_sum.nblocks++;
+}
+static inline void
+nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
+                             struct buffer_head *bh)
+{
+        get_bh(bh);
+        nilfs_segbuf_add_payload_buffer(segbuf, bh);
+        segbuf->sb_sum.nfileblk++;
+}
+void nilfs_release_buffers(struct list_head *);
+static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+        nilfs_release_buffers(&segbuf->sb_payload_buffers);
+}
+struct nilfs_write_info {
+        struct bio             *bio;
+        int                     start, end; /* The region to be submitted */
+        int                     rest_blocks;
+        int                     max_pages;
+        int                     nr_vecs;
+        sector_t                blocknr;
+        int                     nbio;
+        atomic_t                err;
+        struct completion       bio_event;
+                                /* completion event of segment write */
+        /*
+         * The following fields must be set explicitly
+         */
+        struct super_block     *sb;
+        struct backing_dev_info *bdi; /* backing dev info */
+        struct buffer_head     *bh_sr;
+};
+void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
+                                struct nilfs_write_info *);
+int nilfs_segbuf_write(struct nilfs_segment_buffer *,
+                       struct nilfs_write_info *);
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
+                      struct nilfs_write_info *);
+#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
new file mode 100644
index 000000000000..d39df9144e99
--- /dev/null
+++ b/fs/nilfs2/seglist.h
@@ -0,0 +1,85 @@
+/*
+ * seglist.h - expediential structure and routines to handle list of segments
+ *             (would be removed in a future release)
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGLIST_H
+#define _NILFS_SEGLIST_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "sufile.h"
+struct nilfs_segment_entry {
+        __u64                   segnum;
+#define NILFS_SLH_FREED         0x0001  /* The segment was freed provisonally.
+                                           It must be cancelled if
+                                           construction aborted */
+        unsigned                flags;
+        struct list_head        list;
+        struct buffer_head     *bh_su;
+        struct nilfs_segment_usage *raw_su;
+};
+void nilfs_dispose_segment_list(struct list_head *);
+static inline struct nilfs_segment_entry *
+nilfs_alloc_segment_entry(__u64 segnum)
+{
+        struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+        if (likely(ent)) {
+                ent->segnum = segnum;
+                ent->flags = 0;
+                ent->bh_su = NULL;
+                ent->raw_su = NULL;
+                INIT_LIST_HEAD(&ent->list);
+        }
+        return ent;
+}
+static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
+                                           struct inode *sufile)
+{
+        return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
+                                              &ent->raw_su, &ent->bh_su);
+}
+static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
+                                             struct inode *sufile)
+{
+        if (!ent->bh_su)
+                return;
+        nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
+        ent->bh_su = NULL;
+        ent->raw_su = NULL;
+}
+static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
+{
+        kfree(ent);
+}
+#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
new file mode 100644
index 000000000000..22c7f65c2403
--- /dev/null
+++ b/fs/nilfs2/segment.c
@@ -0,0 +1,2978 @@
+/*
+ * segment.c - NILFS segment constructor.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "page.h"
+#include "segment.h"
+#include "sufile.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "seglist.h"
+#include "segbuf.h"
+/*
+ * Segment constructor
+ */
+#define SC_N_INODEVEC   16   /* Size of locally allocated inode vector */
+#define SC_MAX_SEGDELTA 64   /* Upper limit of the number of segments
+                                appended in collection retry loop */
+/* Construction mode */
+enum {
+        SC_LSEG_SR = 1, /* Make a logical segment having a super root */
+        SC_LSEG_DSYNC,  /* Flush data blocks of a given file and make
+                           a logical segment without a super root */
+        SC_FLUSH_FILE,  /* Flush data files, leads to segment writes without
+                           creating a checkpoint */
+        SC_FLUSH_DAT,   /* Flush DAT file. This also creates segments without
+                           a checkpoint */
+};
+/* Stage numbers of dirty block collection */
+enum {
+        NILFS_ST_INIT = 0,
+        NILFS_ST_GC,            /* Collecting dirty blocks for GC */
+        NILFS_ST_FILE,
+        NILFS_ST_IFILE,
+        NILFS_ST_CPFILE,
+        NILFS_ST_SUFILE,
+        NILFS_ST_DAT,
+        NILFS_ST_SR,            /* Super root */
+        NILFS_ST_DSYNC,         /* Data sync blocks */
+        NILFS_ST_DONE,
+};
+/* State flags of collection */
+#define NILFS_CF_NODE           0x0001  /* Collecting node blocks */
+#define NILFS_CF_IFILE_STARTED  0x0002  /* IFILE stage has started */
+#define NILFS_CF_HISTORY_MASK   (NILFS_CF_IFILE_STARTED)
+/* Operations depending on the construction mode and file type */
+struct nilfs_sc_operations {
+        int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
+                            struct inode *);
+        int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
+                            struct inode *);
+        int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
+                            struct inode *);
+        void (*write_data_binfo)(struct nilfs_sc_info *,
+                                 struct nilfs_segsum_pointer *,
+                                 union nilfs_binfo *);
+        void (*write_node_binfo)(struct nilfs_sc_info *,
+                                 struct nilfs_segsum_pointer *,
+                                 union nilfs_binfo *);
+};
+/*
+ * Other definitions
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
+static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
+                               int);
+#define nilfs_cnt32_gt(a, b)   \
+        (typecheck(__u32, a) && typecheck(__u32, b) && \
+         ((__s32)(b) - (__s32)(a) < 0))
+#define nilfs_cnt32_ge(a, b)   \
+        (typecheck(__u32, a) && typecheck(__u32, b) && \
+         ((__s32)(a) - (__s32)(b) >= 0))
+#define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
+#define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
+/*
+ * Transaction
+ */
+static struct kmem_cache *nilfs_transaction_cachep;
+/**
+ * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
+ *
+ * nilfs_init_transaction_cache() creates a slab cache for the struct
+ * nilfs_transaction_info.
+ *
+ * Return Value: On success, it returns 0. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_init_transaction_cache(void)
+{
+        nilfs_transaction_cachep =
+                kmem_cache_create("nilfs2_transaction_cache",
+                                  sizeof(struct nilfs_transaction_info),
+                                  0, SLAB_RECLAIM_ACCOUNT, NULL);
+        return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
+}
+/**
+ * nilfs_detroy_transaction_cache - destroy the cache for transaction info
+ *
+ * nilfs_destroy_transaction_cache() frees the slab cache for the struct
+ * nilfs_transaction_info.
+ */
+void nilfs_destroy_transaction_cache(void)
+{
+        kmem_cache_destroy(nilfs_transaction_cachep);
+}
+static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
+{
+        struct nilfs_transaction_info *cur_ti = current->journal_info;
+        void *save = NULL;
+        if (cur_ti) {
+                if (cur_ti->ti_magic == NILFS_TI_MAGIC)
+                        return ++cur_ti->ti_count;
+                else {
+                        /*
+                         * If journal_info field is occupied by other FS,
+                         * it is saved and will be restored on
+                         * nilfs_transaction_commit().
+                         */
+                        printk(KERN_WARNING
+                               "NILFS warning: journal info from a different "
+                               "FS\n");
+                        save = current->journal_info;
+                }
+        }
+        if (!ti) {
+                ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
+                if (!ti)
+                        return -ENOMEM;
+                ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
+        } else {
+                ti->ti_flags = 0;
+        }
+        ti->ti_count = 0;
+        ti->ti_save = save;
+        ti->ti_magic = NILFS_TI_MAGIC;
+        current->journal_info = ti;
+        return 0;
+}
+/**
+ * nilfs_transaction_begin - start indivisible file operations.
+ * @sb: super block
+ * @ti: nilfs_transaction_info
+ * @vacancy_check: flags for vacancy rate checks
+ *
+ * nilfs_transaction_begin() acquires a reader/writer semaphore, called
+ * the segment semaphore, to make a segment construction and write tasks
+ * exclusive.  The function is used with nilfs_transaction_commit() in pairs.
+ * The region enclosed by these two functions can be nested.  To avoid a
+ * deadlock, the semaphore is only acquired or released in the outermost call.
+ *
+ * This function allocates a nilfs_transaction_info struct to keep context
+ * information on it.  It is initialized and hooked onto the current task in
+ * the outermost call.  If a pre-allocated struct is given to @ti, it is used
+ * instead; othewise a new struct is assigned from a slab.
+ *
+ * When @vacancy_check flag is set, this function will check the amount of
+ * free space, and will wait for the GC to reclaim disk space if low capacity.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-ENOSPC - No space left on device
+ */
+int nilfs_transaction_begin(struct super_block *sb,
+                            struct nilfs_transaction_info *ti,
+                            int vacancy_check)
+{
+        struct nilfs_sb_info *sbi;
+        struct the_nilfs *nilfs;
+        int ret = nilfs_prepare_segment_lock(ti);
+        if (unlikely(ret < 0))
+                return ret;
+        if (ret > 0)
+                return 0;
+        sbi = NILFS_SB(sb);
+        nilfs = sbi->s_nilfs;
+        down_read(&nilfs->ns_segctor_sem);
+        if (vacancy_check && nilfs_near_disk_full(nilfs)) {
+                up_read(&nilfs->ns_segctor_sem);
+                ret = -ENOSPC;
+                goto failed;
+        }
+        return 0;
+ failed:
+        ti = current->journal_info;
+        current->journal_info = ti->ti_save;
+        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+                kmem_cache_free(nilfs_transaction_cachep, ti);
+        return ret;
+}
+/**
+ * nilfs_transaction_commit - commit indivisible file operations.
+ * @sb: super block
+ *
+ * nilfs_transaction_commit() releases the read semaphore which is
+ * acquired by nilfs_transaction_begin(). This is only performed
+ * in outermost call of this function.  If a commit flag is set,
+ * nilfs_transaction_commit() sets a timer to start the segment
+ * constructor.  If a sync flag is set, it starts construction
+ * directly.
+ */
+int nilfs_transaction_commit(struct super_block *sb)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        struct nilfs_sb_info *sbi;
+        struct nilfs_sc_info *sci;
+        int err = 0;
+        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+        ti->ti_flags |= NILFS_TI_COMMIT;
+        if (ti->ti_count > 0) {
+                ti->ti_count--;
+                return 0;
+        }
+        sbi = NILFS_SB(sb);
+        sci = NILFS_SC(sbi);
+        if (sci != NULL) {
+                if (ti->ti_flags & NILFS_TI_COMMIT)
+                        nilfs_segctor_start_timer(sci);
+                if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
+                    sci->sc_watermark)
+                        nilfs_segctor_do_flush(sci, 0);
+        }
+        up_read(&sbi->s_nilfs->ns_segctor_sem);
+        current->journal_info = ti->ti_save;
+        if (ti->ti_flags & NILFS_TI_SYNC)
+                err = nilfs_construct_segment(sb);
+        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+                kmem_cache_free(nilfs_transaction_cachep, ti);
+        return err;
+}
+void nilfs_transaction_abort(struct super_block *sb)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+        if (ti->ti_count > 0) {
+                ti->ti_count--;
+                return;
+        }
+        up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
+        current->journal_info = ti->ti_save;
+        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+                kmem_cache_free(nilfs_transaction_cachep, ti);
+}
+void nilfs_relax_pressure_in_lock(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        if (!sci || !sci->sc_flush_request)
+                return;
+        set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+        up_read(&nilfs->ns_segctor_sem);
+        down_write(&nilfs->ns_segctor_sem);
+        if (sci->sc_flush_request &&
+            test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
+                struct nilfs_transaction_info *ti = current->journal_info;
+                ti->ti_flags |= NILFS_TI_WRITER;
+                nilfs_segctor_do_immediate_flush(sci);
+                ti->ti_flags &= ~NILFS_TI_WRITER;
+        }
+        downgrade_write(&nilfs->ns_segctor_sem);
+}
+static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
+                                   struct nilfs_transaction_info *ti,
+                                   int gcflag)
+{
+        struct nilfs_transaction_info *cur_ti = current->journal_info;
+        WARN_ON(cur_ti);
+        ti->ti_flags = NILFS_TI_WRITER;
+        ti->ti_count = 0;
+        ti->ti_save = cur_ti;
+        ti->ti_magic = NILFS_TI_MAGIC;
+        INIT_LIST_HEAD(&ti->ti_garbage);
+        current->journal_info = ti;
+        for (;;) {
+                down_write(&sbi->s_nilfs->ns_segctor_sem);
+                if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
+                        break;
+                nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
+                up_write(&sbi->s_nilfs->ns_segctor_sem);
+                yield();
+        }
+        if (gcflag)
+                ti->ti_flags |= NILFS_TI_GC;
+}
+static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+        BUG_ON(ti->ti_count > 0);
+        up_write(&sbi->s_nilfs->ns_segctor_sem);
+        current->journal_info = ti->ti_save;
+        if (!list_empty(&ti->ti_garbage))
+                nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
+}
+static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
+                                            struct nilfs_segsum_pointer *ssp,
+                                            unsigned bytes)
+{
+        struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+        unsigned blocksize = sci->sc_super->s_blocksize;
+        void *p;
+        if (unlikely(ssp->offset + bytes > blocksize)) {
+                ssp->offset = 0;
+                BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
+                                               &segbuf->sb_segsum_buffers));
+                ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
+        }
+        p = ssp->bh->b_data + ssp->offset;
+        ssp->offset += bytes;
+        return p;
+}
+/**
+ * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
+ * @sci: nilfs_sc_info
+ */
+static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+        struct buffer_head *sumbh;
+        unsigned sumbytes;
+        unsigned flags = 0;
+        int err;
+        if (nilfs_doing_gc())
+                flags = NILFS_SS_GC;
+        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+        if (unlikely(err))
+                return err;
+        sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+        sumbytes = segbuf->sb_sum.sumbytes;
+        sci->sc_finfo_ptr.bh = sumbh;  sci->sc_finfo_ptr.offset = sumbytes;
+        sci->sc_binfo_ptr.bh = sumbh;  sci->sc_binfo_ptr.offset = sumbytes;
+        sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+        return 0;
+}
+static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
+{
+        sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+        if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
+                return -E2BIG; /* The current segment is filled up
+                                  (internal code) */
+        sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
+        return nilfs_segctor_reset_segment_buffer(sci);
+}
+static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+        int err;
+        if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
+                err = nilfs_segctor_feed_segment(sci);
+                if (err)
+                        return err;
+                segbuf = sci->sc_curseg;
+        }
+        err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+        if (likely(!err))
+                segbuf->sb_sum.flags |= NILFS_SS_SR;
+        return err;
+}
+/*
+ * Functions for making segment summary and payloads
+ */
+static int nilfs_segctor_segsum_block_required(
+        struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
+        unsigned binfo_size)
+{
+        unsigned blocksize = sci->sc_super->s_blocksize;
+        /* Size of finfo and binfo is enough small against blocksize */
+        return ssp->offset + binfo_size +
+                (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
+                blocksize;
+}
+static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
+                                      struct inode *inode)
+{
+        sci->sc_curseg->sb_sum.nfinfo++;
+        sci->sc_binfo_ptr = sci->sc_finfo_ptr;
+        nilfs_segctor_map_segsum_entry(
+                sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
+        if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+                set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+        /* skip finfo */
+}
+static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
+                                    struct inode *inode)
+{
+        struct nilfs_finfo *finfo;
+        struct nilfs_inode_info *ii;
+        struct nilfs_segment_buffer *segbuf;
+        if (sci->sc_blk_cnt == 0)
+                return;
+        ii = NILFS_I(inode);
+        finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
+                                                 sizeof(*finfo));
+        finfo->fi_ino = cpu_to_le64(inode->i_ino);
+        finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
+        finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
+        finfo->fi_cno = cpu_to_le64(ii->i_cno);
+        segbuf = sci->sc_curseg;
+        segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
+                sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
+        sci->sc_finfo_ptr = sci->sc_binfo_ptr;
+        sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+}
+static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
+                                        struct buffer_head *bh,
+                                        struct inode *inode,
+                                        unsigned binfo_size)
+{
+        struct nilfs_segment_buffer *segbuf;
+        int required, err = 0;
+ retry:
+        segbuf = sci->sc_curseg;
+        required = nilfs_segctor_segsum_block_required(
+                sci, &sci->sc_binfo_ptr, binfo_size);
+        if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
+                nilfs_segctor_end_finfo(sci, inode);
+                err = nilfs_segctor_feed_segment(sci);
+                if (err)
+                        return err;
+                goto retry;
+        }
+        if (unlikely(required)) {
+                err = nilfs_segbuf_extend_segsum(segbuf);
+                if (unlikely(err))
+                        goto failed;
+        }
+        if (sci->sc_blk_cnt == 0)
+                nilfs_segctor_begin_finfo(sci, inode);
+        nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
+        /* Substitution to vblocknr is delayed until update_blocknr() */
+        nilfs_segbuf_add_file_buffer(segbuf, bh);
+        sci->sc_blk_cnt++;
+ failed:
+        return err;
+}
+static int nilfs_handle_bmap_error(int err, const char *fname,
+                                   struct inode *inode, struct super_block *sb)
+{
+        if (err == -EINVAL) {
+                nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
+                            inode->i_ino);
+                err = -EIO;
+        }
+        return err;
+}
+/*
+ * Callback functions that enumerate, mark, and collect dirty blocks
+ */
+static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
+                                   struct buffer_head *bh, struct inode *inode)
+{
+        int err;
+        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+        if (unlikely(err < 0))
+                return nilfs_handle_bmap_error(err, __func__, inode,
+                                               sci->sc_super);
+        err = nilfs_segctor_add_file_block(sci, bh, inode,
+                                           sizeof(struct nilfs_binfo_v));
+        if (!err)
+                sci->sc_datablk_cnt++;
+        return err;
+}
+static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
+                                   struct buffer_head *bh,
+                                   struct inode *inode)
+{
+        int err;
+        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+        if (unlikely(err < 0))
+                return nilfs_handle_bmap_error(err, __func__, inode,
+                                               sci->sc_super);
+        return 0;
+}
+static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
+                                   struct buffer_head *bh,
+                                   struct inode *inode)
+{
+        WARN_ON(!buffer_dirty(bh));
+        return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+}
+static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
+                                        struct nilfs_segsum_pointer *ssp,
+                                        union nilfs_binfo *binfo)
+{
+        struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
+                sci, ssp, sizeof(*binfo_v));
+        *binfo_v = binfo->bi_v;
+}
+static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
+                                        struct nilfs_segsum_pointer *ssp,
+                                        union nilfs_binfo *binfo)
+{
+        __le64 *vblocknr = nilfs_segctor_map_segsum_entry(
+                sci, ssp, sizeof(*vblocknr));
+        *vblocknr = binfo->bi_v.bi_vblocknr;
+}
+struct nilfs_sc_operations nilfs_sc_file_ops = {
+        .collect_data = nilfs_collect_file_data,
+        .collect_node = nilfs_collect_file_node,
+        .collect_bmap = nilfs_collect_file_bmap,
+        .write_data_binfo = nilfs_write_file_data_binfo,
+        .write_node_binfo = nilfs_write_file_node_binfo,
+};
+static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
+                                  struct buffer_head *bh, struct inode *inode)
+{
+        int err;
+        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+        if (unlikely(err < 0))
+                return nilfs_handle_bmap_error(err, __func__, inode,
+                                               sci->sc_super);
+        err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+        if (!err)
+                sci->sc_datablk_cnt++;
+        return err;
+}
+static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
+                                  struct buffer_head *bh, struct inode *inode)
+{
+        WARN_ON(!buffer_dirty(bh));
+        return nilfs_segctor_add_file_block(sci, bh, inode,
+                                            sizeof(struct nilfs_binfo_dat));
+}
+static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
+                                       struct nilfs_segsum_pointer *ssp,
+                                       union nilfs_binfo *binfo)
+{
+        __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
+                                                          sizeof(*blkoff));
+        *blkoff = binfo->bi_dat.bi_blkoff;
+}
+static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
+                                       struct nilfs_segsum_pointer *ssp,
+                                       union nilfs_binfo *binfo)
+{
+        struct nilfs_binfo_dat *binfo_dat =
+                nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
+        *binfo_dat = binfo->bi_dat;
+}
+struct nilfs_sc_operations nilfs_sc_dat_ops = {
+        .collect_data = nilfs_collect_dat_data,
+        .collect_node = nilfs_collect_file_node,
+        .collect_bmap = nilfs_collect_dat_bmap,
+        .write_data_binfo = nilfs_write_dat_data_binfo,
+        .write_node_binfo = nilfs_write_dat_node_binfo,
+};
+struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+        .collect_data = nilfs_collect_file_data,
+        .collect_node = NULL,
+        .collect_bmap = NULL,
+        .write_data_binfo = nilfs_write_file_data_binfo,
+        .write_node_binfo = NULL,
+};
+static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
+                                              struct list_head *listp,
+                                              size_t nlimit,
+                                              loff_t start, loff_t end)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct pagevec pvec;
+        pgoff_t index = 0, last = ULONG_MAX;
+        size_t ndirties = 0;
+        int i;
+        if (unlikely(start != 0 || end != LLONG_MAX)) {
+                /*
+                 * A valid range is given for sync-ing data pages. The
+                 * range is rounded to per-page; extra dirty buffers
+                 * may be included if blocksize < pagesize.
+                 */
+                index = start >> PAGE_SHIFT;
+                last = end >> PAGE_SHIFT;
+        }
+        pagevec_init(&pvec, 0);
+ repeat:
+        if (unlikely(index > last) ||
+            !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+                                min_t(pgoff_t, last - index,
+                                      PAGEVEC_SIZE - 1) + 1))
+                return ndirties;
+        for (i = 0; i < pagevec_count(&pvec); i++) {
+                struct buffer_head *bh, *head;
+                struct page *page = pvec.pages[i];
+                if (unlikely(page->index > last))
+                        break;
+                if (mapping->host) {
+                        lock_page(page);
+                        if (!page_has_buffers(page))
+                                create_empty_buffers(page,
+                                                     1 << inode->i_blkbits, 0);
+                        unlock_page(page);
+                }
+                bh = head = page_buffers(page);
+                do {
+                        if (!buffer_dirty(bh))
+                                continue;
+                        get_bh(bh);
+                        list_add_tail(&bh->b_assoc_buffers, listp);
+                        ndirties++;
+                        if (unlikely(ndirties >= nlimit)) {
+                                pagevec_release(&pvec);
+                                cond_resched();
+                                return ndirties;
+                        }
+                } while (bh = bh->b_this_page, bh != head);
+        }
+        pagevec_release(&pvec);
+        cond_resched();
+        goto repeat;
+}
+static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
+                                            struct list_head *listp)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct address_space *mapping = &ii->i_btnode_cache;
+        struct pagevec pvec;
+        struct buffer_head *bh, *head;
+        unsigned int i;
+        pgoff_t index = 0;
+        pagevec_init(&pvec, 0);
+        while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+                                  PAGEVEC_SIZE)) {
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        bh = head = page_buffers(pvec.pages[i]);
+                        do {
+                                if (buffer_dirty(bh)) {
+                                        get_bh(bh);
+                                        list_add_tail(&bh->b_assoc_buffers,
+                                                      listp);
+                                }
+                                bh = bh->b_this_page;
+                        } while (bh != head);
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
+                               struct list_head *head, int force)
+{
+        struct nilfs_inode_info *ii, *n;
+        struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
+        unsigned nv = 0;
+        while (!list_empty(head)) {
+                spin_lock(&sbi->s_inode_lock);
+                list_for_each_entry_safe(ii, n, head, i_dirty) {
+                        list_del_init(&ii->i_dirty);
+                        if (force) {
+                                if (unlikely(ii->i_bh)) {
+                                        brelse(ii->i_bh);
+                                        ii->i_bh = NULL;
+                                }
+                        } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+                                set_bit(NILFS_I_QUEUED, &ii->i_state);
+                                list_add_tail(&ii->i_dirty,
+                                              &sbi->s_dirty_files);
+                                continue;
+                        }
+                        ivec[nv++] = ii;
+                        if (nv == SC_N_INODEVEC)
+                                break;
+                }
+                spin_unlock(&sbi->s_inode_lock);
+                for (pii = ivec; nv > 0; pii++, nv--)
+                        iput(&(*pii)->vfs_inode);
+        }
+}
+static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int ret = 0;
+        if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
+                ret++;
+        if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
+                ret++;
+        if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
+                ret++;
+        if (ret || nilfs_doing_gc())
+                if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
+                        ret++;
+        return ret;
+}
+static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
+{
+        return list_empty(&sci->sc_dirty_files) &&
+                !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
+                list_empty(&sci->sc_cleaning_segments) &&
+                (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
+}
+static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        int ret = 0;
+        if (nilfs_test_metadata_dirty(sbi))
+                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+        spin_lock(&sbi->s_inode_lock);
+        if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
+                ret++;
+        spin_unlock(&sbi->s_inode_lock);
+        return ret;
+}
+static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        nilfs_mdt_clear_dirty(sbi->s_ifile);
+        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
+        nilfs_mdt_clear_dirty(nilfs->ns_sufile);
+        nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+}
+static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
+{
+        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct buffer_head *bh_cp;
+        struct nilfs_checkpoint *raw_cp;
+        int err;
+        /* XXX: this interface will be changed */
+        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
+                                          &raw_cp, &bh_cp);
+        if (likely(!err)) {
+                /* The following code is duplicated with cpfile.  But, it is
+                   needed to collect the checkpoint even if it was not newly
+                   created */
+                nilfs_mdt_mark_buffer_dirty(bh_cp);
+                nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
+                nilfs_cpfile_put_checkpoint(
+                        nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+        } else
+                WARN_ON(err == -EINVAL || err == -ENOENT);
+        return err;
+}
+static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct buffer_head *bh_cp;
+        struct nilfs_checkpoint *raw_cp;
+        int err;
+        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
+                                          &raw_cp, &bh_cp);
+        if (unlikely(err)) {
+                WARN_ON(err == -EINVAL || err == -ENOENT);
+                goto failed_ibh;
+        }
+        raw_cp->cp_snapshot_list.ssl_next = 0;
+        raw_cp->cp_snapshot_list.ssl_prev = 0;
+        raw_cp->cp_inodes_count =
+                cpu_to_le64(atomic_read(&sbi->s_inodes_count));
+        raw_cp->cp_blocks_count =
+                cpu_to_le64(atomic_read(&sbi->s_blocks_count));
+        raw_cp->cp_nblk_inc =
+                cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
+        raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
+        raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
+        if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+                nilfs_checkpoint_clear_minor(raw_cp);
+        else
+                nilfs_checkpoint_set_minor(raw_cp);
+        nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
+        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+        return 0;
+ failed_ibh:
+        return err;
+}
+static void nilfs_fill_in_file_bmap(struct inode *ifile,
+                                    struct nilfs_inode_info *ii)
+{
+        struct buffer_head *ibh;
+        struct nilfs_inode *raw_inode;
+        if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
+                ibh = ii->i_bh;
+                BUG_ON(!ibh);
+                raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
+                                                  ibh);
+                nilfs_bmap_write(ii->i_bmap, raw_inode);
+                nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+        }
+}
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
+                                            struct inode *ifile)
+{
+        struct nilfs_inode_info *ii;
+        list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
+                nilfs_fill_in_file_bmap(ifile, ii);
+                set_bit(NILFS_I_COLLECTED, &ii->i_state);
+        }
+}
+/*
+ * CRC calculation routines
+ */
+static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
+{
+        struct nilfs_super_root *raw_sr =
+                (struct nilfs_super_root *)bh_sr->b_data;
+        u32 crc;
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+        raw_sr->sr_sum = cpu_to_le32(crc);
+}
+static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
+                                            u32 seed)
+{
+        struct nilfs_segment_buffer *segbuf;
+        if (sci->sc_super_root)
+                nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
+        }
+}
+static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
+                                             struct the_nilfs *nilfs)
+{
+        struct buffer_head *bh_sr = sci->sc_super_root;
+        struct nilfs_super_root *raw_sr =
+                (struct nilfs_super_root *)bh_sr->b_data;
+        unsigned isz = nilfs->ns_inode_size;
+        raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
+        raw_sr->sr_nongc_ctime
+                = cpu_to_le64(nilfs_doing_gc() ?
+                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
+        raw_sr->sr_flags = 0;
+        nilfs_mdt_write_inode_direct(
+                nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
+        nilfs_mdt_write_inode_direct(
+                nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
+        nilfs_mdt_write_inode_direct(
+                nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
+}
+static void nilfs_redirty_inodes(struct list_head *head)
+{
+        struct nilfs_inode_info *ii;
+        list_for_each_entry(ii, head, i_dirty) {
+                if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
+                        clear_bit(NILFS_I_COLLECTED, &ii->i_state);
+        }
+}
+static void nilfs_drop_collected_inodes(struct list_head *head)
+{
+        struct nilfs_inode_info *ii;
+        list_for_each_entry(ii, head, i_dirty) {
+                if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
+                        continue;
+                clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+                set_bit(NILFS_I_UPDATED, &ii->i_state);
+        }
+}
+static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
+                                               struct inode *sufile)
+{
+        struct list_head *head = &sci->sc_cleaning_segments;
+        struct nilfs_segment_entry *ent;
+        int err;
+        list_for_each_entry(ent, head, list) {
+                if (!(ent->flags & NILFS_SLH_FREED))
+                        break;
+                err = nilfs_sufile_cancel_free(sufile, ent->segnum);
+                WARN_ON(err); /* do not happen */
+                ent->flags &= ~NILFS_SLH_FREED;
+        }
+}
+static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
+                                               struct inode *sufile)
+{
+        struct list_head *head = &sci->sc_cleaning_segments;
+        struct nilfs_segment_entry *ent;
+        int err;
+        list_for_each_entry(ent, head, list) {
+                err = nilfs_sufile_free(sufile, ent->segnum);
+                if (unlikely(err))
+                        return err;
+                ent->flags |= NILFS_SLH_FREED;
+        }
+        return 0;
+}
+static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
+{
+        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+}
+static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
+                                       struct inode *inode,
+                                       struct list_head *listp,
+                                       int (*collect)(struct nilfs_sc_info *,
+                                                      struct buffer_head *,
+                                                      struct inode *))
+{
+        struct buffer_head *bh, *n;
+        int err = 0;
+        if (collect) {
+                list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
+                        list_del_init(&bh->b_assoc_buffers);
+                        err = collect(sci, bh, inode);
+                        brelse(bh);
+                        if (unlikely(err))
+                                goto dispose_buffers;
+                }
+                return 0;
+        }
+ dispose_buffers:
+        while (!list_empty(listp)) {
+                bh = list_entry(listp->next, struct buffer_head,
+                                b_assoc_buffers);
+                list_del_init(&bh->b_assoc_buffers);
+                brelse(bh);
+        }
+        return err;
+}
+static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
+{
+        /* Remaining number of blocks within segment buffer */
+        return sci->sc_segbuf_nblocks -
+                (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
+}
+static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
+                                   struct inode *inode,
+                                   struct nilfs_sc_operations *sc_ops)
+{
+        LIST_HEAD(data_buffers);
+        LIST_HEAD(node_buffers);
+        int err;
+        if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+                size_t n, rest = nilfs_segctor_buffer_rest(sci);
+                n = nilfs_lookup_dirty_data_buffers(
+                        inode, &data_buffers, rest + 1, 0, LLONG_MAX);
+                if (n > rest) {
+                        err = nilfs_segctor_apply_buffers(
+                                sci, inode, &data_buffers,
+                                sc_ops->collect_data);
+                        BUG_ON(!err); /* always receive -E2BIG or true error */
+                        goto break_or_fail;
+                }
+        }
+        nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
+        if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+                err = nilfs_segctor_apply_buffers(
+                        sci, inode, &data_buffers, sc_ops->collect_data);
+                if (unlikely(err)) {
+                        /* dispose node list */
+                        nilfs_segctor_apply_buffers(
+                                sci, inode, &node_buffers, NULL);
+                        goto break_or_fail;
+                }
+                sci->sc_stage.flags |= NILFS_CF_NODE;
+        }
+        /* Collect node */
+        err = nilfs_segctor_apply_buffers(
+                sci, inode, &node_buffers, sc_ops->collect_node);
+        if (unlikely(err))
+                goto break_or_fail;
+        nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
+        err = nilfs_segctor_apply_buffers(
+                sci, inode, &node_buffers, sc_ops->collect_bmap);
+        if (unlikely(err))
+                goto break_or_fail;
+        nilfs_segctor_end_finfo(sci, inode);
+        sci->sc_stage.flags &= ~NILFS_CF_NODE;
+ break_or_fail:
+        return err;
+}
+static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
+                                         struct inode *inode)
+{
+        LIST_HEAD(data_buffers);
+        size_t n, rest = nilfs_segctor_buffer_rest(sci);
+        int err;
+        n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
+                                            sci->sc_dsync_start,
+                                            sci->sc_dsync_end);
+        err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
+                                          nilfs_collect_file_data);
+        if (!err) {
+                nilfs_segctor_end_finfo(sci, inode);
+                BUG_ON(n > rest);
+                /* always receive -E2BIG or true error if n > rest */
+        }
+        return err;
+}
+static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct list_head *head;
+        struct nilfs_inode_info *ii;
+        int err = 0;
+        switch (sci->sc_stage.scnt) {
+        case NILFS_ST_INIT:
+                /* Pre-processes */
+                sci->sc_stage.flags = 0;
+                if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
+                        sci->sc_nblk_inc = 0;
+                        sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
+                        if (mode == SC_LSEG_DSYNC) {
+                                sci->sc_stage.scnt = NILFS_ST_DSYNC;
+                                goto dsync_mode;
+                        }
+                }
+                sci->sc_stage.dirty_file_ptr = NULL;
+                sci->sc_stage.gc_inode_ptr = NULL;
+                if (mode == SC_FLUSH_DAT) {
+                        sci->sc_stage.scnt = NILFS_ST_DAT;
+                        goto dat_stage;
+                }
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_GC:
+                if (nilfs_doing_gc()) {
+                        head = &sci->sc_gc_inodes;
+                        ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
+                                                head, i_dirty);
+                        list_for_each_entry_continue(ii, head, i_dirty) {
+                                err = nilfs_segctor_scan_file(
+                                        sci, &ii->vfs_inode,
+                                        &nilfs_sc_file_ops);
+                                if (unlikely(err)) {
+                                        sci->sc_stage.gc_inode_ptr = list_entry(
+                                                ii->i_dirty.prev,
+                                                struct nilfs_inode_info,
+                                                i_dirty);
+                                        goto break_or_fail;
+                                }
+                                set_bit(NILFS_I_COLLECTED, &ii->i_state);
+                        }
+                        sci->sc_stage.gc_inode_ptr = NULL;
+                }
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_FILE:
+                head = &sci->sc_dirty_files;
+                ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
+                                        i_dirty);
+                list_for_each_entry_continue(ii, head, i_dirty) {
+                        clear_bit(NILFS_I_DIRTY, &ii->i_state);
+                        err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
+                                                      &nilfs_sc_file_ops);
+                        if (unlikely(err)) {
+                                sci->sc_stage.dirty_file_ptr =
+                                        list_entry(ii->i_dirty.prev,
+                                                   struct nilfs_inode_info,
+                                                   i_dirty);
+                                goto break_or_fail;
+                        }
+                        /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
+                        /* XXX: required ? */
+                }
+                sci->sc_stage.dirty_file_ptr = NULL;
+                if (mode == SC_FLUSH_FILE) {
+                        sci->sc_stage.scnt = NILFS_ST_DONE;
+                        return 0;
+                }
+                sci->sc_stage.scnt++;
+                sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
+                /* Fall through */
+        case NILFS_ST_IFILE:
+                err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
+                                              &nilfs_sc_file_ops);
+                if (unlikely(err))
+                        break;
+                sci->sc_stage.scnt++;
+                /* Creating a checkpoint */
+                err = nilfs_segctor_create_checkpoint(sci);
+                if (unlikely(err))
+                        break;
+                /* Fall through */
+        case NILFS_ST_CPFILE:
+                err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
+                                              &nilfs_sc_file_ops);
+                if (unlikely(err))
+                        break;
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_SUFILE:
+                err = nilfs_segctor_prepare_free_segments(sci,
+                                                          nilfs->ns_sufile);
+                if (unlikely(err))
+                        break;
+                err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
+                                              &nilfs_sc_file_ops);
+                if (unlikely(err))
+                        break;
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_DAT:
+ dat_stage:
+                err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+                                              &nilfs_sc_dat_ops);
+                if (unlikely(err))
+                        break;
+                if (mode == SC_FLUSH_DAT) {
+                        sci->sc_stage.scnt = NILFS_ST_DONE;
+                        return 0;
+                }
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_SR:
+                if (mode == SC_LSEG_SR) {
+                        /* Appending a super root */
+                        err = nilfs_segctor_add_super_root(sci);
+                        if (unlikely(err))
+                                break;
+                }
+                /* End of a logical segment */
+                sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+                sci->sc_stage.scnt = NILFS_ST_DONE;
+                return 0;
+        case NILFS_ST_DSYNC:
+ dsync_mode:
+                sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
+                ii = sci->sc_dsync_inode;
+                if (!test_bit(NILFS_I_BUSY, &ii->i_state))
+                        break;
+                err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
+                if (unlikely(err))
+                        break;
+                sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+                sci->sc_stage.scnt = NILFS_ST_DONE;
+                return 0;
+        case NILFS_ST_DONE:
+                return 0;
+        default:
+                BUG();
+        }
+ break_or_fail:
+        return err;
+}
+static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
+{
+        struct buffer_head *bh_su;
+        struct nilfs_segment_usage *raw_su;
+        int err;
+        err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
+        if (unlikely(err))
+                return err;
+        nilfs_mdt_mark_buffer_dirty(bh_su);
+        nilfs_mdt_mark_dirty(sufile);
+        nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
+        return 0;
+}
+static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
+                                            struct the_nilfs *nilfs)
+{
+        struct nilfs_segment_buffer *segbuf, *n;
+        __u64 nextnum;
+        int err;
+        if (list_empty(&sci->sc_segbufs)) {
+                segbuf = nilfs_segbuf_new(sci->sc_super);
+                if (unlikely(!segbuf))
+                        return -ENOMEM;
+                list_add(&segbuf->sb_list, &sci->sc_segbufs);
+        } else
+                segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
+                         nilfs);
+        if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+                nilfs_shift_to_next_segment(nilfs);
+                nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+        }
+        sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
+        err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
+        if (unlikely(err))
+                return err;
+        if (nilfs->ns_segnum == nilfs->ns_nextnum) {
+                /* Start from the head of a new full segment */
+                err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
+                if (unlikely(err))
+                        return err;
+        } else
+                nextnum = nilfs->ns_nextnum;
+        segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+        nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
+        /* truncating segment buffers */
+        list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+                                          sb_list) {
+                list_del_init(&segbuf->sb_list);
+                nilfs_segbuf_free(segbuf);
+        }
+        return 0;
+}
+static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
+                                         struct the_nilfs *nilfs, int nadd)
+{
+        struct nilfs_segment_buffer *segbuf, *prev, *n;
+        struct inode *sufile = nilfs->ns_sufile;
+        __u64 nextnextnum;
+        LIST_HEAD(list);
+        int err, ret, i;
+        prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+        /*
+         * Since the segment specified with nextnum might be allocated during
+         * the previous construction, the buffer including its segusage may
+         * not be dirty.  The following call ensures that the buffer is dirty
+         * and will pin the buffer on memory until the sufile is written.
+         */
+        err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
+        if (unlikely(err))
+                return err;
+        for (i = 0; i < nadd; i++) {
+                /* extend segment info */
+                err = -ENOMEM;
+                segbuf = nilfs_segbuf_new(sci->sc_super);
+                if (unlikely(!segbuf))
+                        goto failed;
+                /* map this buffer to region of segment on-disk */
+                nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+                sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
+                /* allocate the next next full segment */
+                err = nilfs_sufile_alloc(sufile, &nextnextnum);
+                if (unlikely(err))
+                        goto failed_segbuf;
+                segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
+                nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
+                list_add_tail(&segbuf->sb_list, &list);
+                prev = segbuf;
+        }
+        list_splice(&list, sci->sc_segbufs.prev);
+        return 0;
+ failed_segbuf:
+        nilfs_segbuf_free(segbuf);
+ failed:
+        list_for_each_entry_safe(segbuf, n, &list, sb_list) {
+                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+                WARN_ON(ret); /* never fails */
+                list_del_init(&segbuf->sb_list);
+                nilfs_segbuf_free(segbuf);
+        }
+        return err;
+}
+static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
+                                                   struct the_nilfs *nilfs)
+{
+        struct nilfs_segment_buffer *segbuf;
+        int ret, done = 0;
+        segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
+                ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+                WARN_ON(ret); /* never fails */
+        }
+        if (segbuf->sb_io_error) {
+                /* Case 1: The first segment failed */
+                if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
+                        /* Case 1a:  Partial segment appended into an existing
+                           segment */
+                        nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
+                                                segbuf->sb_fseg_end);
+                else /* Case 1b:  New full segment */
+                        set_nilfs_discontinued(nilfs);
+                done++;
+        }
+        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+                ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+                WARN_ON(ret); /* never fails */
+                if (!done && segbuf->sb_io_error) {
+                        if (segbuf->sb_segnum != nilfs->ns_nextnum)
+                                /* Case 2: extended segment (!= next) failed */
+                                nilfs_sufile_set_error(nilfs->ns_sufile,
+                                                       segbuf->sb_segnum);
+                        done++;
+                }
+        }
+}
+static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
+                nilfs_segbuf_clear(segbuf);
+        sci->sc_super_root = NULL;
+}
+static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf;
+        while (!list_empty(&sci->sc_segbufs)) {
+                segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+                list_del_init(&segbuf->sb_list);
+                nilfs_segbuf_free(segbuf);
+        }
+        /* sci->sc_curseg = NULL; */
+}
+static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
+                                           struct the_nilfs *nilfs, int err)
+{
+        if (unlikely(err)) {
+                nilfs_segctor_free_incomplete_segments(sci, nilfs);
+                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+        }
+        nilfs_segctor_clear_segment_buffers(sci);
+}
+static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
+                                          struct inode *sufile)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct buffer_head *bh_su;
+        struct nilfs_segment_usage *raw_su;
+        unsigned long live_blocks;
+        int ret;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+                                                     &raw_su, &bh_su);
+                WARN_ON(ret); /* always succeed because bh_su is dirty */
+                live_blocks = segbuf->sb_sum.nblocks +
+                        (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
+                raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
+                raw_su->su_nblocks = cpu_to_le32(live_blocks);
+                nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+                                               bh_su);
+        }
+}
+static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
+                                          struct inode *sufile)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct buffer_head *bh_su;
+        struct nilfs_segment_usage *raw_su;
+        int ret;
+        segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+                                             &raw_su, &bh_su);
+        WARN_ON(ret); /* always succeed because bh_su is dirty */
+        raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
+                                         segbuf->sb_fseg_start);
+        nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
+        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+                ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+                                                     &raw_su, &bh_su);
+                WARN_ON(ret); /* always succeed */
+                raw_su->su_nblocks = 0;
+                nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+                                               bh_su);
+        }
+}
+static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
+                                            struct nilfs_segment_buffer *last,
+                                            struct inode *sufile)
+{
+        struct nilfs_segment_buffer *segbuf = last, *n;
+        int ret;
+        list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+                                          sb_list) {
+                list_del_init(&segbuf->sb_list);
+                sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
+                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+                WARN_ON(ret);
+                nilfs_segbuf_free(segbuf);
+        }
+}
+static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
+                                 struct the_nilfs *nilfs, int mode)
+{
+        struct nilfs_cstage prev_stage = sci->sc_stage;
+        int err, nadd = 1;
+        /* Collection retry loop */
+        for (;;) {
+                sci->sc_super_root = NULL;
+                sci->sc_nblk_this_inc = 0;
+                sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+                err = nilfs_segctor_reset_segment_buffer(sci);
+                if (unlikely(err))
+                        goto failed;
+                err = nilfs_segctor_collect_blocks(sci, mode);
+                sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+                if (!err)
+                        break;
+                if (unlikely(err != -E2BIG))
+                        goto failed;
+                /* The current segment is filled up */
+                if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+                        break;
+                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+                nilfs_segctor_clear_segment_buffers(sci);
+                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+                if (unlikely(err))
+                        return err;
+                nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
+                sci->sc_stage = prev_stage;
+        }
+        nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
+        return 0;
+ failed:
+        return err;
+}
+static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
+                                      struct buffer_head *new_bh)
+{
+        BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
+        list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
+        /* The caller must release old_bh */
+}
+static int
+nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
+                                     struct nilfs_segment_buffer *segbuf,
+                                     int mode)
+{
+        struct inode *inode = NULL;
+        sector_t blocknr;
+        unsigned long nfinfo = segbuf->sb_sum.nfinfo;
+        unsigned long nblocks = 0, ndatablk = 0;
+        struct nilfs_sc_operations *sc_op = NULL;
+        struct nilfs_segsum_pointer ssp;
+        struct nilfs_finfo *finfo = NULL;
+        union nilfs_binfo binfo;
+        struct buffer_head *bh, *bh_org;
+        ino_t ino = 0;
+        int err = 0;
+        if (!nfinfo)
+                goto out;
+        blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
+        ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+        ssp.offset = sizeof(struct nilfs_segment_summary);
+        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+                if (bh == sci->sc_super_root)
+                        break;
+                if (!finfo) {
+                        finfo = nilfs_segctor_map_segsum_entry(
+                                sci, &ssp, sizeof(*finfo));
+                        ino = le64_to_cpu(finfo->fi_ino);
+                        nblocks = le32_to_cpu(finfo->fi_nblocks);
+                        ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+                        if (buffer_nilfs_node(bh))
+                                inode = NILFS_BTNC_I(bh->b_page->mapping);
+                        else
+                                inode = NILFS_AS_I(bh->b_page->mapping);
+                        if (mode == SC_LSEG_DSYNC)
+                                sc_op = &nilfs_sc_dsync_ops;
+                        else if (ino == NILFS_DAT_INO)
+                                sc_op = &nilfs_sc_dat_ops;
+                        else /* file blocks */
+                                sc_op = &nilfs_sc_file_ops;
+                }
+                bh_org = bh;
+                get_bh(bh_org);
+                err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
+                                        &binfo);
+                if (bh != bh_org)
+                        nilfs_list_replace_buffer(bh_org, bh);
+                brelse(bh_org);
+                if (unlikely(err))
+                        goto failed_bmap;
+                if (ndatablk > 0)
+                        sc_op->write_data_binfo(sci, &ssp, &binfo);
+                else
+                        sc_op->write_node_binfo(sci, &ssp, &binfo);
+                blocknr++;
+                if (--nblocks == 0) {
+                        finfo = NULL;
+                        if (--nfinfo == 0)
+                                break;
+                } else if (ndatablk > 0)
+                        ndatablk--;
+        }
+ out:
+        return 0;
+ failed_bmap:
+        err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
+        return err;
+}
+static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
+{
+        struct nilfs_segment_buffer *segbuf;
+        int err;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
+                if (unlikely(err))
+                        return err;
+                nilfs_segbuf_fill_in_segsum(segbuf);
+        }
+        return 0;
+}
+static int
+nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
+{
+        struct page *clone_page;
+        struct buffer_head *bh, *head, *bh2;
+        void *kaddr;
+        bh = head = page_buffers(page);
+        clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
+        if (unlikely(!clone_page))
+                return -ENOMEM;
+        bh2 = page_buffers(clone_page);
+        kaddr = kmap_atomic(page, KM_USER0);
+        do {
+                if (list_empty(&bh->b_assoc_buffers))
+                        continue;
+                get_bh(bh2);
+                page_cache_get(clone_page); /* for each bh */
+                memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
+                bh2->b_blocknr = bh->b_blocknr;
+                list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
+                list_add_tail(&bh->b_assoc_buffers, out);
+        } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (!TestSetPageWriteback(clone_page))
+                inc_zone_page_state(clone_page, NR_WRITEBACK);
+        unlock_page(clone_page);
+        return 0;
+}
+static int nilfs_test_page_to_be_frozen(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
+                return 0;
+        if (page_mapped(page)) {
+                ClearPageChecked(page);
+                return 1;
+        }
+        return PageChecked(page);
+}
+static int nilfs_begin_page_io(struct page *page, struct list_head *out)
+{
+        if (!page || PageWriteback(page))
+                /* For split b-tree node pages, this function may be called
+                   twice.  We ignore the 2nd or later calls by this check. */
+                return 0;
+        lock_page(page);
+        clear_page_dirty_for_io(page);
+        set_page_writeback(page);
+        unlock_page(page);
+        if (nilfs_test_page_to_be_frozen(page)) {
+                int err = nilfs_copy_replace_page_buffers(page, out);
+                if (unlikely(err))
+                        return err;
+        }
+        return 0;
+}
+static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
+                                       struct page **failed_page)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct page *bd_page = NULL, *fs_page = NULL;
+        struct list_head *list = &sci->sc_copied_buffers;
+        int err;
+        *failed_page = NULL;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                struct buffer_head *bh;
+                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+                                    b_assoc_buffers) {
+                        if (bh->b_page != bd_page) {
+                                if (bd_page) {
+                                        lock_page(bd_page);
+                                        clear_page_dirty_for_io(bd_page);
+                                        set_page_writeback(bd_page);
+                                        unlock_page(bd_page);
+                                }
+                                bd_page = bh->b_page;
+                        }
+                }
+                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+                                    b_assoc_buffers) {
+                        if (bh == sci->sc_super_root) {
+                                if (bh->b_page != bd_page) {
+                                        lock_page(bd_page);
+                                        clear_page_dirty_for_io(bd_page);
+                                        set_page_writeback(bd_page);
+                                        unlock_page(bd_page);
+                                        bd_page = bh->b_page;
+                                }
+                                break;
+                        }
+                        if (bh->b_page != fs_page) {
+                                err = nilfs_begin_page_io(fs_page, list);
+                                if (unlikely(err)) {
+                                        *failed_page = fs_page;
+                                        goto out;
+                                }
+                                fs_page = bh->b_page;
+                        }
+                }
+        }
+        if (bd_page) {
+                lock_page(bd_page);
+                clear_page_dirty_for_io(bd_page);
+                set_page_writeback(bd_page);
+                unlock_page(bd_page);
+        }
+        err = nilfs_begin_page_io(fs_page, list);
+        if (unlikely(err))
+                *failed_page = fs_page;
+ out:
+        return err;
+}
+static int nilfs_segctor_write(struct nilfs_sc_info *sci,
+                               struct backing_dev_info *bdi)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct nilfs_write_info wi;
+        int err, res;
+        wi.sb = sci->sc_super;
+        wi.bh_sr = sci->sc_super_root;
+        wi.bdi = bdi;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                nilfs_segbuf_prepare_write(segbuf, &wi);
+                err = nilfs_segbuf_write(segbuf, &wi);
+                res = nilfs_segbuf_wait(segbuf, &wi);
+                err = unlikely(err) ? : res;
+                if (unlikely(err))
+                        return err;
+        }
+        return 0;
+}
+static int nilfs_page_has_uncleared_buffer(struct page *page)
+{
+        struct buffer_head *head, *bh;
+        head = bh = page_buffers(page);
+        do {
+                if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
+                        return 1;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return 0;
+}
+static void __nilfs_end_page_io(struct page *page, int err)
+{
+        if (!err) {
+                if (!nilfs_page_buffers_clean(page))
+                        __set_page_dirty_nobuffers(page);
+                ClearPageError(page);
+        } else {
+                __set_page_dirty_nobuffers(page);
+                SetPageError(page);
+        }
+        if (buffer_nilfs_allocated(page_buffers(page))) {
+                if (TestClearPageWriteback(page))
+                        dec_zone_page_state(page, NR_WRITEBACK);
+        } else
+                end_page_writeback(page);
+}
+static void nilfs_end_page_io(struct page *page, int err)
+{
+        if (!page)
+                return;
+        if (buffer_nilfs_node(page_buffers(page)) &&
+            nilfs_page_has_uncleared_buffer(page))
+                /* For b-tree node pages, this function may be called twice
+                   or more because they might be split in a segment.
+                   This check assures that cleanup has been done for all
+                   buffers in a split btnode page. */
+                return;
+        __nilfs_end_page_io(page, err);
+}
+static void nilfs_clear_copied_buffers(struct list_head *list, int err)
+{
+        struct buffer_head *bh, *head;
+        struct page *page;
+        while (!list_empty(list)) {
+                bh = list_entry(list->next, struct buffer_head,
+                                b_assoc_buffers);
+                page = bh->b_page;
+                page_cache_get(page);
+                head = bh = page_buffers(page);
+                do {
+                        if (!list_empty(&bh->b_assoc_buffers)) {
+                                list_del_init(&bh->b_assoc_buffers);
+                                if (!err) {
+                                        set_buffer_uptodate(bh);
+                                        clear_buffer_dirty(bh);
+                                        clear_buffer_nilfs_volatile(bh);
+                                }
+                                brelse(bh); /* for b_assoc_buffers */
+                        }
+                } while ((bh = bh->b_this_page) != head);
+                __nilfs_end_page_io(page, err);
+                page_cache_release(page);
+        }
+}
+static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
+                                      struct page *failed_page, int err)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct page *bd_page = NULL, *fs_page = NULL;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                struct buffer_head *bh;
+                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+                                    b_assoc_buffers) {
+                        if (bh->b_page != bd_page) {
+                                if (bd_page)
+                                        end_page_writeback(bd_page);
+                                bd_page = bh->b_page;
+                        }
+                }
+                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+                                    b_assoc_buffers) {
+                        if (bh == sci->sc_super_root) {
+                                if (bh->b_page != bd_page) {
+                                        end_page_writeback(bd_page);
+                                        bd_page = bh->b_page;
+                                }
+                                break;
+                        }
+                        if (bh->b_page != fs_page) {
+                                nilfs_end_page_io(fs_page, err);
+                                if (unlikely(fs_page == failed_page))
+                                        goto done;
+                                fs_page = bh->b_page;
+                        }
+                }
+        }
+        if (bd_page)
+                end_page_writeback(bd_page);
+        nilfs_end_page_io(fs_page, err);
+ done:
+        nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
+}
+static void nilfs_set_next_segment(struct the_nilfs *nilfs,
+                                   struct nilfs_segment_buffer *segbuf)
+{
+        nilfs->ns_segnum = segbuf->sb_segnum;
+        nilfs->ns_nextnum = segbuf->sb_nextnum;
+        nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
+                + segbuf->sb_sum.nblocks;
+        nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
+        nilfs->ns_ctime = segbuf->sb_sum.ctime;
+}
+static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct page *bd_page = NULL, *fs_page = NULL;
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int update_sr = (sci->sc_super_root != NULL);
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                struct buffer_head *bh;
+                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+                                    b_assoc_buffers) {
+                        set_buffer_uptodate(bh);
+                        clear_buffer_dirty(bh);
+                        if (bh->b_page != bd_page) {
+                                if (bd_page)
+                                        end_page_writeback(bd_page);
+                                bd_page = bh->b_page;
+                        }
+                }
+                /*
+                 * We assume that the buffers which belong to the same page
+                 * continue over the buffer list.
+                 * Under this assumption, the last BHs of pages is
+                 * identifiable by the discontinuity of bh->b_page
+                 * (page != fs_page).
+                 *
+                 * For B-tree node blocks, however, this assumption is not
+                 * guaranteed.  The cleanup code of B-tree node pages needs
+                 * special care.
+                 */
+                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+                                    b_assoc_buffers) {
+                        set_buffer_uptodate(bh);
+                        clear_buffer_dirty(bh);
+                        clear_buffer_nilfs_volatile(bh);
+                        if (bh == sci->sc_super_root) {
+                                if (bh->b_page != bd_page) {
+                                        end_page_writeback(bd_page);
+                                        bd_page = bh->b_page;
+                                }
+                                break;
+                        }
+                        if (bh->b_page != fs_page) {
+                                nilfs_end_page_io(fs_page, 0);
+                                fs_page = bh->b_page;
+                        }
+                }
+                if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
+                        if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
+                                set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+                                sci->sc_lseg_stime = jiffies;
+                        }
+                        if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
+                                clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+                }
+        }
+        /*
+         * Since pages may continue over multiple segment buffers,
+         * end of the last page must be checked outside of the loop.
+         */
+        if (bd_page)
+                end_page_writeback(bd_page);
+        nilfs_end_page_io(fs_page, 0);
+        nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
+        nilfs_drop_collected_inodes(&sci->sc_dirty_files);
+        if (nilfs_doing_gc()) {
+                nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
+                if (update_sr)
+                        nilfs_commit_gcdat_inode(nilfs);
+        } else
+                nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
+        sci->sc_nblk_inc += sci->sc_nblk_this_inc;
+        segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+        nilfs_set_next_segment(nilfs, segbuf);
+        if (update_sr) {
+                nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
+                                       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
+                sbi->s_super->s_dirt = 1;
+                clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+                clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+                set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+        } else
+                clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+}
+static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
+                                        struct nilfs_sb_info *sbi)
+{
+        struct nilfs_inode_info *ii, *n;
+        __u64 cno = sbi->s_nilfs->ns_cno;
+        spin_lock(&sbi->s_inode_lock);
+ retry:
+        list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
+                if (!ii->i_bh) {
+                        struct buffer_head *ibh;
+                        int err;
+                        spin_unlock(&sbi->s_inode_lock);
+                        err = nilfs_ifile_get_inode_block(
+                                sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
+                        if (unlikely(err)) {
+                                nilfs_warning(sbi->s_super, __func__,
+                                              "failed to get inode block.\n");
+                                return err;
+                        }
+                        nilfs_mdt_mark_buffer_dirty(ibh);
+                        nilfs_mdt_mark_dirty(sbi->s_ifile);
+                        spin_lock(&sbi->s_inode_lock);
+                        if (likely(!ii->i_bh))
+                                ii->i_bh = ibh;
+                        else
+                                brelse(ibh);
+                        goto retry;
+                }
+                ii->i_cno = cno;
+                clear_bit(NILFS_I_QUEUED, &ii->i_state);
+                set_bit(NILFS_I_BUSY, &ii->i_state);
+                list_del(&ii->i_dirty);
+                list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
+        }
+        spin_unlock(&sbi->s_inode_lock);
+        NILFS_I(sbi->s_ifile)->i_cno = cno;
+        return 0;
+}
+static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
+                                          struct nilfs_sb_info *sbi)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        struct nilfs_inode_info *ii, *n;
+        __u64 cno = sbi->s_nilfs->ns_cno;
+        spin_lock(&sbi->s_inode_lock);
+        list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
+                if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
+                    test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+                        /* The current checkpoint number (=nilfs->ns_cno) is
+                           changed between check-in and check-out only if the
+                           super root is written out.  So, we can update i_cno
+                           for the inodes that remain in the dirty list. */
+                        ii->i_cno = cno;
+                        continue;
+                }
+                clear_bit(NILFS_I_BUSY, &ii->i_state);
+                brelse(ii->i_bh);
+                ii->i_bh = NULL;
+                list_del(&ii->i_dirty);
+                list_add_tail(&ii->i_dirty, &ti->ti_garbage);
+        }
+        spin_unlock(&sbi->s_inode_lock);
+}
+/*
+ * Main procedure of segment constructor
+ */
+static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct page *failed_page;
+        int err, has_sr = 0;
+        sci->sc_stage.scnt = NILFS_ST_INIT;
+        err = nilfs_segctor_check_in_files(sci, sbi);
+        if (unlikely(err))
+                goto out;
+        if (nilfs_test_metadata_dirty(sbi))
+                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+        if (nilfs_segctor_clean(sci))
+                goto out;
+        do {
+                sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
+                err = nilfs_segctor_begin_construction(sci, nilfs);
+                if (unlikely(err))
+                        goto out;
+                /* Update time stamp */
+                sci->sc_seg_ctime = get_seconds();
+                err = nilfs_segctor_collect(sci, nilfs, mode);
+                if (unlikely(err))
+                        goto failed;
+                has_sr = (sci->sc_super_root != NULL);
+                /* Avoid empty segment */
+                if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+                    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
+                        nilfs_segctor_end_construction(sci, nilfs, 1);
+                        goto out;
+                }
+                err = nilfs_segctor_assign(sci, mode);
+                if (unlikely(err))
+                        goto failed;
+                if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+                        nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
+                if (has_sr) {
+                        err = nilfs_segctor_fill_in_checkpoint(sci);
+                        if (unlikely(err))
+                                goto failed_to_make_up;
+                        nilfs_segctor_fill_in_super_root(sci, nilfs);
+                }
+                nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
+                /* Write partial segments */
+                err = nilfs_segctor_prepare_write(sci, &failed_page);
+                if (unlikely(err))
+                        goto failed_to_write;
+                nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+                err = nilfs_segctor_write(sci, nilfs->ns_bdi);
+                if (unlikely(err))
+                        goto failed_to_write;
+                nilfs_segctor_complete_write(sci);
+                /* Commit segments */
+                if (has_sr) {
+                        nilfs_segctor_commit_free_segments(sci);
+                        nilfs_segctor_clear_metadata_dirty(sci);
+                }
+                nilfs_segctor_end_construction(sci, nilfs, 0);
+        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+ out:
+        nilfs_segctor_destroy_segment_buffers(sci);
+        nilfs_segctor_check_out_files(sci, sbi);
+        return err;
+ failed_to_write:
+        nilfs_segctor_abort_write(sci, failed_page, err);
+        nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
+ failed_to_make_up:
+        if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+                nilfs_redirty_inodes(&sci->sc_dirty_files);
+ failed:
+        if (nilfs_doing_gc())
+                nilfs_redirty_inodes(&sci->sc_gc_inodes);
+        nilfs_segctor_end_construction(sci, nilfs, err);
+        goto out;
+}
+/**
+ * nilfs_secgtor_start_timer - set timer of background write
+ * @sci: nilfs_sc_info
+ *
+ * If the timer has already been set, it ignores the new request.
+ * This function MUST be called within a section locking the segment
+ * semaphore.
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
+{
+        spin_lock(&sci->sc_state_lock);
+        if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+                sci->sc_timer->expires = jiffies + sci->sc_interval;
+                add_timer(sci->sc_timer);
+                sci->sc_state |= NILFS_SEGCTOR_COMMIT;
+        }
+        spin_unlock(&sci->sc_state_lock);
+}
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
+{
+        spin_lock(&sci->sc_state_lock);
+        if (!(sci->sc_flush_request & (1 << bn))) {
+                unsigned long prev_req = sci->sc_flush_request;
+                sci->sc_flush_request |= (1 << bn);
+                if (!prev_req)
+                        wake_up(&sci->sc_wait_daemon);
+        }
+        spin_unlock(&sci->sc_state_lock);
+}
+/**
+ * nilfs_flush_segment - trigger a segment construction for resource control
+ * @sb: super block
+ * @ino: inode number of the file to be flushed out.
+ */
+void nilfs_flush_segment(struct super_block *sb, ino_t ino)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        if (!sci || nilfs_doing_construction())
+                return;
+        nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
+                                        /* assign bit 0 to data files */
+}
+int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
+                                           __u64 *segnum, size_t nsegs)
+{
+        struct nilfs_segment_entry *ent;
+        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct inode *sufile = nilfs->ns_sufile;
+        LIST_HEAD(list);
+        __u64 *pnum;
+        size_t i;
+        int err;
+        for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
+                ent = nilfs_alloc_segment_entry(*pnum);
+                if (unlikely(!ent)) {
+                        err = -ENOMEM;
+                        goto failed;
+                }
+                list_add_tail(&ent->list, &list);
+                err = nilfs_open_segment_entry(ent, sufile);
+                if (unlikely(err))
+                        goto failed;
+                if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
+                        printk(KERN_WARNING "NILFS: unused segment is "
+                               "requested to be cleaned (segnum=%llu)\n",
+                               (unsigned long long)ent->segnum);
+                nilfs_close_segment_entry(ent, sufile);
+        }
+        list_splice(&list, sci->sc_cleaning_segments.prev);
+        return 0;
+ failed:
+        nilfs_dispose_segment_list(&list);
+        return err;
+}
+void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
+{
+        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+}
+struct nilfs_segctor_wait_request {
+        wait_queue_t    wq;
+        __u32           seq;
+        int             err;
+        atomic_t        done;
+};
+static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segctor_wait_request wait_req;
+        int err = 0;
+        spin_lock(&sci->sc_state_lock);
+        init_wait(&wait_req.wq);
+        wait_req.err = 0;
+        atomic_set(&wait_req.done, 0);
+        wait_req.seq = ++sci->sc_seq_request;
+        spin_unlock(&sci->sc_state_lock);
+        init_waitqueue_entry(&wait_req.wq, current);
+        add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
+        set_current_state(TASK_INTERRUPTIBLE);
+        wake_up(&sci->sc_wait_daemon);
+        for (;;) {
+                if (atomic_read(&wait_req.done)) {
+                        err = wait_req.err;
+                        break;
+                }
+                if (!signal_pending(current)) {
+                        schedule();
+                        continue;
+                }
+                err = -ERESTARTSYS;
+                break;
+        }
+        finish_wait(&sci->sc_wait_request, &wait_req.wq);
+        return err;
+}
+static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
+{
+        struct nilfs_segctor_wait_request *wrq, *n;
+        unsigned long flags;
+        spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
+        list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
+                                 wq.task_list) {
+                if (!atomic_read(&wrq->done) &&
+                    nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
+                        wrq->err = err;
+                        atomic_set(&wrq->done, 1);
+                }
+                if (atomic_read(&wrq->done)) {
+                        wrq->wq.func(&wrq->wq,
+                                     TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+                                     0, NULL);
+                }
+        }
+        spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
+}
+/**
+ * nilfs_construct_segment - construct a logical segment
+ * @sb: super block
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_segment(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_transaction_info *ti;
+        int err;
+        if (!sci)
+                return -EROFS;
+        /* A call inside transactions causes a deadlock. */
+        BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
+        err = nilfs_segctor_sync(sci);
+        return err;
+}
+/**
+ * nilfs_construct_dsync_segment - construct a data-only logical segment
+ * @sb: super block
+ * @inode: inode whose data blocks should be written out
+ * @start: start byte offset
+ * @end: end byte offset (inclusive)
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
+                                  loff_t start, loff_t end)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_inode_info *ii;
+        struct nilfs_transaction_info ti;
+        int err = 0;
+        if (!sci)
+                return -EROFS;
+        nilfs_transaction_lock(sbi, &ti, 0);
+        ii = NILFS_I(inode);
+        if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
+            nilfs_test_opt(sbi, STRICT_ORDER) ||
+            test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+            nilfs_discontinued(sbi->s_nilfs)) {
+                nilfs_transaction_unlock(sbi);
+                err = nilfs_segctor_sync(sci);
+                return err;
+        }
+        spin_lock(&sbi->s_inode_lock);
+        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+                spin_unlock(&sbi->s_inode_lock);
+                nilfs_transaction_unlock(sbi);
+                return 0;
+        }
+        spin_unlock(&sbi->s_inode_lock);
+        sci->sc_dsync_inode = ii;
+        sci->sc_dsync_start = start;
+        sci->sc_dsync_end = end;
+        err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
+        nilfs_transaction_unlock(sbi);
+        return err;
+}
+struct nilfs_segctor_req {
+        int mode;
+        __u32 seq_accepted;
+        int sc_err;  /* construction failure */
+        int sb_err;  /* super block writeback failure */
+};
+#define FLUSH_FILE_BIT  (0x1) /* data file only */
+#define FLUSH_DAT_BIT   (1 << NILFS_DAT_INO) /* DAT only */
+static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
+                                 struct nilfs_segctor_req *req)
+{
+        req->sc_err = req->sb_err = 0;
+        spin_lock(&sci->sc_state_lock);
+        req->seq_accepted = sci->sc_seq_request;
+        spin_unlock(&sci->sc_state_lock);
+        if (sci->sc_timer)
+                del_timer_sync(sci->sc_timer);
+}
+static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
+                                 struct nilfs_segctor_req *req)
+{
+        /* Clear requests (even when the construction failed) */
+        spin_lock(&sci->sc_state_lock);
+        sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
+        if (req->mode == SC_LSEG_SR) {
+                sci->sc_seq_done = req->seq_accepted;
+                nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
+                sci->sc_flush_request = 0;
+        } else if (req->mode == SC_FLUSH_FILE)
+                sci->sc_flush_request &= ~FLUSH_FILE_BIT;
+        else if (req->mode == SC_FLUSH_DAT)
+                sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+        spin_unlock(&sci->sc_state_lock);
+}
+static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
+                                   struct nilfs_segctor_req *req)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err = 0;
+        if (nilfs_discontinued(nilfs))
+                req->mode = SC_LSEG_SR;
+        if (!nilfs_segctor_confirm(sci)) {
+                err = nilfs_segctor_do_construct(sci, req->mode);
+                req->sc_err = err;
+        }
+        if (likely(!err)) {
+                if (req->mode != SC_FLUSH_DAT)
+                        atomic_set(&nilfs->ns_ndirtyblks, 0);
+                if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
+                    nilfs_discontinued(nilfs)) {
+                        down_write(&nilfs->ns_sem);
+                        req->sb_err = nilfs_commit_super(sbi, 0);
+                        up_write(&nilfs->ns_sem);
+                }
+        }
+        return err;
+}
+static void nilfs_construction_timeout(unsigned long data)
+{
+        struct task_struct *p = (struct task_struct *)data;
+        wake_up_process(p);
+}
+static void
+nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
+{
+        struct nilfs_inode_info *ii, *n;
+        list_for_each_entry_safe(ii, n, head, i_dirty) {
+                if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
+                        continue;
+                hlist_del_init(&ii->vfs_inode.i_hash);
+                list_del_init(&ii->i_dirty);
+                nilfs_clear_gcinode(&ii->vfs_inode);
+        }
+}
+int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
+                         void **kbufs)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_transaction_info ti;
+        struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
+        int err;
+        if (unlikely(!sci))
+                return -EROFS;
+        nilfs_transaction_lock(sbi, &ti, 1);
+        err = nilfs_init_gcdat_inode(nilfs);
+        if (unlikely(err))
+                goto out_unlock;
+        err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
+        if (unlikely(err))
+                goto out_unlock;
+        list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
+        for (;;) {
+                nilfs_segctor_accept(sci, &req);
+                err = nilfs_segctor_construct(sci, &req);
+                nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
+                nilfs_segctor_notify(sci, &req);
+                if (likely(!err))
+                        break;
+                nilfs_warning(sb, __func__,
+                              "segment construction failed. (err=%d)", err);
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(sci->sc_interval);
+        }
+ out_unlock:
+        nilfs_clear_gcdat_inode(nilfs);
+        nilfs_transaction_unlock(sbi);
+        return err;
+}
+static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct nilfs_transaction_info ti;
+        struct nilfs_segctor_req req = { .mode = mode };
+        nilfs_transaction_lock(sbi, &ti, 0);
+        nilfs_segctor_accept(sci, &req);
+        nilfs_segctor_construct(sci, &req);
+        nilfs_segctor_notify(sci, &req);
+        /*
+         * Unclosed segment should be retried.  We do this using sc_timer.
+         * Timeout of sc_timer will invoke complete construction which leads
+         * to close the current logical segment.
+         */
+        if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
+                nilfs_segctor_start_timer(sci);
+        nilfs_transaction_unlock(sbi);
+}
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
+{
+        int mode = 0;
+        int err;
+        spin_lock(&sci->sc_state_lock);
+        mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
+                SC_FLUSH_DAT : SC_FLUSH_FILE;
+        spin_unlock(&sci->sc_state_lock);
+        if (mode) {
+                err = nilfs_segctor_do_construct(sci, mode);
+                spin_lock(&sci->sc_state_lock);
+                sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
+                        ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
+                spin_unlock(&sci->sc_state_lock);
+        }
+        clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+}
+static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
+{
+        if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+            time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
+                if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
+                        return SC_FLUSH_FILE;
+                else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
+                        return SC_FLUSH_DAT;
+        }
+        return SC_LSEG_SR;
+}
+/**
+ * nilfs_segctor_thread - main loop of the segment constructor thread.
+ * @arg: pointer to a struct nilfs_sc_info.
+ *
+ * nilfs_segctor_thread() initializes a timer and serves as a daemon
+ * to execute segment constructions.
+ */
+static int nilfs_segctor_thread(void *arg)
+{
+        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
+        struct timer_list timer;
+        int timeout = 0;
+        init_timer(&timer);
+        timer.data = (unsigned long)current;
+        timer.function = nilfs_construction_timeout;
+        sci->sc_timer = &timer;
+        /* start sync. */
+        sci->sc_task = current;
+        wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
+        printk(KERN_INFO
+               "segctord starting. Construction interval = %lu seconds, "
+               "CP frequency < %lu seconds\n",
+               sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+        spin_lock(&sci->sc_state_lock);
+ loop:
+        for (;;) {
+                int mode;
+                if (sci->sc_state & NILFS_SEGCTOR_QUIT)
+                        goto end_thread;
+                if (timeout || sci->sc_seq_request != sci->sc_seq_done)
+                        mode = SC_LSEG_SR;
+                else if (!sci->sc_flush_request)
+                        break;
+                else
+                        mode = nilfs_segctor_flush_mode(sci);
+                spin_unlock(&sci->sc_state_lock);
+                nilfs_segctor_thread_construct(sci, mode);
+                spin_lock(&sci->sc_state_lock);
+                timeout = 0;
+        }
+        if (freezing(current)) {
+                spin_unlock(&sci->sc_state_lock);
+                refrigerator();
+                spin_lock(&sci->sc_state_lock);
+        } else {
+                DEFINE_WAIT(wait);
+                int should_sleep = 1;
+                prepare_to_wait(&sci->sc_wait_daemon, &wait,
+                                TASK_INTERRUPTIBLE);
+                if (sci->sc_seq_request != sci->sc_seq_done)
+                        should_sleep = 0;
+                else if (sci->sc_flush_request)
+                        should_sleep = 0;
+                else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
+                        should_sleep = time_before(jiffies,
+                                                   sci->sc_timer->expires);
+                if (should_sleep) {
+                        spin_unlock(&sci->sc_state_lock);
+                        schedule();
+                        spin_lock(&sci->sc_state_lock);
+                }
+                finish_wait(&sci->sc_wait_daemon, &wait);
+                timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+                           time_after_eq(jiffies, sci->sc_timer->expires));
+        }
+        goto loop;
+ end_thread:
+        spin_unlock(&sci->sc_state_lock);
+        del_timer_sync(sci->sc_timer);
+        sci->sc_timer = NULL;
+        /* end sync. */
+        sci->sc_task = NULL;
+        wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
+        return 0;
+}
+static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
+{
+        struct task_struct *t;
+        t = kthread_run(nilfs_segctor_thread, sci, "segctord");
+        if (IS_ERR(t)) {
+                int err = PTR_ERR(t);
+                printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
+                       err);
+                return err;
+        }
+        wait_event(sci->sc_wait_task, sci->sc_task != NULL);
+        return 0;
+}
+static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+{
+        sci->sc_state |= NILFS_SEGCTOR_QUIT;
+        while (sci->sc_task) {
+                wake_up(&sci->sc_wait_daemon);
+                spin_unlock(&sci->sc_state_lock);
+                wait_event(sci->sc_wait_task, sci->sc_task == NULL);
+                spin_lock(&sci->sc_state_lock);
+        }
+}
+static int nilfs_segctor_init(struct nilfs_sc_info *sci)
+{
+        sci->sc_seq_done = sci->sc_seq_request;
+        return nilfs_segctor_start_thread(sci);
+}
+/*
+ * Setup & clean-up functions
+ */
+static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
+{
+        struct nilfs_sc_info *sci;
+        sci = kzalloc(sizeof(*sci), GFP_KERNEL);
+        if (!sci)
+                return NULL;
+        sci->sc_sbi = sbi;
+        sci->sc_super = sbi->s_super;
+        init_waitqueue_head(&sci->sc_wait_request);
+        init_waitqueue_head(&sci->sc_wait_daemon);
+        init_waitqueue_head(&sci->sc_wait_task);
+        spin_lock_init(&sci->sc_state_lock);
+        INIT_LIST_HEAD(&sci->sc_dirty_files);
+        INIT_LIST_HEAD(&sci->sc_segbufs);
+        INIT_LIST_HEAD(&sci->sc_gc_inodes);
+        INIT_LIST_HEAD(&sci->sc_cleaning_segments);
+        INIT_LIST_HEAD(&sci->sc_copied_buffers);
+        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
+        sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
+        sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
+        if (sbi->s_interval)
+                sci->sc_interval = sbi->s_interval;
+        if (sbi->s_watermark)
+                sci->sc_watermark = sbi->s_watermark;
+        return sci;
+}
+static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
+{
+        int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
+        /* The segctord thread was stopped and its timer was removed.
+           But some tasks remain. */
+        do {
+                struct nilfs_sb_info *sbi = sci->sc_sbi;
+                struct nilfs_transaction_info ti;
+                struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
+                nilfs_transaction_lock(sbi, &ti, 0);
+                nilfs_segctor_accept(sci, &req);
+                ret = nilfs_segctor_construct(sci, &req);
+                nilfs_segctor_notify(sci, &req);
+                nilfs_transaction_unlock(sbi);
+        } while (ret && retrycount-- > 0);
+}
+/**
+ * nilfs_segctor_destroy - destroy the segment constructor.
+ * @sci: nilfs_sc_info
+ *
+ * nilfs_segctor_destroy() kills the segctord thread and frees
+ * the nilfs_sc_info struct.
+ * Caller must hold the segment semaphore.
+ */
+static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        int flag;
+        up_write(&sbi->s_nilfs->ns_segctor_sem);
+        spin_lock(&sci->sc_state_lock);
+        nilfs_segctor_kill_thread(sci);
+        flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
+                || sci->sc_seq_request != sci->sc_seq_done);
+        spin_unlock(&sci->sc_state_lock);
+        if (flag || nilfs_segctor_confirm(sci))
+                nilfs_segctor_write_out(sci);
+        WARN_ON(!list_empty(&sci->sc_copied_buffers));
+        if (!list_empty(&sci->sc_dirty_files)) {
+                nilfs_warning(sbi->s_super, __func__,
+                              "dirty file(s) after the final construction\n");
+                nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
+        }
+        if (!list_empty(&sci->sc_cleaning_segments))
+                nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+        WARN_ON(!list_empty(&sci->sc_segbufs));
+        down_write(&sbi->s_nilfs->ns_segctor_sem);
+        kfree(sci);
+}
+/**
+ * nilfs_attach_segment_constructor - attach a segment constructor
+ * @sbi: nilfs_sb_info
+ *
+ * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
+ * initilizes it, and starts the segment constructor.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err;
+        /* Each field of nilfs_segctor is cleared through the initialization
+           of super-block info */
+        sbi->s_sc_info = nilfs_segctor_new(sbi);
+        if (!sbi->s_sc_info)
+                return -ENOMEM;
+        nilfs_attach_writer(nilfs, sbi);
+        err = nilfs_segctor_init(NILFS_SC(sbi));
+        if (err) {
+                nilfs_detach_writer(nilfs, sbi);
+                kfree(sbi->s_sc_info);
+                sbi->s_sc_info = NULL;
+        }
+        return err;
+}
+/**
+ * nilfs_detach_segment_constructor - destroy the segment constructor
+ * @sbi: nilfs_sb_info
+ *
+ * nilfs_detach_segment_constructor() kills the segment constructor daemon,
+ * frees the struct nilfs_sc_info, and destroy the dirty file list.
+ */
+void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        LIST_HEAD(garbage_list);
+        down_write(&nilfs->ns_segctor_sem);
+        if (NILFS_SC(sbi)) {
+                nilfs_segctor_destroy(NILFS_SC(sbi));
+                sbi->s_sc_info = NULL;
+        }
+        /* Force to free the list of dirty files */
+        spin_lock(&sbi->s_inode_lock);
+        if (!list_empty(&sbi->s_dirty_files)) {
+                list_splice_init(&sbi->s_dirty_files, &garbage_list);
+                nilfs_warning(sbi->s_super, __func__,
+                              "Non empty dirty list after the last "
+                              "segment construction\n");
+        }
+        spin_unlock(&sbi->s_inode_lock);
+        up_write(&nilfs->ns_segctor_sem);
+        nilfs_dispose_list(sbi, &garbage_list, 1);
+        nilfs_detach_writer(nilfs, sbi);
+}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
new file mode 100644
index 000000000000..476bdd5df5be
--- /dev/null
+++ b/fs/nilfs2/segment.h
@@ -0,0 +1,244 @@
+/*
+ * segment.h - NILFS Segment constructor prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGMENT_H
+#define _NILFS_SEGMENT_H
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "sb.h"
+/**
+ * struct nilfs_recovery_info - Recovery infomation
+ * @ri_need_recovery: Recovery status
+ * @ri_super_root: Block number of the last super root
+ * @ri_ri_cno: Number of the last checkpoint
+ * @ri_lsegs_start: Region for roll-forwarding (start block number)
+ * @ri_lsegs_end: Region for roll-forwarding (end block number)
+ * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
+ * @ri_used_segments: List of segments to be mark active
+ * @ri_pseg_start: Block number of the last partial segment
+ * @ri_seq: Sequence number on the last partial segment
+ * @ri_segnum: Segment number on the last partial segment
+ * @ri_nextnum: Next segment number on the last partial segment
+ */
+struct nilfs_recovery_info {
+        int                     ri_need_recovery;
+        sector_t                ri_super_root;
+        __u64                   ri_cno;
+        sector_t                ri_lsegs_start;
+        sector_t                ri_lsegs_end;
+        u64                     ri_lsegs_start_seq;
+        struct list_head        ri_used_segments;
+        sector_t                ri_pseg_start;
+        u64                     ri_seq;
+        __u64                   ri_segnum;
+        __u64                   ri_nextnum;
+};
+/* ri_need_recovery */
+#define NILFS_RECOVERY_SR_UPDATED        1  /* The super root was updated */
+#define NILFS_RECOVERY_ROLLFORWARD_DONE  2  /* Rollforward was carried out */
+/**
+ * struct nilfs_cstage - Context of collection stage
+ * @scnt: Stage count
+ * @flags: State flags
+ * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
+ * @gc_inode_ptr: Pointer on the list of gc-inodes
+ */
+struct nilfs_cstage {
+        int                     scnt;
+        unsigned                flags;
+        struct nilfs_inode_info *dirty_file_ptr;
+        struct nilfs_inode_info *gc_inode_ptr;
+};
+struct nilfs_segment_buffer;
+struct nilfs_segsum_pointer {
+        struct buffer_head     *bh;
+        unsigned                offset; /* offset in bytes */
+};
+/**
+ * struct nilfs_sc_info - Segment constructor information
+ * @sc_super: Back pointer to super_block struct
+ * @sc_sbi: Back pointer to nilfs_sb_info struct
+ * @sc_nblk_inc: Block count of current generation
+ * @sc_dirty_files: List of files to be written
+ * @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_cleaning_segments: List of segments to be freed through construction
+ * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
+ * @sc_dsync_inode: inode whose data pages are written for a sync operation
+ * @sc_dsync_start: start byte offset of data pages
+ * @sc_dsync_end: end byte offset of data pages (inclusive)
+ * @sc_segbufs: List of segment buffers
+ * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
+ * @sc_curseg: Current segment buffer
+ * @sc_super_root: Pointer to the super root buffer
+ * @sc_stage: Collection stage
+ * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
+ * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
+ * @sc_blk_cnt: Block count of a file
+ * @sc_datablk_cnt: Data block count of a file
+ * @sc_nblk_this_inc: Number of blocks included in the current logical segment
+ * @sc_seg_ctime: Creation time
+ * @sc_flags: Internal flags
+ * @sc_state_lock: spinlock for sc_state and so on
+ * @sc_state: Segctord state flags
+ * @sc_flush_request: inode bitmap of metadata files to be flushed
+ * @sc_wait_request: Client request queue
+ * @sc_wait_daemon: Daemon wait queue
+ * @sc_wait_task: Start/end wait queue to control segctord task
+ * @sc_seq_request: Request counter
+ * @sc_seq_done: Completion counter
+ * @sc_sync: Request of explicit sync operation
+ * @sc_interval: Timeout value of background construction
+ * @sc_mjcp_freq: Frequency of creating checkpoints
+ * @sc_lseg_stime: Start time of the latest logical segment
+ * @sc_watermark: Watermark for the number of dirty buffers
+ * @sc_timer: Timer for segctord
+ * @sc_task: current thread of segctord
+ */
+struct nilfs_sc_info {
+        struct super_block     *sc_super;
+        struct nilfs_sb_info   *sc_sbi;
+        unsigned long           sc_nblk_inc;
+        struct list_head        sc_dirty_files;
+        struct list_head        sc_gc_inodes;
+        struct list_head        sc_cleaning_segments;
+        struct list_head        sc_copied_buffers;
+        struct nilfs_inode_info *sc_dsync_inode;
+        loff_t                  sc_dsync_start;
+        loff_t                  sc_dsync_end;
+        /* Segment buffers */
+        struct list_head        sc_segbufs;
+        unsigned long           sc_segbuf_nblocks;
+        struct nilfs_segment_buffer *sc_curseg;
+        struct buffer_head     *sc_super_root;
+        struct nilfs_cstage     sc_stage;
+        struct nilfs_segsum_pointer sc_finfo_ptr;
+        struct nilfs_segsum_pointer sc_binfo_ptr;
+        unsigned long           sc_blk_cnt;
+        unsigned long           sc_datablk_cnt;
+        unsigned long           sc_nblk_this_inc;
+        time_t                  sc_seg_ctime;
+        unsigned long           sc_flags;
+        spinlock_t              sc_state_lock;
+        unsigned long           sc_state;
+        unsigned long           sc_flush_request;
+        wait_queue_head_t       sc_wait_request;
+        wait_queue_head_t       sc_wait_daemon;
+        wait_queue_head_t       sc_wait_task;
+        __u32                   sc_seq_request;
+        __u32                   sc_seq_done;
+        int                     sc_sync;
+        unsigned long           sc_interval;
+        unsigned long           sc_mjcp_freq;
+        unsigned long           sc_lseg_stime;  /* in 1/HZ seconds */
+        unsigned long           sc_watermark;
+        struct timer_list      *sc_timer;
+        struct task_struct     *sc_task;
+};
+/* sc_flags */
+enum {
+        NILFS_SC_DIRTY,         /* One or more dirty meta-data blocks exist */
+        NILFS_SC_UNCLOSED,      /* Logical segment is not closed */
+        NILFS_SC_SUPER_ROOT,    /* The latest segment has a super root */
+        NILFS_SC_PRIOR_FLUSH,   /* Requesting immediate flush without making a
+                                   checkpoint */
+        NILFS_SC_HAVE_DELTA,    /* Next checkpoint will have update of files
+                                   other than DAT, cpfile, sufile, or files
+                                   moved by GC */
+};
+/* sc_state */
+#define NILFS_SEGCTOR_QUIT          0x0001  /* segctord is being destroyed */
+#define NILFS_SEGCTOR_COMMIT        0x0004  /* committed transaction exists */
+/*
+ * Constant parameters
+ */
+#define NILFS_SC_CLEANUP_RETRY      3  /* Retry count of construction when
+                                          destroying segctord */
+/*
+ * Default values of timeout, in seconds.
+ */
+#define NILFS_SC_DEFAULT_TIMEOUT    5   /* Timeout value of dirty blocks.
+                                           It triggers construction of a
+                                           logical segment with a super root */
+#define NILFS_SC_DEFAULT_SR_FREQ    30  /* Maximum frequency of super root
+                                           creation */
+/*
+ * The default threshold amount of data, in block counts.
+ */
+#define NILFS_SC_DEFAULT_WATERMARK  3600
+/* segment.c */
+extern int nilfs_init_transaction_cache(void);
+extern void nilfs_destroy_transaction_cache(void);
+extern void nilfs_relax_pressure_in_lock(struct super_block *);
+extern int nilfs_construct_segment(struct super_block *);
+extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
+                                         loff_t, loff_t);
+extern void nilfs_flush_segment(struct super_block *, ino_t);
+extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
+                                void **);
+extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
+                                                  __u64 *, size_t);
+extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
+extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
+extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
+/* recovery.c */
+extern int nilfs_read_super_root_block(struct super_block *, sector_t,
+                                       struct buffer_head **, int);
+extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
+                                   struct nilfs_recovery_info *);
+extern int nilfs_recover_logical_segments(struct the_nilfs *,
+                                          struct nilfs_sb_info *,
+                                          struct nilfs_recovery_info *);
+#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
new file mode 100644
index 000000000000..98e68677f045
--- /dev/null
+++ b/fs/nilfs2/sufile.c
@@ -0,0 +1,558 @@
+/*
+ * sufile.c - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "sufile.h"
+static inline unsigned long
+nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
+{
+        return NILFS_MDT(sufile)->mi_entries_per_block;
+}
+static unsigned long
+nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
+{
+        __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+        do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+        return (unsigned long)t;
+}
+static unsigned long
+nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
+{
+        __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+        return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+}
+static unsigned long
+nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
+                                     __u64 max)
+{
+        return min_t(unsigned long,
+                     nilfs_sufile_segment_usages_per_block(sufile) -
+                     nilfs_sufile_get_offset(sufile, curr),
+                     max - curr + 1);
+}
+static inline struct nilfs_sufile_header *
+nilfs_sufile_block_get_header(const struct inode *sufile,
+                              struct buffer_head *bh,
+                              void *kaddr)
+{
+        return kaddr + bh_offset(bh);
+}
+static struct nilfs_segment_usage *
+nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
+                                     struct buffer_head *bh, void *kaddr)
+{
+        return kaddr + bh_offset(bh) +
+                nilfs_sufile_get_offset(sufile, segnum) *
+                NILFS_MDT(sufile)->mi_entry_size;
+}
+static inline int nilfs_sufile_get_header_block(struct inode *sufile,
+                                                struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
+}
+static inline int
+nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
+                                     int create, struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(sufile,
+                                   nilfs_sufile_get_blkoff(sufile, segnum),
+                                   create, NULL, bhp);
+}
+static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
+                                     u64 ncleanadd, u64 ndirtyadd)
+{
+        struct nilfs_sufile_header *header;
+        void *kaddr;
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = kaddr + bh_offset(header_bh);
+        le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
+        le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+}
+int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
+                        void (*dofunc)(struct inode *, __u64,
+                                       struct buffer_head *,
+                                       struct buffer_head *))
+{
+        struct buffer_head *header_bh, *bh;
+        int ret;
+        if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
+                printk(KERN_WARNING "%s: invalid segment number: %llu\n",
+                       __func__, (unsigned long long)segnum);
+                return -EINVAL;
+        }
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
+        if (!ret) {
+                dofunc(sufile, segnum, header_bh, bh);
+                brelse(bh);
+        }
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_alloc - allocate a segment
+ * @sufile: inode of segment usage file
+ * @segnump: pointer to segment number
+ *
+ * Description: nilfs_sufile_alloc() allocates a clean segment.
+ *
+ * Return Value: On success, 0 is returned and the segment number of the
+ * allocated segment is stored in the place pointed by @segnump. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No clean segment left.
+ */
+int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
+{
+        struct buffer_head *header_bh, *su_bh;
+        struct nilfs_sufile_header *header;
+        struct nilfs_segment_usage *su;
+        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+        __u64 segnum, maxsegnum, last_alloc;
+        void *kaddr;
+        unsigned long nsegments, ncleansegs, nsus;
+        int ret, i, j;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+        last_alloc = le64_to_cpu(header->sh_last_alloc);
+        kunmap_atomic(kaddr, KM_USER0);
+        nsegments = nilfs_sufile_get_nsegments(sufile);
+        segnum = last_alloc + 1;
+        maxsegnum = nsegments - 1;
+        for (i = 0; i < nsegments; i += nsus) {
+                if (segnum >= nsegments) {
+                        /* wrap around */
+                        segnum = 0;
+                        maxsegnum = last_alloc;
+                }
+                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
+                                                           &su_bh);
+                if (ret < 0)
+                        goto out_header;
+                kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+                su = nilfs_sufile_block_get_segment_usage(
+                        sufile, segnum, su_bh, kaddr);
+                nsus = nilfs_sufile_segment_usages_in_block(
+                        sufile, segnum, maxsegnum);
+                for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
+                        if (!nilfs_segment_usage_clean(su))
+                                continue;
+                        /* found a clean segment */
+                        nilfs_segment_usage_set_dirty(su);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                        header = nilfs_sufile_block_get_header(
+                                sufile, header_bh, kaddr);
+                        le64_add_cpu(&header->sh_ncleansegs, -1);
+                        le64_add_cpu(&header->sh_ndirtysegs, 1);
+                        header->sh_last_alloc = cpu_to_le64(segnum);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        nilfs_mdt_mark_buffer_dirty(header_bh);
+                        nilfs_mdt_mark_buffer_dirty(su_bh);
+                        nilfs_mdt_mark_dirty(sufile);
+                        brelse(su_bh);
+                        *segnump = segnum;
+                        goto out_header;
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(su_bh);
+        }
+        /* no segments left */
+        ret = -ENOSPC;
+ out_header:
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
+                                 struct buffer_head *header_bh,
+                                 struct buffer_head *su_bh)
+{
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+        if (unlikely(!nilfs_segment_usage_clean(su))) {
+                printk(KERN_WARNING "%s: segment %llu must be clean\n",
+                       __func__, (unsigned long long)segnum);
+                kunmap_atomic(kaddr, KM_USER0);
+                return;
+        }
+        nilfs_segment_usage_set_dirty(su);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_sufile_mod_counter(header_bh, -1, 1);
+        nilfs_mdt_mark_buffer_dirty(su_bh);
+        nilfs_mdt_mark_dirty(sufile);
+}
+void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
+                           struct buffer_head *header_bh,
+                           struct buffer_head *su_bh)
+{
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int clean, dirty;
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+        if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+            su->su_nblocks == cpu_to_le32(0)) {
+                kunmap_atomic(kaddr, KM_USER0);
+                return;
+        }
+        clean = nilfs_segment_usage_clean(su);
+        dirty = nilfs_segment_usage_dirty(su);
+        /* make the segment garbage */
+        su->su_lastmod = cpu_to_le64(0);
+        su->su_nblocks = cpu_to_le32(0);
+        su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+        nilfs_mdt_mark_buffer_dirty(su_bh);
+        nilfs_mdt_mark_dirty(sufile);
+}
+void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
+                          struct buffer_head *header_bh,
+                          struct buffer_head *su_bh)
+{
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int sudirty;
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+        if (nilfs_segment_usage_clean(su)) {
+                printk(KERN_WARNING "%s: segment %llu is already clean\n",
+                       __func__, (unsigned long long)segnum);
+                kunmap_atomic(kaddr, KM_USER0);
+                return;
+        }
+        WARN_ON(nilfs_segment_usage_error(su));
+        WARN_ON(!nilfs_segment_usage_dirty(su));
+        sudirty = nilfs_segment_usage_dirty(su);
+        nilfs_segment_usage_set_clean(su);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(su_bh);
+        nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+        nilfs_mdt_mark_dirty(sufile);
+}
+/**
+ * nilfs_sufile_get_segment_usage - get a segment usage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @sup: pointer to segment usage
+ * @bhp: pointer to buffer head
+ *
+ * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
+ * specified by @segnum.
+ *
+ * Return Value: On success, 0 is returned, and the segment usage and the
+ * buffer head of the buffer on which the segment usage is located are stored
+ * in the place pointed by @sup and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
+                                   struct nilfs_segment_usage **sup,
+                                   struct buffer_head **bhp)
+{
+        struct buffer_head *bh;
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int ret;
+        /* segnum is 0 origin */
+        if (segnum >= nilfs_sufile_get_nsegments(sufile))
+                return -EINVAL;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap(bh->b_page);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+        if (nilfs_segment_usage_error(su)) {
+                kunmap(bh->b_page);
+                brelse(bh);
+                ret = -EINVAL;
+                goto out_sem;
+        }
+        if (sup != NULL)
+                *sup = su;
+        *bhp = bh;
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_put_segment_usage - put a segment usage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @bh: buffer head
+ *
+ * Description: nilfs_sufile_put_segment_usage() releases the segment usage
+ * specified by @segnum. @bh must be the buffer head which have been returned
+ * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
+ */
+void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
+                                    struct buffer_head *bh)
+{
+        kunmap(bh->b_page);
+        brelse(bh);
+}
+/**
+ * nilfs_sufile_get_stat - get segment usage statistics
+ * @sufile: inode of segment usage file
+ * @stat: pointer to a structure of segment usage statistics
+ *
+ * Description: nilfs_sufile_get_stat() returns information about segment
+ * usage.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
+{
+        struct buffer_head *header_bh;
+        struct nilfs_sufile_header *header;
+        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        void *kaddr;
+        int ret;
+        down_read(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
+        sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+        sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
+        sustat->ss_ctime = nilfs->ns_ctime;
+        sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
+        spin_lock(&nilfs->ns_last_segment_lock);
+        sustat->ss_prot_seq = nilfs->ns_prot_seq;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(header_bh);
+ out_sem:
+        up_read(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_get_ncleansegs - get the number of clean segments
+ * @sufile: inode of segment usage file
+ * @nsegsp: pointer to the number of clean segments
+ *
+ * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
+ * segments.
+ *
+ * Return Value: On success, 0 is returned and the number of clean segments is
+ * stored in the place pointed by @nsegsp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
+{
+        struct nilfs_sustat sustat;
+        int ret;
+        ret = nilfs_sufile_get_stat(sufile, &sustat);
+        if (ret == 0)
+                *nsegsp = sustat.ss_ncleansegs;
+        return ret;
+}
+void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
+                               struct buffer_head *header_bh,
+                               struct buffer_head *su_bh)
+{
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int suclean;
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+        if (nilfs_segment_usage_error(su)) {
+                kunmap_atomic(kaddr, KM_USER0);
+                return;
+        }
+        suclean = nilfs_segment_usage_clean(su);
+        nilfs_segment_usage_set_error(su);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (suclean)
+                nilfs_sufile_mod_counter(header_bh, -1, 0);
+        nilfs_mdt_mark_buffer_dirty(su_bh);
+        nilfs_mdt_mark_dirty(sufile);
+}
+/**
+ * nilfs_sufile_get_suinfo -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to start looking
+ * @si: array of suinfo
+ * @nsi: size of suinfo array
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned and .... On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
+                                struct nilfs_suinfo *si, size_t nsi)
+{
+        struct buffer_head *su_bh;
+        struct nilfs_segment_usage *su;
+        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        void *kaddr;
+        unsigned long nsegs, segusages_per_block;
+        ssize_t n;
+        int ret, i, j;
+        down_read(&NILFS_MDT(sufile)->mi_sem);
+        segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+        nsegs = min_t(unsigned long,
+                      nilfs_sufile_get_nsegments(sufile) - segnum,
+                      nsi);
+        for (i = 0; i < nsegs; i += n, segnum += n) {
+                n = min_t(unsigned long,
+                          segusages_per_block -
+                                  nilfs_sufile_get_offset(sufile, segnum),
+                          nsegs - i);
+                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+                                                           &su_bh);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                goto out;
+                        /* hole */
+                        memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
+                        continue;
+                }
+                kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+                su = nilfs_sufile_block_get_segment_usage(
+                        sufile, segnum, su_bh, kaddr);
+                for (j = 0; j < n; j++, su = (void *)su + susz) {
+                        si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
+                        si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
+                        si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
+                                ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+                        if (nilfs_segment_is_active(nilfs, segnum + j))
+                                si[i + j].sui_flags |=
+                                        (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(su_bh);
+        }
+        ret = nsegs;
+ out:
+        up_read(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
new file mode 100644
index 000000000000..a2e2efd4ade1
--- /dev/null
+++ b/fs/nilfs2/sufile.h
@@ -0,0 +1,125 @@
+/*
+ * sufile.h - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_SUFILE_H
+#define _NILFS_SUFILE_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#define NILFS_SUFILE_GFP        NILFS_MDT_GFP
+static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
+{
+        return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
+}
+int nilfs_sufile_alloc(struct inode *, __u64 *);
+int nilfs_sufile_get_segment_usage(struct inode *, __u64,
+                                   struct nilfs_segment_usage **,
+                                   struct buffer_head **);
+void nilfs_sufile_put_segment_usage(struct inode *, __u64,
+                                    struct buffer_head *);
+int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
+int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
+                                size_t);
+int nilfs_sufile_update(struct inode *, __u64, int,
+                        void (*dofunc)(struct inode *, __u64,
+                                       struct buffer_head *,
+                                       struct buffer_head *));
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+                                 struct buffer_head *);
+void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
+                           struct buffer_head *);
+void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
+                          struct buffer_head *);
+void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
+                               struct buffer_head *);
+/**
+ * nilfs_sufile_cancel_free -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 0,
+                                   nilfs_sufile_do_cancel_free);
+}
+/**
+ * nilfs_sufile_scrap - make a segment garbage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap);
+}
+/**
+ * nilfs_sufile_free - free segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
+}
+/**
+ * nilfs_sufile_set_error - mark a segment as erroneous
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description: nilfs_sufile_set_error() marks the segment specified by
+ * @segnum as erroneous. The error segment will never be used again.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 0,
+                                   nilfs_sufile_do_set_error);
+}
+#endif  /* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
new file mode 100644
index 000000000000..6989b03e97ab
--- /dev/null
+++ b/fs/nilfs2/super.c
@@ -0,0 +1,1326 @@
+/*
+ * super.c - NILFS module and super block management.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/smp_lock.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/kobject.h>
+#include <linux/exportfs.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "page.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "dat.h"
+#include "segment.h"
+#include "segbuf.h"
+MODULE_AUTHOR("NTT Corp.");
+MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
+                   "(NILFS)");
+MODULE_LICENSE("GPL");
+static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+static int test_exclusive_mount(struct file_system_type *fs_type,
+                                struct block_device *bdev, int flags);
+/**
+ * nilfs_error() - report failure condition on a filesystem
+ *
+ * nilfs_error() sets an ERROR_FS flag on the superblock as well as
+ * reporting an error message.  It should be called when NILFS detects
+ * incoherences or defects of meta data on disk.  As for sustainable
+ * errors such as a single-shot I/O error, nilfs_warning() or the printk()
+ * function should be used instead.
+ *
+ * The segment constructor must not call this function because it can
+ * kill itself.
+ */
+void nilfs_error(struct super_block *sb, const char *function,
+                 const char *fmt, ...)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        va_list args;
+        va_start(args, fmt);
+        printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                struct the_nilfs *nilfs = sbi->s_nilfs;
+                if (!nilfs_test_opt(sbi, ERRORS_CONT))
+                        nilfs_detach_segment_constructor(sbi);
+                down_write(&nilfs->ns_sem);
+                if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
+                        nilfs->ns_mount_state |= NILFS_ERROR_FS;
+                        nilfs->ns_sbp[0]->s_state |=
+                                cpu_to_le16(NILFS_ERROR_FS);
+                        nilfs_commit_super(sbi, 1);
+                }
+                up_write(&nilfs->ns_sem);
+                if (nilfs_test_opt(sbi, ERRORS_RO)) {
+                        printk(KERN_CRIT "Remounting filesystem read-only\n");
+                        sb->s_flags |= MS_RDONLY;
+                }
+        }
+        if (nilfs_test_opt(sbi, ERRORS_PANIC))
+                panic("NILFS (device %s): panic forced after error\n",
+                      sb->s_id);
+}
+void nilfs_warning(struct super_block *sb, const char *function,
+                   const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        printk(KERN_WARNING "NILFS warning (device %s): %s: ",
+               sb->s_id, function);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+}
+static struct kmem_cache *nilfs_inode_cachep;
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+        struct nilfs_inode_info *ii;
+        ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
+        if (!ii)
+                return NULL;
+        ii->i_bh = NULL;
+        ii->i_state = 0;
+        ii->vfs_inode.i_version = 1;
+        nilfs_btnode_cache_init(&ii->i_btnode_cache);
+        return &ii->vfs_inode;
+}
+void nilfs_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
+}
+static void init_once(void *obj)
+{
+        struct nilfs_inode_info *ii = obj;
+        INIT_LIST_HEAD(&ii->i_dirty);
+#ifdef CONFIG_NILFS_XATTR
+        init_rwsem(&ii->xattr_sem);
+#endif
+        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+        inode_init_once(&ii->vfs_inode);
+}
+static int nilfs_init_inode_cache(void)
+{
+        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+                                               sizeof(struct nilfs_inode_info),
+                                               0, SLAB_RECLAIM_ACCOUNT,
+                                               init_once);
+        return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
+}
+static inline void nilfs_destroy_inode_cache(void)
+{
+        kmem_cache_destroy(nilfs_inode_cachep);
+}
+static void nilfs_clear_inode(struct inode *inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+#ifdef CONFIG_NILFS_POSIX_ACL
+        if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
+                posix_acl_release(ii->i_acl);
+                ii->i_acl = NILFS_ACL_NOT_CACHED;
+        }
+        if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
+                posix_acl_release(ii->i_default_acl);
+                ii->i_default_acl = NILFS_ACL_NOT_CACHED;
+        }
+#endif
+        /*
+         * Free resources allocated in nilfs_read_inode(), here.
+         */
+        BUG_ON(!list_empty(&ii->i_dirty));
+        brelse(ii->i_bh);
+        ii->i_bh = NULL;
+        if (test_bit(NILFS_I_BMAP, &ii->i_state))
+                nilfs_bmap_clear(ii->i_bmap);
+        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+}
+static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err;
+        int barrier_done = 0;
+        if (nilfs_test_opt(sbi, BARRIER)) {
+                set_buffer_ordered(nilfs->ns_sbh[0]);
+                barrier_done = 1;
+        }
+ retry:
+        set_buffer_dirty(nilfs->ns_sbh[0]);
+        err = sync_dirty_buffer(nilfs->ns_sbh[0]);
+        if (err == -EOPNOTSUPP && barrier_done) {
+                nilfs_warning(sbi->s_super, __func__,
+                              "barrier-based sync failed. "
+                              "disabling barriers\n");
+                nilfs_clear_opt(sbi, BARRIER);
+                barrier_done = 0;
+                clear_buffer_ordered(nilfs->ns_sbh[0]);
+                goto retry;
+        }
+        if (unlikely(err)) {
+                printk(KERN_ERR
+                       "NILFS: unable to write superblock (err=%d)\n", err);
+                if (err == -EIO && nilfs->ns_sbh[1]) {
+                        nilfs_fall_back_super_block(nilfs);
+                        goto retry;
+                }
+        } else {
+                struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+                /*
+                 * The latest segment becomes trailable from the position
+                 * written in superblock.
+                 */
+                clear_nilfs_discontinued(nilfs);
+                /* update GC protection for recent segments */
+                if (nilfs->ns_sbh[1]) {
+                        sbp = NULL;
+                        if (dupsb) {
+                                set_buffer_dirty(nilfs->ns_sbh[1]);
+                                if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
+                                        sbp = nilfs->ns_sbp[1];
+                        }
+                }
+                if (sbp) {
+                        spin_lock(&nilfs->ns_last_segment_lock);
+                        nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+                        spin_unlock(&nilfs->ns_last_segment_lock);
+                }
+        }
+        return err;
+}
+int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        sector_t nfreeblocks;
+        time_t t;
+        int err;
+        /* nilfs->sem must be locked by the caller. */
+        if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
+                if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+                        nilfs_swap_super_block(nilfs);
+                else {
+                        printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
+                               sbi->s_super->s_id);
+                        return -EIO;
+                }
+        }
+        err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+        if (unlikely(err)) {
+                printk(KERN_ERR "NILFS: failed to count free blocks\n");
+                return err;
+        }
+        spin_lock(&nilfs->ns_last_segment_lock);
+        sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+        sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+        sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        t = get_seconds();
+        nilfs->ns_sbwtime[0] = t;
+        sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+        sbp[0]->s_wtime = cpu_to_le64(t);
+        sbp[0]->s_sum = 0;
+        sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+                                             (unsigned char *)sbp[0],
+                                             nilfs->ns_sbsize));
+        if (dupsb && sbp[1]) {
+                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+                nilfs->ns_sbwtime[1] = t;
+        }
+        sbi->s_super->s_dirt = 0;
+        return nilfs_sync_super(sbi, dupsb);
+}
+static void nilfs_put_super(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        nilfs_detach_segment_constructor(sbi);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                down_write(&nilfs->ns_sem);
+                nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+                nilfs_commit_super(sbi, 1);
+                up_write(&nilfs->ns_sem);
+        }
+        nilfs_detach_checkpoint(sbi);
+        put_nilfs(sbi->s_nilfs);
+        sbi->s_super = NULL;
+        sb->s_fs_info = NULL;
+        kfree(sbi);
+}
+/**
+ * nilfs_write_super - write super block(s) of NILFS
+ * @sb: super_block
+ *
+ * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
+ * clears s_dirt.  This function is called in the section protected by
+ * lock_super().
+ *
+ * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
+ * of the struct the_nilfs.  Lock order must be as follows:
+ *
+ *   1. lock_super()
+ *   2.    down_write(&nilfs->ns_sem)
+ *
+ * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
+ * of the super block (nilfs->ns_sbp[]).
+ *
+ * In most cases, VFS functions call lock_super() before calling these
+ * methods.  So we must be careful not to bring on deadlocks when using
+ * lock_super();  see generic_shutdown_super(), write_super(), and so on.
+ *
+ * Note that order of lock_kernel() and lock_super() depends on contexts
+ * of VFS.  We should also note that lock_kernel() can be used in its
+ * protective section and only the outermost one has an effect.
+ */
+static void nilfs_write_super(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        down_write(&nilfs->ns_sem);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                struct nilfs_super_block **sbp = nilfs->ns_sbp;
+                u64 t = get_seconds();
+                int dupsb;
+                if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
+                    t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
+                        up_write(&nilfs->ns_sem);
+                        return;
+                }
+                dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+                nilfs_commit_super(sbi, dupsb);
+        }
+        sb->s_dirt = 0;
+        up_write(&nilfs->ns_sem);
+}
+static int nilfs_sync_fs(struct super_block *sb, int wait)
+{
+        int err = 0;
+        /* This function is called when super block should be written back */
+        if (wait)
+                err = nilfs_construct_segment(sb);
+        return err;
+}
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_checkpoint *raw_cp;
+        struct buffer_head *bh_cp;
+        int err;
+        down_write(&nilfs->ns_sem);
+        list_add(&sbi->s_list, &nilfs->ns_supers);
+        up_write(&nilfs->ns_sem);
+        sbi->s_ifile = nilfs_mdt_new(
+                nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
+        if (!sbi->s_ifile)
+                return -ENOMEM;
+        err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
+        if (unlikely(err))
+                goto failed;
+        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
+                                          &bh_cp);
+        if (unlikely(err)) {
+                if (err == -ENOENT || err == -EINVAL) {
+                        printk(KERN_ERR
+                               "NILFS: Invalid checkpoint "
+                               "(checkpoint number=%llu)\n",
+                               (unsigned long long)cno);
+                        err = -EINVAL;
+                }
+                goto failed;
+        }
+        err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
+        if (unlikely(err))
+                goto failed_bh;
+        atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+        atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+        return 0;
+ failed_bh:
+        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+ failed:
+        nilfs_mdt_destroy(sbi->s_ifile);
+        sbi->s_ifile = NULL;
+        down_write(&nilfs->ns_sem);
+        list_del_init(&sbi->s_list);
+        up_write(&nilfs->ns_sem);
+        return err;
+}
+void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        nilfs_mdt_clear(sbi->s_ifile);
+        nilfs_mdt_destroy(sbi->s_ifile);
+        sbi->s_ifile = NULL;
+        down_write(&nilfs->ns_sem);
+        list_del_init(&sbi->s_list);
+        up_write(&nilfs->ns_sem);
+}
+static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err = 0;
+        down_write(&nilfs->ns_sem);
+        if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+                nilfs->ns_mount_state |= NILFS_VALID_FS;
+                err = nilfs_commit_super(sbi, 1);
+                if (likely(!err))
+                        printk(KERN_INFO "NILFS: recovery complete.\n");
+        }
+        up_write(&nilfs->ns_sem);
+        return err;
+}
+static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+        unsigned long long blocks;
+        unsigned long overhead;
+        unsigned long nrsvblocks;
+        sector_t nfreeblocks;
+        int err;
+        /*
+         * Compute all of the segment blocks
+         *
+         * The blocks before first segment and after last segment
+         * are excluded.
+         */
+        blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
+                - nilfs->ns_first_data_block;
+        nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
+        /*
+         * Compute the overhead
+         *
+         * When distributing meta data blocks outside semgent structure,
+         * We must count them as the overhead.
+         */
+        overhead = 0;
+        err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+        if (unlikely(err))
+                return err;
+        buf->f_type = NILFS_SUPER_MAGIC;
+        buf->f_bsize = sb->s_blocksize;
+        buf->f_blocks = blocks - overhead;
+        buf->f_bfree = nfreeblocks;
+        buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
+                (buf->f_bfree - nrsvblocks) : 0;
+        buf->f_files = atomic_read(&sbi->s_inodes_count);
+        buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+        buf->f_namelen = NILFS_NAME_LEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
+        return 0;
+}
+static struct super_operations nilfs_sops = {
+        .alloc_inode    = nilfs_alloc_inode,
+        .destroy_inode  = nilfs_destroy_inode,
+        .dirty_inode    = nilfs_dirty_inode,
+        /* .write_inode    = nilfs_write_inode, */
+        /* .put_inode      = nilfs_put_inode, */
+        /* .drop_inode    = nilfs_drop_inode, */
+        .delete_inode   = nilfs_delete_inode,
+        .put_super      = nilfs_put_super,
+        .write_super    = nilfs_write_super,
+        .sync_fs        = nilfs_sync_fs,
+        /* .write_super_lockfs */
+        /* .unlockfs */
+        .statfs         = nilfs_statfs,
+        .remount_fs     = nilfs_remount,
+        .clear_inode    = nilfs_clear_inode,
+        /* .umount_begin */
+        /* .show_options */
+};
+static struct inode *
+nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
+{
+        struct inode *inode;
+        if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
+            ino != NILFS_SKETCH_INO)
+                return ERR_PTR(-ESTALE);
+        inode = nilfs_iget(sb, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (generation && inode->i_generation != generation) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return inode;
+}
+static struct dentry *
+nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
+                   int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                    nilfs_nfs_get_inode);
+}
+static struct dentry *
+nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
+                   int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                    nilfs_nfs_get_inode);
+}
+static struct export_operations nilfs_export_ops = {
+        .fh_to_dentry = nilfs_fh_to_dentry,
+        .fh_to_parent = nilfs_fh_to_parent,
+        .get_parent = nilfs_get_parent,
+};
+enum {
+        Opt_err_cont, Opt_err_panic, Opt_err_ro,
+        Opt_barrier, Opt_snapshot, Opt_order,
+        Opt_err,
+};
+static match_table_t tokens = {
+        {Opt_err_cont, "errors=continue"},
+        {Opt_err_panic, "errors=panic"},
+        {Opt_err_ro, "errors=remount-ro"},
+        {Opt_barrier, "barrier=%s"},
+        {Opt_snapshot, "cp=%u"},
+        {Opt_order, "order=%s"},
+        {Opt_err, NULL}
+};
+static int match_bool(substring_t *s, int *result)
+{
+        int len = s->to - s->from;
+        if (strncmp(s->from, "on", len) == 0)
+                *result = 1;
+        else if (strncmp(s->from, "off", len) == 0)
+                *result = 0;
+        else
+                return 1;
+        return 0;
+}
+static int parse_options(char *options, struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        if (!options)
+                return 1;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_barrier:
+                        if (match_bool(&args[0], &option))
+                                return 0;
+                        if (option)
+                                nilfs_set_opt(sbi, BARRIER);
+                        else
+                                nilfs_clear_opt(sbi, BARRIER);
+                        break;
+                case Opt_order:
+                        if (strcmp(args[0].from, "relaxed") == 0)
+                                /* Ordered data semantics */
+                                nilfs_clear_opt(sbi, STRICT_ORDER);
+                        else if (strcmp(args[0].from, "strict") == 0)
+                                /* Strict in-order semantics */
+                                nilfs_set_opt(sbi, STRICT_ORDER);
+                        else
+                                return 0;
+                        break;
+                case Opt_err_panic:
+                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
+                        break;
+                case Opt_err_ro:
+                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
+                        break;
+                case Opt_err_cont:
+                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
+                        break;
+                case Opt_snapshot:
+                        if (match_int(&args[0], &option) || option <= 0)
+                                return 0;
+                        if (!(sb->s_flags & MS_RDONLY))
+                                return 0;
+                        sbi->s_snapshot_cno = option;
+                        nilfs_set_opt(sbi, SNAPSHOT);
+                        break;
+                default:
+                        printk(KERN_ERR
+                               "NILFS: Unrecognized mount option \"%s\"\n", p);
+                        return 0;
+                }
+        }
+        return 1;
+}
+static inline void
+nilfs_set_default_options(struct nilfs_sb_info *sbi,
+                          struct nilfs_super_block *sbp)
+{
+        sbi->s_mount_opt =
+                NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+}
+static int nilfs_setup_super(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+        int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
+        int mnt_count = le16_to_cpu(sbp->s_mnt_count);
+        /* nilfs->sem must be locked by the caller. */
+        if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+                printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+        } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
+                printk(KERN_WARNING
+                       "NILFS warning: mounting fs with errors\n");
+#if 0
+        } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
+                printk(KERN_WARNING
+                       "NILFS warning: maximal mount count reached\n");
+#endif
+        }
+        if (!max_mnt_count)
+                sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+        sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
+        sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
+        sbp->s_mtime = cpu_to_le64(get_seconds());
+        return nilfs_commit_super(sbi, 1);
+}
+struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
+                                                 u64 pos, int blocksize,
+                                                 struct buffer_head **pbh)
+{
+        unsigned long long sb_index = pos;
+        unsigned long offset;
+        offset = do_div(sb_index, blocksize);
+        *pbh = sb_bread(sb, sb_index);
+        if (!*pbh)
+                return NULL;
+        return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
+}
+int nilfs_store_magic_and_option(struct super_block *sb,
+                                 struct nilfs_super_block *sbp,
+                                 char *data)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        sb->s_magic = le16_to_cpu(sbp->s_magic);
+        /* FS independent flags */
+#ifdef NILFS_ATIME_DISABLE
+        sb->s_flags |= MS_NOATIME;
+#endif
+        nilfs_set_default_options(sbi, sbp);
+        sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
+        sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
+        sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
+        sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
+        return !parse_options(data, sb) ? -EINVAL : 0 ;
+}
+/**
+ * nilfs_fill_super() - initialize a super block instance
+ * @sb: super_block
+ * @data: mount options
+ * @silent: silent mode flag
+ * @nilfs: the_nilfs struct
+ *
+ * This function is called exclusively by bd_mount_mutex.
+ * So, the recovery process is protected from other simultaneous mounts.
+ */
+static int
+nilfs_fill_super(struct super_block *sb, void *data, int silent,
+                 struct the_nilfs *nilfs)
+{
+        struct nilfs_sb_info *sbi;
+        struct inode *root;
+        __u64 cno;
+        int err;
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+        if (!sbi)
+                return -ENOMEM;
+        sb->s_fs_info = sbi;
+        get_nilfs(nilfs);
+        sbi->s_nilfs = nilfs;
+        sbi->s_super = sb;
+        err = init_nilfs(nilfs, sbi, (char *)data);
+        if (err)
+                goto failed_sbi;
+        spin_lock_init(&sbi->s_inode_lock);
+        INIT_LIST_HEAD(&sbi->s_dirty_files);
+        INIT_LIST_HEAD(&sbi->s_list);
+        /*
+         * Following initialization is overlapped because
+         * nilfs_sb_info structure has been cleared at the beginning.
+         * But we reserve them to keep our interest and make ready
+         * for the future change.
+         */
+        get_random_bytes(&sbi->s_next_generation,
+                         sizeof(sbi->s_next_generation));
+        spin_lock_init(&sbi->s_next_gen_lock);
+        sb->s_op = &nilfs_sops;
+        sb->s_export_op = &nilfs_export_ops;
+        sb->s_root = NULL;
+        sb->s_time_gran = 1;
+        if (!nilfs_loaded(nilfs)) {
+                err = load_nilfs(nilfs, sbi);
+                if (err)
+                        goto failed_sbi;
+        }
+        cno = nilfs_last_cno(nilfs);
+        if (sb->s_flags & MS_RDONLY) {
+                if (nilfs_test_opt(sbi, SNAPSHOT)) {
+                        err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
+                                                       sbi->s_snapshot_cno);
+                        if (err < 0)
+                                goto failed_sbi;
+                        if (!err) {
+                                printk(KERN_ERR
+                                       "NILFS: The specified checkpoint is "
+                                       "not a snapshot "
+                                       "(checkpoint number=%llu).\n",
+                                       (unsigned long long)sbi->s_snapshot_cno);
+                                err = -EINVAL;
+                                goto failed_sbi;
+                        }
+                        cno = sbi->s_snapshot_cno;
+                } else
+                        /* Read-only mount */
+                        sbi->s_snapshot_cno = cno;
+        }
+        err = nilfs_attach_checkpoint(sbi, cno);
+        if (err) {
+                printk(KERN_ERR "NILFS: error loading a checkpoint"
+                       " (checkpoint number=%llu).\n", (unsigned long long)cno);
+                goto failed_sbi;
+        }
+        if (!(sb->s_flags & MS_RDONLY)) {
+                err = nilfs_attach_segment_constructor(sbi);
+                if (err)
+                        goto failed_checkpoint;
+        }
+        root = nilfs_iget(sb, NILFS_ROOT_INO);
+        if (IS_ERR(root)) {
+                printk(KERN_ERR "NILFS: get root inode failed\n");
+                err = PTR_ERR(root);
+                goto failed_segctor;
+        }
+        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+                iput(root);
+                printk(KERN_ERR "NILFS: corrupt root inode.\n");
+                err = -EINVAL;
+                goto failed_segctor;
+        }
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                iput(root);
+                printk(KERN_ERR "NILFS: get root dentry failed\n");
+                err = -ENOMEM;
+                goto failed_segctor;
+        }
+        if (!(sb->s_flags & MS_RDONLY)) {
+                down_write(&nilfs->ns_sem);
+                nilfs_setup_super(sbi);
+                up_write(&nilfs->ns_sem);
+        }
+        err = nilfs_mark_recovery_complete(sbi);
+        if (unlikely(err)) {
+                printk(KERN_ERR "NILFS: recovery failed.\n");
+                goto failed_root;
+        }
+        return 0;
+ failed_root:
+        dput(sb->s_root);
+        sb->s_root = NULL;
+ failed_segctor:
+        nilfs_detach_segment_constructor(sbi);
+ failed_checkpoint:
+        nilfs_detach_checkpoint(sbi);
+ failed_sbi:
+        put_nilfs(nilfs);
+        sb->s_fs_info = NULL;
+        kfree(sbi);
+        return err;
+}
+static int nilfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_super_block *sbp;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        unsigned long old_sb_flags;
+        struct nilfs_mount_options old_opts;
+        int err;
+        old_sb_flags = sb->s_flags;
+        old_opts.mount_opt = sbi->s_mount_opt;
+        old_opts.snapshot_cno = sbi->s_snapshot_cno;
+        if (!parse_options(data, sb)) {
+                err = -EINVAL;
+                goto restore_opts;
+        }
+        sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+        if ((*flags & MS_RDONLY) &&
+            sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+                printk(KERN_WARNING "NILFS (device %s): couldn't "
+                       "remount to a different snapshot. \n",
+                       sb->s_id);
+                err = -EINVAL;
+                goto restore_opts;
+        }
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+                goto out;
+        if (*flags & MS_RDONLY) {
+                /* Shutting down the segment constructor */
+                nilfs_detach_segment_constructor(sbi);
+                sb->s_flags |= MS_RDONLY;
+                sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
+                /* nilfs_set_opt(sbi, SNAPSHOT); */
+                /*
+                 * Remounting a valid RW partition RDONLY, so set
+                 * the RDONLY flag and then mark the partition as valid again.
+                 */
+                down_write(&nilfs->ns_sem);
+                sbp = nilfs->ns_sbp[0];
+                if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
+                    (nilfs->ns_mount_state & NILFS_VALID_FS))
+                        sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
+                sbp->s_mtime = cpu_to_le64(get_seconds());
+                nilfs_commit_super(sbi, 1);
+                up_write(&nilfs->ns_sem);
+        } else {
+                /*
+                 * Mounting a RDONLY partition read-write, so reread and
+                 * store the current valid flag.  (It may have been changed
+                 * by fsck since we originally mounted the partition.)
+                 */
+                down(&sb->s_bdev->bd_mount_sem);
+                /* Check existing RW-mount */
+                if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
+                        printk(KERN_WARNING "NILFS (device %s): couldn't "
+                               "remount because a RW-mount exists.\n",
+                               sb->s_id);
+                        err = -EBUSY;
+                        goto rw_remount_failed;
+                }
+                if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
+                        printk(KERN_WARNING "NILFS (device %s): couldn't "
+                               "remount because the current RO-mount is not "
+                               "the latest one.\n",
+                               sb->s_id);
+                        err = -EINVAL;
+                        goto rw_remount_failed;
+                }
+                sb->s_flags &= ~MS_RDONLY;
+                nilfs_clear_opt(sbi, SNAPSHOT);
+                sbi->s_snapshot_cno = 0;
+                err = nilfs_attach_segment_constructor(sbi);
+                if (err)
+                        goto rw_remount_failed;
+                down_write(&nilfs->ns_sem);
+                nilfs_setup_super(sbi);
+                up_write(&nilfs->ns_sem);
+                up(&sb->s_bdev->bd_mount_sem);
+        }
+ out:
+        return 0;
+ rw_remount_failed:
+        up(&sb->s_bdev->bd_mount_sem);
+ restore_opts:
+        sb->s_flags = old_sb_flags;
+        sbi->s_mount_opt = old_opts.mount_opt;
+        sbi->s_snapshot_cno = old_opts.snapshot_cno;
+        return err;
+}
+struct nilfs_super_data {
+        struct block_device *bdev;
+        __u64 cno;
+        int flags;
+};
+/**
+ * nilfs_identify - pre-read mount options needed to identify mount instance
+ * @data: mount options
+ * @sd: nilfs_super_data
+ */
+static int nilfs_identify(char *data, struct nilfs_super_data *sd)
+{
+        char *p, *options = data;
+        substring_t args[MAX_OPT_ARGS];
+        int option, token;
+        int ret = 0;
+        do {
+                p = strsep(&options, ",");
+                if (p != NULL && *p) {
+                        token = match_token(p, tokens, args);
+                        if (token == Opt_snapshot) {
+                                if (!(sd->flags & MS_RDONLY))
+                                        ret++;
+                                else {
+                                        ret = match_int(&args[0], &option);
+                                        if (!ret) {
+                                                if (option > 0)
+                                                        sd->cno = option;
+                                                else
+                                                        ret++;
+                                        }
+                                }
+                        }
+                        if (ret)
+                                printk(KERN_ERR
+                                       "NILFS: invalid mount option: %s\n", p);
+                }
+                if (!options)
+                        break;
+                BUG_ON(options == data);
+                *(options - 1) = ',';
+        } while (!ret);
+        return ret;
+}
+static int nilfs_set_bdev_super(struct super_block *s, void *data)
+{
+        struct nilfs_super_data *sd = data;
+        s->s_bdev = sd->bdev;
+        s->s_dev = s->s_bdev->bd_dev;
+        return 0;
+}
+static int nilfs_test_bdev_super(struct super_block *s, void *data)
+{
+        struct nilfs_super_data *sd = data;
+        return s->s_bdev == sd->bdev;
+}
+static int nilfs_test_bdev_super2(struct super_block *s, void *data)
+{
+        struct nilfs_super_data *sd = data;
+        int ret;
+        if (s->s_bdev != sd->bdev)
+                return 0;
+        if (!((s->s_flags | sd->flags) & MS_RDONLY))
+                return 1; /* Reuse an old R/W-mode super_block */
+        if (s->s_flags & sd->flags & MS_RDONLY) {
+                if (down_read_trylock(&s->s_umount)) {
+                        ret = s->s_root &&
+                                (sd->cno == NILFS_SB(s)->s_snapshot_cno);
+                        up_read(&s->s_umount);
+                        /*
+                         * This path is locked with sb_lock by sget().
+                         * So, drop_super() causes deadlock.
+                         */
+                        return ret;
+                }
+        }
+        return 0;
+}
+static int
+nilfs_get_sb(struct file_system_type *fs_type, int flags,
+             const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct nilfs_super_data sd;
+        struct super_block *s, *s2;
+        struct the_nilfs *nilfs = NULL;
+        int err, need_to_close = 1;
+        sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+        if (IS_ERR(sd.bdev))
+                return PTR_ERR(sd.bdev);
+        /*
+         * To get mount instance using sget() vfs-routine, NILFS needs
+         * much more information than normal filesystems to identify mount
+         * instance.  For snapshot mounts, not only a mount type (ro-mount
+         * or rw-mount) but also a checkpoint number is required.
+         * The results are passed in sget() using nilfs_super_data.
+         */
+        sd.cno = 0;
+        sd.flags = flags;
+        if (nilfs_identify((char *)data, &sd)) {
+                err = -EINVAL;
+                goto failed;
+        }
+        /*
+         * once the super is inserted into the list by sget, s_umount
+         * will protect the lockfs code from trying to start a snapshot
+         * while we are mounting
+         */
+        down(&sd.bdev->bd_mount_sem);
+        if (!sd.cno &&
+            (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
+                err = (err < 0) ? : -EBUSY;
+                goto failed_unlock;
+        }
+        /*
+         * Phase-1: search any existent instance and get the_nilfs
+         */
+        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+        if (IS_ERR(s))
+                goto error_s;
+        if (!s->s_root) {
+                err = -ENOMEM;
+                nilfs = alloc_nilfs(sd.bdev);
+                if (!nilfs)
+                        goto cancel_new;
+        } else {
+                struct nilfs_sb_info *sbi = NILFS_SB(s);
+                /*
+                 * s_umount protects super_block from unmount process;
+                 * It covers pointers of nilfs_sb_info and the_nilfs.
+                 */
+                nilfs = sbi->s_nilfs;
+                get_nilfs(nilfs);
+                up_write(&s->s_umount);
+                /*
+                 * Phase-2: search specified snapshot or R/W mode super_block
+                 */
+                if (!sd.cno)
+                        /* trying to get the latest checkpoint.  */
+                        sd.cno = nilfs_last_cno(nilfs);
+                s2 = sget(fs_type, nilfs_test_bdev_super2,
+                          nilfs_set_bdev_super, &sd);
+                deactivate_super(s);
+                /*
+                 * Although deactivate_super() invokes close_bdev_exclusive() at
+                 * kill_block_super().  Here, s is an existent mount; we need
+                 * one more close_bdev_exclusive() call.
+                 */
+                s = s2;
+                if (IS_ERR(s))
+                        goto error_s;
+        }
+        if (!s->s_root) {
+                char b[BDEVNAME_SIZE];
+                s->s_flags = flags;
+                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+                sb_set_blocksize(s, block_size(sd.bdev));
+                err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+                if (err)
+                        goto cancel_new;
+                s->s_flags |= MS_ACTIVE;
+                need_to_close = 0;
+        } else if (!(s->s_flags & MS_RDONLY)) {
+                err = -EBUSY;
+        }
+        up(&sd.bdev->bd_mount_sem);
+        put_nilfs(nilfs);
+        if (need_to_close)
+                close_bdev_exclusive(sd.bdev, flags);
+        simple_set_mnt(mnt, s);
+        return 0;
+ error_s:
+        up(&sd.bdev->bd_mount_sem);
+        if (nilfs)
+                put_nilfs(nilfs);
+        close_bdev_exclusive(sd.bdev, flags);
+        return PTR_ERR(s);
+ failed_unlock:
+        up(&sd.bdev->bd_mount_sem);
+ failed:
+        close_bdev_exclusive(sd.bdev, flags);
+        return err;
+ cancel_new:
+        /* Abandoning the newly allocated superblock */
+        up(&sd.bdev->bd_mount_sem);
+        if (nilfs)
+                put_nilfs(nilfs);
+        up_write(&s->s_umount);
+        deactivate_super(s);
+        /*
+         * deactivate_super() invokes close_bdev_exclusive().
+         * We must finish all post-cleaning before this call;
+         * put_nilfs() and unlocking bd_mount_sem need the block device.
+         */
+        return err;
+}
+static int nilfs_test_bdev_super3(struct super_block *s, void *data)
+{
+        struct nilfs_super_data *sd = data;
+        int ret;
+        if (s->s_bdev != sd->bdev)
+                return 0;
+        if (down_read_trylock(&s->s_umount)) {
+                ret = (s->s_flags & MS_RDONLY) && s->s_root &&
+                        nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
+                up_read(&s->s_umount);
+                if (ret)
+                        return 0; /* ignore snapshot mounts */
+        }
+        return !((sd->flags ^ s->s_flags) & MS_RDONLY);
+}
+static int __false_bdev_super(struct super_block *s, void *data)
+{
+#if 0 /* XXX: workaround for lock debug. This is not good idea */
+        up_write(&s->s_umount);
+#endif
+        return -EFAULT;
+}
+/**
+ * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
+ * fs_type: filesystem type
+ * bdev: block device
+ * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
+ * res: pointer to an integer to store result
+ *
+ * This function must be called within a section protected by bd_mount_mutex.
+ */
+static int test_exclusive_mount(struct file_system_type *fs_type,
+                                struct block_device *bdev, int flags)
+{
+        struct super_block *s;
+        struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
+        s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
+        if (IS_ERR(s)) {
+                if (PTR_ERR(s) != -EFAULT)
+                        return PTR_ERR(s);
+                return 0;  /* Not found */
+        }
+        up_write(&s->s_umount);
+        deactivate_super(s);
+        return 1;  /* Found */
+}
+struct file_system_type nilfs_fs_type = {
+        .owner    = THIS_MODULE,
+        .name     = "nilfs2",
+        .get_sb   = nilfs_get_sb,
+        .kill_sb  = kill_block_super,
+        .fs_flags = FS_REQUIRES_DEV,
+};
+static int __init init_nilfs_fs(void)
+{
+        int err;
+        err = nilfs_init_inode_cache();
+        if (err)
+                goto failed;
+        err = nilfs_init_transaction_cache();
+        if (err)
+                goto failed_inode_cache;
+        err = nilfs_init_segbuf_cache();
+        if (err)
+                goto failed_transaction_cache;
+        err = nilfs_btree_path_cache_init();
+        if (err)
+                goto failed_segbuf_cache;
+        err = register_filesystem(&nilfs_fs_type);
+        if (err)
+                goto failed_btree_path_cache;
+        return 0;
+ failed_btree_path_cache:
+        nilfs_btree_path_cache_destroy();
+ failed_segbuf_cache:
+        nilfs_destroy_segbuf_cache();
+ failed_transaction_cache:
+        nilfs_destroy_transaction_cache();
+ failed_inode_cache:
+        nilfs_destroy_inode_cache();
+ failed:
+        return err;
+}
+static void __exit exit_nilfs_fs(void)
+{
+        nilfs_destroy_segbuf_cache();
+        nilfs_destroy_transaction_cache();
+        nilfs_destroy_inode_cache();
+        nilfs_btree_path_cache_destroy();
+        unregister_filesystem(&nilfs_fs_type);
+}
+module_init(init_nilfs_fs)
+module_exit(exit_nilfs_fs)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
new file mode 100644
index 000000000000..7f65b3be4aa9
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,641 @@
+/*
+ * the_nilfs.c - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "alloc.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+#include "seglist.h"
+#include "segbuf.h"
+void nilfs_set_last_segment(struct the_nilfs *nilfs,
+                            sector_t start_blocknr, u64 seq, __u64 cno)
+{
+        spin_lock(&nilfs->ns_last_segment_lock);
+        nilfs->ns_last_pseg = start_blocknr;
+        nilfs->ns_last_seq = seq;
+        nilfs->ns_last_cno = cno;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+}
+/**
+ * alloc_nilfs - allocate the_nilfs structure
+ * @bdev: block device to which the_nilfs is related
+ *
+ * alloc_nilfs() allocates memory for the_nilfs and
+ * initializes its reference count and locks.
+ *
+ * Return Value: On success, pointer to the_nilfs is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+{
+        struct the_nilfs *nilfs;
+        nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
+        if (!nilfs)
+                return NULL;
+        nilfs->ns_bdev = bdev;
+        atomic_set(&nilfs->ns_count, 1);
+        atomic_set(&nilfs->ns_writer_refcount, -1);
+        atomic_set(&nilfs->ns_ndirtyblks, 0);
+        init_rwsem(&nilfs->ns_sem);
+        mutex_init(&nilfs->ns_writer_mutex);
+        INIT_LIST_HEAD(&nilfs->ns_supers);
+        spin_lock_init(&nilfs->ns_last_segment_lock);
+        nilfs->ns_gc_inodes_h = NULL;
+        init_rwsem(&nilfs->ns_segctor_sem);
+        return nilfs;
+}
+/**
+ * put_nilfs - release a reference to the_nilfs
+ * @nilfs: the_nilfs structure to be released
+ *
+ * put_nilfs() decrements a reference counter of the_nilfs.
+ * If the reference count reaches zero, the_nilfs is freed.
+ */
+void put_nilfs(struct the_nilfs *nilfs)
+{
+        if (!atomic_dec_and_test(&nilfs->ns_count))
+                return;
+        /*
+         * Increment of ns_count never occur below because the caller
+         * of get_nilfs() holds at least one reference to the_nilfs.
+         * Thus its exclusion control is not required here.
+         */
+        might_sleep();
+        if (nilfs_loaded(nilfs)) {
+                nilfs_mdt_clear(nilfs->ns_sufile);
+                nilfs_mdt_destroy(nilfs->ns_sufile);
+                nilfs_mdt_clear(nilfs->ns_cpfile);
+                nilfs_mdt_destroy(nilfs->ns_cpfile);
+                nilfs_mdt_clear(nilfs->ns_dat);
+                nilfs_mdt_destroy(nilfs->ns_dat);
+                /* XXX: how and when to clear nilfs->ns_gc_dat? */
+                nilfs_mdt_destroy(nilfs->ns_gc_dat);
+        }
+        if (nilfs_init(nilfs)) {
+                nilfs_destroy_gccache(nilfs);
+                brelse(nilfs->ns_sbh[0]);
+                brelse(nilfs->ns_sbh[1]);
+        }
+        kfree(nilfs);
+}
+static int nilfs_load_super_root(struct the_nilfs *nilfs,
+                                 struct nilfs_sb_info *sbi, sector_t sr_block)
+{
+        static struct lock_class_key dat_lock_key;
+        struct buffer_head *bh_sr;
+        struct nilfs_super_root *raw_sr;
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        unsigned dat_entry_size, segment_usage_size, checkpoint_size;
+        unsigned inode_size;
+        int err;
+        err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
+        if (unlikely(err))
+                return err;
+        down_read(&nilfs->ns_sem);
+        dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
+        checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
+        segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
+        up_read(&nilfs->ns_sem);
+        inode_size = nilfs->ns_inode_size;
+        err = -ENOMEM;
+        nilfs->ns_dat = nilfs_mdt_new(
+                nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+        if (unlikely(!nilfs->ns_dat))
+                goto failed;
+        nilfs->ns_gc_dat = nilfs_mdt_new(
+                nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+        if (unlikely(!nilfs->ns_gc_dat))
+                goto failed_dat;
+        nilfs->ns_cpfile = nilfs_mdt_new(
+                nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
+        if (unlikely(!nilfs->ns_cpfile))
+                goto failed_gc_dat;
+        nilfs->ns_sufile = nilfs_mdt_new(
+                nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
+        if (unlikely(!nilfs->ns_sufile))
+                goto failed_cpfile;
+        err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
+        if (unlikely(err))
+                goto failed_sufile;
+        err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
+        if (unlikely(err))
+                goto failed_sufile;
+        lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
+        lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
+        nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
+        nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
+                                 sizeof(struct nilfs_cpfile_header));
+        nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
+                                 sizeof(struct nilfs_sufile_header));
+        err = nilfs_mdt_read_inode_direct(
+                nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
+        if (unlikely(err))
+                goto failed_sufile;
+        err = nilfs_mdt_read_inode_direct(
+                nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
+        if (unlikely(err))
+                goto failed_sufile;
+        err = nilfs_mdt_read_inode_direct(
+                nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
+        if (unlikely(err))
+                goto failed_sufile;
+        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+        nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
+ failed:
+        brelse(bh_sr);
+        return err;
+ failed_sufile:
+        nilfs_mdt_destroy(nilfs->ns_sufile);
+ failed_cpfile:
+        nilfs_mdt_destroy(nilfs->ns_cpfile);
+ failed_gc_dat:
+        nilfs_mdt_destroy(nilfs->ns_gc_dat);
+ failed_dat:
+        nilfs_mdt_destroy(nilfs->ns_dat);
+        goto failed;
+}
+static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
+{
+        memset(ri, 0, sizeof(*ri));
+        INIT_LIST_HEAD(&ri->ri_used_segments);
+}
+static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
+{
+        nilfs_dispose_segment_list(&ri->ri_used_segments);
+}
+/**
+ * load_nilfs - load and recover the nilfs
+ * @nilfs: the_nilfs structure to be released
+ * @sbi: nilfs_sb_info used to recover past segment
+ *
+ * load_nilfs() searches and load the latest super root,
+ * attaches the last segment, and does recovery if needed.
+ * The caller must call this exclusively for simultaneous mounts.
+ */
+int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+        struct nilfs_recovery_info ri;
+        unsigned int s_flags = sbi->s_super->s_flags;
+        int really_read_only = bdev_read_only(nilfs->ns_bdev);
+        unsigned valid_fs;
+        int err = 0;
+        nilfs_init_recovery_info(&ri);
+        down_write(&nilfs->ns_sem);
+        valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+        up_write(&nilfs->ns_sem);
+        if (!valid_fs && (s_flags & MS_RDONLY)) {
+                printk(KERN_INFO "NILFS: INFO: recovery "
+                       "required for readonly filesystem.\n");
+                if (really_read_only) {
+                        printk(KERN_ERR "NILFS: write access "
+                               "unavailable, cannot proceed.\n");
+                        err = -EROFS;
+                        goto failed;
+                }
+                printk(KERN_INFO "NILFS: write access will "
+                       "be enabled during recovery.\n");
+                sbi->s_super->s_flags &= ~MS_RDONLY;
+        }
+        err = nilfs_search_super_root(nilfs, sbi, &ri);
+        if (unlikely(err)) {
+                printk(KERN_ERR "NILFS: error searching super root.\n");
+                goto failed;
+        }
+        err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
+        if (unlikely(err)) {
+                printk(KERN_ERR "NILFS: error loading super root.\n");
+                goto failed;
+        }
+        if (!valid_fs) {
+                err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+                if (unlikely(err)) {
+                        nilfs_mdt_destroy(nilfs->ns_cpfile);
+                        nilfs_mdt_destroy(nilfs->ns_sufile);
+                        nilfs_mdt_destroy(nilfs->ns_dat);
+                        goto failed;
+                }
+                if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
+                        sbi->s_super->s_dirt = 1;
+        }
+        set_nilfs_loaded(nilfs);
+ failed:
+        nilfs_clear_recovery_info(&ri);
+        sbi->s_super->s_flags = s_flags;
+        return err;
+}
+static unsigned long long nilfs_max_size(unsigned int blkbits)
+{
+        unsigned int max_bits;
+        unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
+        max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
+        if (max_bits < 64)
+                res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
+        return res;
+}
+static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
+                                   struct nilfs_super_block *sbp)
+{
+        if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
+                printk(KERN_ERR "NILFS: revision mismatch "
+                       "(superblock rev.=%d.%d, current rev.=%d.%d). "
+                       "Please check the version of mkfs.nilfs.\n",
+                       le32_to_cpu(sbp->s_rev_level),
+                       le16_to_cpu(sbp->s_minor_rev_level),
+                       NILFS_CURRENT_REV, NILFS_MINOR_REV);
+                return -EINVAL;
+        }
+        nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
+        if (nilfs->ns_sbsize > BLOCK_SIZE)
+                return -EINVAL;
+        nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
+        nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+        nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+        if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
+                printk(KERN_ERR "NILFS: too short segment. \n");
+                return -EINVAL;
+        }
+        nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
+        nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
+        nilfs->ns_r_segments_percentage =
+                le32_to_cpu(sbp->s_r_segments_percentage);
+        nilfs->ns_nrsvsegs =
+                max_t(unsigned long, NILFS_MIN_NRSVSEGS,
+                      DIV_ROUND_UP(nilfs->ns_nsegments *
+                                   nilfs->ns_r_segments_percentage, 100));
+        nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
+        return 0;
+}
+static int nilfs_valid_sb(struct nilfs_super_block *sbp)
+{
+        static unsigned char sum[4];
+        const int sumoff = offsetof(struct nilfs_super_block, s_sum);
+        size_t bytes;
+        u32 crc;
+        if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
+                return 0;
+        bytes = le16_to_cpu(sbp->s_bytes);
+        if (bytes > BLOCK_SIZE)
+                return 0;
+        crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
+                       sumoff);
+        crc = crc32_le(crc, sum, 4);
+        crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
+                       bytes - sumoff - 4);
+        return crc == le32_to_cpu(sbp->s_sum);
+}
+static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+{
+        return offset < ((le64_to_cpu(sbp->s_nsegments) *
+                          le32_to_cpu(sbp->s_blocks_per_segment)) <<
+                         (le32_to_cpu(sbp->s_log_block_size) + 10));
+}
+static void nilfs_release_super_block(struct the_nilfs *nilfs)
+{
+        int i;
+        for (i = 0; i < 2; i++) {
+                if (nilfs->ns_sbp[i]) {
+                        brelse(nilfs->ns_sbh[i]);
+                        nilfs->ns_sbh[i] = NULL;
+                        nilfs->ns_sbp[i] = NULL;
+                }
+        }
+}
+void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
+{
+        brelse(nilfs->ns_sbh[0]);
+        nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+        nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+        nilfs->ns_sbh[1] = NULL;
+        nilfs->ns_sbp[1] = NULL;
+}
+void nilfs_swap_super_block(struct the_nilfs *nilfs)
+{
+        struct buffer_head *tsbh = nilfs->ns_sbh[0];
+        struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
+        nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+        nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+        nilfs->ns_sbh[1] = tsbh;
+        nilfs->ns_sbp[1] = tsbp;
+}
+static int nilfs_load_super_block(struct the_nilfs *nilfs,
+                                  struct super_block *sb, int blocksize,
+                                  struct nilfs_super_block **sbpp)
+{
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        struct buffer_head **sbh = nilfs->ns_sbh;
+        u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+        int valid[2], swp = 0;
+        sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
+                                        &sbh[0]);
+        sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
+        if (!sbp[0]) {
+                if (!sbp[1]) {
+                        printk(KERN_ERR "NILFS: unable to read superblock\n");
+                        return -EIO;
+                }
+                printk(KERN_WARNING
+                       "NILFS warning: unable to read primary superblock\n");
+        } else if (!sbp[1])
+                printk(KERN_WARNING
+                       "NILFS warning: unable to read secondary superblock\n");
+        valid[0] = nilfs_valid_sb(sbp[0]);
+        valid[1] = nilfs_valid_sb(sbp[1]);
+        swp = valid[1] &&
+                (!valid[0] ||
+                 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+        if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
+                brelse(sbh[1]);
+                sbh[1] = NULL;
+                sbp[1] = NULL;
+                swp = 0;
+        }
+        if (!valid[swp]) {
+                nilfs_release_super_block(nilfs);
+                printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
+                       sb->s_id);
+                return -EINVAL;
+        }
+        if (swp) {
+                printk(KERN_WARNING "NILFS warning: broken superblock. "
+                       "using spare superblock.\n");
+                nilfs_swap_super_block(nilfs);
+        }
+        nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
+        nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
+        nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
+        *sbpp = sbp[0];
+        return 0;
+}
+/**
+ * init_nilfs - initialize a NILFS instance.
+ * @nilfs: the_nilfs structure
+ * @sbi: nilfs_sb_info
+ * @sb: super block
+ * @data: mount options
+ *
+ * init_nilfs() performs common initialization per block device (e.g.
+ * reading the super block, getting disk layout information, initializing
+ * shared fields in the_nilfs). It takes on some portion of the jobs
+ * typically done by a fill_super() routine. This division arises from
+ * the nature that multiple NILFS instances may be simultaneously
+ * mounted on a device.
+ * For multiple mounts on the same device, only the first mount
+ * invokes these tasks.
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error
+ * code is returned.
+ */
+int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
+{
+        struct super_block *sb = sbi->s_super;
+        struct nilfs_super_block *sbp;
+        struct backing_dev_info *bdi;
+        int blocksize;
+        int err;
+        down_write(&nilfs->ns_sem);
+        if (nilfs_init(nilfs)) {
+                /* Load values from existing the_nilfs */
+                sbp = nilfs->ns_sbp[0];
+                err = nilfs_store_magic_and_option(sb, sbp, data);
+                if (err)
+                        goto out;
+                blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+                if (sb->s_blocksize != blocksize &&
+                    !sb_set_blocksize(sb, blocksize)) {
+                        printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
+                               blocksize);
+                        err = -EINVAL;
+                }
+                sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+                goto out;
+        }
+        blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+        if (!blocksize) {
+                printk(KERN_ERR "NILFS: unable to set blocksize\n");
+                err = -EINVAL;
+                goto out;
+        }
+        err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+        if (err)
+                goto out;
+        err = nilfs_store_magic_and_option(sb, sbp, data);
+        if (err)
+                goto failed_sbh;
+        blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+        if (sb->s_blocksize != blocksize) {
+                int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+                if (blocksize < hw_blocksize) {
+                        printk(KERN_ERR
+                               "NILFS: blocksize %d too small for device "
+                               "(sector-size = %d).\n",
+                               blocksize, hw_blocksize);
+                        err = -EINVAL;
+                        goto failed_sbh;
+                }
+                nilfs_release_super_block(nilfs);
+                sb_set_blocksize(sb, blocksize);
+                err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+                if (err)
+                        goto out;
+                        /* not failed_sbh; sbh is released automatically
+                           when reloading fails. */
+        }
+        nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
+        err = nilfs_store_disk_layout(nilfs, sbp);
+        if (err)
+                goto failed_sbh;
+        sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+        nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
+        bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
+        if (!bdi)
+                bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+        nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
+        /* Finding last segment */
+        nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+        nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
+        nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
+        nilfs->ns_seg_seq = nilfs->ns_last_seq;
+        nilfs->ns_segnum =
+                nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
+        nilfs->ns_cno = nilfs->ns_last_cno + 1;
+        if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
+                printk(KERN_ERR "NILFS invalid last segment number.\n");
+                err = -EINVAL;
+                goto failed_sbh;
+        }
+        /* Dummy values  */
+        nilfs->ns_free_segments_count =
+                nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
+        /* Initialize gcinode cache */
+        err = nilfs_init_gccache(nilfs);
+        if (err)
+                goto failed_sbh;
+        set_nilfs_init(nilfs);
+        err = 0;
+ out:
+        up_write(&nilfs->ns_sem);
+        return err;
+ failed_sbh:
+        nilfs_release_super_block(nilfs);
+        goto out;
+}
+int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
+{
+        struct inode *dat = nilfs_dat_inode(nilfs);
+        unsigned long ncleansegs;
+        int err;
+        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
+        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        if (likely(!err))
+                *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+        return err;
+}
+int nilfs_near_disk_full(struct the_nilfs *nilfs)
+{
+        struct inode *sufile = nilfs->ns_sufile;
+        unsigned long ncleansegs, nincsegs;
+        int ret;
+        ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
+        if (likely(!ret)) {
+                nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+                        nilfs->ns_blocks_per_segment + 1;
+                if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
+                        ret++;
+        }
+        return ret;
+}
+int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
+                                int snapshot_mount)
+{
+        struct nilfs_sb_info *sbi;
+        int ret = 0;
+        down_read(&nilfs->ns_sem);
+        if (cno == 0 || cno > nilfs->ns_cno)
+                goto out_unlock;
+        list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+                if (sbi->s_snapshot_cno == cno &&
+                    (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
+                                        /* exclude read-only mounts */
+                        ret++;
+                        break;
+                }
+        }
+        /* for protecting recent checkpoints */
+        if (cno >= nilfs_last_cno(nilfs))
+                ret++;
+ out_unlock:
+        up_read(&nilfs->ns_sem);
+        return ret;
+}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
new file mode 100644
index 000000000000..30fe58778d05
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,298 @@
+/*
+ * the_nilfs.h - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _THE_NILFS_H
+#define _THE_NILFS_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "sb.h"
+/* the_nilfs struct */
+enum {
+        THE_NILFS_INIT = 0,     /* Information from super_block is set */
+        THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
+                                   the latest checkpoint was loaded */
+        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
+};
+/**
+ * struct the_nilfs - struct to supervise multiple nilfs mount points
+ * @ns_flags: flags
+ * @ns_count: reference count
+ * @ns_bdev: block device
+ * @ns_bdi: backing dev info
+ * @ns_writer: back pointer to writable nilfs_sb_info
+ * @ns_sem: semaphore for shared states
+ * @ns_writer_mutex: mutex protecting ns_writer attach/detach
+ * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_sbh: buffer heads of on-disk super blocks
+ * @ns_sbp: pointers to super block data
+ * @ns_sbwtime: previous write time of super blocks
+ * @ns_sbsize: size of valid data in super block
+ * @ns_supers: list of nilfs super block structs
+ * @ns_seg_seq: segment sequence counter
+ * @ns_segnum: index number of the latest full segment.
+ * @ns_nextnum: index number of the full segment index to be used next
+ * @ns_pseg_offset: offset of next partial segment in the current full segment
+ * @ns_cno: next checkpoint number
+ * @ns_ctime: write time of the last segment
+ * @ns_nongc_ctime: write time of the last segment not for cleaner operation
+ * @ns_ndirtyblks: Number of dirty data blocks
+ * @ns_last_segment_lock: lock protecting fields for the latest segment
+ * @ns_last_pseg: start block number of the latest segment
+ * @ns_last_seq: sequence value of the latest segment
+ * @ns_last_cno: checkpoint number of the latest segment
+ * @ns_prot_seq: least sequence number of segments which must not be reclaimed
+ * @ns_free_segments_count: counter of free segments
+ * @ns_segctor_sem: segment constructor semaphore
+ * @ns_dat: DAT file inode
+ * @ns_cpfile: checkpoint file inode
+ * @ns_sufile: segusage file inode
+ * @ns_gc_dat: shadow inode of the DAT file inode for GC
+ * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
+ * @ns_blocksize_bits: bit length of block size
+ * @ns_nsegments: number of segments in filesystem
+ * @ns_blocks_per_segment: number of blocks per segment
+ * @ns_r_segments_percentage: reserved segments percentage
+ * @ns_nrsvsegs: number of reserved segments
+ * @ns_first_data_block: block number of first data block
+ * @ns_inode_size: size of on-disk inode
+ * @ns_first_ino: first not-special inode number
+ * @ns_crc_seed: seed value of CRC32 calculation
+ */
+struct the_nilfs {
+        unsigned long           ns_flags;
+        atomic_t                ns_count;
+        struct block_device    *ns_bdev;
+        struct backing_dev_info *ns_bdi;
+        struct nilfs_sb_info   *ns_writer;
+        struct rw_semaphore     ns_sem;
+        struct mutex            ns_writer_mutex;
+        atomic_t                ns_writer_refcount;
+        /*
+         * used for
+         * - loading the latest checkpoint exclusively.
+         * - allocating a new full segment.
+         * - protecting s_dirt in the super_block struct
+         *   (see nilfs_write_super) and the following fields.
+         */
+        struct buffer_head     *ns_sbh[2];
+        struct nilfs_super_block *ns_sbp[2];
+        time_t                  ns_sbwtime[2];
+        unsigned                ns_sbsize;
+        unsigned                ns_mount_state;
+        struct list_head        ns_supers;
+        /*
+         * Following fields are dedicated to a writable FS-instance.
+         * Except for the period seeking checkpoint, code outside the segment
+         * constructor must lock a segment semaphore while accessing these
+         * fields.
+         * The writable FS-instance is sole during a lifetime of the_nilfs.
+         */
+        u64                     ns_seg_seq;
+        __u64                   ns_segnum;
+        __u64                   ns_nextnum;
+        unsigned long           ns_pseg_offset;
+        __u64                   ns_cno;
+        time_t                  ns_ctime;
+        time_t                  ns_nongc_ctime;
+        atomic_t                ns_ndirtyblks;
+        /*
+         * The following fields hold information on the latest partial segment
+         * written to disk with a super root.  These fields are protected by
+         * ns_last_segment_lock.
+         */
+        spinlock_t              ns_last_segment_lock;
+        sector_t                ns_last_pseg;
+        u64                     ns_last_seq;
+        __u64                   ns_last_cno;
+        u64                     ns_prot_seq;
+        unsigned long           ns_free_segments_count;
+        struct rw_semaphore     ns_segctor_sem;
+        /*
+         * Following fields are lock free except for the period before
+         * the_nilfs is initialized.
+         */
+        struct inode           *ns_dat;
+        struct inode           *ns_cpfile;
+        struct inode           *ns_sufile;
+        struct inode           *ns_gc_dat;
+        /* GC inode list and hash table head */
+        struct list_head        ns_gc_inodes;
+        struct hlist_head      *ns_gc_inodes_h;
+        /* Disk layout information (static) */
+        unsigned int            ns_blocksize_bits;
+        unsigned long           ns_nsegments;
+        unsigned long           ns_blocks_per_segment;
+        unsigned long           ns_r_segments_percentage;
+        unsigned long           ns_nrsvsegs;
+        unsigned long           ns_first_data_block;
+        int                     ns_inode_size;
+        int                     ns_first_ino;
+        u32                     ns_crc_seed;
+};
+#define NILFS_GCINODE_HASH_BITS         8
+#define NILFS_GCINODE_HASH_SIZE         (1<<NILFS_GCINODE_HASH_BITS)
+#define THE_NILFS_FNS(bit, name)                                        \
+static inline void set_nilfs_##name(struct the_nilfs *nilfs)            \
+{                                                                       \
+        set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);                   \
+}                                                                       \
+static inline void clear_nilfs_##name(struct the_nilfs *nilfs)          \
+{                                                                       \
+        clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);                 \
+}                                                                       \
+static inline int nilfs_##name(struct the_nilfs *nilfs)                 \
+{                                                                       \
+        return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);           \
+}
+THE_NILFS_FNS(INIT, init)
+THE_NILFS_FNS(LOADED, loaded)
+THE_NILFS_FNS(DISCONTINUED, discontinued)
+/* Minimum interval of periodical update of superblocks (in seconds) */
+#define NILFS_SB_FREQ           10
+#define NILFS_ALTSB_FREQ        60  /* spare superblock */
+void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
+struct the_nilfs *alloc_nilfs(struct block_device *);
+void put_nilfs(struct the_nilfs *);
+int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
+int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
+int nilfs_near_disk_full(struct the_nilfs *);
+void nilfs_fall_back_super_block(struct the_nilfs *);
+void nilfs_swap_super_block(struct the_nilfs *);
+static inline void get_nilfs(struct the_nilfs *nilfs)
+{
+        /* Caller must have at least one reference of the_nilfs. */
+        atomic_inc(&nilfs->ns_count);
+}
+static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
+{
+        if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
+                mutex_lock(&nilfs->ns_writer_mutex);
+        return nilfs->ns_writer;
+}
+static inline void nilfs_put_writer(struct the_nilfs *nilfs)
+{
+        if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
+                mutex_unlock(&nilfs->ns_writer_mutex);
+}
+static inline void
+nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+        mutex_lock(&nilfs->ns_writer_mutex);
+        nilfs->ns_writer = sbi;
+        mutex_unlock(&nilfs->ns_writer_mutex);
+}
+static inline void
+nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+        mutex_lock(&nilfs->ns_writer_mutex);
+        if (sbi == nilfs->ns_writer)
+                nilfs->ns_writer = NULL;
+        mutex_unlock(&nilfs->ns_writer_mutex);
+}
+static inline void
+nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
+                        sector_t *seg_start, sector_t *seg_end)
+{
+        *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
+        *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
+        if (segnum == 0)
+                *seg_start = nilfs->ns_first_data_block;
+}
+static inline sector_t
+nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
+{
+        return (segnum == 0) ? nilfs->ns_first_data_block :
+                (sector_t)nilfs->ns_blocks_per_segment * segnum;
+}
+static inline __u64
+nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
+{
+        sector_t segnum = blocknr;
+        sector_div(segnum, nilfs->ns_blocks_per_segment);
+        return segnum;
+}
+static inline void
+nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
+                        sector_t seg_end)
+{
+        /* terminate the current full segment (used in case of I/O-error) */
+        nilfs->ns_pseg_offset = seg_end - seg_start + 1;
+}
+static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
+{
+        /* move forward with a full segment */
+        nilfs->ns_segnum = nilfs->ns_nextnum;
+        nilfs->ns_pseg_offset = 0;
+        nilfs->ns_seg_seq++;
+}
+static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
+{
+        __u64 cno;
+        spin_lock(&nilfs->ns_last_segment_lock);
+        cno = nilfs->ns_last_cno;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        return cno;
+}
+static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
+{
+        return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
+}
+#endif /* _THE_NILFS_H */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bed766e435b5..1634319e2404 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -220,7 +220,7 @@ static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
                                rem = 0;
                }
-                kevent->name = kmalloc(len + rem, GFP_KERNEL);
+                kevent->name = kmalloc(len + rem, GFP_NOFS);
                if (unlikely(!kevent->name)) {
                        kmem_cache_free(event_cachep, kevent);
                        return NULL;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 34314b33dbd4..5a9e34475e37 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -32,8 +32,8 @@
 /**
 * The little endian Unicode string $I30 as a global constant.
 */
-ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
+ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
-                const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 };
+                cpu_to_le16('3'),       cpu_to_le16('0'), 0 };
 /**
 * ntfs_lookup_inode_by_name - find an inode in a directory given its name
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 86bef156cf0a..82c5085559c6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi)
                                goto em_put_err_out;
                        next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
                                        le16_to_cpu(al_entry->length));
-                        if (le32_to_cpu(al_entry->type) >
+                        if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
-                                        const_le32_to_cpu(AT_DATA))
                                goto em_put_err_out;
                        if (AT_DATA != al_entry->type)
                                continue;
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 1e383328eceb..50931b1ce4b9 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -31,19 +31,8 @@
 #include "types.h"
-/*
- * Constant endianness conversion defines.
- */
-#define const_le16_to_cpu(x)    __constant_le16_to_cpu(x)
-#define const_le32_to_cpu(x)    __constant_le32_to_cpu(x)
-#define const_le64_to_cpu(x)    __constant_le64_to_cpu(x)
-#define const_cpu_to_le16(x)    __constant_cpu_to_le16(x)
-#define const_cpu_to_le32(x)    __constant_cpu_to_le32(x)
-#define const_cpu_to_le64(x)    __constant_cpu_to_le64(x)
 /* The NTFS oem_id "NTFS    " */
-#define magicNTFS       const_cpu_to_le64(0x202020205346544eULL)
+#define magicNTFS       cpu_to_le64(0x202020205346544eULL)
 /*
 * Location of bootsector on partition:
@@ -114,25 +103,25 @@ typedef struct {
 */
 enum {
        /* Found in $MFT/$DATA. */
-        magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */
+        magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
-        magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */
+        magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
-        magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+        magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
        /* Found in $LogFile/$DATA. */
-        magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */
+        magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
-        magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */
+        magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
        /* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
-        magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
+        magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
        /* Found in all ntfs record containing records. */
-        magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector
+        magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
                                                       transfer was detected. */
        /*
         * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
         * thus not initialized.  Page must be initialized before using it.
         */
-        magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */
+        magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
 };
 typedef le32 NTFS_RECORD_TYPE;
@@ -258,8 +247,8 @@ typedef enum {
 * information about the mft record in which they are present.
 */
 enum {
-        MFT_RECORD_IN_USE       = const_cpu_to_le16(0x0001),
+        MFT_RECORD_IN_USE       = cpu_to_le16(0x0001),
-        MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002),
+        MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
 } __attribute__ ((__packed__));
 typedef le16 MFT_RECORD_FLAGS;
@@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS;
 * Note: The _LE versions will return a CPU endian formatted value!
 */
 #define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
-#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU)
+#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
 typedef u64 MFT_REF;
 typedef le64 leMFT_REF;
@@ -477,25 +466,25 @@ typedef struct {
 * a revealing choice of symbol I do not know what is... (-;
 */
 enum {
-        AT_UNUSED                       = const_cpu_to_le32(         0),
+        AT_UNUSED                       = cpu_to_le32(         0),
-        AT_STANDARD_INFORMATION         = const_cpu_to_le32(      0x10),
+        AT_STANDARD_INFORMATION         = cpu_to_le32(      0x10),
-        AT_ATTRIBUTE_LIST               = const_cpu_to_le32(      0x20),
+        AT_ATTRIBUTE_LIST               = cpu_to_le32(      0x20),
-        AT_FILE_NAME                    = const_cpu_to_le32(      0x30),
+        AT_FILE_NAME                    = cpu_to_le32(      0x30),
-        AT_OBJECT_ID                    = const_cpu_to_le32(      0x40),
+        AT_OBJECT_ID                    = cpu_to_le32(      0x40),
-        AT_SECURITY_DESCRIPTOR          = const_cpu_to_le32(      0x50),
+        AT_SECURITY_DESCRIPTOR          = cpu_to_le32(      0x50),
-        AT_VOLUME_NAME                  = const_cpu_to_le32(      0x60),
+        AT_VOLUME_NAME                  = cpu_to_le32(      0x60),
-        AT_VOLUME_INFORMATION           = const_cpu_to_le32(      0x70),
+        AT_VOLUME_INFORMATION           = cpu_to_le32(      0x70),
-        AT_DATA                         = const_cpu_to_le32(      0x80),
+        AT_DATA                         = cpu_to_le32(      0x80),
-        AT_INDEX_ROOT                   = const_cpu_to_le32(      0x90),
+        AT_INDEX_ROOT                   = cpu_to_le32(      0x90),
-        AT_INDEX_ALLOCATION             = const_cpu_to_le32(      0xa0),
+        AT_INDEX_ALLOCATION             = cpu_to_le32(      0xa0),
-        AT_BITMAP                       = const_cpu_to_le32(      0xb0),
+        AT_BITMAP                       = cpu_to_le32(      0xb0),
-        AT_REPARSE_POINT                = const_cpu_to_le32(      0xc0),
+        AT_REPARSE_POINT                = cpu_to_le32(      0xc0),
-        AT_EA_INFORMATION               = const_cpu_to_le32(      0xd0),
+        AT_EA_INFORMATION               = cpu_to_le32(      0xd0),
-        AT_EA                           = const_cpu_to_le32(      0xe0),
+        AT_EA                           = cpu_to_le32(      0xe0),
-        AT_PROPERTY_SET                 = const_cpu_to_le32(      0xf0),
+        AT_PROPERTY_SET                 = cpu_to_le32(      0xf0),
-        AT_LOGGED_UTILITY_STREAM        = const_cpu_to_le32(     0x100),
+        AT_LOGGED_UTILITY_STREAM        = cpu_to_le32(     0x100),
-        AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32(    0x1000),
+        AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32(    0x1000),
-        AT_END                          = const_cpu_to_le32(0xffffffff)
+        AT_END                          = cpu_to_le32(0xffffffff)
 };
 typedef le32 ATTR_TYPE;
@@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE;
 *      equal then the second le32 values would be compared, etc.
 */
 enum {
-        COLLATION_BINARY                = const_cpu_to_le32(0x00),
+        COLLATION_BINARY                = cpu_to_le32(0x00),
-        COLLATION_FILE_NAME             = const_cpu_to_le32(0x01),
+        COLLATION_FILE_NAME             = cpu_to_le32(0x01),
-        COLLATION_UNICODE_STRING        = const_cpu_to_le32(0x02),
+        COLLATION_UNICODE_STRING        = cpu_to_le32(0x02),
-        COLLATION_NTOFS_ULONG           = const_cpu_to_le32(0x10),
+        COLLATION_NTOFS_ULONG           = cpu_to_le32(0x10),
-        COLLATION_NTOFS_SID             = const_cpu_to_le32(0x11),
+        COLLATION_NTOFS_SID             = cpu_to_le32(0x11),
-        COLLATION_NTOFS_SECURITY_HASH   = const_cpu_to_le32(0x12),
+        COLLATION_NTOFS_SECURITY_HASH   = cpu_to_le32(0x12),
-        COLLATION_NTOFS_ULONGS          = const_cpu_to_le32(0x13),
+        COLLATION_NTOFS_ULONGS          = cpu_to_le32(0x13),
 };
 typedef le32 COLLATION_RULE;
@@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE;
 * NT4.
 */
 enum {
-        ATTR_DEF_INDEXABLE      = const_cpu_to_le32(0x02), /* Attribute can be
+        ATTR_DEF_INDEXABLE      = cpu_to_le32(0x02), /* Attribute can be
                                        indexed. */
-        ATTR_DEF_MULTIPLE       = const_cpu_to_le32(0x04), /* Attribute type
+        ATTR_DEF_MULTIPLE       = cpu_to_le32(0x04), /* Attribute type
                                        can be present multiple times in the
                                        mft records of an inode. */
-        ATTR_DEF_NOT_ZERO       = const_cpu_to_le32(0x08), /* Attribute value
+        ATTR_DEF_NOT_ZERO       = cpu_to_le32(0x08), /* Attribute value
                                        must contain at least one non-zero
                                        byte. */
-        ATTR_DEF_INDEXED_UNIQUE = const_cpu_to_le32(0x10), /* Attribute must be
+        ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be
                                        indexed and the attribute value must be
                                        unique for the attribute type in all of
                                        the mft records of an inode. */
-        ATTR_DEF_NAMED_UNIQUE   = const_cpu_to_le32(0x20), /* Attribute must be
+        ATTR_DEF_NAMED_UNIQUE   = cpu_to_le32(0x20), /* Attribute must be
                                        named and the name must be unique for
                                        the attribute type in all of the mft
                                        records of an inode. */
-        ATTR_DEF_RESIDENT       = const_cpu_to_le32(0x40), /* Attribute must be
+        ATTR_DEF_RESIDENT       = cpu_to_le32(0x40), /* Attribute must be
                                        resident. */
-        ATTR_DEF_ALWAYS_LOG     = const_cpu_to_le32(0x80), /* Always log
+        ATTR_DEF_ALWAYS_LOG     = cpu_to_le32(0x80), /* Always log
                                        modifications to this attribute,
                                        regardless of whether it is resident or
                                        non-resident.  Without this, only log
@@ -614,12 +603,12 @@ typedef struct {
 * Attribute flags (16-bit).
 */
 enum {
-        ATTR_IS_COMPRESSED    = const_cpu_to_le16(0x0001),
+        ATTR_IS_COMPRESSED    = cpu_to_le16(0x0001),
-        ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method
+        ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
                                                              mask.  Also, first
                                                              illegal value. */
-        ATTR_IS_ENCRYPTED     = const_cpu_to_le16(0x4000),
+        ATTR_IS_ENCRYPTED     = cpu_to_le16(0x4000),
-        ATTR_IS_SPARSE        = const_cpu_to_le16(0x8000),
+        ATTR_IS_SPARSE        = cpu_to_le16(0x8000),
 } __attribute__ ((__packed__));
 typedef le16 ATTR_FLAGS;
@@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC;
 * flags appear in all of the above.
 */
 enum {
-        FILE_ATTR_READONLY              = const_cpu_to_le32(0x00000001),
+        FILE_ATTR_READONLY              = cpu_to_le32(0x00000001),
-        FILE_ATTR_HIDDEN                = const_cpu_to_le32(0x00000002),
+        FILE_ATTR_HIDDEN                = cpu_to_le32(0x00000002),
-        FILE_ATTR_SYSTEM                = const_cpu_to_le32(0x00000004),
+        FILE_ATTR_SYSTEM                = cpu_to_le32(0x00000004),
-        /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */
+        /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */
-        FILE_ATTR_DIRECTORY             = const_cpu_to_le32(0x00000010),
+        FILE_ATTR_DIRECTORY             = cpu_to_le32(0x00000010),
        /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
           reserved for the DOS SUBDIRECTORY flag. */
-        FILE_ATTR_ARCHIVE               = const_cpu_to_le32(0x00000020),
+        FILE_ATTR_ARCHIVE               = cpu_to_le32(0x00000020),
-        FILE_ATTR_DEVICE                = const_cpu_to_le32(0x00000040),
+        FILE_ATTR_DEVICE                = cpu_to_le32(0x00000040),
-        FILE_ATTR_NORMAL                = const_cpu_to_le32(0x00000080),
+        FILE_ATTR_NORMAL                = cpu_to_le32(0x00000080),
-        FILE_ATTR_TEMPORARY             = const_cpu_to_le32(0x00000100),
+        FILE_ATTR_TEMPORARY             = cpu_to_le32(0x00000100),
-        FILE_ATTR_SPARSE_FILE           = const_cpu_to_le32(0x00000200),
+        FILE_ATTR_SPARSE_FILE           = cpu_to_le32(0x00000200),
-        FILE_ATTR_REPARSE_POINT         = const_cpu_to_le32(0x00000400),
+        FILE_ATTR_REPARSE_POINT         = cpu_to_le32(0x00000400),
-        FILE_ATTR_COMPRESSED            = const_cpu_to_le32(0x00000800),
+        FILE_ATTR_COMPRESSED            = cpu_to_le32(0x00000800),
-        FILE_ATTR_OFFLINE               = const_cpu_to_le32(0x00001000),
+        FILE_ATTR_OFFLINE               = cpu_to_le32(0x00001000),
-        FILE_ATTR_NOT_CONTENT_INDEXED   = const_cpu_to_le32(0x00002000),
+        FILE_ATTR_NOT_CONTENT_INDEXED   = cpu_to_le32(0x00002000),
-        FILE_ATTR_ENCRYPTED             = const_cpu_to_le32(0x00004000),
+        FILE_ATTR_ENCRYPTED             = cpu_to_le32(0x00004000),
-        FILE_ATTR_VALID_FLAGS           = const_cpu_to_le32(0x00007fb7),
+        FILE_ATTR_VALID_FLAGS           = cpu_to_le32(0x00007fb7),
        /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
           FILE_ATTR_DEVICE and preserves everything else.  This mask is used
           to obtain all flags that are valid for reading. */
-        FILE_ATTR_VALID_SET_FLAGS       = const_cpu_to_le32(0x000031a7),
+        FILE_ATTR_VALID_SET_FLAGS       = cpu_to_le32(0x000031a7),
        /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
           F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
           F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
@@ -846,11 +835,11 @@ enum {
         * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
         * attribute of an mft record.
         */
-        FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT   = const_cpu_to_le32(0x10000000),
+        FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT   = cpu_to_le32(0x10000000),
        /* Note, this is a copy of the corresponding bit from the mft record,
           telling us whether this is a directory or not, i.e. whether it has
           an index root attribute or not. */
-        FILE_ATTR_DUP_VIEW_INDEX_PRESENT        = const_cpu_to_le32(0x20000000),
+        FILE_ATTR_DUP_VIEW_INDEX_PRESENT        = cpu_to_le32(0x20000000),
        /* Note, this is a copy of the corresponding bit from the mft record,
           telling us whether this file has a view index present (eg. object id
           index, quota index, one of the security indexes or the encrypting
@@ -1446,42 +1435,42 @@ enum {
        /* Specific rights for files and directories are as follows: */
        /* Right to read data from the file. (FILE) */
-        FILE_READ_DATA                  = const_cpu_to_le32(0x00000001),
+        FILE_READ_DATA                  = cpu_to_le32(0x00000001),
        /* Right to list contents of a directory. (DIRECTORY) */
-        FILE_LIST_DIRECTORY             = const_cpu_to_le32(0x00000001),
+        FILE_LIST_DIRECTORY             = cpu_to_le32(0x00000001),
        /* Right to write data to the file. (FILE) */
-        FILE_WRITE_DATA                 = const_cpu_to_le32(0x00000002),
+        FILE_WRITE_DATA                 = cpu_to_le32(0x00000002),
        /* Right to create a file in the directory. (DIRECTORY) */
-        FILE_ADD_FILE                   = const_cpu_to_le32(0x00000002),
+        FILE_ADD_FILE                   = cpu_to_le32(0x00000002),
        /* Right to append data to the file. (FILE) */
-        FILE_APPEND_DATA                = const_cpu_to_le32(0x00000004),
+        FILE_APPEND_DATA                = cpu_to_le32(0x00000004),
        /* Right to create a subdirectory. (DIRECTORY) */
-        FILE_ADD_SUBDIRECTORY           = const_cpu_to_le32(0x00000004),
+        FILE_ADD_SUBDIRECTORY           = cpu_to_le32(0x00000004),
        /* Right to read extended attributes. (FILE/DIRECTORY) */
-        FILE_READ_EA                    = const_cpu_to_le32(0x00000008),
+        FILE_READ_EA                    = cpu_to_le32(0x00000008),
        /* Right to write extended attributes. (FILE/DIRECTORY) */
-        FILE_WRITE_EA                   = const_cpu_to_le32(0x00000010),
+        FILE_WRITE_EA                   = cpu_to_le32(0x00000010),
        /* Right to execute a file. (FILE) */
-        FILE_EXECUTE                    = const_cpu_to_le32(0x00000020),
+        FILE_EXECUTE                    = cpu_to_le32(0x00000020),
        /* Right to traverse the directory. (DIRECTORY) */
-        FILE_TRAVERSE                   = const_cpu_to_le32(0x00000020),
+        FILE_TRAVERSE                   = cpu_to_le32(0x00000020),
        /*
         * Right to delete a directory and all the files it contains (its
         * children), even if the files are read-only. (DIRECTORY)
         */
-        FILE_DELETE_CHILD               = const_cpu_to_le32(0x00000040),
+        FILE_DELETE_CHILD               = cpu_to_le32(0x00000040),
        /* Right to read file attributes. (FILE/DIRECTORY) */
-        FILE_READ_ATTRIBUTES            = const_cpu_to_le32(0x00000080),
+        FILE_READ_ATTRIBUTES            = cpu_to_le32(0x00000080),
        /* Right to change file attributes. (FILE/DIRECTORY) */
-        FILE_WRITE_ATTRIBUTES           = const_cpu_to_le32(0x00000100),
+        FILE_WRITE_ATTRIBUTES           = cpu_to_le32(0x00000100),
        /*
         * The standard rights (bits 16 to 23).  These are independent of the
@@ -1489,27 +1478,27 @@ enum {
         */
        /* Right to delete the object. */
-        DELETE                          = const_cpu_to_le32(0x00010000),
+        DELETE                          = cpu_to_le32(0x00010000),
        /*
         * Right to read the information in the object's security descriptor,
         * not including the information in the SACL, i.e. right to read the
         * security descriptor and owner.
         */
-        READ_CONTROL                    = const_cpu_to_le32(0x00020000),
+        READ_CONTROL                    = cpu_to_le32(0x00020000),
        /* Right to modify the DACL in the object's security descriptor. */
-        WRITE_DAC                       = const_cpu_to_le32(0x00040000),
+        WRITE_DAC                       = cpu_to_le32(0x00040000),
        /* Right to change the owner in the object's security descriptor. */
-        WRITE_OWNER                     = const_cpu_to_le32(0x00080000),
+        WRITE_OWNER                     = cpu_to_le32(0x00080000),
        /*
         * Right to use the object for synchronization.  Enables a process to
         * wait until the object is in the signalled state.  Some object types
         * do not support this access right.
         */
-        SYNCHRONIZE                     = const_cpu_to_le32(0x00100000),
+        SYNCHRONIZE                     = cpu_to_le32(0x00100000),
        /*
         * The following STANDARD_RIGHTS_* are combinations of the above for
@@ -1517,25 +1506,25 @@ enum {
         */
        /* These are currently defined to READ_CONTROL. */
-        STANDARD_RIGHTS_READ            = const_cpu_to_le32(0x00020000),
+        STANDARD_RIGHTS_READ            = cpu_to_le32(0x00020000),
-        STANDARD_RIGHTS_WRITE           = const_cpu_to_le32(0x00020000),
+        STANDARD_RIGHTS_WRITE           = cpu_to_le32(0x00020000),
-        STANDARD_RIGHTS_EXECUTE         = const_cpu_to_le32(0x00020000),
+        STANDARD_RIGHTS_EXECUTE         = cpu_to_le32(0x00020000),
        /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
-        STANDARD_RIGHTS_REQUIRED        = const_cpu_to_le32(0x000f0000),
+        STANDARD_RIGHTS_REQUIRED        = cpu_to_le32(0x000f0000),
        /*
         * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
         * SYNCHRONIZE access.
         */
-        STANDARD_RIGHTS_ALL             = const_cpu_to_le32(0x001f0000),
+        STANDARD_RIGHTS_ALL             = cpu_to_le32(0x001f0000),
        /*
         * The access system ACL and maximum allowed access types (bits 24 to
         * 25, bits 26 to 27 are reserved).
         */
-        ACCESS_SYSTEM_SECURITY          = const_cpu_to_le32(0x01000000),
+        ACCESS_SYSTEM_SECURITY          = cpu_to_le32(0x01000000),
-        MAXIMUM_ALLOWED                 = const_cpu_to_le32(0x02000000),
+        MAXIMUM_ALLOWED                 = cpu_to_le32(0x02000000),
        /*
         * The generic rights (bits 28 to 31).  These map onto the standard and
@@ -1543,10 +1532,10 @@ enum {
         */
        /* Read, write, and execute access. */
-        GENERIC_ALL                     = const_cpu_to_le32(0x10000000),
+        GENERIC_ALL                     = cpu_to_le32(0x10000000),
        /* Execute access. */
-        GENERIC_EXECUTE                 = const_cpu_to_le32(0x20000000),
+        GENERIC_EXECUTE                 = cpu_to_le32(0x20000000),
        /*
         * Write access.  For files, this maps onto:
@@ -1555,7 +1544,7 @@ enum {
         * For directories, the mapping has the same numerical value.  See
         * above for the descriptions of the rights granted.
         */
-        GENERIC_WRITE                   = const_cpu_to_le32(0x40000000),
+        GENERIC_WRITE                   = cpu_to_le32(0x40000000),
        /*
         * Read access.  For files, this maps onto:
@@ -1564,7 +1553,7 @@ enum {
         * For directories, the mapping has the same numberical value.  See
         * above for the descriptions of the rights granted.
         */
-        GENERIC_READ                    = const_cpu_to_le32(0x80000000),
+        GENERIC_READ                    = cpu_to_le32(0x80000000),
 };
 typedef le32 ACCESS_MASK;
@@ -1604,8 +1593,8 @@ typedef struct {
 * The object ACE flags (32-bit).
 */
 enum {
-        ACE_OBJECT_TYPE_PRESENT                 = const_cpu_to_le32(1),
+        ACE_OBJECT_TYPE_PRESENT                 = cpu_to_le32(1),
-        ACE_INHERITED_OBJECT_TYPE_PRESENT       = const_cpu_to_le32(2),
+        ACE_INHERITED_OBJECT_TYPE_PRESENT       = cpu_to_le32(2),
 };
 typedef le32 OBJECT_ACE_FLAGS;
@@ -1706,23 +1695,23 @@ typedef enum {
 *      expressed as offsets from the beginning of the security descriptor.
 */
 enum {
-        SE_OWNER_DEFAULTED              = const_cpu_to_le16(0x0001),
+        SE_OWNER_DEFAULTED              = cpu_to_le16(0x0001),
-        SE_GROUP_DEFAULTED              = const_cpu_to_le16(0x0002),
+        SE_GROUP_DEFAULTED              = cpu_to_le16(0x0002),
-        SE_DACL_PRESENT                 = const_cpu_to_le16(0x0004),
+        SE_DACL_PRESENT                 = cpu_to_le16(0x0004),
-        SE_DACL_DEFAULTED               = const_cpu_to_le16(0x0008),
+        SE_DACL_DEFAULTED               = cpu_to_le16(0x0008),
-        SE_SACL_PRESENT                 = const_cpu_to_le16(0x0010),
+        SE_SACL_PRESENT                 = cpu_to_le16(0x0010),
-        SE_SACL_DEFAULTED               = const_cpu_to_le16(0x0020),
+        SE_SACL_DEFAULTED               = cpu_to_le16(0x0020),
-        SE_DACL_AUTO_INHERIT_REQ        = const_cpu_to_le16(0x0100),
+        SE_DACL_AUTO_INHERIT_REQ        = cpu_to_le16(0x0100),
-        SE_SACL_AUTO_INHERIT_REQ        = const_cpu_to_le16(0x0200),
+        SE_SACL_AUTO_INHERIT_REQ        = cpu_to_le16(0x0200),
-        SE_DACL_AUTO_INHERITED          = const_cpu_to_le16(0x0400),
+        SE_DACL_AUTO_INHERITED          = cpu_to_le16(0x0400),
-        SE_SACL_AUTO_INHERITED          = const_cpu_to_le16(0x0800),
+        SE_SACL_AUTO_INHERITED          = cpu_to_le16(0x0800),
-        SE_DACL_PROTECTED               = const_cpu_to_le16(0x1000),
+        SE_DACL_PROTECTED               = cpu_to_le16(0x1000),
-        SE_SACL_PROTECTED               = const_cpu_to_le16(0x2000),
+        SE_SACL_PROTECTED               = cpu_to_le16(0x2000),
-        SE_RM_CONTROL_VALID             = const_cpu_to_le16(0x4000),
+        SE_RM_CONTROL_VALID             = cpu_to_le16(0x4000),
-        SE_SELF_RELATIVE                = const_cpu_to_le16(0x8000)
+        SE_SELF_RELATIVE                = cpu_to_le16(0x8000)
 } __attribute__ ((__packed__));
 typedef le16 SECURITY_DESCRIPTOR_CONTROL;
@@ -1910,21 +1899,21 @@ typedef struct {
 * Possible flags for the volume (16-bit).
 */
 enum {
-        VOLUME_IS_DIRTY                 = const_cpu_to_le16(0x0001),
+        VOLUME_IS_DIRTY                 = cpu_to_le16(0x0001),
-        VOLUME_RESIZE_LOG_FILE          = const_cpu_to_le16(0x0002),
+        VOLUME_RESIZE_LOG_FILE          = cpu_to_le16(0x0002),
-        VOLUME_UPGRADE_ON_MOUNT         = const_cpu_to_le16(0x0004),
+        VOLUME_UPGRADE_ON_MOUNT         = cpu_to_le16(0x0004),
-        VOLUME_MOUNTED_ON_NT4           = const_cpu_to_le16(0x0008),
+        VOLUME_MOUNTED_ON_NT4           = cpu_to_le16(0x0008),
-        VOLUME_DELETE_USN_UNDERWAY      = const_cpu_to_le16(0x0010),
+        VOLUME_DELETE_USN_UNDERWAY      = cpu_to_le16(0x0010),
-        VOLUME_REPAIR_OBJECT_ID         = const_cpu_to_le16(0x0020),
+        VOLUME_REPAIR_OBJECT_ID         = cpu_to_le16(0x0020),
-        VOLUME_CHKDSK_UNDERWAY          = const_cpu_to_le16(0x4000),
+        VOLUME_CHKDSK_UNDERWAY          = cpu_to_le16(0x4000),
-        VOLUME_MODIFIED_BY_CHKDSK       = const_cpu_to_le16(0x8000),
+        VOLUME_MODIFIED_BY_CHKDSK       = cpu_to_le16(0x8000),
-        VOLUME_FLAGS_MASK               = const_cpu_to_le16(0xc03f),
+        VOLUME_FLAGS_MASK               = cpu_to_le16(0xc03f),
        /* To make our life easier when checking if we must mount read-only. */
-        VOLUME_MUST_MOUNT_RO_MASK       = const_cpu_to_le16(0xc027),
+        VOLUME_MUST_MOUNT_RO_MASK       = cpu_to_le16(0xc027),
 } __attribute__ ((__packed__));
 typedef le16 VOLUME_FLAGS;
@@ -2109,26 +2098,26 @@ typedef struct {
 * The user quota flags.  Names explain meaning.
 */
 enum {
-        QUOTA_FLAG_DEFAULT_LIMITS       = const_cpu_to_le32(0x00000001),
+        QUOTA_FLAG_DEFAULT_LIMITS       = cpu_to_le32(0x00000001),
-        QUOTA_FLAG_LIMIT_REACHED        = const_cpu_to_le32(0x00000002),
+        QUOTA_FLAG_LIMIT_REACHED        = cpu_to_le32(0x00000002),
-        QUOTA_FLAG_ID_DELETED           = const_cpu_to_le32(0x00000004),
+        QUOTA_FLAG_ID_DELETED           = cpu_to_le32(0x00000004),
-        QUOTA_FLAG_USER_MASK            = const_cpu_to_le32(0x00000007),
+        QUOTA_FLAG_USER_MASK            = cpu_to_le32(0x00000007),
        /* This is a bit mask for the user quota flags. */
        /*
         * These flags are only present in the quota defaults index entry, i.e.
         * in the entry where owner_id = QUOTA_DEFAULTS_ID.
         */
-        QUOTA_FLAG_TRACKING_ENABLED     = const_cpu_to_le32(0x00000010),
+        QUOTA_FLAG_TRACKING_ENABLED     = cpu_to_le32(0x00000010),
-        QUOTA_FLAG_ENFORCEMENT_ENABLED  = const_cpu_to_le32(0x00000020),
+        QUOTA_FLAG_ENFORCEMENT_ENABLED  = cpu_to_le32(0x00000020),
-        QUOTA_FLAG_TRACKING_REQUESTED   = const_cpu_to_le32(0x00000040),
+        QUOTA_FLAG_TRACKING_REQUESTED   = cpu_to_le32(0x00000040),
-        QUOTA_FLAG_LOG_THRESHOLD        = const_cpu_to_le32(0x00000080),
+        QUOTA_FLAG_LOG_THRESHOLD        = cpu_to_le32(0x00000080),
-        QUOTA_FLAG_LOG_LIMIT            = const_cpu_to_le32(0x00000100),
+        QUOTA_FLAG_LOG_LIMIT            = cpu_to_le32(0x00000100),
-        QUOTA_FLAG_OUT_OF_DATE          = const_cpu_to_le32(0x00000200),
+        QUOTA_FLAG_OUT_OF_DATE          = cpu_to_le32(0x00000200),
-        QUOTA_FLAG_CORRUPT              = const_cpu_to_le32(0x00000400),
+        QUOTA_FLAG_CORRUPT              = cpu_to_le32(0x00000400),
-        QUOTA_FLAG_PENDING_DELETES      = const_cpu_to_le32(0x00000800),
+        QUOTA_FLAG_PENDING_DELETES      = cpu_to_le32(0x00000800),
 };
 typedef le32 QUOTA_FLAGS;
@@ -2172,9 +2161,9 @@ typedef struct {
 * Predefined owner_id values (32-bit).
 */
 enum {
-        QUOTA_INVALID_ID        = const_cpu_to_le32(0x00000000),
+        QUOTA_INVALID_ID        = cpu_to_le32(0x00000000),
-        QUOTA_DEFAULTS_ID       = const_cpu_to_le32(0x00000001),
+        QUOTA_DEFAULTS_ID       = cpu_to_le32(0x00000001),
-        QUOTA_FIRST_USER_ID     = const_cpu_to_le32(0x00000100),
+        QUOTA_FIRST_USER_ID     = cpu_to_le32(0x00000100),
 };
 /*
@@ -2189,14 +2178,14 @@ typedef enum {
 * Index entry flags (16-bit).
 */
 enum {
-        INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a
+        INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
                        sub-node, i.e. a reference to an index block in form of
                        a virtual cluster number (see below). */
-        INDEX_ENTRY_END  = const_cpu_to_le16(2), /* This signifies the last
+        INDEX_ENTRY_END  = cpu_to_le16(2), /* This signifies the last
                        entry in an index block.  The index entry does not
                        represent a file but it can point to a sub-node. */
-        INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force
+        INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
                        enum bit width to 16-bit. */
 } __attribute__ ((__packed__));
@@ -2334,26 +2323,26 @@ typedef struct {
 * These are the predefined reparse point tags:
 */
 enum {
-        IO_REPARSE_TAG_IS_ALIAS         = const_cpu_to_le32(0x20000000),
+        IO_REPARSE_TAG_IS_ALIAS         = cpu_to_le32(0x20000000),
-        IO_REPARSE_TAG_IS_HIGH_LATENCY  = const_cpu_to_le32(0x40000000),
+        IO_REPARSE_TAG_IS_HIGH_LATENCY  = cpu_to_le32(0x40000000),
-        IO_REPARSE_TAG_IS_MICROSOFT     = const_cpu_to_le32(0x80000000),
+        IO_REPARSE_TAG_IS_MICROSOFT     = cpu_to_le32(0x80000000),
-        IO_REPARSE_TAG_RESERVED_ZERO    = const_cpu_to_le32(0x00000000),
+        IO_REPARSE_TAG_RESERVED_ZERO    = cpu_to_le32(0x00000000),
-        IO_REPARSE_TAG_RESERVED_ONE     = const_cpu_to_le32(0x00000001),
+        IO_REPARSE_TAG_RESERVED_ONE     = cpu_to_le32(0x00000001),
-        IO_REPARSE_TAG_RESERVED_RANGE   = const_cpu_to_le32(0x00000001),
+        IO_REPARSE_TAG_RESERVED_RANGE   = cpu_to_le32(0x00000001),
-        IO_REPARSE_TAG_NSS              = const_cpu_to_le32(0x68000005),
+        IO_REPARSE_TAG_NSS              = cpu_to_le32(0x68000005),
-        IO_REPARSE_TAG_NSS_RECOVER      = const_cpu_to_le32(0x68000006),
+        IO_REPARSE_TAG_NSS_RECOVER      = cpu_to_le32(0x68000006),
-        IO_REPARSE_TAG_SIS              = const_cpu_to_le32(0x68000007),
+        IO_REPARSE_TAG_SIS              = cpu_to_le32(0x68000007),
-        IO_REPARSE_TAG_DFS              = const_cpu_to_le32(0x68000008),
+        IO_REPARSE_TAG_DFS              = cpu_to_le32(0x68000008),
-        IO_REPARSE_TAG_MOUNT_POINT      = const_cpu_to_le32(0x88000003),
+        IO_REPARSE_TAG_MOUNT_POINT      = cpu_to_le32(0x88000003),
-        IO_REPARSE_TAG_HSM              = const_cpu_to_le32(0xa8000004),
+        IO_REPARSE_TAG_HSM              = cpu_to_le32(0xa8000004),
-        IO_REPARSE_TAG_SYMBOLIC_LINK    = const_cpu_to_le32(0xe8000000),
+        IO_REPARSE_TAG_SYMBOLIC_LINK    = cpu_to_le32(0xe8000000),
-        IO_REPARSE_TAG_VALID_VALUES     = const_cpu_to_le32(0xe000ffff),
+        IO_REPARSE_TAG_VALID_VALUES     = cpu_to_le32(0xe000ffff),
 };
 /*
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 9468e1c45ae3..b5a6f08bd35c 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -104,7 +104,7 @@ typedef struct {
 * in this particular client array.  Also inside the client records themselves,
 * this means that there are no client records preceding or following this one.
 */
-#define LOGFILE_NO_CLIENT       const_cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT       cpu_to_le16(0xffff)
 #define LOGFILE_NO_CLIENT_CPU   0xffff
 /*
@@ -112,8 +112,8 @@ typedef struct {
 * information about the log file in which they are present.
 */
 enum {
-        RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002),
+        RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002),
-        RESTART_SPACE_FILLER    = const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
+        RESTART_SPACE_FILLER    = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
 } __attribute__ ((__packed__));
 typedef le16 RESTART_AREA_FLAGS;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 17d32ca6bc35..23bf68453d7d 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
         */
        /* Mark the mft record as not in use. */
-        m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
+        m->flags &= ~MFT_RECORD_IN_USE;
        /* Increment the sequence number, skipping zero, if it is not zero. */
        old_seq_no = m->sequence_number;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4a46743b5077..f76951dcd4a6 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb,
         * many BIOSes will refuse to boot from a bootsector if the magic is
         * incorrect, so we emit a warning.
         */
-        if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55))
+        if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
                ntfs_warning(sb, "Invalid end of sector marker.");
        return true;
 not_ntfs:
@@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
        u32 *kaddr, *kend;
        ntfs_name *name = NULL;
        int ret = 1;
-        static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'),
+        static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
-                        const_cpu_to_le16('i'), const_cpu_to_le16('b'),
+                        cpu_to_le16('i'), cpu_to_le16('b'),
-                        const_cpu_to_le16('e'), const_cpu_to_le16('r'),
+                        cpu_to_le16('e'), cpu_to_le16('r'),
-                        const_cpu_to_le16('f'), const_cpu_to_le16('i'),
+                        cpu_to_le16('f'), cpu_to_le16('i'),
-                        const_cpu_to_le16('l'), const_cpu_to_le16('.'),
+                        cpu_to_le16('l'), cpu_to_le16('.'),
-                        const_cpu_to_le16('s'), const_cpu_to_le16('y'),
+                        cpu_to_le16('s'), cpu_to_le16('y'),
-                        const_cpu_to_le16('s'), 0 };
+                        cpu_to_le16('s'), 0 };
        ntfs_debug("Entering.");
        /*
@@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
                goto iput_out;
        }
        kaddr = (u32*)page_address(page);
-        if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) {
+        if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
                ntfs_debug("Magic \"hibr\" found in hiberfil.sys.  Windows is "
                                "hibernated on the volume.  This is the "
                                "system volume.");
@@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol)
        MFT_REF mref;
        struct inode *tmp_ino;
        ntfs_name *name = NULL;
-        static const ntfschar Quota[7] = { const_cpu_to_le16('$'),
+        static const ntfschar Quota[7] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('Q'), const_cpu_to_le16('u'),
+                        cpu_to_le16('Q'), cpu_to_le16('u'),
-                        const_cpu_to_le16('o'), const_cpu_to_le16('t'),
+                        cpu_to_le16('o'), cpu_to_le16('t'),
-                        const_cpu_to_le16('a'), 0 };
+                        cpu_to_le16('a'), 0 };
-        static ntfschar Q[3] = { const_cpu_to_le16('$'),
+        static ntfschar Q[3] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('Q'), 0 };
+                        cpu_to_le16('Q'), 0 };
        ntfs_debug("Entering.");
        /*
@@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
        struct page *page;
        ntfs_name *name = NULL;
        USN_HEADER *uh;
-        static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'),
+        static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('U'), const_cpu_to_le16('s'),
+                        cpu_to_le16('U'), cpu_to_le16('s'),
-                        const_cpu_to_le16('n'), const_cpu_to_le16('J'),
+                        cpu_to_le16('n'), cpu_to_le16('J'),
-                        const_cpu_to_le16('r'), const_cpu_to_le16('n'),
+                        cpu_to_le16('r'), cpu_to_le16('n'),
-                        const_cpu_to_le16('l'), 0 };
+                        cpu_to_le16('l'), 0 };
-        static ntfschar Max[5] = { const_cpu_to_le16('$'),
+        static ntfschar Max[5] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('M'), const_cpu_to_le16('a'),
+                        cpu_to_le16('M'), cpu_to_le16('a'),
-                        const_cpu_to_le16('x'), 0 };
+                        cpu_to_le16('x'), 0 };
-        static ntfschar J[3] = { const_cpu_to_le16('$'),
+        static ntfschar J[3] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('J'), 0 };
+                        cpu_to_le16('J'), 0 };
        ntfs_debug("Entering.");
        /*
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 4087fbdac327..00d8e6bd7c36 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -116,27 +116,27 @@ typedef struct {
 * documentation: http://www.linux-ntfs.org/
 */
 enum {
-        USN_REASON_DATA_OVERWRITE       = const_cpu_to_le32(0x00000001),
+        USN_REASON_DATA_OVERWRITE       = cpu_to_le32(0x00000001),
-        USN_REASON_DATA_EXTEND          = const_cpu_to_le32(0x00000002),
+        USN_REASON_DATA_EXTEND          = cpu_to_le32(0x00000002),
-        USN_REASON_DATA_TRUNCATION      = const_cpu_to_le32(0x00000004),
+        USN_REASON_DATA_TRUNCATION      = cpu_to_le32(0x00000004),
-        USN_REASON_NAMED_DATA_OVERWRITE = const_cpu_to_le32(0x00000010),
+        USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010),
-        USN_REASON_NAMED_DATA_EXTEND    = const_cpu_to_le32(0x00000020),
+        USN_REASON_NAMED_DATA_EXTEND    = cpu_to_le32(0x00000020),
-        USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040),
+        USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
-        USN_REASON_FILE_CREATE          = const_cpu_to_le32(0x00000100),
+        USN_REASON_FILE_CREATE          = cpu_to_le32(0x00000100),
-        USN_REASON_FILE_DELETE          = const_cpu_to_le32(0x00000200),
+        USN_REASON_FILE_DELETE          = cpu_to_le32(0x00000200),
-        USN_REASON_EA_CHANGE            = const_cpu_to_le32(0x00000400),
+        USN_REASON_EA_CHANGE            = cpu_to_le32(0x00000400),
-        USN_REASON_SECURITY_CHANGE      = const_cpu_to_le32(0x00000800),
+        USN_REASON_SECURITY_CHANGE      = cpu_to_le32(0x00000800),
-        USN_REASON_RENAME_OLD_NAME      = const_cpu_to_le32(0x00001000),
+        USN_REASON_RENAME_OLD_NAME      = cpu_to_le32(0x00001000),
-        USN_REASON_RENAME_NEW_NAME      = const_cpu_to_le32(0x00002000),
+        USN_REASON_RENAME_NEW_NAME      = cpu_to_le32(0x00002000),
-        USN_REASON_INDEXABLE_CHANGE     = const_cpu_to_le32(0x00004000),
+        USN_REASON_INDEXABLE_CHANGE     = cpu_to_le32(0x00004000),
-        USN_REASON_BASIC_INFO_CHANGE    = const_cpu_to_le32(0x00008000),
+        USN_REASON_BASIC_INFO_CHANGE    = cpu_to_le32(0x00008000),
-        USN_REASON_HARD_LINK_CHANGE     = const_cpu_to_le32(0x00010000),
+        USN_REASON_HARD_LINK_CHANGE     = cpu_to_le32(0x00010000),
-        USN_REASON_COMPRESSION_CHANGE   = const_cpu_to_le32(0x00020000),
+        USN_REASON_COMPRESSION_CHANGE   = cpu_to_le32(0x00020000),
-        USN_REASON_ENCRYPTION_CHANGE    = const_cpu_to_le32(0x00040000),
+        USN_REASON_ENCRYPTION_CHANGE    = cpu_to_le32(0x00040000),
-        USN_REASON_OBJECT_ID_CHANGE     = const_cpu_to_le32(0x00080000),
+        USN_REASON_OBJECT_ID_CHANGE     = cpu_to_le32(0x00080000),
-        USN_REASON_REPARSE_POINT_CHANGE = const_cpu_to_le32(0x00100000),
+        USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000),
-        USN_REASON_STREAM_CHANGE        = const_cpu_to_le32(0x00200000),
+        USN_REASON_STREAM_CHANGE        = cpu_to_le32(0x00200000),
-        USN_REASON_CLOSE                = const_cpu_to_le32(0x80000000),
+        USN_REASON_CLOSE                = cpu_to_le32(0x80000000),
 };
 typedef le32 USN_REASON_FLAGS;
@@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS;
 *      http://www.linux-ntfs.org/
 */
 enum {
-        USN_SOURCE_DATA_MANAGEMENT        = const_cpu_to_le32(0x00000001),
+        USN_SOURCE_DATA_MANAGEMENT        = cpu_to_le32(0x00000001),
-        USN_SOURCE_AUXILIARY_DATA         = const_cpu_to_le32(0x00000002),
+        USN_SOURCE_AUXILIARY_DATA         = cpu_to_le32(0x00000002),
-        USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004),
+        USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
 };
 typedef le32 USN_SOURCE_INFO_FLAGS;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
                                return PTR_ERR(acl);
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19e3a96aa02c..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
        .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 };
+static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
+                                          u64 blkno)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
+}
+static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        return le64_to_cpu(dx_root->dr_last_eb_blk);
+}
+static void ocfs2_dx_root_update_clusters(struct inode *inode,
+                                          struct ocfs2_extent_tree *et,
+                                          u32 clusters)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        le32_add_cpu(&dx_root->dr_clusters, clusters);
+}
+static int ocfs2_dx_root_sanity_check(struct inode *inode,
+                                      struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
+        return 0;
+}
+static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        et->et_root_el = &dx_root->dr_list;
+}
+static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+        .eo_set_last_eb_blk     = ocfs2_dx_root_set_last_eb_blk,
+        .eo_get_last_eb_blk     = ocfs2_dx_root_get_last_eb_blk,
+        .eo_update_clusters     = ocfs2_dx_root_update_clusters,
+        .eo_sanity_check        = ocfs2_dx_root_sanity_check,
+        .eo_fill_root_el        = ocfs2_dx_root_fill_root_el,
+};
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
                                     struct inode *inode,
                                     struct buffer_head *bh,
@@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                 &ocfs2_xattr_value_et_ops);
 }
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+                                    struct inode *inode,
+                                    struct buffer_head *bh)
+{
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
+                                 NULL, &ocfs2_dx_root_et_ops);
+}
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
                                            u64 new_last_eb_blk)
 {
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
                                        struct ocfs2_xattr_value_buf *vb);
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+                                    struct inode *inode,
+                                    struct buffer_head *bh);
 /*
 * Read an extent block into *bh.  If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8e1709a679b7..b2c52b3a1484 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 }
 const struct address_space_operations ocfs2_aops = {
-        .readpage       = ocfs2_readpage,
+        .readpage               = ocfs2_readpage,
-        .readpages      = ocfs2_readpages,
+        .readpages              = ocfs2_readpages,
-        .writepage      = ocfs2_writepage,
+        .writepage              = ocfs2_writepage,
-        .write_begin    = ocfs2_write_begin,
+        .write_begin            = ocfs2_write_begin,
-        .write_end      = ocfs2_write_end,
+        .write_end              = ocfs2_write_end,
-        .bmap           = ocfs2_bmap,
+        .bmap                   = ocfs2_bmap,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .direct_IO      = ocfs2_direct_IO,
+        .direct_IO              = ocfs2_direct_IO,
-        .invalidatepage = ocfs2_invalidatepage,
+        .invalidatepage         = ocfs2_invalidatepage,
-        .releasepage    = ocfs2_releasepage,
+        .releasepage            = ocfs2_releasepage,
-        .migratepage    = buffer_migrate_page,
+        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 04697ba7f73e..4f85eceab376 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,7 @@
 #include <linux/random.h>
 #include <linux/crc32.h>
 #include <linux/time.h>
+#include <linux/debugfs.h>
 #include "heartbeat.h"
 #include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+#define O2HB_DEBUG_DIR                  "o2hb"
+#define O2HB_DEBUG_LIVENODES            "livenodes"
+static struct dentry *o2hb_debug_dir;
+static struct dentry *o2hb_debug_livenodes;
 static LIST_HEAD(o2hb_all_regions);
 static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
        return 0;
 }
-void o2hb_init(void)
+#ifdef CONFIG_DEBUG_FS
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+        unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        char *buf = NULL;
+        int i = -1;
+        int out = 0;
+        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!buf)
+                goto bail;
+        o2hb_fill_node_map(map, sizeof(map));
+        while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+                out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+        out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+        i_size_write(inode, out);
+        file->private_data = buf;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+        kfree(file->private_data);
+        return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+                                 size_t nbytes, loff_t *ppos)
+{
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
+}
+#else
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+                               size_t nbytes, loff_t *ppos)
+{
+        return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+static struct file_operations o2hb_debug_fops = {
+        .open =         o2hb_debug_open,
+        .release =      o2hb_debug_release,
+        .read =         o2hb_debug_read,
+        .llseek =       generic_file_llseek,
+};
+void o2hb_exit(void)
+{
+        if (o2hb_debug_livenodes)
+                debugfs_remove(o2hb_debug_livenodes);
+        if (o2hb_debug_dir)
+                debugfs_remove(o2hb_debug_dir);
+}
+int o2hb_init(void)
 {
        int i;
@@ -918,6 +994,24 @@ void o2hb_init(void)
        INIT_LIST_HEAD(&o2hb_node_events);
        memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+        o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+        if (!o2hb_debug_dir) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
+                                                   S_IFREG|S_IRUSR,
+                                                   o2hb_debug_dir, NULL,
+                                                   &o2hb_debug_fops);
+        if (!o2hb_debug_livenodes) {
+                mlog_errno(-ENOMEM);
+                debugfs_remove(o2hb_debug_dir);
+                return -ENOMEM;
+        }
+        return 0;
 }
 /* if we're already in a callback then we're already serialized by the sem */
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index e511339886b3..2f1649253b49 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
                              struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
                        unsigned bytes);
-void o2hb_init(void);
+void o2hb_exit(void);
+int o2hb_init(void);
 int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 70e8fa9e2539..7ee6188bc79a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
        o2cb_sys_shutdown();
        o2net_exit();
+        o2hb_exit();
 }
 static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
        cluster_print_version();
-        o2hb_init();
+        ret = o2hb_init();
+        if (ret)
+                goto out;
        ret = o2net_init();
        if (ret)
-                goto out;
+                goto out_o2hb;
        ret = o2net_register_hb_callbacks();
        if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
        o2net_unregister_hb_callbacks();
 out_o2net:
        o2net_exit();
+out_o2hb:
+        o2hb_exit();
 out:
        return ret;
 }
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 7d604480557a..b574431a031d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -290,6 +290,21 @@ out_attach:
        else
                mlog_errno(ret);
+        /*
+         * In case of error, manually free the allocation and do the iput().
+         * We need to do this because error here means no d_instantiate(),
+         * which means iput() will not be called during dput(dentry).
+         */
+        if (ret < 0 && !alias) {
+                ocfs2_lock_res_free(&dl->dl_lockres);
+                BUG_ON(dl->dl_count != 1);
+                spin_lock(&dentry_attach_lock);
+                dentry->d_fsdata = NULL;
+                spin_unlock(&dentry_attach_lock);
+                kfree(dl);
+                iput(inode);
+        }
        dput(alias);
        return ret;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2c4098cf337..c5752305627c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/quotaops.h>
+#include <linux/sort.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -58,6 +59,7 @@
 #include "namei.h"
 #include "suballoc.h"
 #include "super.h"
+#include "sysfile.h"
 #include "uptodate.h"
 #include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
-static int ocfs2_extend_dir(struct ocfs2_super *osb,
-                            struct inode *dir,
-                            struct buffer_head *parent_fe_bh,
-                            unsigned int blocks_wanted,
-                            struct buffer_head **new_de_bh);
 static int ocfs2_do_extend_dir(struct super_block *sb,
                               handle_t *handle,
                               struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
+static int ocfs2_dir_indexed(struct inode *inode);
 /*
 * These are distinct checks because future versions of the file system will
 * want to have a trailing dirent structure independent of indexing.
 */
-static int ocfs2_dir_has_trailer(struct inode *dir)
+static int ocfs2_supports_dir_trailer(struct inode *dir)
 {
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
-        return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+        return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
 }
-static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+/*
+ * "new' here refers to the point at which we're creating a new
+ * directory via "mkdir()", but also when we're expanding an inline
+ * directory. In either case, we don't yet have the indexing bit set
+ * on the directory, so the standard checks will fail in when metaecc
+ * is turned off. Only directory-initialization type functions should
+ * use this then. Everything else wants ocfs2_supports_dir_trailer()
+ */
+static int ocfs2_new_dir_wants_trailer(struct inode *dir)
 {
-        return ocfs2_meta_ecc(osb);
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        return ocfs2_meta_ecc(osb) ||
+                ocfs2_supports_indexed_dirs(osb);
 }
 static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 {
        unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
-        if (!ocfs2_dir_has_trailer(dir))
+        if (!ocfs2_supports_dir_trailer(dir))
                return 0;
        if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 }
 static void ocfs2_init_dir_trailer(struct inode *inode,
-                                   struct buffer_head *bh)
+                                   struct buffer_head *bh, u16 rec_len)
 {
        struct ocfs2_dir_block_trailer *trailer;
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
                        cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
        trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
        trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
+        trailer->db_free_rec_len = cpu_to_le16(rec_len);
+}
+/*
+ * Link an unindexed block with a dir trailer structure into the index free
+ * list. This function will modify dirdata_bh, but assumes you've already
+ * passed it to the journal.
+ */
+static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
+                                     struct buffer_head *dx_root_bh,
+                                     struct buffer_head *dirdata_bh)
+{
+        int ret;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dir_block_trailer *trailer;
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        trailer->db_free_next = dx_root->dr_free_blk;
+        dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+        ocfs2_journal_dirty(handle, dx_root_bh);
+out:
+        return ret;
+}
+static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
+{
+        return res->dl_prev_leaf_bh == NULL;
+}
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
+{
+        brelse(res->dl_dx_root_bh);
+        brelse(res->dl_leaf_bh);
+        brelse(res->dl_dx_leaf_bh);
+        brelse(res->dl_prev_leaf_bh);
+}
+static int ocfs2_dir_indexed(struct inode *inode)
+{
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
+                return 1;
+        return 0;
+}
+static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
+{
+        return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
+}
+/*
+ * Hashing code adapted from ext3
+ */
+#define DELTA 0x9E3779B9
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+        __u32   sum = 0;
+        __u32   b0 = buf[0], b1 = buf[1];
+        __u32   a = in[0], b = in[1], c = in[2], d = in[3];
+        int     n = 16;
+        do {
+                sum += DELTA;
+                b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+                b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+        } while (--n);
+        buf[0] += b0;
+        buf[1] += b1;
+}
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+        __u32   pad, val;
+        int     i;
+        pad = (__u32)len | ((__u32)len << 8);
+        pad |= pad << 16;
+        val = pad;
+        if (len > num*4)
+                len = num * 4;
+        for (i = 0; i < len; i++) {
+                if ((i % 4) == 0)
+                        val = pad;
+                val = msg[i] + (val << 8);
+                if ((i % 4) == 3) {
+                        *buf++ = val;
+                        val = pad;
+                        num--;
+                }
+        }
+        if (--num >= 0)
+                *buf++ = val;
+        while (--num >= 0)
+                *buf++ = pad;
+}
+static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
+                                   struct ocfs2_dx_hinfo *hinfo)
+{
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        const char      *p;
+        __u32           in[8], buf[4];
+        /*
+         * XXX: Is this really necessary, if the index is never looked
+         * at by readdir? Is a hash value of '0' a bad idea?
+         */
+        if ((len == 1 && !strncmp(".", name, 1)) ||
+            (len == 2 && !strncmp("..", name, 2))) {
+                buf[0] = buf[1] = 0;
+                goto out;
+        }
+#ifdef OCFS2_DEBUG_DX_DIRS
+        /*
+         * This makes it very easy to debug indexing problems. We
+         * should never allow this to be selected without hand editing
+         * this file though.
+         */
+        buf[0] = buf[1] = len;
+        goto out;
+#endif
+        memcpy(buf, osb->osb_dx_seed, sizeof(buf));
+        p = name;
+        while (len > 0) {
+                str2hashbuf(p, len, in, 4);
+                TEA_transform(buf, in);
+                len -= 16;
+                p += 16;
+        }
+out:
+        hinfo->major_hash = buf[0];
+        hinfo->minor_hash = buf[1];
 }
 /*
@@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
 }
 /*
+ * Validate a directory trailer.
+ *
+ * We check the trailer here rather than in ocfs2_validate_dir_block()
+ * because that function doesn't have the inode to test.
+ */
+static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
+{
+        int rc = 0;
+        struct ocfs2_dir_block_trailer *trailer;
+        trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
+        if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                rc = -EINVAL;
+                ocfs2_error(dir->i_sb,
+                            "Invalid dirblock #%llu: "
+                            "signature = %.*s\n",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            trailer->db_signature);
+                goto out;
+        }
+        if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
+                rc = -EINVAL;
+                ocfs2_error(dir->i_sb,
+                            "Directory block #%llu has an invalid "
+                            "db_blkno of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                goto out;
+        }
+        if (le64_to_cpu(trailer->db_parent_dinode) !=
+            OCFS2_I(dir)->ip_blkno) {
+                rc = -EINVAL;
+                ocfs2_error(dir->i_sb,
+                            "Directory block #%llu on dinode "
+                            "#%llu has an invalid parent_dinode "
+                            "of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                goto out;
+        }
+out:
+        return rc;
+}
+/*
 * This function forces all errors to -EIO for consistency with its
 * predecessor, ocfs2_bread().  We haven't audited what returning the
 * real error codes would do to callers.  We log the real codes with
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 {
        int rc = 0;
        struct buffer_head *tmp = *bh;
-        struct ocfs2_dir_block_trailer *trailer;
        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
                                    ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
                goto out;
        }
-        /*
-         * We check the trailer here rather than in
-         * ocfs2_validate_dir_block() because that function doesn't have
-         * the inode to test.
-         */
        if (!(flags & OCFS2_BH_READAHEAD) &&
-            ocfs2_dir_has_trailer(inode)) {
+            ocfs2_supports_dir_trailer(inode)) {
-                trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+                rc = ocfs2_check_dir_trailer(inode, tmp);
-                if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                if (rc) {
-                        rc = -EINVAL;
+                        if (!*bh)
-                        ocfs2_error(inode->i_sb,
+                                brelse(tmp);
-                                    "Invalid dirblock #%llu: "
+                        mlog_errno(rc);
-                                    "signature = %.*s\n",
-                                    (unsigned long long)tmp->b_blocknr, 7,
-                                    trailer->db_signature);
-                        goto out;
-                }
-                if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
-                        rc = -EINVAL;
-                        ocfs2_error(inode->i_sb,
-                                    "Directory block #%llu has an invalid "
-                                    "db_blkno of %llu",
-                                    (unsigned long long)tmp->b_blocknr,
-                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
-                        goto out;
-                }
-                if (le64_to_cpu(trailer->db_parent_dinode) !=
-                    OCFS2_I(inode)->ip_blkno) {
-                        rc = -EINVAL;
-                        ocfs2_error(inode->i_sb,
-                                    "Directory block #%llu on dinode "
-                                    "#%llu has an invalid parent_dinode "
-                                    "of %llu",
-                                    (unsigned long long)tmp->b_blocknr,
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
                        goto out;
                }
        }
@@ -379,6 +553,141 @@ out:
        return rc ? -EIO : 0;
 }
+/*
+ * Read the block at 'phys' which belongs to this directory
+ * inode. This function does no virtual->physical block translation -
+ * what's passed in is assumed to be a valid directory block.
+ */
+static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
+                                       struct buffer_head **bh)
+{
+        int ret;
+        struct buffer_head *tmp = *bh;
+        ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (ocfs2_supports_dir_trailer(dir)) {
+                ret = ocfs2_check_dir_trailer(dir, tmp);
+                if (ret) {
+                        if (!*bh)
+                                brelse(tmp);
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (!ret && !*bh)
+                *bh = tmp;
+out:
+        return ret;
+}
+static int ocfs2_validate_dx_root(struct super_block *sb,
+                                  struct buffer_head *bh)
+{
+        int ret;
+        struct ocfs2_dx_root_block *dx_root;
+        BUG_ON(!buffer_uptodate(bh));
+        dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
+        ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
+        if (ret) {
+                mlog(ML_ERROR,
+                     "Checksum failed for dir index root block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return ret;
+        }
+        if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
+                ocfs2_error(sb,
+                            "Dir Index Root # %llu has bad signature %.*s",
+                            (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+                            7, dx_root->dr_signature);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
+                              struct buffer_head **dx_root_bh)
+{
+        int ret;
+        u64 blkno = le64_to_cpu(di->i_dx_root);
+        struct buffer_head *tmp = *dx_root_bh;
+        ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!ret && !*dx_root_bh)
+                *dx_root_bh = tmp;
+        return ret;
+}
+static int ocfs2_validate_dx_leaf(struct super_block *sb,
+                                  struct buffer_head *bh)
+{
+        int ret;
+        struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
+        BUG_ON(!buffer_uptodate(bh));
+        ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
+        if (ret) {
+                mlog(ML_ERROR,
+                     "Checksum failed for dir index leaf block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return ret;
+        }
+        if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
+                ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+                            7, dx_leaf->dl_signature);
+                return -EROFS;
+        }
+        return 0;
+}
+static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
+                              struct buffer_head **dx_leaf_bh)
+{
+        int ret;
+        struct buffer_head *tmp = *dx_leaf_bh;
+        ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!ret && !*dx_leaf_bh)
+                *dx_leaf_bh = tmp;
+        return ret;
+}
+/*
+ * Read a series of dx_leaf blocks. This expects all buffer_head
+ * pointers to be NULL on function entry.
+ */
+static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
+                                struct buffer_head **dx_leaf_bhs)
+{
+        int ret;
+        ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
+                                ocfs2_validate_dx_leaf);
+        if (ret)
+                mlog_errno(ret);
+        return ret;
+}
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
                                               struct inode *dir,
                                               struct ocfs2_dir_entry **res_dir)
@@ -480,39 +789,340 @@ cleanup_and_exit:
        return ret;
 }
+static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
+                                   struct ocfs2_extent_list *el,
+                                   u32 major_hash,
+                                   u32 *ret_cpos,
+                                   u64 *ret_phys_blkno,
+                                   unsigned int *ret_clen)
+{
+        int ret = 0, i, found;
+        struct buffer_head *eb_bh = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_rec *rec = NULL;
+        if (el->l_tree_depth) {
+                ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+                el = &eb->h_list;
+                if (el->l_tree_depth) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %lu has non zero tree depth in "
+                                    "btree tree block %llu\n", inode->i_ino,
+                                    (unsigned long long)eb_bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
+                }
+        }
+        found = 0;
+        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+                rec = &el->l_recs[i];
+                if (le32_to_cpu(rec->e_cpos) <= major_hash) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found) {
+                ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                            "record (%u, %u, 0) in btree", inode->i_ino,
+                            le32_to_cpu(rec->e_cpos),
+                            ocfs2_rec_clusters(el, rec));
+                ret = -EROFS;
+                goto out;
+        }
+        if (ret_phys_blkno)
+                *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
+        if (ret_cpos)
+                *ret_cpos = le32_to_cpu(rec->e_cpos);
+        if (ret_clen)
+                *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
+out:
+        brelse(eb_bh);
+        return ret;
+}
+/*
+ * Returns the block index, from the start of the cluster which this
+ * hash belongs too.
+ */
+static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+                                                   u32 minor_hash)
+{
+        return minor_hash & osb->osb_dx_mask;
+}
+static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+                                          struct ocfs2_dx_hinfo *hinfo)
+{
+        return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
+}
+static int ocfs2_dx_dir_lookup(struct inode *inode,
+                               struct ocfs2_extent_list *el,
+                               struct ocfs2_dx_hinfo *hinfo,
+                               u32 *ret_cpos,
+                               u64 *ret_phys_blkno)
+{
+        int ret = 0;
+        unsigned int cend, uninitialized_var(clen);
+        u32 uninitialized_var(cpos);
+        u64 uninitialized_var(blkno);
+        u32 name_hash = hinfo->major_hash;
+        ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
+                                      &clen);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        cend = cpos + clen;
+        if (name_hash >= cend) {
+                /* We want the last cluster */
+                blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
+                cpos += clen - 1;
+        } else {
+                blkno += ocfs2_clusters_to_blocks(inode->i_sb,
+                                                  name_hash - cpos);
+                cpos = name_hash;
+        }
+        /*
+         * We now have the cluster which should hold our entry. To
+         * find the exact block from the start of the cluster to
+         * search, we take the lower bits of the hash.
+         */
+        blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
+        if (ret_phys_blkno)
+                *ret_phys_blkno = blkno;
+        if (ret_cpos)
+                *ret_cpos = cpos;
+out:
+        return ret;
+}
+static int ocfs2_dx_dir_search(const char *name, int namelen,
+                               struct inode *dir,
+                               struct ocfs2_dx_root_block *dx_root,
+                               struct ocfs2_dir_lookup_result *res)
+{
+        int ret, i, found;
+        u64 uninitialized_var(phys);
+        struct buffer_head *dx_leaf_bh = NULL;
+        struct ocfs2_dx_leaf *dx_leaf;
+        struct ocfs2_dx_entry *dx_entry = NULL;
+        struct buffer_head *dir_ent_bh = NULL;
+        struct ocfs2_dir_entry *dir_ent = NULL;
+        struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
+        struct ocfs2_extent_list *dr_el;
+        struct ocfs2_dx_entry_list *entry_list;
+        ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
+        if (ocfs2_dx_root_inline(dx_root)) {
+                entry_list = &dx_root->dr_entries;
+                goto search;
+        }
+        dr_el = &dx_root->dr_list;
+        ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
+             "returns: %llu\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+             namelen, name, hinfo->major_hash, hinfo->minor_hash,
+             (unsigned long long)phys);
+        ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
+        mlog(0, "leaf info: num_used: %d, count: %d\n",
+             le16_to_cpu(dx_leaf->dl_list.de_num_used),
+             le16_to_cpu(dx_leaf->dl_list.de_count));
+        entry_list = &dx_leaf->dl_list;
+search:
+        /*
+         * Empty leaf is legal, so no need to check for that.
+         */
+        found = 0;
+        for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+                dx_entry = &entry_list->de_entries[i];
+                if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
+                    || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
+                        continue;
+                /*
+                 * Search unindexed leaf block now. We're not
+                 * guaranteed to find anything.
+                 */
+                ret = ocfs2_read_dir_block_direct(dir,
+                                          le64_to_cpu(dx_entry->dx_dirent_blk),
+                                          &dir_ent_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * XXX: We should check the unindexed block here,
+                 * before using it.
+                 */
+                found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
+                                              0, dir_ent_bh->b_data,
+                                              dir->i_sb->s_blocksize, &dir_ent);
+                if (found == 1)
+                        break;
+                if (found == -1) {
+                        /* This means we found a bad directory entry. */
+                        ret = -EIO;
+                        mlog_errno(ret);
+                        goto out;
+                }
+                brelse(dir_ent_bh);
+                dir_ent_bh = NULL;
+        }
+        if (found <= 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        res->dl_leaf_bh = dir_ent_bh;
+        res->dl_entry = dir_ent;
+        res->dl_dx_leaf_bh = dx_leaf_bh;
+        res->dl_dx_entry = dx_entry;
+        ret = 0;
+out:
+        if (ret) {
+                brelse(dx_leaf_bh);
+                brelse(dir_ent_bh);
+        }
+        return ret;
+}
+static int ocfs2_find_entry_dx(const char *name, int namelen,
+                               struct inode *dir,
+                               struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        ret = ocfs2_read_inode_block(dir, &di_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
+        if (ret) {
+                if (ret != -ENOENT)
+                        mlog_errno(ret);
+                goto out;
+        }
+        lookup->dl_dx_root_bh = dx_root_bh;
+        dx_root_bh = NULL;
+out:
+        brelse(di_bh);
+        brelse(dx_root_bh);
+        return ret;
+}
 /*
 * Try to find an entry of the provided name within 'dir'.
 *
- * If nothing was found, NULL is returned. Otherwise, a buffer_head
+ * If nothing was found, -ENOENT is returned. Otherwise, zero is
- * and pointer to the dir entry are passed back.
+ * returned and the struct 'res' will contain information useful to
+ * other directory manipulation functions.
 *
 * Caller can NOT assume anything about the contents of the
- * buffer_head - it is passed back only so that it can be passed into
+ * buffer_heads - they are passed back only so that it can be passed
- * any one of the manipulation functions (add entry, delete entry,
+ * into any one of the manipulation functions (add entry, delete
- * etc). As an example, bh in the extent directory case is a data
+ * entry, etc). As an example, bh in the extent directory case is a
- * block, in the inline-data case it actually points to an inode.
+ * data block, in the inline-data case it actually points to an inode,
+ * in the indexed directory case, multiple buffers are involved.
 */
-struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
+int ocfs2_find_entry(const char *name, int namelen,
-                                     struct inode *dir,
+                     struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
-                                     struct ocfs2_dir_entry **res_dir)
 {
-        *res_dir = NULL;
+        struct buffer_head *bh;
+        struct ocfs2_dir_entry *res_dir = NULL;
+        if (ocfs2_dir_indexed(dir))
+                return ocfs2_find_entry_dx(name, namelen, dir, lookup);
+        /*
+         * The unindexed dir code only uses part of the lookup
+         * structure, so there's no reason to push it down further
+         * than this.
+         */
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return ocfs2_find_entry_id(name, namelen, dir, res_dir);
+                bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
+        else
+                bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+        if (bh == NULL)
+                return -ENOENT;
-        return ocfs2_find_entry_el(name, namelen, dir, res_dir);
+        lookup->dl_leaf_bh = bh;
+        lookup->dl_entry = res_dir;
+        return 0;
 }
 /*
 * Update inode number and type of a previously found directory entry.
 */
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-                       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+                       struct ocfs2_dir_lookup_result *res,
                       struct inode *new_entry_inode)
 {
        int ret;
        ocfs2_journal_access_func access = ocfs2_journal_access_db;
+        struct ocfs2_dir_entry *de = res->dl_entry;
+        struct buffer_head *de_bh = res->dl_leaf_bh;
        /*
         * The same code works fine for both inline-data and extent
@@ -538,6 +1148,10 @@ out:
        return ret;
 }
+/*
+ * __ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
 static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                                struct ocfs2_dir_entry *de_del,
                                struct buffer_head *bh, char *first_de,
@@ -587,6 +1201,181 @@ bail:
        return status;
 }
+static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
+{
+        unsigned int hole;
+        if (le64_to_cpu(de->inode) == 0)
+                hole = le16_to_cpu(de->rec_len);
+        else
+                hole = le16_to_cpu(de->rec_len) -
+                        OCFS2_DIR_REC_LEN(de->name_len);
+        return hole;
+}
+static int ocfs2_find_max_rec_len(struct super_block *sb,
+                                  struct buffer_head *dirblock_bh)
+{
+        int size, this_hole, largest_hole = 0;
+        char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
+        struct ocfs2_dir_entry *de;
+        trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
+        size = ocfs2_dir_trailer_blk_off(sb);
+        limit = start + size;
+        de_buf = start;
+        de = (struct ocfs2_dir_entry *)de_buf;
+        do {
+                if (de_buf != trailer) {
+                        this_hole = ocfs2_figure_dirent_hole(de);
+                        if (this_hole > largest_hole)
+                                largest_hole = this_hole;
+                }
+                de_buf += le16_to_cpu(de->rec_len);
+                de = (struct ocfs2_dir_entry *)de_buf;
+        } while (de_buf < limit);
+        if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+                return largest_hole;
+        return 0;
+}
+static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
+                                       int index)
+{
+        int num_used = le16_to_cpu(entry_list->de_num_used);
+        if (num_used == 1 || index == (num_used - 1))
+                goto clear;
+        memmove(&entry_list->de_entries[index],
+                &entry_list->de_entries[index + 1],
+                (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
+clear:
+        num_used--;
+        memset(&entry_list->de_entries[num_used], 0,
+               sizeof(struct ocfs2_dx_entry));
+        entry_list->de_num_used = cpu_to_le16(num_used);
+}
+static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
+                                 struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret, index, max_rec_len, add_to_free_list = 0;
+        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+        struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
+        struct ocfs2_dx_leaf *dx_leaf;
+        struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
+        struct ocfs2_dir_block_trailer *trailer;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        /*
+         * This function gets a bit messy because we might have to
+         * modify the root block, regardless of whether the indexed
+         * entries are stored inline.
+         */
+        /*
+         * *Only* set 'entry_list' here, based on where we're looking
+         * for the indexed entries. Later, we might still want to
+         * journal both blocks, based on free list state.
+         */
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (ocfs2_dx_root_inline(dx_root)) {
+                entry_list = &dx_root->dr_entries;
+        } else {
+                dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
+                entry_list = &dx_leaf->dl_list;
+        }
+        /* Neither of these are a disk corruption - that should have
+         * been caught by lookup, before we got here. */
+        BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
+        BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
+        index = (char *)dx_entry - (char *)entry_list->de_entries;
+        index /= sizeof(*dx_entry);
+        if (index >= le16_to_cpu(entry_list->de_num_used)) {
+                mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
+                     (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
+                     entry_list, dx_entry);
+                return -EIO;
+        }
+        /*
+         * We know that removal of this dirent will leave enough room
+         * for a new one, so add this block to the free list if it
+         * isn't already there.
+         */
+        trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+        if (trailer->db_free_rec_len == 0)
+                add_to_free_list = 1;
+        /*
+         * Add the block holding our index into the journal before
+         * removing the unindexed entry. If we get an error return
+         * from __ocfs2_delete_entry(), then it hasn't removed the
+         * entry yet. Likewise, successful return means we *must*
+         * remove the indexed entry.
+         *
+         * We're also careful to journal the root tree block here as
+         * the entry count needs to be updated. Also, we might be
+         * adding to the start of the free list.
+         */
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!ocfs2_dx_root_inline(dx_root)) {
+                ret = ocfs2_journal_access_dl(handle, dir,
+                                              lookup->dl_dx_leaf_bh,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        mlog(0, "Dir %llu: delete entry at index: %d\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+        ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
+                                   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
+        trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+        if (add_to_free_list) {
+                trailer->db_free_next = dx_root->dr_free_blk;
+                dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
+                ocfs2_journal_dirty(handle, dx_root_bh);
+        }
+        /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
+        ocfs2_journal_dirty(handle, leaf_bh);
+        le32_add_cpu(&dx_root->dr_num_entries, -1);
+        ocfs2_journal_dirty(handle, dx_root_bh);
+        ocfs2_dx_list_remove_entry(entry_list, index);
+        if (!ocfs2_dx_root_inline(dx_root))
+                ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
+out:
+        return ret;
+}
 static inline int ocfs2_delete_entry_id(handle_t *handle,
                                        struct inode *dir,
                                        struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
 }
 /*
- * ocfs2_delete_entry deletes a directory entry by merging it with the
+ * Delete a directory entry. Hide the details of directory
- * previous entry
+ * implementation from the caller.
 */
 int ocfs2_delete_entry(handle_t *handle,
                       struct inode *dir,
-                       struct ocfs2_dir_entry *de_del,
+                       struct ocfs2_dir_lookup_result *res)
-                       struct buffer_head *bh)
 {
+        if (ocfs2_dir_indexed(dir))
+                return ocfs2_delete_entry_dx(handle, dir, res);
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return ocfs2_delete_entry_id(handle, dir, de_del, bh);
+                return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
+                                             res->dl_leaf_bh);
-        return ocfs2_delete_entry_el(handle, dir, de_del, bh);
+        return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
+                                     res->dl_leaf_bh);
 }
 /*
@@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
        return 0;
 }
+static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
+                                          struct ocfs2_dx_entry *dx_new_entry)
+{
+        int i;
+        i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+        dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
+        le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
+}
+static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
+                                       struct ocfs2_dx_hinfo *hinfo,
+                                       u64 dirent_blk)
+{
+        int i;
+        struct ocfs2_dx_entry *dx_entry;
+        i = le16_to_cpu(entry_list->de_num_used);
+        dx_entry = &entry_list->de_entries[i];
+        memset(dx_entry, 0, sizeof(*dx_entry));
+        dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
+        dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
+        dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
+        le16_add_cpu(&entry_list->de_num_used, 1);
+}
+static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
+                                      struct ocfs2_dx_hinfo *hinfo,
+                                      u64 dirent_blk,
+                                      struct buffer_head *dx_leaf_bh)
+{
+        int ret;
+        struct ocfs2_dx_leaf *dx_leaf;
+        ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+        ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
+        ocfs2_journal_dirty(handle, dx_leaf_bh);
+out:
+        return ret;
+}
+static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
+                                        struct ocfs2_dx_hinfo *hinfo,
+                                        u64 dirent_blk,
+                                        struct ocfs2_dx_root_block *dx_root)
+{
+        ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
+}
+static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
+                               struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret = 0;
+        struct ocfs2_dx_root_block *dx_root;
+        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
+        if (ocfs2_dx_root_inline(dx_root)) {
+                ocfs2_dx_inline_root_insert(dir, handle,
+                                            &lookup->dl_hinfo,
+                                            lookup->dl_leaf_bh->b_blocknr,
+                                            dx_root);
+        } else {
+                ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
+                                                 lookup->dl_leaf_bh->b_blocknr,
+                                                 lookup->dl_dx_leaf_bh);
+                if (ret)
+                        goto out;
+        }
+        le32_add_cpu(&dx_root->dr_num_entries, 1);
+        ocfs2_journal_dirty(handle, dx_root_bh);
+out:
+        return ret;
+}
+static void ocfs2_remove_block_from_free_list(struct inode *dir,
+                                       handle_t *handle,
+                                       struct ocfs2_dir_lookup_result *lookup)
+{
+        struct ocfs2_dir_block_trailer *trailer, *prev;
+        struct ocfs2_dx_root_block *dx_root;
+        struct buffer_head *bh;
+        trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+        if (ocfs2_free_list_at_root(lookup)) {
+                bh = lookup->dl_dx_root_bh;
+                dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
+                dx_root->dr_free_blk = trailer->db_free_next;
+        } else {
+                bh = lookup->dl_prev_leaf_bh;
+                prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
+                prev->db_free_next = trailer->db_free_next;
+        }
+        trailer->db_free_rec_len = cpu_to_le16(0);
+        trailer->db_free_next = cpu_to_le64(0);
+        ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+}
+/*
+ * This expects that a journal write has been reserved on
+ * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
+ */
+static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
+                                   struct ocfs2_dir_lookup_result *lookup)
+{
+        int max_rec_len;
+        struct ocfs2_dir_block_trailer *trailer;
+        /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
+        max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
+        if (max_rec_len) {
+                /*
+                 * There's still room in this block, so no need to remove it
+                 * from the free list. In this case, we just want to update
+                 * the rec len accounting.
+                 */
+                trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+                trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+                ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+        } else {
+                ocfs2_remove_block_from_free_list(dir, handle, lookup);
+        }
+}
 /* we don't always have a dentry for what we want to add, so people
 * like orphan dir can call this instead.
 *
- * If you pass me insert_bh, I'll skip the search of the other dir
+ * The lookup context must have been filled from
- * blocks and put the record in there.
+ * ocfs2_prepare_dir_for_insert.
 */
 int __ocfs2_add_entry(handle_t *handle,
                      struct inode *dir,
                      const char *name, int namelen,
                      struct inode *inode, u64 blkno,
                      struct buffer_head *parent_fe_bh,
-                      struct buffer_head *insert_bh)
+                      struct ocfs2_dir_lookup_result *lookup)
 {
        unsigned long offset;
        unsigned short rec_len;
@@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle,
        struct super_block *sb = dir->i_sb;
        int retval, status;
        unsigned int size = sb->s_blocksize;
+        struct buffer_head *insert_bh = lookup->dl_leaf_bh;
        char *data_start = insert_bh->b_data;
        mlog_entry_void();
@@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle,
        if (!namelen)
                return -EINVAL;
-        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+        if (ocfs2_dir_indexed(dir)) {
+                struct buffer_head *bh;
+                /*
+                 * An indexed dir may require that we update the free space
+                 * list. Reserve a write to the previous node in the list so
+                 * that we don't fail later.
+                 *
+                 * XXX: This can be either a dx_root_block, or an unindexed
+                 * directory tree leaf block.
+                 */
+                if (ocfs2_free_list_at_root(lookup)) {
+                        bh = lookup->dl_dx_root_bh;
+                        retval = ocfs2_journal_access_dr(handle, dir, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                } else {
+                        bh = lookup->dl_prev_leaf_bh;
+                        retval = ocfs2_journal_access_db(handle, dir, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                }
+                if (retval) {
+                        mlog_errno(retval);
+                        return retval;
+                }
+        } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                data_start = di->id2.i_data.id_data;
                size = i_size_read(dir);
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
                                status = ocfs2_journal_access_di(handle, dir,
                                                                 insert_bh,
                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
-                        else
+                        else {
                                status = ocfs2_journal_access_db(handle, dir,
                                                                 insert_bh,
-                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                if (ocfs2_dir_indexed(dir)) {
+                                        status = ocfs2_dx_dir_insert(dir,
+                                                                handle,
+                                                                lookup);
+                                        if (status) {
+                                                mlog_errno(status);
+                                                goto bail;
+                                        }
+                                }
+                        }
                        /* By now the buffer is marked for journaling */
                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
                        de->name_len = namelen;
                        memcpy(de->name, name, namelen);
+                        if (ocfs2_dir_indexed(dir))
+                                ocfs2_recalc_free_list(dir, handle, lookup);
                        dir->i_version++;
                        status = ocfs2_journal_dirty(handle, insert_bh);
                        retval = 0;
@@ -870,6 +1851,10 @@ out:
        return 0;
 }
+/*
+ * NOTE: This function can be called against unindexed directories,
+ * and indexed ones.
+ */
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                                    u64 *f_version,
                                    loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
                             int namelen,
                             u64 *blkno,
                             struct inode *inode,
-                             struct buffer_head **dirent_bh,
+                             struct ocfs2_dir_lookup_result *lookup)
-                             struct ocfs2_dir_entry **dirent)
 {
        int status = -ENOENT;
-        mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
+        mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
-                   namelen, name, blkno, inode, dirent_bh, dirent);
+             (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
+        status = ocfs2_find_entry(name, namelen, inode, lookup);
-        if (!*dirent_bh || !*dirent) {
+        if (status)
-                status = -ENOENT;
                goto leave;
-        }
-        *blkno = le64_to_cpu((*dirent)->inode);
+        *blkno = le64_to_cpu(lookup->dl_entry->inode);
        status = 0;
 leave:
-        if (status < 0) {
-                *dirent = NULL;
-                brelse(*dirent_bh);
-                *dirent_bh = NULL;
-        }
-        mlog_exit(status);
        return status;
 }
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno)
 {
        int ret;
-        struct buffer_head *bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        struct ocfs2_dir_entry *dirent = NULL;
-        ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent);
+        ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
-        brelse(bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        return ret;
 }
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
                              int namelen)
 {
        int ret;
-        struct buffer_head *dirent_bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        struct ocfs2_dir_entry *dirent = NULL;
        mlog_entry("dir %llu, name '%.*s'\n",
                   (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
        ret = -EEXIST;
-        dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
+        if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
-        if (dirent_bh)
                goto bail;
        ret = 0;
 bail:
-        brelse(dirent_bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(ret);
        return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
        unsigned seen_dot;
        unsigned seen_dot_dot;
        unsigned seen_other;
+        unsigned dx_dir;
 };
 static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
                                   loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
        /*
         * Check the positions of "." and ".." records to be sure
         * they're in the correct place.
+         *
+         * Indexed directories don't need to proceed past the first
+         * two entries, so we end the scan after seeing '..'. Despite
+         * that, we allow the scan to proceed In the event that we
+         * have a corrupted indexed directory (no dot or dot dot
+         * entries). This allows us to double check for existing
+         * entries which might not have been found in the index.
         */
        if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
                p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
        if (name_len == 2 && !strncmp("..", name, 2) &&
            pos == OCFS2_DIR_REC_LEN(1)) {
                p->seen_dot_dot = 1;
+                if (p->dx_dir && p->seen_dot)
+                        return 1;
                return 0;
        }
        p->seen_other = 1;
        return 1;
 }
+static int ocfs2_empty_dir_dx(struct inode *inode,
+                              struct ocfs2_empty_dir_priv *priv)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_dx_root_block *dx_root;
+        priv->dx_dir = 1;
+        ret = ocfs2_read_inode_block(inode, &di_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (le32_to_cpu(dx_root->dr_num_entries) != 2)
+                priv->seen_other = 1;
+out:
+        brelse(di_bh);
+        brelse(dx_root_bh);
+        return ret;
+}
 /*
 * routine to check that the specified directory is empty (for rmdir)
 *
 * Returns 1 if dir is empty, zero otherwise.
+ *
+ * XXX: This is a performance problem for unindexed directories.
 */
 int ocfs2_empty_dir(struct inode *inode)
 {
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
        memset(&priv, 0, sizeof(priv));
+        if (ocfs2_dir_indexed(inode)) {
+                ret = ocfs2_empty_dir_dx(inode, &priv);
+                if (ret)
+                        mlog_errno(ret);
+                /*
+                 * We still run ocfs2_dir_foreach to get the checks
+                 * for "." and "..".
+                 */
+        }
        ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
        if (ret)
                mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                                 struct inode *parent,
                                 struct inode *inode,
                                 struct buffer_head *fe_bh,
-                                 struct ocfs2_alloc_context *data_ac)
+                                 struct ocfs2_alloc_context *data_ac,
+                                 struct buffer_head **ret_new_bh)
 {
        int status;
        unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        mlog_entry_void();
-        if (ocfs2_supports_dir_trailer(osb))
+        if (ocfs2_new_dir_wants_trailer(inode))
                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
        de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
-        if (ocfs2_supports_dir_trailer(osb))
+        if (ocfs2_new_dir_wants_trailer(inode)) {
-                ocfs2_init_dir_trailer(inode, new_bh);
+                int size = le16_to_cpu(de->rec_len);
+                /*
+                 * Figure out the size of the hole left over after
+                 * insertion of '.' and '..'. The trailer wants this
+                 * information.
+                 */
+                size -= OCFS2_DIR_REC_LEN(2);
+                size -= sizeof(struct ocfs2_dir_block_trailer);
+                ocfs2_init_dir_trailer(inode, new_bh, size);
+        }
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        }
        status = 0;
+        if (ret_new_bh) {
+                *ret_new_bh = new_bh;
+                new_bh = NULL;
+        }
 bail:
        brelse(new_bh);
@@ -1336,20 +2384,427 @@ bail:
        return status;
 }
+static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
+                                     handle_t *handle, struct inode *dir,
+                                     struct buffer_head *di_bh,
+                                     struct buffer_head *dirdata_bh,
+                                     struct ocfs2_alloc_context *meta_ac,
+                                     int dx_inline, u32 num_entries,
+                                     struct buffer_head **ret_dx_root_bh)
+{
+        int ret;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+        u16 dr_suballoc_bit;
+        u64 dr_blkno;
+        unsigned int num_bits;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
+                                   &num_bits, &dr_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Dir %llu, attach new index block: %llu\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+             (unsigned long long)dr_blkno);
+        dx_root_bh = sb_getblk(osb->sb, dr_blkno);
+        if (dx_root_bh == NULL) {
+                ret = -EIO;
+                goto out;
+        }
+        ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        memset(dx_root, 0, osb->sb->s_blocksize);
+        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
+        dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
+        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
+        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
+        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
+        dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
+        dx_root->dr_num_entries = cpu_to_le32(num_entries);
+        if (le16_to_cpu(trailer->db_free_rec_len))
+                dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+        else
+                dx_root->dr_free_blk = cpu_to_le64(0);
+        if (dx_inline) {
+                dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
+                dx_root->dr_entries.de_count =
+                        cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
+        } else {
+                dx_root->dr_list.l_count =
+                        cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+        }
+        ret = ocfs2_journal_dirty(handle, dx_root_bh);
+        if (ret)
+                mlog_errno(ret);
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di->i_dx_root = cpu_to_le64(dr_blkno);
+        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
+        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+        *ret_dx_root_bh = dx_root_bh;
+        dx_root_bh = NULL;
+out:
+        brelse(dx_root_bh);
+        return ret;
+}
+static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
+                                       handle_t *handle, struct inode *dir,
+                                       struct buffer_head **dx_leaves,
+                                       int num_dx_leaves, u64 start_blk)
+{
+        int ret, i;
+        struct ocfs2_dx_leaf *dx_leaf;
+        struct buffer_head *bh;
+        for (i = 0; i < num_dx_leaves; i++) {
+                bh = sb_getblk(osb->sb, start_blk + i);
+                if (bh == NULL) {
+                        ret = -EIO;
+                        goto out;
+                }
+                dx_leaves[i] = bh;
+                ocfs2_set_new_buffer_uptodate(dir, bh);
+                ret = ocfs2_journal_access_dl(handle, dir, bh,
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
+                memset(dx_leaf, 0, osb->sb->s_blocksize);
+                strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
+                dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
+                dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
+                dx_leaf->dl_list.de_count =
+                        cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
+                mlog(0,
+                     "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
+                     (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                     (unsigned long long)bh->b_blocknr,
+                     le16_to_cpu(dx_leaf->dl_list.de_count));
+                ocfs2_journal_dirty(handle, bh);
+        }
+        ret = 0;
+out:
+        return ret;
+}
+/*
+ * Allocates and formats a new cluster for use in an indexed dir
+ * leaf. This version will not do the extent insert, so that it can be
+ * used by operations which need careful ordering.
+ */
+static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
+                                      u32 cpos, handle_t *handle,
+                                      struct ocfs2_alloc_context *data_ac,
+                                      struct buffer_head **dx_leaves,
+                                      int num_dx_leaves, u64 *ret_phys_blkno)
+{
+        int ret;
+        u32 phys, num;
+        u64 phys_blkno;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        /*
+         * XXX: For create, this should claim cluster for the index
+         * *before* the unindexed insert so that we have a better
+         * chance of contiguousness as the directory grows in number
+         * of entries.
+         */
+        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Format the new cluster first. That way, we're inserting
+         * valid data.
+         */
+        phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
+        ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
+                                          num_dx_leaves, phys_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *ret_phys_blkno = phys_blkno;
+out:
+        return ret;
+}
+static int ocfs2_dx_dir_new_cluster(struct inode *dir,
+                                    struct ocfs2_extent_tree *et,
+                                    u32 cpos, handle_t *handle,
+                                    struct ocfs2_alloc_context *data_ac,
+                                    struct ocfs2_alloc_context *meta_ac,
+                                    struct buffer_head **dx_leaves,
+                                    int num_dx_leaves)
+{
+        int ret;
+        u64 phys_blkno;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
+                                         num_dx_leaves, &phys_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
+                                  meta_ac);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
+}
+static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
+                                                        int *ret_num_leaves)
+{
+        int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
+        struct buffer_head **dx_leaves;
+        dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
+                            GFP_NOFS);
+        if (dx_leaves && ret_num_leaves)
+                *ret_num_leaves = num_dx_leaves;
+        return dx_leaves;
+}
+static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
+                                 handle_t *handle,
+                                 struct inode *parent,
+                                 struct inode *inode,
+                                 struct buffer_head *di_bh,
+                                 struct ocfs2_alloc_context *data_ac,
+                                 struct ocfs2_alloc_context *meta_ac)
+{
+        int ret;
+        struct buffer_head *leaf_bh = NULL;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_hinfo hinfo;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        /*
+         * Our strategy is to create the directory as though it were
+         * unindexed, then add the index block. This works with very
+         * little complication since the state of a new directory is a
+         * very well known quantity.
+         *
+         * Essentially, we have two dirents ("." and ".."), in the 1st
+         * block which need indexing. These are easily inserted into
+         * the index block.
+         */
+        ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
+                                    data_ac, &leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
+                                        meta_ac, 1, 2, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        entry_list = &dx_root->dr_entries;
+        /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
+        ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
+        ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+        ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
+        ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+out:
+        brelse(dx_root_bh);
+        brelse(leaf_bh);
+        return ret;
+}
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       handle_t *handle,
                       struct inode *parent,
                       struct inode *inode,
                       struct buffer_head *fe_bh,
-                       struct ocfs2_alloc_context *data_ac)
+                       struct ocfs2_alloc_context *data_ac,
+                       struct ocfs2_alloc_context *meta_ac)
 {
        BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
+        if (ocfs2_supports_indexed_dirs(osb))
+                return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
+                                             data_ac, meta_ac);
        return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
-                                     data_ac);
+                                     data_ac, NULL);
+}
+static int ocfs2_dx_dir_index_block(struct inode *dir,
+                                    handle_t *handle,
+                                    struct buffer_head **dx_leaves,
+                                    int num_dx_leaves,
+                                    u32 *num_dx_entries,
+                                    struct buffer_head *dirent_bh)
+{
+        int ret = 0, namelen, i;
+        char *de_buf, *limit;
+        struct ocfs2_dir_entry *de;
+        struct buffer_head *dx_leaf_bh;
+        struct ocfs2_dx_hinfo hinfo;
+        u64 dirent_blk = dirent_bh->b_blocknr;
+        de_buf = dirent_bh->b_data;
+        limit = de_buf + dir->i_sb->s_blocksize;
+        while (de_buf < limit) {
+                de = (struct ocfs2_dir_entry *)de_buf;
+                namelen = de->name_len;
+                if (!namelen || !de->inode)
+                        goto inc;
+                ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
+                i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
+                dx_leaf_bh = dx_leaves[i];
+                ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
+                                                 dirent_blk, dx_leaf_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                *num_dx_entries = *num_dx_entries + 1;
+inc:
+                de_buf += le16_to_cpu(de->rec_len);
+        }
+out:
+        return ret;
+}
+/*
+ * XXX: This expects dx_root_bh to already be part of the transaction.
+ */
+static void ocfs2_dx_dir_index_root_block(struct inode *dir,
+                                         struct buffer_head *dx_root_bh,
+                                         struct buffer_head *dirent_bh)
+{
+        char *de_buf, *limit;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dir_entry *de;
+        struct ocfs2_dx_hinfo hinfo;
+        u64 dirent_blk = dirent_bh->b_blocknr;
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        de_buf = dirent_bh->b_data;
+        limit = de_buf + dir->i_sb->s_blocksize;
+        while (de_buf < limit) {
+                de = (struct ocfs2_dir_entry *)de_buf;
+                if (!de->name_len || !de->inode)
+                        goto inc;
+                ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
+                mlog(0,
+                     "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
+                     (unsigned long long)dir->i_ino, hinfo.major_hash,
+                     hinfo.minor_hash,
+                     le16_to_cpu(dx_root->dr_entries.de_num_used),
+                     de->name_len, de->name);
+                ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
+                                           dirent_blk);
+                le32_add_cpu(&dx_root->dr_num_entries, 1);
+inc:
+                de_buf += le16_to_cpu(de->rec_len);
+        }
+}
+/*
+ * Count the number of inline directory entries in di_bh and compare
+ * them against the number of entries we can hold in an inline dx root
+ * block.
+ */
+static int ocfs2_new_dx_should_be_inline(struct inode *dir,
+                                         struct buffer_head *di_bh)
+{
+        int dirent_count = 0;
+        char *de_buf, *limit;
+        struct ocfs2_dir_entry *de;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        de_buf = di->id2.i_data.id_data;
+        limit = de_buf + i_size_read(dir);
+        while (de_buf < limit) {
+                de = (struct ocfs2_dir_entry *)de_buf;
+                if (de->name_len && de->inode)
+                        dirent_count++;
+                de_buf += le16_to_cpu(de->rec_len);
+        }
+        /* We are careful to leave room for one extra record. */
+        return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
 }
 /*
@@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 * expansion from an inline directory to one with extents. The first dir block
 * in that case is taken from the inline data portion of the inode block.
 *
+ * This will also return the largest amount of contiguous space for a dirent
+ * in the block. That value is *not* necessarily the last dirent, even after
+ * expansion. The directory indexing code wants this value for free space
+ * accounting. We do this here since we're already walking the entire dir
+ * block.
+ *
 * We add the dir trailer if this filesystem wants it.
 */
-static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
+static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-                                     struct super_block *sb)
+                                             struct inode *dir)
 {
+        struct super_block *sb = dir->i_sb;
        struct ocfs2_dir_entry *de;
        struct ocfs2_dir_entry *prev_de;
        char *de_buf, *limit;
        unsigned int new_size = sb->s_blocksize;
-        unsigned int bytes;
+        unsigned int bytes, this_hole;
+        unsigned int largest_hole = 0;
-        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+        if (ocfs2_new_dir_wants_trailer(dir))
                new_size = ocfs2_dir_trailer_blk_off(sb);
        bytes = new_size - old_size;
@@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
        de_buf = start;
        de = (struct ocfs2_dir_entry *)de_buf;
        do {
+                this_hole = ocfs2_figure_dirent_hole(de);
+                if (this_hole > largest_hole)
+                        largest_hole = this_hole;
                prev_de = de;
                de_buf += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *)de_buf;
        } while (de_buf < limit);
        le16_add_cpu(&prev_de->rec_len, bytes);
+        /* We need to double check this after modification of the final
+         * dirent. */
+        this_hole = ocfs2_figure_dirent_hole(prev_de);
+        if (this_hole > largest_hole)
+                largest_hole = this_hole;
+        if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+                return largest_hole;
+        return 0;
 }
 /*
@@ -1396,36 +2873,68 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
 */
 static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int blocks_wanted,
+                                   struct ocfs2_dir_lookup_result *lookup,
                                   struct buffer_head **first_block_bh)
 {
-        u32 alloc, bit_off, len;
+        u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
        struct super_block *sb = dir->i_sb;
-        int ret, credits = ocfs2_inline_to_extents_credits(sb);
+        int ret, i, num_dx_leaves = 0, dx_inline = 0,
-        u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
+                credits = ocfs2_inline_to_extents_credits(sb);
+        u64 dx_insert_blkno, blkno,
+                bytes = blocks_wanted << sb->s_blocksize_bits;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(dir);
        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_alloc_context *meta_ac = NULL;
        struct buffer_head *dirdata_bh = NULL;
+        struct buffer_head *dx_root_bh = NULL;
+        struct buffer_head **dx_leaves = NULL;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle;
        struct ocfs2_extent_tree et;
-        int did_quota = 0;
+        struct ocfs2_extent_tree dx_et;
+        int did_quota = 0, bytes_allocated = 0;
        ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
        alloc = ocfs2_clusters_for_bytes(sb, bytes);
+        dx_alloc = 0;
+        if (ocfs2_supports_indexed_dirs(osb)) {
+                credits += ocfs2_add_dir_index_credits(sb);
+                dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
+                if (!dx_inline) {
+                        /* Add one more cluster for an index leaf */
+                        dx_alloc++;
+                        dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
+                                                                &num_dx_leaves);
+                        if (!dx_leaves) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+                /* This gets us the dx_root */
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
        /*
-         * We should never need more than 2 clusters for this -
+         * We should never need more than 2 clusters for the unindexed
-         * maximum dirent size is far less than one block. In fact,
+         * tree - maximum dirent size is far less than one block. In
-         * the only time we'd need more than one cluster is if
+         * fact, the only time we'd need more than one cluster is if
         * blocksize == clustersize and the dirent won't fit in the
         * extra space that the expansion to a single block gives. As
         * of today, that only happens on 4k/4k file systems.
         */
        BUG_ON(alloc > 2);
-        ret = ocfs2_reserve_clusters(osb, alloc, &data_ac);
+        ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        /*
         * Prepare for worst case allocation scenario of two separate
-         * extents.
+         * extents in the unindexed tree.
         */
        if (alloc == 2)
                credits += OCFS2_SUBALLOC_ALLOC;
@@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        }
        if (vfs_dq_alloc_space_nodirty(dir,
-                                ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+                                ocfs2_clusters_to_bytes(osb->sb,
+                                                        alloc + dx_alloc))) {
                ret = -EDQUOT;
                goto out_commit;
        }
        did_quota = 1;
+        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+                /*
+                 * Allocate our index cluster first, to maximize the
+                 * possibility that unindexed leaves grow
+                 * contiguously.
+                 */
+                ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
+                                                 dx_leaves, num_dx_leaves,
+                                                 &dx_insert_blkno);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+        }
        /*
         * Try to claim as many clusters as the bitmap can give though
         * if we only get one now, that's enough to continue. The rest
@@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                mlog_errno(ret);
                goto out_commit;
        }
+        bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
        /*
         * Operations are carefully ordered so that we set up the new
@@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
        memset(dirdata_bh->b_data + i_size_read(dir), 0,
               sb->s_blocksize - i_size_read(dir));
-        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
+        i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
-        if (ocfs2_supports_dir_trailer(osb))
+        if (ocfs2_new_dir_wants_trailer(dir)) {
-                ocfs2_init_dir_trailer(dir, dirdata_bh);
+                /*
+                 * Prepare the dir trailer up front. It will otherwise look
+                 * like a valid dirent. Even if inserting the index fails
+                 * (unlikely), then all we'll have done is given first dir
+                 * block a small amount of fragmentation.
+                 */
+                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
+        }
        ret = ocfs2_journal_dirty(handle, dirdata_bh);
        if (ret) {
@@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_commit;
        }
+        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+                /*
+                 * Dx dirs with an external cluster need to do this up
+                 * front. Inline dx root's get handled later, after
+                 * we've allocated our root block. We get passed back
+                 * a total number of items so that dr_num_entries can
+                 * be correctly set once the dx_root has been
+                 * allocated.
+                 */
+                ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
+                                               num_dx_leaves, &num_dx_entries,
+                                               dirdata_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+        }
        /*
         * Set extent, i_size, etc on the directory. After this, the
         * inode should contain the same exact dirents as before and
@@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_commit;
        }
+        if (ocfs2_supports_indexed_dirs(osb)) {
+                ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
+                                                dirdata_bh, meta_ac, dx_inline,
+                                                num_dx_entries, &dx_root_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                if (dx_inline) {
+                        ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
+                                                      dirdata_bh);
+                } else {
+                        ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
+                        ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
+                                                  dx_insert_blkno, 1, 0, NULL);
+                        if (ret)
+                                mlog_errno(ret);
+                }
+        }
        /*
         * We asked for two clusters, but only got one in the 1st
         * pass. Claim the 2nd cluster as a separate extent.
@@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                        mlog_errno(ret);
                        goto out_commit;
                }
+                bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
        }
        *first_block_bh = dirdata_bh;
        dirdata_bh = NULL;
+        if (ocfs2_supports_indexed_dirs(osb)) {
+                unsigned int off;
+                if (!dx_inline) {
+                        /*
+                         * We need to return the correct block within the
+                         * cluster which should hold our entry.
+                         */
+                        off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+                                                    &lookup->dl_hinfo);
+                        get_bh(dx_leaves[off]);
+                        lookup->dl_dx_leaf_bh = dx_leaves[off];
+                }
+                lookup->dl_dx_root_bh = dx_root_bh;
+                dx_root_bh = NULL;
+        }
 out_commit:
        if (ret < 0 && did_quota)
-                vfs_dq_free_space_nodirty(dir,
+                vfs_dq_free_space_nodirty(dir, bytes_allocated);
-                        ocfs2_clusters_to_bytes(osb->sb, 2));
        ocfs2_commit_trans(osb, handle);
 out_sem:
@@ -1587,8 +3178,17 @@ out_sem:
 out:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        if (dx_leaves) {
+                for (i = 0; i < num_dx_leaves; i++)
+                        brelse(dx_leaves[i]);
+                kfree(dx_leaves);
+        }
        brelse(dirdata_bh);
+        brelse(dx_root_bh);
        return ret;
 }
@@ -1658,11 +3258,14 @@ bail:
 * is to be turned into an extent based one. The size of the dirent to
 * insert might be larger than the space gained by growing to just one
 * block, so we may have to grow the inode by two blocks in that case.
+ *
+ * If the directory is already indexed, dx_root_bh must be provided.
 */
 static int ocfs2_extend_dir(struct ocfs2_super *osb,
                            struct inode *dir,
                            struct buffer_head *parent_fe_bh,
                            unsigned int blocks_wanted,
+                            struct ocfs2_dir_lookup_result *lookup,
                            struct buffer_head **new_de_bh)
 {
        int status = 0;
@@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        struct ocfs2_dir_entry * de;
        struct super_block *sb = osb->sb;
        struct ocfs2_extent_tree et;
+        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
        mlog_entry_void();
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                /*
+                 * This would be a code error as an inline directory should
+                 * never have an index root.
+                 */
+                BUG_ON(dx_root_bh);
                status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
-                                                 blocks_wanted, &new_bh);
+                                                 blocks_wanted, lookup,
+                                                 &new_bh);
                if (status) {
                        mlog_errno(status);
                        goto bail;
                }
+                /* Expansion from inline to an indexed directory will
+                 * have given us this. */
+                dx_root_bh = lookup->dl_dx_root_bh;
                if (blocks_wanted == 1) {
                        /*
                         * If the new dirent will fit inside the space
@@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        }
 do_extend:
+        if (ocfs2_dir_indexed(dir))
+                credits++; /* For attaching the new dirent block to the
+                            * dx_root */
        down_write(&OCFS2_I(dir)->ip_alloc_sem);
        drop_alloc_sem = 1;
@@ -1781,9 +3400,19 @@ do_extend:
        de = (struct ocfs2_dir_entry *) new_bh->b_data;
        de->inode = 0;
-        if (ocfs2_dir_has_trailer(dir)) {
+        if (ocfs2_supports_dir_trailer(dir)) {
                de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
-                ocfs2_init_dir_trailer(dir, new_bh);
+                ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
+                if (ocfs2_dir_indexed(dir)) {
+                        status = ocfs2_dx_dir_link_trailer(dir, handle,
+                                                           dx_root_bh, new_bh);
+                        if (status) {
+                                mlog_errno(status);
+                                goto bail;
+                        }
+                }
        } else {
                de->rec_len = cpu_to_le16(sb->s_blocksize);
        }
@@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
         * This calculates how many free bytes we'd have in block zero, should
         * this function force expansion to an extent tree.
         */
-        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+        if (ocfs2_new_dir_wants_trailer(dir))
                free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
        else
                free_space = dir->i_sb->s_blocksize - i_size_read(dir);
@@ -1970,12 +3599,766 @@ bail:
        return status;
 }
+static int dx_leaf_sort_cmp(const void *a, const void *b)
+{
+        const struct ocfs2_dx_entry *entry1 = a;
+        const struct ocfs2_dx_entry *entry2 = b;
+        u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
+        u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
+        u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
+        u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
+        if (major_hash1 > major_hash2)
+                return 1;
+        if (major_hash1 < major_hash2)
+                return -1;
+        /*
+         * It is not strictly necessary to sort by minor
+         */
+        if (minor_hash1 > minor_hash2)
+                return 1;
+        if (minor_hash1 < minor_hash2)
+                return -1;
+        return 0;
+}
+static void dx_leaf_sort_swap(void *a, void *b, int size)
+{
+        struct ocfs2_dx_entry *entry1 = a;
+        struct ocfs2_dx_entry *entry2 = b;
+        struct ocfs2_dx_entry tmp;
+        BUG_ON(size != sizeof(*entry1));
+        tmp = *entry1;
+        *entry1 = *entry2;
+        *entry2 = tmp;
+}
+static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
+{
+        struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+        int i, num = le16_to_cpu(dl_list->de_num_used);
+        for (i = 0; i < (num - 1); i++) {
+                if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
+                    le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
+                        return 0;
+        }
+        return 1;
+}
+/*
+ * Find the optimal value to split this leaf on. This expects the leaf
+ * entries to be in sorted order.
+ *
+ * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
+ * the hash we want to insert.
+ *
+ * This function is only concerned with the major hash - that which
+ * determines which cluster an item belongs to.
+ */
+static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
+                                        u32 leaf_cpos, u32 insert_hash,
+                                        u32 *split_hash)
+{
+        struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+        int i, num_used = le16_to_cpu(dl_list->de_num_used);
+        int allsame;
+        /*
+         * There's a couple rare, but nasty corner cases we have to
+         * check for here. All of them involve a leaf where all value
+         * have the same hash, which is what we look for first.
+         *
+         * Most of the time, all of the above is false, and we simply
+         * pick the median value for a split.
+         */
+        allsame = ocfs2_dx_leaf_same_major(dx_leaf);
+        if (allsame) {
+                u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
+                if (val == insert_hash) {
+                        /*
+                         * No matter where we would choose to split,
+                         * the new entry would want to occupy the same
+                         * block as these. Since there's no space left
+                         * in their existing block, we know there
+                         * won't be space after the split.
+                         */
+                        return -ENOSPC;
+                }
+                if (val == leaf_cpos) {
+                        /*
+                         * Because val is the same as leaf_cpos (which
+                         * is the smallest value this leaf can have),
+                         * yet is not equal to insert_hash, then we
+                         * know that insert_hash *must* be larger than
+                         * val (and leaf_cpos). At least cpos+1 in value.
+                         *
+                         * We also know then, that there cannot be an
+                         * adjacent extent (otherwise we'd be looking
+                         * at it). Choosing this value gives us a
+                         * chance to get some contiguousness.
+                         */
+                        *split_hash = leaf_cpos + 1;
+                        return 0;
+                }
+                if (val > insert_hash) {
+                        /*
+                         * val can not be the same as insert hash, and
+                         * also must be larger than leaf_cpos. Also,
+                         * we know that there can't be a leaf between
+                         * cpos and val, otherwise the entries with
+                         * hash 'val' would be there.
+                         */
+                        *split_hash = val;
+                        return 0;
+                }
+                *split_hash = insert_hash;
+                return 0;
+        }
+        /*
+         * Since the records are sorted and the checks above
+         * guaranteed that not all records in this block are the same,
+         * we simple travel forward, from the median, and pick the 1st
+         * record whose value is larger than leaf_cpos.
+         */
+        for (i = (num_used / 2); i < num_used; i++)
+                if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
+                    leaf_cpos)
+                        break;
+        BUG_ON(i == num_used); /* Should be impossible */
+        *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
+        return 0;
+}
+/*
+ * Transfer all entries in orig_dx_leaves whose major hash is equal to or
+ * larger than split_hash into new_dx_leaves. We use a temporary
+ * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
+ *
+ * Since the block offset inside a leaf (cluster) is a constant mask
+ * of minor_hash, we can optimize - an item at block offset X within
+ * the original cluster, will be at offset X within the new cluster.
+ */
+static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
+                                       handle_t *handle,
+                                       struct ocfs2_dx_leaf *tmp_dx_leaf,
+                                       struct buffer_head **orig_dx_leaves,
+                                       struct buffer_head **new_dx_leaves,
+                                       int num_dx_leaves)
+{
+        int i, j, num_used;
+        u32 major_hash;
+        struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
+        struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+        struct ocfs2_dx_entry *dx_entry;
+        tmp_list = &tmp_dx_leaf->dl_list;
+        for (i = 0; i < num_dx_leaves; i++) {
+                orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
+                orig_list = &orig_dx_leaf->dl_list;
+                new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
+                new_list = &new_dx_leaf->dl_list;
+                num_used = le16_to_cpu(orig_list->de_num_used);
+                memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
+                tmp_list->de_num_used = cpu_to_le16(0);
+                memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
+                for (j = 0; j < num_used; j++) {
+                        dx_entry = &orig_list->de_entries[j];
+                        major_hash = le32_to_cpu(dx_entry->dx_major_hash);
+                        if (major_hash >= split_hash)
+                                ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
+                                                              dx_entry);
+                        else
+                                ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
+                                                              dx_entry);
+                }
+                memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
+                ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
+                ocfs2_journal_dirty(handle, new_dx_leaves[i]);
+        }
+}
+static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
+                                          struct ocfs2_dx_root_block *dx_root)
+{
+        int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
+        credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+        credits += ocfs2_quota_trans_credits(osb->sb);
+        return credits;
+}
+/*
+ * Find the median value in dx_leaf_bh and allocate a new leaf to move
+ * half our entries into.
+ */
+static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
+                                  struct buffer_head *dx_root_bh,
+                                  struct buffer_head *dx_leaf_bh,
+                                  struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
+                                  u64 leaf_blkno)
+{
+        struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+        int credits, ret, i, num_used, did_quota = 0;
+        u32 cpos, split_hash, insert_hash = hinfo->major_hash;
+        u64 orig_leaves_start;
+        int num_dx_leaves;
+        struct buffer_head **orig_dx_leaves = NULL;
+        struct buffer_head **new_dx_leaves = NULL;
+        struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
+        struct ocfs2_extent_tree et;
+        handle_t *handle = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
+        mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+             (unsigned long long)leaf_blkno, insert_hash);
+        ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        /*
+         * XXX: This is a rather large limit. We should use a more
+         * realistic value.
+         */
+        if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
+                return -ENOSPC;
+        num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+        if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
+                mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
+                     "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                     (unsigned long long)leaf_blkno, num_used);
+                ret = -EIO;
+                goto out;
+        }
+        orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+        if (!orig_dx_leaves) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
+        if (!new_dx_leaves) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
+        if (ret) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+        credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                       ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
+        ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * This block is changing anyway, so we can sort it in place.
+         */
+        sort(dx_leaf->dl_list.de_entries, num_used,
+             sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
+             dx_leaf_sort_swap);
+        ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
+                                           &split_hash);
+        if (ret) {
+                mlog_errno(ret);
+                goto  out_commit;
+        }
+        mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
+             leaf_cpos, split_hash, insert_hash);
+        /*
+         * We have to carefully order operations here. There are items
+         * which want to be in the new cluster before insert, but in
+         * order to put those items in the new cluster, we alter the
+         * old cluster. A failure to insert gets nasty.
+         *
+         * So, start by reserving writes to the old
+         * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
+         * the new cluster for us, before inserting it. The insert
+         * won't happen if there's an error before that. Once the
+         * insert is done then, we can transfer from one leaf into the
+         * other without fear of hitting any error.
+         */
+        /*
+         * The leaf transfer wants some scratch space so that we don't
+         * wind up doing a bunch of expensive memmove().
+         */
+        tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
+        if (!tmp_dx_leaf) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
+        ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
+                                   orig_dx_leaves);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        for (i = 0; i < num_dx_leaves; i++) {
+                ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+        }
+        cpos = split_hash;
+        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                       data_ac, meta_ac, new_dx_leaves,
+                                       num_dx_leaves);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
+                                   orig_dx_leaves, new_dx_leaves, num_dx_leaves);
+out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                                ocfs2_clusters_to_bytes(dir->i_sb, 1));
+        ocfs2_commit_trans(osb, handle);
+out:
+        if (orig_dx_leaves || new_dx_leaves) {
+                for (i = 0; i < num_dx_leaves; i++) {
+                        if (orig_dx_leaves)
+                                brelse(orig_dx_leaves[i]);
+                        if (new_dx_leaves)
+                                brelse(new_dx_leaves[i]);
+                }
+                kfree(orig_dx_leaves);
+                kfree(new_dx_leaves);
+        }
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        kfree(tmp_dx_leaf);
+        return ret;
+}
+static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
+                                   struct buffer_head *di_bh,
+                                   struct buffer_head *dx_root_bh,
+                                   const char *name, int namelen,
+                                   struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret, rebalanced = 0;
+        struct ocfs2_dx_root_block *dx_root;
+        struct buffer_head *dx_leaf_bh = NULL;
+        struct ocfs2_dx_leaf *dx_leaf;
+        u64 blkno;
+        u32 leaf_cpos;
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+restart_search:
+        ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
+                                  &leaf_cpos, &blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+        if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
+            le16_to_cpu(dx_leaf->dl_list.de_count)) {
+                if (rebalanced) {
+                        /*
+                         * Rebalancing should have provided us with
+                         * space in an appropriate leaf.
+                         *
+                         * XXX: Is this an abnormal condition then?
+                         * Should we print a message here?
+                         */
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
+                                             &lookup->dl_hinfo, leaf_cpos,
+                                             blkno);
+                if (ret) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * Restart the lookup. The rebalance might have
+                 * changed which block our item fits into. Mark our
+                 * progress, so we only execute this once.
+                 */
+                brelse(dx_leaf_bh);
+                dx_leaf_bh = NULL;
+                rebalanced = 1;
+                goto restart_search;
+        }
+        lookup->dl_dx_leaf_bh = dx_leaf_bh;
+        dx_leaf_bh = NULL;
+out:
+        brelse(dx_leaf_bh);
+        return ret;
+}
+static int ocfs2_search_dx_free_list(struct inode *dir,
+                                     struct buffer_head *dx_root_bh,
+                                     int namelen,
+                                     struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret = -ENOSPC;
+        struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
+        struct ocfs2_dir_block_trailer *db;
+        u64 next_block;
+        int rec_len = OCFS2_DIR_REC_LEN(namelen);
+        struct ocfs2_dx_root_block *dx_root;
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        next_block = le64_to_cpu(dx_root->dr_free_blk);
+        while (next_block) {
+                brelse(prev_leaf_bh);
+                prev_leaf_bh = leaf_bh;
+                leaf_bh = NULL;
+                ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+                if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
+                        lookup->dl_leaf_bh = leaf_bh;
+                        lookup->dl_prev_leaf_bh = prev_leaf_bh;
+                        leaf_bh = NULL;
+                        prev_leaf_bh = NULL;
+                        break;
+                }
+                next_block = le64_to_cpu(db->db_free_next);
+        }
+        if (!next_block)
+                ret = -ENOSPC;
+out:
+        brelse(leaf_bh);
+        brelse(prev_leaf_bh);
+        return ret;
+}
+static int ocfs2_expand_inline_dx_root(struct inode *dir,
+                                       struct buffer_head *dx_root_bh)
+{
+        int ret, num_dx_leaves, i, j, did_quota = 0;
+        struct buffer_head **dx_leaves = NULL;
+        struct ocfs2_extent_tree et;
+        u64 insert_blkno;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        handle_t *handle = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        struct ocfs2_dx_entry *dx_entry;
+        struct ocfs2_dx_leaf *target_leaf;
+        ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+        if (!dx_leaves) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
+        /*
+         * We do this up front, before the allocation, so that a
+         * failure to add the dx_root_bh to the journal won't result
+         * us losing clusters.
+         */
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
+                                         num_dx_leaves, &insert_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * Transfer the entries from our dx_root into the appropriate
+         * block
+         */
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        entry_list = &dx_root->dr_entries;
+        for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+                dx_entry = &entry_list->de_entries[i];
+                j = __ocfs2_dx_dir_hash_idx(osb,
+                                            le32_to_cpu(dx_entry->dx_minor_hash));
+                target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
+                ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
+                /* Each leaf has been passed to the journal already
+                 * via __ocfs2_dx_dir_new_cluster() */
+        }
+        dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
+        memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
+               offsetof(struct ocfs2_dx_root_block, dr_list));
+        dx_root->dr_list.l_count =
+                cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+        /* This should never fail considering we start with an empty
+         * dx_root. */
+        ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+        ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
+                                  insert_blkno, 1, 0, NULL);
+        if (ret)
+                mlog_errno(ret);
+        did_quota = 0;
+        ocfs2_journal_dirty(handle, dx_root_bh);
+out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                                          ocfs2_clusters_to_bytes(dir->i_sb, 1));
+        ocfs2_commit_trans(osb, handle);
+out:
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (dx_leaves) {
+                for (i = 0; i < num_dx_leaves; i++)
+                        brelse(dx_leaves[i]);
+                kfree(dx_leaves);
+        }
+        return ret;
+}
+static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
+{
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        entry_list = &dx_root->dr_entries;
+        if (le16_to_cpu(entry_list->de_num_used) >=
+            le16_to_cpu(entry_list->de_count))
+                return -ENOSPC;
+        return 0;
+}
+static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
+                                           struct buffer_head *di_bh,
+                                           const char *name,
+                                           int namelen,
+                                           struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret, free_dx_root = 1;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct buffer_head *dx_root_bh = NULL;
+        struct buffer_head *leaf_bh = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_dx_root_block *dx_root;
+        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
+                ret = -ENOSPC;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (ocfs2_dx_root_inline(dx_root)) {
+                ret = ocfs2_inline_dx_has_space(dx_root_bh);
+                if (ret == 0)
+                        goto search_el;
+                /*
+                 * We ran out of room in the root block. Expand it to
+                 * an extent, then allow ocfs2_find_dir_space_dx to do
+                 * the rest.
+                 */
+                ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        /*
+         * Insert preparation for an indexed directory is split into two
+         * steps. The call to find_dir_space_dx reserves room in the index for
+         * an additional item. If we run out of space there, it's a real error
+         * we can't continue on.
+         */
+        ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
+                                      namelen, lookup);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+search_el:
+        /*
+         * Next, we need to find space in the unindexed tree. This call
+         * searches using the free space linked list. If the unindexed tree
+         * lacks sufficient space, we'll expand it below. The expansion code
+         * is smart enough to add any new blocks to the free space list.
+         */
+        ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
+        if (ret && ret != -ENOSPC) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* Do this up here - ocfs2_extend_dir might need the dx_root */
+        lookup->dl_dx_root_bh = dx_root_bh;
+        free_dx_root = 0;
+        if (ret == -ENOSPC) {
+                ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * We make the assumption here that new leaf blocks are added
+                 * to the front of our free list.
+                 */
+                lookup->dl_prev_leaf_bh = NULL;
+                lookup->dl_leaf_bh = leaf_bh;
+        }
+out:
+        if (free_dx_root)
+                brelse(dx_root_bh);
+        return ret;
+}
+/*
+ * Get a directory ready for insert. Any directory allocation required
+ * happens here. Success returns zero, and enough context in the dir
+ * lookup result that ocfs2_add_entry() will be able complete the task
+ * with minimal performance impact.
+ */
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct inode *dir,
                                 struct buffer_head *parent_fe_bh,
                                 const char *name,
                                 int namelen,
-                                 struct buffer_head **ret_de_bh)
+                                 struct ocfs2_dir_lookup_result *lookup)
 {
        int ret;
        unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
        mlog(0, "getting ready to insert namelen %d into dir %llu\n",
             namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        *ret_de_bh = NULL;
        if (!namelen) {
                ret = -EINVAL;
                mlog_errno(ret);
                goto out;
        }
+        /*
+         * Do this up front to reduce confusion.
+         *
+         * The directory might start inline, then be turned into an
+         * indexed one, in which case we'd need to hash deep inside
+         * ocfs2_find_dir_space_id(). Since
+         * ocfs2_prepare_dx_dir_for_insert() also needs this hash
+         * done, there seems no point in spreading out the calls. We
+         * can optimize away the case where the file system doesn't
+         * support indexing.
+         */
+        if (ocfs2_supports_indexed_dirs(osb))
+                ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
+        if (ocfs2_dir_indexed(dir)) {
+                ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
+                                                      name, namelen, lookup);
+                if (ret)
+                        mlog_errno(ret);
+                goto out;
+        }
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
                                              namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                BUG_ON(bh);
                ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
-                                       &bh);
+                                       lookup, &bh);
                if (ret) {
                        if (ret != -ENOSPC)
                                mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                BUG_ON(!bh);
        }
-        *ret_de_bh = bh;
+        lookup->dl_leaf_bh = bh;
        bh = NULL;
 out:
        brelse(bh);
        return ret;
 }
+static int ocfs2_dx_dir_remove_index(struct inode *dir,
+                                     struct buffer_head *di_bh,
+                                     struct buffer_head *dx_root_bh)
+{
+        int ret;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_dx_root_block *dx_root;
+        struct inode *dx_alloc_inode = NULL;
+        struct buffer_head *dx_alloc_bh = NULL;
+        handle_t *handle;
+        u64 blk;
+        u16 bit;
+        u64 bg_blkno;
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        dx_alloc_inode = ocfs2_get_system_file_inode(osb,
+                                        EXTENT_ALLOC_SYSTEM_INODE,
+                                        le16_to_cpu(dx_root->dr_suballoc_slot));
+        if (!dx_alloc_inode) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&dx_alloc_inode->i_mutex);
+        ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
+        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        di->i_dx_root = cpu_to_le64(0ULL);
+        ocfs2_journal_dirty(handle, di_bh);
+        blk = le64_to_cpu(dx_root->dr_blkno);
+        bit = le16_to_cpu(dx_root->dr_suballoc_bit);
+        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
+                                       bit, bg_blkno, 1);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        ocfs2_inode_unlock(dx_alloc_inode, 1);
+out_mutex:
+        mutex_unlock(&dx_alloc_inode->i_mutex);
+        brelse(dx_alloc_bh);
+out:
+        iput(dx_alloc_inode);
+        return ret;
+}
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
+{
+        int ret;
+        unsigned int uninitialized_var(clen);
+        u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
+        u64 uninitialized_var(blkno);
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_dealloc_ctxt(&dealloc);
+        if (!ocfs2_dir_indexed(dir))
+                return 0;
+        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (ocfs2_dx_root_inline(dx_root))
+                goto remove_index;
+        ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+        /* XXX: What if dr_clusters is too large? */
+        while (le32_to_cpu(dx_root->dr_clusters)) {
+                ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
+                                              major_hash, &cpos, &blkno, &clen);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
+                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
+                                               &dealloc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (cpos == 0)
+                        break;
+                major_hash = cpos - 1;
+        }
+remove_index:
+        ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_remove_from_cache(dir, dx_root_bh);
+out:
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &dealloc);
+        brelse(dx_root_bh);
+        return ret;
+}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index c511e2e18e9f..e683f3deb645 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,44 +26,70 @@
 #ifndef OCFS2_DIR_H
 #define OCFS2_DIR_H
-struct buffer_head *ocfs2_find_entry(const char *name,
+struct ocfs2_dx_hinfo {
-                                     int namelen,
+        u32     major_hash;
-                                     struct inode *dir,
+        u32     minor_hash;
-                                     struct ocfs2_dir_entry **res_dir);
+};
+struct ocfs2_dir_lookup_result {
+        struct buffer_head              *dl_leaf_bh;    /* Unindexed leaf
+                                                         * block */
+        struct ocfs2_dir_entry          *dl_entry;      /* Target dirent in
+                                                         * unindexed leaf */
+        struct buffer_head              *dl_dx_root_bh; /* Root of indexed
+                                                         * tree */
+        struct buffer_head              *dl_dx_leaf_bh; /* Indexed leaf block */
+        struct ocfs2_dx_entry           *dl_dx_entry;   /* Target dx_entry in
+                                                         * indexed leaf */
+        struct ocfs2_dx_hinfo           dl_hinfo;       /* Name hash results */
+        struct buffer_head              *dl_prev_leaf_bh;/* Previous entry in
+                                                          * dir free space
+                                                          * list. NULL if
+                                                          * previous entry is
+                                                          * dx root block. */
+};
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
+int ocfs2_find_entry(const char *name, int namelen,
+                     struct inode *dir,
+                     struct ocfs2_dir_lookup_result *lookup);
 int ocfs2_delete_entry(handle_t *handle,
                       struct inode *dir,
-                       struct ocfs2_dir_entry *de_del,
+                       struct ocfs2_dir_lookup_result *res);
-                       struct buffer_head *bh);
 int __ocfs2_add_entry(handle_t *handle,
                      struct inode *dir,
                      const char *name, int namelen,
                      struct inode *inode, u64 blkno,
                      struct buffer_head *parent_fe_bh,
-                      struct buffer_head *insert_bh);
+                      struct ocfs2_dir_lookup_result *lookup);
 static inline int ocfs2_add_entry(handle_t *handle,
                                  struct dentry *dentry,
                                  struct inode *inode, u64 blkno,
                                  struct buffer_head *parent_fe_bh,
-                                  struct buffer_head *insert_bh)
+                                  struct ocfs2_dir_lookup_result *lookup)
 {
        return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
                                 dentry->d_name.name, dentry->d_name.len,
-                                 inode, blkno, parent_fe_bh, insert_bh);
+                                 inode, blkno, parent_fe_bh, lookup);
 }
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-                       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+                       struct ocfs2_dir_lookup_result *res,
                       struct inode *new_entry_inode);
 int ocfs2_check_dir_for_entry(struct inode *dir,
                              const char *name,
                              int namelen);
 int ocfs2_empty_dir(struct inode *inode);
 int ocfs2_find_files_on_disk(const char *name,
                             int namelen,
                             u64 *blkno,
                             struct inode *inode,
-                             struct buffer_head **dirent_bh,
+                             struct ocfs2_dir_lookup_result *res);
-                             struct ocfs2_dir_entry **dirent);
 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno);
 int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct buffer_head *parent_fe_bh,
                                 const char *name,
                                 int namelen,
-                                 struct buffer_head **ret_de_bh);
+                                 struct ocfs2_dir_lookup_result *lookup);
 struct ocfs2_alloc_context;
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       handle_t *handle,
                       struct inode *parent,
                       struct inode *inode,
                       struct buffer_head *fe_bh,
-                       struct ocfs2_alloc_context *data_ac);
+                       struct ocfs2_alloc_context *data_ac,
+                       struct ocfs2_alloc_context *meta_ac);
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
 struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
                                                            void *data);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bb53714813ab..0102be35980c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -52,16 +52,12 @@
 enum dlm_mle_type {
        DLM_MLE_BLOCK,
        DLM_MLE_MASTER,
-        DLM_MLE_MIGRATION
+        DLM_MLE_MIGRATION,
-};
+        DLM_MLE_NUM_TYPES
-struct dlm_lock_name {
-        u8 len;
-        u8 name[DLM_LOCKID_NAME_MAX];
 };
 struct dlm_master_list_entry {
-        struct list_head list;
+        struct hlist_node master_hash_node;
        struct list_head hb_events;
        struct dlm_ctxt *dlm;
        spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
        enum dlm_mle_type type;
        struct o2hb_callback_func mle_hb_up;
        struct o2hb_callback_func mle_hb_down;
-        union {
+        struct dlm_lock_resource *mleres;
-                struct dlm_lock_resource *res;
+        unsigned char mname[DLM_LOCKID_NAME_MAX];
-                struct dlm_lock_name name;
+        unsigned int mnamelen;
-        } u;
+        unsigned int mnamehash;
 };
 enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
        unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct dlm_recovery_ctxt reco;
        spinlock_t master_lock;
-        struct list_head master_list;
+        struct hlist_head **master_hash;
        struct list_head mle_hb_events;
        /* these give a really vague idea of the system load */
-        atomic_t local_resources;
+        atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
-        atomic_t remote_resources;
+        atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
-        atomic_t unknown_resources;
+        atomic_t res_tot_count;
+        atomic_t res_cur_count;
        struct dlm_debug_ctxt *dlm_debug_ctxt;
        struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
        return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
 }
+static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
+                                                 unsigned i)
+{
+        return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
+                        (i % DLM_BUCKETS_PER_PAGE);
+}
 /* these keventd work queue items are for less-frequently
 * called functions that cannot be directly called from the
 * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                              unsigned int len);
 int dlm_is_host_down(int errno);
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res,
-                              u8 owner);
 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
                                                 const char *lockid,
                                                 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
                                          DLM_LOCK_RES_MIGRATING));
 }
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
 /* create/destroy slab caches */
 int dlm_init_master_caches(void);
 void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
        return bit;
 }
+static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+                                         struct dlm_lock_resource *res,
+                                         u8 owner)
+{
+        assert_spin_locked(&res->spinlock);
+        res->owner = owner;
+}
+static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+                                            struct dlm_lock_resource *res,
+                                            u8 owner)
+{
+        assert_spin_locked(&res->spinlock);
+        if (owner != res->owner)
+                dlm_set_lockres_owner(dlm, res, owner);
+}
 #endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index b32f60a5acfb..df52f706f669 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
 static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 {
        int out = 0;
-        unsigned int namelen;
-        const char *name;
        char *mle_type;
-        if (mle->type != DLM_MLE_MASTER) {
-                namelen = mle->u.name.len;
-                name = mle->u.name.name;
-        } else {
-                namelen = mle->u.res->lockname.len;
-                name = mle->u.res->lockname.name;
-        }
        if (mle->type == DLM_MLE_BLOCK)
                mle_type = "BLK";
        else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
        else
                mle_type = "MIG";
-        out += stringify_lockname(name, namelen, buf + out, len - out);
+        out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
        out += snprintf(buf + out, len - out,
                        "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
                        mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
 static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
        struct dlm_master_list_entry *mle;
-        int out = 0;
+        struct hlist_head *bucket;
-        unsigned long total = 0;
+        struct hlist_node *list;
+        int i, out = 0;
+        unsigned long total = 0, longest = 0, bktcnt;
        out += snprintf(db->buf + out, db->len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
        spin_lock(&dlm->master_lock);
-        list_for_each_entry(mle, &dlm->master_list, list) {
+        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                ++total;
+                bucket = dlm_master_hash(dlm, i);
-                if (db->len - out < 200)
+                hlist_for_each(list, bucket) {
-                        continue;
+                        mle = hlist_entry(list, struct dlm_master_list_entry,
-                out += dump_mle(mle, db->buf + out, db->len - out);
+                                          master_hash_node);
+                        ++total;
+                        ++bktcnt;
+                        if (db->len - out < 200)
+                                continue;
+                        out += dump_mle(mle, db->buf + out, db->len - out);
+                }
+                longest = max(longest, bktcnt);
+                bktcnt = 0;
        }
        spin_unlock(&dlm->master_lock);
        out += snprintf(db->buf + out, db->len - out,
-                        "Total on list: %ld\n", total);
+                        "Total: %ld, Longest: %ld\n", total, longest);
        return out;
 }
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        int out = 0;
        struct dlm_reco_node_data *node;
        char *state;
-        int lres, rres, ures, tres;
+        int cur_mles = 0, tot_mles = 0;
+        int i;
-        lres = atomic_read(&dlm->local_resources);
-        rres = atomic_read(&dlm->remote_resources);
-        ures = atomic_read(&dlm->unknown_resources);
-        tres = lres + rres + ures;
        spin_lock(&dlm->spinlock);
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                                 db->buf + out, db->len - out);
        out += snprintf(db->buf + out, db->len - out, "\n");
-        /* Mastered Resources Total: xxx  Locally: xxx  Remotely: ... */
+        /* Lock Resources: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Lock Resources: %d (%d)\n",
+                        atomic_read(&dlm->res_cur_count),
+                        atomic_read(&dlm->res_tot_count));
+        for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+                tot_mles += atomic_read(&dlm->mle_tot_count[i]);
+        for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+                cur_mles += atomic_read(&dlm->mle_cur_count[i]);
+        /* MLEs: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "MLEs: %d (%d)\n", cur_mles, tot_mles);
+        /*  Blocking: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "  Blocking: %d (%d)\n",
+                        atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
+                        atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
+        /*  Mastery: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "  Mastery: %d (%d)\n",
+                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
+                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
+        /*  Migration: xxx (xxx) */
        out += snprintf(db->buf + out, db->len - out,
-                        "Mastered Resources Total: %d  Locally: %d  "
+                        "  Migration: %d (%d)\n",
-                        "Remotely: %d  Unknown: %d\n",
+                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
-                        tres, lres, rres, ures);
+                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
        out += snprintf(db->buf + out, db->len - out,
                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
-                        "PendingBASTs=%s  Master=%s\n",
+                        "PendingBASTs=%s\n",
                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
                        (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
                        (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
-                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
-                        (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
        /* Purge Count: xxx  Refs: xxx */
        out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d8d578f45613..4d9e6b288dd8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
        if (dlm->lockres_hash)
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+        if (dlm->master_hash)
+                dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
        if (dlm->name)
                kfree(dlm->name);
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        for (i = 0; i < DLM_HASH_BUCKETS; i++)
                INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
+        dlm->master_hash = (struct hlist_head **)
+                                dlm_alloc_pagevec(DLM_HASH_PAGES);
+        if (!dlm->master_hash) {
+                mlog_errno(-ENOMEM);
+                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+                kfree(dlm->name);
+                kfree(dlm);
+                dlm = NULL;
+                goto leave;
+        }
+        for (i = 0; i < DLM_HASH_BUCKETS; i++)
+                INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
        strcpy(dlm->name, domain);
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
        ret = dlm_create_debugfs_subroot(dlm);
        if (ret < 0) {
+                dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
                kfree(dlm->name);
                kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        init_waitqueue_head(&dlm->reco.event);
        init_waitqueue_head(&dlm->ast_wq);
        init_waitqueue_head(&dlm->migration_wq);
-        INIT_LIST_HEAD(&dlm->master_list);
        INIT_LIST_HEAD(&dlm->mle_hb_events);
        dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
        dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-        atomic_set(&dlm->local_resources, 0);
-        atomic_set(&dlm->remote_resources, 0);
+        atomic_set(&dlm->res_tot_count, 0);
-        atomic_set(&dlm->unknown_resources, 0);
+        atomic_set(&dlm->res_cur_count, 0);
+        for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
+                atomic_set(&dlm->mle_tot_count[i], 0);
+                atomic_set(&dlm->mle_cur_count[i], 0);
+        }
        spin_lock_init(&dlm->work_lock);
        INIT_LIST_HEAD(&dlm->work_list);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0a2813947853..f8b653fcd4dd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
                                const char *name,
                                unsigned int namelen)
 {
-        struct dlm_lock_resource *res;
        if (dlm != mle->dlm)
                return 0;
-        if (mle->type == DLM_MLE_BLOCK ||
+        if (namelen != mle->mnamelen ||
-            mle->type == DLM_MLE_MIGRATION) {
+            memcmp(name, mle->mname, namelen) != 0)
-                if (namelen != mle->u.name.len ||
+                return 0;
-                    memcmp(name, mle->u.name.name, namelen)!=0)
-                        return 0;
-        } else {
-                res = mle->u.res;
-                if (namelen != res->lockname.len ||
-                    memcmp(res->lockname.name, name, namelen) != 0)
-                        return 0;
-        }
        return 1;
 }
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        mle->dlm = dlm;
        mle->type = type;
-        INIT_LIST_HEAD(&mle->list);
+        INIT_HLIST_NODE(&mle->master_hash_node);
        INIT_LIST_HEAD(&mle->hb_events);
        memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
        spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        mle->new_master = O2NM_MAX_NODES;
        mle->inuse = 0;
+        BUG_ON(mle->type != DLM_MLE_BLOCK &&
+               mle->type != DLM_MLE_MASTER &&
+               mle->type != DLM_MLE_MIGRATION);
        if (mle->type == DLM_MLE_MASTER) {
                BUG_ON(!res);
-                mle->u.res = res;
+                mle->mleres = res;
-        } else if (mle->type == DLM_MLE_BLOCK) {
+                memcpy(mle->mname, res->lockname.name, res->lockname.len);
-                BUG_ON(!name);
+                mle->mnamelen = res->lockname.len;
-                memcpy(mle->u.name.name, name, namelen);
+                mle->mnamehash = res->lockname.hash;
-                mle->u.name.len = namelen;
+        } else {
-        } else /* DLM_MLE_MIGRATION */ {
                BUG_ON(!name);
-                memcpy(mle->u.name.name, name, namelen);
+                mle->mleres = NULL;
-                mle->u.name.len = namelen;
+                memcpy(mle->mname, name, namelen);
+                mle->mnamelen = namelen;
+                mle->mnamehash = dlm_lockid_hash(name, namelen);
        }
+        atomic_inc(&dlm->mle_tot_count[mle->type]);
+        atomic_inc(&dlm->mle_cur_count[mle->type]);
        /* copy off the node_map and register hb callbacks on our copy */
        memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
        memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        __dlm_mle_attach_hb_events(dlm, mle);
 }
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&dlm->master_lock);
+        if (!hlist_unhashed(&mle->master_hash_node))
+                hlist_del_init(&mle->master_hash_node);
+}
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+        struct hlist_head *bucket;
+        assert_spin_locked(&dlm->master_lock);
+        bucket = dlm_master_hash(dlm, mle->mnamehash);
+        hlist_add_head(&mle->master_hash_node, bucket);
+}
 /* returns 1 if found, 0 if not */
 static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
                        char *name, unsigned int namelen)
 {
        struct dlm_master_list_entry *tmpmle;
+        struct hlist_head *bucket;
+        struct hlist_node *list;
+        unsigned int hash;
        assert_spin_locked(&dlm->master_lock);
-        list_for_each_entry(tmpmle, &dlm->master_list, list) {
+        hash = dlm_lockid_hash(name, namelen);
+        bucket = dlm_master_hash(dlm, hash);
+        hlist_for_each(list, bucket) {
+                tmpmle = hlist_entry(list, struct dlm_master_list_entry,
+                                     master_hash_node);
                if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
                        continue;
                dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
        mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
        dlm = mle->dlm;
-        if (mle->type != DLM_MLE_MASTER) {
-                mlog(0, "calling mle_release for %.*s, type %d\n",
-                     mle->u.name.len, mle->u.name.name, mle->type);
-        } else {
-                mlog(0, "calling mle_release for %.*s, type %d\n",
-                     mle->u.res->lockname.len,
-                     mle->u.res->lockname.name, mle->type);
-        }
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&dlm->master_lock);
+        mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
+             mle->type);
        /* remove from list if not already */
-        if (!list_empty(&mle->list))
+        __dlm_unlink_mle(dlm, mle);
-                list_del_init(&mle->list);
        /* detach the mle from the domain node up/down events */
        __dlm_mle_detach_hb_events(dlm, mle);
+        atomic_dec(&dlm->mle_cur_count[mle->type]);
        /* NOTE: kfree under spinlock here.
         * if this is bad, we can move this to a freelist. */
        kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
                kmem_cache_destroy(dlm_lockres_cache);
 }
-static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
-                                  struct dlm_lock_resource *res,
-                                  u8 owner)
-{
-        assert_spin_locked(&res->spinlock);
-        mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
-        if (owner == dlm->node_num)
-                atomic_inc(&dlm->local_resources);
-        else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-                atomic_inc(&dlm->unknown_resources);
-        else
-                atomic_inc(&dlm->remote_resources);
-        res->owner = owner;
-}
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res, u8 owner)
-{
-        assert_spin_locked(&res->spinlock);
-        if (owner == res->owner)
-                return;
-        if (res->owner == dlm->node_num)
-                atomic_dec(&dlm->local_resources);
-        else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-                atomic_dec(&dlm->unknown_resources);
-        else
-                atomic_dec(&dlm->remote_resources);
-        dlm_set_lockres_owner(dlm, res, owner);
-}
 static void dlm_lockres_release(struct kref *kref)
 {
        struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
        }
        spin_unlock(&dlm->track_lock);
+        atomic_dec(&dlm->res_cur_count);
        dlm_put(dlm);
        if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        kref_init(&res->refs);
+        atomic_inc(&dlm->res_tot_count);
+        atomic_inc(&dlm->res_cur_count);
        /* just for consistency */
        spin_lock(&res->spinlock);
        dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
                alloc_mle = NULL;
                dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
                set_bit(dlm->node_num, mle->maybe_map);
-                list_add(&mle->list, &dlm->master_list);
+                __dlm_insert_mle(dlm, mle);
                /* still holding the dlm spinlock, check the recovery map
                 * to see if there are any nodes that still need to be 
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                                                     res->lockname.len,
                                                     res->lockname.name);
                                                mle->type = DLM_MLE_MASTER;
-                                                mle->u.res = res;
+                                                mle->mleres = res;
                                        }
                                }
                        }
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
        BUG_ON(mle->type == DLM_MLE_MIGRATION);
-        if (mle->type != DLM_MLE_MASTER) {
+        request.namelen = (u8)mle->mnamelen;
-                request.namelen = mle->u.name.len;
+        memcpy(request.name, mle->mname, request.namelen);
-                memcpy(request.name, mle->u.name.name, request.namelen);
-        } else {
-                request.namelen = mle->u.res->lockname.len;
-                memcpy(request.name, mle->u.res->lockname.name,
-                        request.namelen);
-        }
 again:
        ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
                // "add the block.\n");
                dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
                set_bit(request->node_idx, mle->maybe_map);
-                list_add(&mle->list, &dlm->master_list);
+                __dlm_insert_mle(dlm, mle);
                response = DLM_MASTER_RESP_NO;
        } else {
                // mlog(0, "mle was found\n");
@@ -1967,7 +1949,7 @@ ok:
                             assert->node_idx, rr, extra_ref, mle->inuse);
                        dlm_print_one_mle(mle);
                }
-                list_del_init(&mle->list);
+                __dlm_unlink_mle(dlm, mle);
                __dlm_mle_detach_hb_events(dlm, mle);
                __dlm_put_mle(mle);
                if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
                        tmp->master = master;
                        atomic_set(&tmp->woken, 1);
                        wake_up(&tmp->wq);
-                        /* remove it from the list so that only one
+                        /* remove it so that only one mle will be found */
-                         * mle will be found */
+                        __dlm_unlink_mle(dlm, tmp);
-                        list_del_init(&tmp->list);
-                        /* this was obviously WRONG.  mle is uninited here.  should be tmp. */
                        __dlm_mle_detach_hb_events(dlm, tmp);
                        ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
                        mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
        mle->master = master;
        /* do this for consistency with other mle types */
        set_bit(new_master, mle->maybe_map);
-        list_add(&mle->list, &dlm->master_list);
+        __dlm_insert_mle(dlm, mle);
        return ret;
 }
+/*
-void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+ * Sets the owner of the lockres, associated to the mle, to UNKNOWN
+ */
+static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
+                                        struct dlm_master_list_entry *mle)
 {
-        struct dlm_master_list_entry *mle, *next;
        struct dlm_lock_resource *res;
-        unsigned int hash;
-        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+        /* Find the lockres associated to the mle and set its owner to UNK */
-top:
+        res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
-        assert_spin_locked(&dlm->spinlock);
+                                   mle->mnamehash);
+        if (res) {
+                spin_unlock(&dlm->master_lock);
-        /* clean the master list */
+                /* move lockres onto recovery list */
-        spin_lock(&dlm->master_lock);
+                spin_lock(&res->spinlock);
-        list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
+                dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
-                BUG_ON(mle->type != DLM_MLE_BLOCK &&
+                dlm_move_lockres_to_recovery_list(dlm, res);
-                       mle->type != DLM_MLE_MASTER &&
+                spin_unlock(&res->spinlock);
-                       mle->type != DLM_MLE_MIGRATION);
+                dlm_lockres_put(res);
-                /* MASTER mles are initiated locally.  the waiting
-                 * process will notice the node map change
-                 * shortly.  let that happen as normal. */
-                if (mle->type == DLM_MLE_MASTER)
-                        continue;
+                /* about to get rid of mle, detach from heartbeat */
+                __dlm_mle_detach_hb_events(dlm, mle);
-                /* BLOCK mles are initiated by other nodes.
+                /* dump the mle */
-                 * need to clean up if the dead node would have
+                spin_lock(&dlm->master_lock);
-                 * been the master. */
+                __dlm_put_mle(mle);
-                if (mle->type == DLM_MLE_BLOCK) {
+                spin_unlock(&dlm->master_lock);
-                        int bit;
+        }
-                        spin_lock(&mle->spinlock);
+        return res;
-                        bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+}
-                        if (bit != dead_node) {
-                                mlog(0, "mle found, but dead node %u would "
-                                     "not have been master\n", dead_node);
-                                spin_unlock(&mle->spinlock);
-                        } else {
-                                /* must drop the refcount by one since the
-                                 * assert_master will never arrive.  this
-                                 * may result in the mle being unlinked and
-                                 * freed, but there may still be a process
-                                 * waiting in the dlmlock path which is fine. */
-                                mlog(0, "node %u was expected master\n",
-                                     dead_node);
-                                atomic_set(&mle->woken, 1);
-                                spin_unlock(&mle->spinlock);
-                                wake_up(&mle->wq);
-                                /* do not need events any longer, so detach 
-                                 * from heartbeat */
-                                __dlm_mle_detach_hb_events(dlm, mle);
-                                __dlm_put_mle(mle);
-                        }
-                        continue;
-                }
-                /* everything else is a MIGRATION mle */
+static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
+                                    struct dlm_master_list_entry *mle)
-                /* the rule for MIGRATION mles is that the master
+{
-                 * becomes UNKNOWN if *either* the original or
+        __dlm_mle_detach_hb_events(dlm, mle);
-                 * the new master dies.  all UNKNOWN lockreses
-                 * are sent to whichever node becomes the recovery
-                 * master.  the new master is responsible for
-                 * determining if there is still a master for
-                 * this lockres, or if he needs to take over
-                 * mastery.  either way, this node should expect
-                 * another message to resolve this. */
-                if (mle->master != dead_node &&
-                    mle->new_master != dead_node)
-                        continue;
-                /* if we have reached this point, this mle needs to
+        spin_lock(&mle->spinlock);
-                 * be removed from the list and freed. */
+        __dlm_unlink_mle(dlm, mle);
+        atomic_set(&mle->woken, 1);
+        spin_unlock(&mle->spinlock);
-                /* remove from the list early.  NOTE: unlinking
+        wake_up(&mle->wq);
-                 * list_head while in list_for_each_safe */
+}
-                __dlm_mle_detach_hb_events(dlm, mle);
-                spin_lock(&mle->spinlock);
+static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
-                list_del_init(&mle->list);
+                                struct dlm_master_list_entry *mle, u8 dead_node)
+{
+        int bit;
+        BUG_ON(mle->type != DLM_MLE_BLOCK);
+        spin_lock(&mle->spinlock);
+        bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+        if (bit != dead_node) {
+                mlog(0, "mle found, but dead node %u would not have been "
+                     "master\n", dead_node);
+                spin_unlock(&mle->spinlock);
+        } else {
+                /* Must drop the refcount by one since the assert_master will
+                 * never arrive. This may result in the mle being unlinked and
+                 * freed, but there may still be a process waiting in the
+                 * dlmlock path which is fine. */
+                mlog(0, "node %u was expected master\n", dead_node);
                atomic_set(&mle->woken, 1);
                spin_unlock(&mle->spinlock);
                wake_up(&mle->wq);
-                mlog(0, "%s: node %u died during migration from "
+                /* Do not need events any longer, so detach from heartbeat */
-                     "%u to %u!\n", dlm->name, dead_node,
+                __dlm_mle_detach_hb_events(dlm, mle);
-                     mle->master, mle->new_master);
+                __dlm_put_mle(mle);
-                /* if there is a lockres associated with this
+        }
-                 * mle, find it and set its owner to UNKNOWN */
+}
-                hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
-                res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-                                           mle->u.name.len, hash);
-                if (res) {
-                        /* unfortunately if we hit this rare case, our
-                         * lock ordering is messed.  we need to drop
-                         * the master lock so that we can take the
-                         * lockres lock, meaning that we will have to
-                         * restart from the head of list. */
-                        spin_unlock(&dlm->master_lock);
-                        /* move lockres onto recovery list */
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
-                        spin_lock(&res->spinlock);
+{
-                        dlm_set_lockres_owner(dlm, res,
+        struct dlm_master_list_entry *mle;
-                                        DLM_LOCK_RES_OWNER_UNKNOWN);
+        struct dlm_lock_resource *res;
-                        dlm_move_lockres_to_recovery_list(dlm, res);
+        struct hlist_head *bucket;
-                        spin_unlock(&res->spinlock);
+        struct hlist_node *list;
-                        dlm_lockres_put(res);
+        unsigned int i;
-                        /* about to get rid of mle, detach from heartbeat */
+        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
-                        __dlm_mle_detach_hb_events(dlm, mle);
+top:
+        assert_spin_locked(&dlm->spinlock);
-                        /* dump the mle */
+        /* clean the master list */
-                        spin_lock(&dlm->master_lock);
+        spin_lock(&dlm->master_lock);
-                        __dlm_put_mle(mle);
+        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                        spin_unlock(&dlm->master_lock);
+                bucket = dlm_master_hash(dlm, i);
+                hlist_for_each(list, bucket) {
+                        mle = hlist_entry(list, struct dlm_master_list_entry,
+                                          master_hash_node);
+                        BUG_ON(mle->type != DLM_MLE_BLOCK &&
+                               mle->type != DLM_MLE_MASTER &&
+                               mle->type != DLM_MLE_MIGRATION);
+                        /* MASTER mles are initiated locally. The waiting
+                         * process will notice the node map change shortly.
+                         * Let that happen as normal. */
+                        if (mle->type == DLM_MLE_MASTER)
+                                continue;
+                        /* BLOCK mles are initiated by other nodes. Need to
+                         * clean up if the dead node would have been the
+                         * master. */
+                        if (mle->type == DLM_MLE_BLOCK) {
+                                dlm_clean_block_mle(dlm, mle, dead_node);
+                                continue;
+                        }
-                        /* restart */
+                        /* Everything else is a MIGRATION mle */
-                        goto top;
-                }
+                        /* The rule for MIGRATION mles is that the master
+                         * becomes UNKNOWN if *either* the original or the new
+                         * master dies. All UNKNOWN lockres' are sent to
+                         * whichever node becomes the recovery master. The new
+                         * master is responsible for determining if there is
+                         * still a master for this lockres, or if he needs to
+                         * take over mastery. Either way, this node should
+                         * expect another message to resolve this. */
+                        if (mle->master != dead_node &&
+                            mle->new_master != dead_node)
+                                continue;
+                        /* If we have reached this point, this mle needs to be
+                         * removed from the list and freed. */
+                        dlm_clean_migration_mle(dlm, mle);
+                        mlog(0, "%s: node %u died during migration from "
+                             "%u to %u!\n", dlm->name, dead_node, mle->master,
+                             mle->new_master);
+                        /* If we find a lockres associated with the mle, we've
+                         * hit this rare case that messes up our lock ordering.
+                         * If so, we need to drop the master lock so that we can
+                         * take the lockres lock, meaning that we will have to
+                         * restart from the head of list. */
+                        res = dlm_reset_mleres_owner(dlm, mle);
+                        if (res)
+                                /* restart */
+                                goto top;
-                /* this may be the last reference */
+                        /* This may be the last reference */
-                __dlm_put_mle(mle);
+                        __dlm_put_mle(mle);
+                }
        }
        spin_unlock(&dlm->master_lock);
 }
 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         u8 old_master)
 {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d490b66ad9d7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        if (!__dlm_lockres_unused(res)) {
-                spin_unlock(&res->spinlock);
                mlog(0, "%s:%.*s: tried to purge but not unused\n",
                     dlm->name, res->lockname.len, res->lockname.name);
-                return -ENOTEMPTY;
+                __dlm_print_one_lock_resource(res);
+                spin_unlock(&res->spinlock);
+                BUG();
        }
+        if (res->state & DLM_LOCK_RES_MIGRATING) {
+                mlog(0, "%s:%.*s: Delay dropref as this lockres is "
+                     "being remastered\n", dlm->name, res->lockname.len,
+                     res->lockname.name);
+                /* Re-add the lockres to the end of the purge list */
+                if (!list_empty(&res->purge)) {
+                        list_del_init(&res->purge);
+                        list_add_tail(&res->purge, &dlm->purge_list);
+                }
+                spin_unlock(&res->spinlock);
+                return 0;
+        }
        master = (res->owner == dlm->node_num);
        if (!master)
                res->state |= DLM_LOCK_RES_DROPPING_REF;
        spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7219a86d34cc..e15fc7d50827 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+        .flags          = 0,
+};
 static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .get_osb        = ocfs2_get_dentry_osb,
        .post_unlock    = ocfs2_dentry_post_unlock,
@@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
                                   &ocfs2_rename_lops, osb);
 }
+static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
+                                         struct ocfs2_super *osb)
+{
+        /* nfs_sync lockres doesn't come from a slab so we call init
+         * once on it manually.  */
+        ocfs2_lock_res_init_once(res);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
+        ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
+                                   &ocfs2_nfs_sync_lops, osb);
+}
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp)
 {
@@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
                ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
 }
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
+{
+        int status;
+        struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+        if (ocfs2_is_hard_readonly(osb))
+                return -EROFS;
+        if (ocfs2_mount_local(osb))
+                return 0;
+        status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
+                                    0, 0);
+        if (status < 0)
+                mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+        return status;
+}
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
+{
+        struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+        if (!ocfs2_mount_local(osb))
+                ocfs2_cluster_unlock(osb, lockres,
+                                     ex ? LKM_EXMODE : LKM_PRMODE);
+}
 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 {
        int ret;
@@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 local:
        ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
        ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+        ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
        osb->cconn = conn;
@@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
        ocfs2_lock_res_free(&osb->osb_super_lockres);
        ocfs2_lock_res_free(&osb->osb_rename_lockres);
+        ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
        ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
        osb->cconn = NULL;
@@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
 {
        ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
        ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+        ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
 }
 int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3f8d9986b8e0..e1fd5721cd7f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex);
 int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
 int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 2f27b332d8b3..15713cbb865c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -31,6 +31,7 @@
 #include "ocfs2.h"
+#include "alloc.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "dcache.h"
@@ -38,6 +39,7 @@
 #include "inode.h"
 #include "buffer_head_io.h"
+#include "suballoc.h"
 struct ocfs2_inode_handle
 {
@@ -49,29 +51,98 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
                struct ocfs2_inode_handle *handle)
 {
        struct inode *inode;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        u64 blkno = handle->ih_blkno;
+        int status, set;
        struct dentry *result;
        mlog_entry("(0x%p, 0x%p)\n", sb, handle);
-        if (handle->ih_blkno == 0) {
+        if (blkno == 0) {
-                mlog_errno(-ESTALE);
+                mlog(0, "nfs wants inode with blkno: 0\n");
-                return ERR_PTR(-ESTALE);
+                result = ERR_PTR(-ESTALE);
+                goto bail;
+        }
+        inode = ocfs2_ilookup(sb, blkno);
+        /*
+         * If the inode exists in memory, we only need to check it's
+         * generation number
+         */
+        if (inode)
+                goto check_gen;
+        /*
+         * This will synchronize us against ocfs2_delete_inode() on
+         * all nodes
+         */
+        status = ocfs2_nfs_sync_lock(osb, 1);
+        if (status < 0) {
+                mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+                goto check_err;
+        }
+        status = ocfs2_test_inode_bit(osb, blkno, &set);
+        if (status < 0) {
+                if (status == -EINVAL) {
+                        /*
+                         * The blkno NFS gave us doesn't even show up
+                         * as an inode, we return -ESTALE to be
+                         * nice
+                         */
+                        mlog(0, "test inode bit failed %d\n", status);
+                        status = -ESTALE;
+                } else {
+                        mlog(ML_ERROR, "test inode bit failed %d\n", status);
+                }
+                goto unlock_nfs_sync;
+        }
+        /* If the inode allocator bit is clear, this inode must be stale */
+        if (!set) {
+                mlog(0, "inode %llu suballoc bit is clear\n",
+                     (unsigned long long)blkno);
+                status = -ESTALE;
+                goto unlock_nfs_sync;
        }
-        inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
+        inode = ocfs2_iget(osb, blkno, 0, 0);
-        if (IS_ERR(inode))
+unlock_nfs_sync:
-                return (void *)inode;
+        ocfs2_nfs_sync_unlock(osb, 1);
+check_err:
+        if (status < 0) {
+                if (status == -ESTALE) {
+                        mlog(0, "stale inode ino: %llu generation: %u\n",
+                             (unsigned long long)blkno, handle->ih_generation);
+                }
+                result = ERR_PTR(status);
+                goto bail;
+        }
+        if (IS_ERR(inode)) {
+                mlog_errno(PTR_ERR(inode));
+                result = (void *)inode;
+                goto bail;
+        }
+check_gen:
        if (handle->ih_generation != inode->i_generation) {
                iput(inode);
-                return ERR_PTR(-ESTALE);
+                mlog(0, "stale inode ino: %llu generation: %u\n",
+                     (unsigned long long)blkno, handle->ih_generation);
+                result = ERR_PTR(-ESTALE);
+                goto bail;
        }
        result = d_obtain_alias(inode);
        if (!IS_ERR(result))
                result->d_op = &ocfs2_dentry_ops;
+        else
+                mlog_errno(PTR_ERR(result));
+bail:
        mlog_exit_ptr(result);
        return result;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5887df2cd8a..c2a87c885b73 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1912,6 +1912,22 @@ out_sems:
        return written ? written : ret;
 }
+static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
+                                struct file *out,
+                                struct splice_desc *sd)
+{
+        int ret;
+        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
+                                            sd->total_len, 0, NULL);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        return splice_from_pipe_feed(pipe, sd, pipe_to_file);
+}
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                                       struct file *out,
                                       loff_t *ppos,
@@ -1919,34 +1935,76 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                                       unsigned int flags)
 {
        int ret;
-        struct inode *inode = out->f_path.dentry->d_inode;
+        struct address_space *mapping = out->f_mapping;
+        struct inode *inode = mapping->host;
+        struct splice_desc sd = {
+                .total_len = len,
+                .flags = flags,
+                .pos = *ppos,
+                .u.file = out,
+        };
        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
                   (unsigned int)len,
                   out->f_path.dentry->d_name.len,
                   out->f_path.dentry->d_name.name);
-        inode_double_lock(inode, pipe->inode);
+        if (pipe->inode)
+                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
-        ret = ocfs2_rw_lock(inode, 1);
+        splice_from_pipe_begin(&sd);
-        if (ret < 0) {
+        do {
-                mlog_errno(ret);
+                ret = splice_from_pipe_next(pipe, &sd);
-                goto out;
+                if (ret <= 0)
-        }
+                        break;
-        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-                                            NULL);
+                ret = ocfs2_rw_lock(inode, 1);
-        if (ret < 0) {
+                if (ret < 0)
-                mlog_errno(ret);
+                        mlog_errno(ret);
-                goto out_unlock;
+                else {
-        }
+                        ret = ocfs2_splice_to_file(pipe, out, &sd);
+                        ocfs2_rw_unlock(inode, 1);
+                }
+                mutex_unlock(&inode->i_mutex);
+        } while (ret > 0);
+        splice_from_pipe_end(pipe, &sd);
-        ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+        if (pipe->inode)
+                mutex_unlock(&pipe->inode->i_mutex);
-out_unlock:
+        if (sd.num_spliced)
-        ocfs2_rw_unlock(inode, 1);
+                ret = sd.num_spliced;
-out:
-        inode_double_unlock(inode, pipe->inode);
+        if (ret > 0) {
+                unsigned long nr_pages;
+                *ppos += ret;
+                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                /*
+                 * If file or inode is SYNC and we actually wrote some data,
+                 * sync it.
+                 */
+                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        int err;
+                        mutex_lock(&inode->i_mutex);
+                        err = ocfs2_rw_lock(inode, 1);
+                        if (err < 0) {
+                                mlog_errno(err);
+                        } else {
+                                err = generic_osync_inode(inode, mapping,
+                                                  OSYNC_METADATA|OSYNC_DATA);
+                                ocfs2_rw_unlock(inode, 1);
+                        }
+                        mutex_unlock(&inode->i_mutex);
+                        if (err)
+                                ret = err;
+                }
+                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+        }
        mlog_exit(ret);
        return ret;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..10e1fa87396a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "dir.h"
 #include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
                oi->ip_attr |= OCFS2_DIRSYNC_FL;
 }
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
+{
+        struct ocfs2_find_inode_args args;
+        args.fi_blkno = blkno;
+        args.fi_flags = 0;
+        args.fi_ino = ino_from_blkno(sb, blkno);
+        args.fi_sysfile_type = 0;
+        return ilookup5(sb, blkno, ocfs2_find_actor, &args);
+}
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
                         int sysfile_type)
 {
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
                     (unsigned long long)le64_to_cpu(fe->i_blkno));
-        inode->i_nlink = le16_to_cpu(fe->i_links_count);
+        inode->i_nlink = ocfs2_read_links_count(fe);
        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        ocfs2_set_inode_flags(inode);
+        OCFS2_I(inode)->ip_last_used_slot = 0;
+        OCFS2_I(inode)->ip_last_used_group = 0;
        mlog_exit_void();
 }
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
-                                        ocfs2_quota_trans_credits(inode->i_sb));
+                                   ocfs2_quota_trans_credits(inode->i_sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
                goto bail_unlock_dir;
        }
+        /* Remove any dir index tree */
+        if (S_ISDIR(inode->i_mode)) {
+                status = ocfs2_dx_dir_truncate(inode, di_bh);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail_unlock_dir;
+                }
+        }
        /*Free extended attribute resources associated with this inode.*/
        status = ocfs2_xattr_remove(inode, di_bh);
        if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
                goto bail;
        }
+        /*
+         * Synchronize us against ocfs2_get_dentry. We take this in
+         * shared mode so that all nodes can still concurrently
+         * process deletes.
+         */
+        status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
+        if (status < 0) {
+                mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
+                ocfs2_cleanup_delete_inode(inode, 0);
+                goto bail_unblock;
+        }
        /* Lock down the inode. This gives us an up to date view of
         * it's metadata (for verification), and allows us to
         * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
                if (status != -ENOENT)
                        mlog_errno(status);
                ocfs2_cleanup_delete_inode(inode, 0);
-                goto bail_unblock;
+                goto bail_unlock_nfs_sync;
        }
        /* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
 bail_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
+bail_unlock_nfs_sync:
+        ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
 bail_unblock:
        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
        if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
        fe->i_size = cpu_to_le64(i_size_read(inode));
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_uid = cpu_to_le32(inode->i_uid);
        fe->i_gid = cpu_to_le32(inode->i_gid);
        fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
        OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
        ocfs2_set_inode_flags(inode);
        i_size_write(inode, le64_to_cpu(fe->i_size));
-        inode->i_nlink = le16_to_cpu(fe->i_links_count);
+        inode->i_nlink = ocfs2_read_links_count(fe);
        inode->i_uid = le32_to_cpu(fe->i_uid);
        inode->i_gid = le32_to_cpu(fe->i_gid);
        inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index eb3c302b38d3..ea71525aad41 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
        struct inode                    vfs_inode;
        struct jbd2_inode               ip_jinode;
+        /* Only valid if the inode is the dir. */
+        u32                             ip_last_used_slot;
+        u64                             ip_last_used_group;
 };
 /*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE           0x1
 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x2
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 57d7d25a2b9a..a20a0f1e37fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+                                            int slot_num,
+                                            struct ocfs2_dinode *la_dinode,
+                                            struct ocfs2_dinode *tl_dinode,
+                                            struct ocfs2_quota_recovery *qrec);
 static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
 {
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
        return __ocfs2_wait_on_mount(osb, 1);
 }
 /*
- * The recovery_list is a simple linked list of node numbers to recover.
+ * This replay_map is to track online/offline slots, so we could recover
- * It is protected by the recovery_lock.
+ * offline slots during recovery and mount
 */
-struct ocfs2_recovery_map {
+enum ocfs2_replay_state {
-        unsigned int rm_used;
+        REPLAY_UNNEEDED = 0,    /* Replay is not needed, so ignore this map */
-        unsigned int *rm_entries;
+        REPLAY_NEEDED,          /* Replay slots marked in rm_replay_slots */
+        REPLAY_DONE             /* Replay was already queued */
 };
+struct ocfs2_replay_map {
+        unsigned int rm_slots;
+        enum ocfs2_replay_state rm_state;
+        unsigned char rm_replay_slots[0];
+};
+void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+{
+        if (!osb->replay_map)
+                return;
+        /* If we've already queued the replay, we don't have any more to do */
+        if (osb->replay_map->rm_state == REPLAY_DONE)
+                return;
+        osb->replay_map->rm_state = state;
+}
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
+{
+        struct ocfs2_replay_map *replay_map;
+        int i, node_num;
+        /* If replay map is already set, we don't do it again */
+        if (osb->replay_map)
+                return 0;
+        replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
+                             (osb->max_slots * sizeof(char)), GFP_KERNEL);
+        if (!replay_map) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        spin_lock(&osb->osb_lock);
+        replay_map->rm_slots = osb->max_slots;
+        replay_map->rm_state = REPLAY_UNNEEDED;
+        /* set rm_replay_slots for offline slot(s) */
+        for (i = 0; i < replay_map->rm_slots; i++) {
+                if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
+                        replay_map->rm_replay_slots[i] = 1;
+        }
+        osb->replay_map = replay_map;
+        spin_unlock(&osb->osb_lock);
+        return 0;
+}
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+{
+        struct ocfs2_replay_map *replay_map = osb->replay_map;
+        int i;
+        if (!replay_map)
+                return;
+        if (replay_map->rm_state != REPLAY_NEEDED)
+                return;
+        for (i = 0; i < replay_map->rm_slots; i++)
+                if (replay_map->rm_replay_slots[i])
+                        ocfs2_queue_recovery_completion(osb->journal, i, NULL,
+                                                        NULL, NULL);
+        replay_map->rm_state = REPLAY_DONE;
+}
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+{
+        struct ocfs2_replay_map *replay_map = osb->replay_map;
+        if (!osb->replay_map)
+                return;
+        kfree(replay_map);
+        osb->replay_map = NULL;
+}
 int ocfs2_recovery_init(struct ocfs2_super *osb)
 {
        struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
        },
 };
+static struct ocfs2_triggers dr_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dx_root_block, dr_check),
+};
+static struct ocfs2_triggers dl_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dx_leaf, dl_check),
+};
 static int __ocfs2_journal_access(handle_t *handle,
                                  struct inode *inode,
                                  struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
                                      type);
 }
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
+                                      type);
+}
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
+                                      type);
+}
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh, int type)
 {
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 }
 /* Called by the mount code to queue recovery the last part of
- * recovery for it's own slot. */
+ * recovery for it's own and offline slot(s). */
 void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 {
        struct ocfs2_journal *journal = osb->journal;
-        if (osb->dirty) {
+        /* No need to queue up our truncate_log as regular cleanup will catch
-                /* No need to queue up our truncate_log as regular
+         * that */
-                 * cleanup will catch that. */
+        ocfs2_queue_recovery_completion(journal, osb->slot_num,
-                ocfs2_queue_recovery_completion(journal,
+                                        osb->local_alloc_copy, NULL, NULL);
-                                                osb->slot_num,
+        ocfs2_schedule_truncate_log_flush(osb, 0);
-                                                osb->local_alloc_copy,
-                                                NULL,
-                                                NULL);
-                ocfs2_schedule_truncate_log_flush(osb, 0);
-                osb->local_alloc_copy = NULL;
+        osb->local_alloc_copy = NULL;
-                osb->dirty = 0;
+        osb->dirty = 0;
-        }
+        /* queue to recover orphan slots for all offline slots */
+        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+        ocfs2_queue_replay_slots(osb);
+        ocfs2_free_replay_slots(osb);
 }
 void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
                goto bail;
        }
+        status = ocfs2_compute_replay_slots(osb);
+        if (status < 0)
+                mlog_errno(status);
+        /* queue recovery for our own slot */
+        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+                                        NULL, NULL);
        spin_lock(&osb->osb_lock);
        while (rm->rm_used) {
                /* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
        ocfs2_super_unlock(osb, 1);
-        /* We always run recovery on our own orphan dir - the dead
+        /* queue recovery for offline slots */
-         * node(s) may have disallowd a previos inode delete. Re-processing
+        ocfs2_queue_replay_slots(osb);
-         * is therefore required. */
-        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                        NULL, NULL);
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
                goto restart;
        }
+        ocfs2_free_replay_slots(osb);
        osb->recovery_thread_task = NULL;
        mb(); /* sync with ocfs2_recovery_thread_running */
        wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
                goto done;
        }
+        /* we need to run complete recovery for offline orphan slots */
+        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
        mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
             node_num, slot_num,
             MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 172850a9a12a..eb7b76331eb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
 struct ocfs2_super;
 struct ocfs2_dinode;
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+struct ocfs2_recovery_map {
+        unsigned int rm_used;
+        unsigned int *rm_entries;
+};
 struct ocfs2_journal {
        enum ocfs2_journal_state   j_state;    /* Journals current state   */
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
 int ocfs2_recovery_init(struct ocfs2_super *osb);
 void ocfs2_recovery_exit(struct ocfs2_super *osb);
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
 /*
 *  Journal Control:
 *  Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
 /* dirblock */
 int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
                            struct buffer_head *bh, int type);
+/* ocfs2_dx_root_block */
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_dx_leaf */
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
 /* Anything that has no ecc */
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
 }
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) */
+ * bitmap block for the new bit) dx_root update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
+{
+        /* 1 block for index, 2 allocs (data, metadata), 1 clusters
+         * worth of blocks for initial extent. */
+        return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
+                ocfs2_clusters_to_blocks(sb, 1);
+}
-/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
+/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
- * group descriptor + mkdir/symlink blocks + quota update */
+ * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
-static inline int ocfs2_mknod_credits(struct super_block *sb)
+ * blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
+                                      int xattr_credits)
 {
-        return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+        int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
+        if (is_dir)
+                dir_credits += ocfs2_add_dir_index_credits(sb);
+        return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
               ocfs2_quota_trans_credits(sb);
 }
@@ -388,31 +421,32 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir */
+ * update on dir + index leaf + dx root update for free list */
 static inline int ocfs2_link_credits(struct super_block *sb)
 {
-        return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+        return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
               ocfs2_quota_trans_credits(sb);
 }
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
- * dir inode link */
+ * dir inode link + dir inode index leaf + dir index root */
 static inline int ocfs2_unlink_credits(struct super_block *sb)
 {
        /* The quota update from ocfs2_link_credits is unused here... */
-        return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+        return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
 }
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
- * inode alloc group descriptor */
+ * inode alloc group descriptor + orphan dir index root +
-#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+ * orphan dir index leaf */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
- * directory + target unlink */
+ * directory + target unlink + 3 x dir index leaves */
 static inline int ocfs2_rename_credits(struct super_block *sb)
 {
-        return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+        return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
 }
 /* global bitmap dinode, group desc., relinked group,
@@ -422,6 +456,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
                                          + OCFS2_INODE_UPDATE_CREDITS \
                                          + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+/* inode update, removal of dx root block from allocator */
+#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS +      \
+                                      OCFS2_SUBALLOC_FREE)
+static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
+{
+        int credits = 1 + OCFS2_SUBALLOC_ALLOC;
+        credits += ocfs2_clusters_to_blocks(sb, 1);
+        credits += ocfs2_quota_trans_credits(sb);
+        return credits;
+}
 /*
 * Please note that the caller must make sure that root_el is the root
 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +505,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-        int blocks = ocfs2_mknod_credits(sb);
+        int blocks = ocfs2_mknod_credits(sb, 0, 0);
        /* links can be longer than one block so we may update many
         * within our single allocated extent. */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec70cdbe77fc..bac7e6abaf47 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
-#include <linux/debugfs.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
-#ifdef CONFIG_OCFS2_FS_STATS
-static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
-{
-        file->private_data = inode->i_private;
-        return 0;
-}
-#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
-#define LA_DEBUG_VER    1
-static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
-                                   size_t count, loff_t *ppos)
-{
-        static DEFINE_MUTEX(la_debug_mutex);
-        struct ocfs2_super *osb = file->private_data;
-        int written, ret;
-        char *buf = osb->local_alloc_debug_buf;
-        mutex_lock(&la_debug_mutex);
-        memset(buf, 0, LA_DEBUG_BUF_SZ);
-        written = snprintf(buf, LA_DEBUG_BUF_SZ,
-                           "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
-                           LA_DEBUG_VER,
-                           (unsigned long long)osb->la_last_gd,
-                           osb->local_alloc_default_bits,
-                           osb->local_alloc_bits, osb->local_alloc_state);
-        ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
-        mutex_unlock(&la_debug_mutex);
-        return ret;
-}
-static const struct file_operations ocfs2_la_debug_fops = {
-        .open =         ocfs2_la_debug_open,
-        .read =         ocfs2_la_debug_read,
-};
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-        osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
-        if (!osb->local_alloc_debug_buf)
-                return;
-        osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
-                                                     S_IFREG|S_IRUSR,
-                                                     osb->osb_debug_root,
-                                                     osb,
-                                                     &ocfs2_la_debug_fops);
-        if (!osb->local_alloc_debug) {
-                kfree(osb->local_alloc_debug_buf);
-                osb->local_alloc_debug_buf = NULL;
-        }
-}
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-        if (osb->local_alloc_debug)
-                debugfs_remove(osb->local_alloc_debug);
-        if (osb->local_alloc_debug_buf)
-                kfree(osb->local_alloc_debug_buf);
-        osb->local_alloc_debug_buf = NULL;
-        osb->local_alloc_debug = NULL;
-}
-#else   /* CONFIG_OCFS2_FS_STATS */
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-        return;
-}
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-        return;
-}
-#endif
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
        mlog_entry_void();
-        ocfs2_init_la_debug(osb);
        if (osb->local_alloc_bits == 0)
                goto bail;
@@ -299,9 +218,6 @@ bail:
        if (inode)
                iput(inode);
-        if (status < 0)
-                ocfs2_shutdown_la_debug(osb);
        mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
        mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        cancel_delayed_work(&osb->la_enable_wq);
        flush_workqueue(ocfs2_wq);
-        ocfs2_shutdown_la_debug(osb);
        if (osb->local_alloc_state == OCFS2_LA_UNUSED)
                goto out;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713ea..b606496b72ec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
        return ret;
 }
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct buffer_head *di_bh = NULL;
        sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
        if (ret2 < 0)
                mlog_errno(ret2);
+        if (ret)
+                ret = VM_FAULT_SIGBUS;
        return ret;
 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4b11762f249e..33464c6b60a2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    struct inode *inode,
                                    char *name,
-                                    struct buffer_head **de_bh);
+                                    struct ocfs2_dir_lookup_result *lookup);
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
                            struct ocfs2_dinode *fe,
                            char *name,
-                            struct buffer_head *de_bh,
+                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode);
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
        struct ocfs2_super *osb;
        struct ocfs2_dinode *dirfe;
        struct buffer_head *new_fe_bh = NULL;
-        struct buffer_head *de_bh = NULL;
        struct inode *inode = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *xattr_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
        int want_clusters = 0;
+        int want_meta = 0;
        int xattr_credits = 0;
        struct ocfs2_security_xattr_info si = {
                .enable = 1,
        };
        int did_quota_inode = 0;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
                return status;
        }
-        if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+        if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
                status = -EMLINK;
                goto leave;
        }
        dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-        if (!dirfe->i_links_count) {
+        if (!ocfs2_read_links_count(dirfe)) {
                /* can't make a file in a deleted directory. */
                status = -ENOENT;
                goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
        /* get a spot inside the dir. */
        status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                              dentry->d_name.name,
-                                              dentry->d_name.len, &de_bh);
+                                              dentry->d_name.len, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
        /* calculate meta data/clusters for setting security and acl xattr */
        status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
-                                        &si, &want_clusters,
+                                       &si, &want_clusters,
-                                        &xattr_credits, &xattr_ac);
+                                       &xattr_credits, &want_meta);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        /* Reserve a cluster if creating an extent based directory. */
-        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
                want_clusters += 1;
+                /* Dir indexing requires extra space as well */
+                if (ocfs2_supports_indexed_dirs(osb))
+                        want_meta++;
+        }
+        status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto leave;
+        }
        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
-        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
-                                   xattr_credits);
+                                                            S_ISDIR(mode),
+                                                            xattr_credits));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
        if (S_ISDIR(mode)) {
                status = ocfs2_fill_new_dir(osb, handle, dir, inode,
-                                            new_fe_bh, data_ac);
+                                            new_fe_bh, data_ac, meta_ac);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
                        mlog_errno(status);
                        goto leave;
                }
-                le16_add_cpu(&dirfe->i_links_count, 1);
+                ocfs2_add_links_count(dirfe, 1);
                status = ocfs2_journal_dirty(handle, parent_fe_bh);
                if (status < 0) {
                        mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
        }
        status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
-                                xattr_ac, data_ac);
+                                meta_ac, data_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
        if (si.enable) {
                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
-                                                 xattr_ac, data_ac);
+                                                 meta_ac, data_ac);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
-                                 de_bh);
+                                 &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -423,11 +437,12 @@ leave:
                mlog(0, "Disk is full\n");
        brelse(new_fe_bh);
-        brelse(de_bh);
        brelse(parent_fe_bh);
        kfree(si.name);
        kfree(si.value);
+        ocfs2_free_dir_lookup_result(&lookup);
        if ((status < 0) && inode) {
                clear_nlink(inode);
                iput(inode);
@@ -439,8 +454,8 @@ leave:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
-        if (xattr_ac)
+        if (meta_ac)
-                ocfs2_free_alloc_context(xattr_ac);
+                ocfs2_free_alloc_context(meta_ac);
        mlog_exit(status);
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        struct ocfs2_extent_list *fel;
        u64 fe_blkno = 0;
        u16 suballoc_bit;
+        u16 feat;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
                   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        *new_fe_bh = NULL;
-        status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
+        status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
-                                       &fe_blkno);
+                                       inode_ac, &suballoc_bit, &fe_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_mode = cpu_to_le16(inode->i_mode);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_last_eb_blk = 0;
        strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_dtime = 0;
        /*
-         * If supported, directories start with inline data.
+         * If supported, directories start with inline data. If inline
+         * isn't supported, but indexing is, we start them as indexed.
         */
+        feat = le16_to_cpu(fe->i_dyn_features);
        if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
-                u16 feat = le16_to_cpu(fe->i_dyn_features);
                fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
                fe->id2.i_data.id_count = cpu_to_le16(
@@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
        int err;
        struct buffer_head *fe_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
-        struct buffer_head *de_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
                   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                           dentry->d_name.name,
-                                           dentry->d_name.len, &de_bh);
+                                           dentry->d_name.len, &lookup);
        if (err < 0) {
                mlog_errno(err);
                goto out;
@@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        }
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+        if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
                err = -EMLINK;
                goto out_unlock_inode;
        }
@@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
        inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME;
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        err = ocfs2_journal_dirty(handle, fe_bh);
        if (err < 0) {
-                le16_add_cpu(&fe->i_links_count, -1);
+                ocfs2_add_links_count(fe, -1);
                drop_nlink(inode);
                mlog_errno(err);
                goto out_commit;
@@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
        err = ocfs2_add_entry(handle, dentry, inode,
                              OCFS2_I(inode)->ip_blkno,
-                              parent_fe_bh, de_bh);
+                              parent_fe_bh, &lookup);
        if (err) {
-                le16_add_cpu(&fe->i_links_count, -1);
+                ocfs2_add_links_count(fe, -1);
                drop_nlink(inode);
                mlog_errno(err);
                goto out_commit;
@@ -714,10 +731,11 @@ out_unlock_inode:
 out:
        ocfs2_inode_unlock(dir, 1);
-        brelse(de_bh);
        brelse(fe_bh);
        brelse(parent_fe_bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(err);
        return err;
@@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
        struct buffer_head *fe_bh = NULL;
        struct buffer_head *parent_node_bh = NULL;
        handle_t *handle = NULL;
-        struct ocfs2_dir_entry *dirent = NULL;
-        struct buffer_head *dirent_bh = NULL;
        char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
-        struct buffer_head *orphan_entry_bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
        mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
        }
        status = ocfs2_find_files_on_disk(dentry->d_name.name,
-                                          dentry->d_name.len, &blkno,
+                                          dentry->d_name.len, &blkno, dir,
-                                          dir, &dirent_bh, &dirent);
+                                          &lookup);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
        child_locked = 1;
        if (S_ISDIR(inode->i_mode)) {
-                if (!ocfs2_empty_dir(inode)) {
+                if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
-                        status = -ENOTEMPTY;
-                        goto leave;
-                } else if (inode->i_nlink != 2) {
                        status = -ENOTEMPTY;
                        goto leave;
                }
@@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (inode_is_unlinkable(inode)) {
                status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
-                                                  orphan_name,
+                                                  orphan_name, &orphan_insert);
-                                                  &orphan_entry_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (inode_is_unlinkable(inode)) {
                status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
-                                          orphan_entry_bh, orphan_dir);
+                                          &orphan_insert, orphan_dir);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
        }
        /* delete the name from the parent dir */
-        status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+        status = ocfs2_delete_entry(handle, dir, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (S_ISDIR(inode->i_mode))
                drop_nlink(inode);
        drop_nlink(inode);
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0) {
@@ -916,9 +929,10 @@ leave:
        }
        brelse(fe_bh);
-        brelse(dirent_bh);
        brelse(parent_node_bh);
-        brelse(orphan_entry_bh);
+        ocfs2_free_dir_lookup_result(&orphan_insert);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(status);
@@ -1004,29 +1018,27 @@ static int ocfs2_rename(struct inode *old_dir,
                        struct inode *new_dir,
                        struct dentry *new_dentry)
 {
-        int status = 0, rename_lock = 0, parents_locked = 0;
+        int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
-        int old_child_locked = 0, new_child_locked = 0;
+        int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct inode *orphan_dir = NULL;
        struct ocfs2_dinode *newfe = NULL;
        char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
-        struct buffer_head *orphan_entry_bh = NULL;
        struct buffer_head *newfe_bh = NULL;
        struct buffer_head *old_inode_bh = NULL;
-        struct buffer_head *insert_entry_bh = NULL;
        struct ocfs2_super *osb = NULL;
        u64 newfe_blkno, old_de_ino;
        handle_t *handle = NULL;
        struct buffer_head *old_dir_bh = NULL;
        struct buffer_head *new_dir_bh = NULL;
-        struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
-                *new_de = NULL;
-        struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
-        struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
-                                                    // this is the 1st dirent bh
        nlink_t old_dir_nlink = old_dir->i_nlink;
        struct ocfs2_dinode *old_di;
+        struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
+        struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
+        struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
+        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+        struct ocfs2_dir_lookup_result target_insert = { NULL, };
        /* At some point it might be nice to break this function up a
         * bit. */
@@ -1108,9 +1120,10 @@ static int ocfs2_rename(struct inode *old_dir,
        if (S_ISDIR(old_inode->i_mode)) {
                u64 old_inode_parent;
+                update_dot_dot = 1;
                status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
-                                                  old_inode, &old_inode_de_bh,
+                                                  old_inode,
-                                                  &old_inode_dot_dot_de);
+                                                  &old_inode_dot_dot_res);
                if (status) {
                        status = -EIO;
                        goto bail;
@@ -1122,7 +1135,7 @@ static int ocfs2_rename(struct inode *old_dir,
                }
                if (!new_inode && new_dir != old_dir &&
-                    new_dir->i_nlink >= OCFS2_LINK_MAX) {
+                    new_dir->i_nlink >= ocfs2_link_max(osb)) {
                        status = -EMLINK;
                        goto bail;
                }
@@ -1151,8 +1164,8 @@ static int ocfs2_rename(struct inode *old_dir,
         * to delete it */
        status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
                                          new_dentry->d_name.len,
-                                          &newfe_blkno, new_dir, &new_de_bh,
+                                          &newfe_blkno, new_dir,
-                                          &new_de);
+                                          &target_lookup_res);
        /* The only error we allow here is -ENOENT because the new
         * file not existing is perfectly valid. */
        if ((status < 0) && (status != -ENOENT)) {
@@ -1161,8 +1174,10 @@ static int ocfs2_rename(struct inode *old_dir,
                mlog_errno(status);
                goto bail;
        }
+        if (status == 0)
+                target_exists = 1;
-        if (!new_de && new_inode) {
+        if (!target_exists && new_inode) {
                /*
                 * Target was unlinked by another node while we were
                 * waiting to get to ocfs2_rename(). There isn't
@@ -1175,7 +1190,7 @@ static int ocfs2_rename(struct inode *old_dir,
        /* In case we need to overwrite an existing file, we blow it
         * away first */
-        if (new_de) {
+        if (target_exists) {
                /* VFS didn't think there existed an inode here, but
                 * someone else in the cluster must have raced our
                 * rename to create one. Today we error cleanly, in
@@ -1216,8 +1231,8 @@ static int ocfs2_rename(struct inode *old_dir,
                newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
-                mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu "
+                mlog(0, "aha rename over existing... new_blkno=%llu "
-                     "newfebh=%p bhblocknr=%llu\n", new_de,
+                     "newfebh=%p bhblocknr=%llu\n",
                     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
                     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
@@ -1225,7 +1240,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                          new_inode,
                                                          orphan_name,
-                                                          &orphan_entry_bh);
+                                                          &orphan_insert);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1243,7 +1258,7 @@ static int ocfs2_rename(struct inode *old_dir,
                status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
                                                      new_dentry->d_name.name,
                                                      new_dentry->d_name.len,
-                                                      &insert_entry_bh);
+                                                      &target_insert);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1258,10 +1273,10 @@ static int ocfs2_rename(struct inode *old_dir,
                goto bail;
        }
-        if (new_de) {
+        if (target_exists) {
                if (S_ISDIR(new_inode->i_mode)) {
-                        if (!ocfs2_empty_dir(new_inode) ||
+                        if (new_inode->i_nlink != 2 ||
-                            new_inode->i_nlink != 2) {
+                            !ocfs2_empty_dir(new_inode)) {
                                status = -ENOTEMPTY;
                                goto bail;
                        }
@@ -1274,10 +1289,10 @@ static int ocfs2_rename(struct inode *old_dir,
                }
                if (S_ISDIR(new_inode->i_mode) ||
-                    (newfe->i_links_count == cpu_to_le16(1))){
+                    (ocfs2_read_links_count(newfe) == 1)) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
                                                  newfe, orphan_name,
-                                                  orphan_entry_bh, orphan_dir);
+                                                  &orphan_insert, orphan_dir);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1285,8 +1300,8 @@ static int ocfs2_rename(struct inode *old_dir,
                }
                /* change the dirent to point to the correct inode */
-                status = ocfs2_update_entry(new_dir, handle, new_de_bh,
+                status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
-                                            new_de, old_inode);
+                                            old_inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1294,9 +1309,9 @@ static int ocfs2_rename(struct inode *old_dir,
                new_dir->i_version++;
                if (S_ISDIR(new_inode->i_mode))
-                        newfe->i_links_count = 0;
+                        ocfs2_set_links_count(newfe, 0);
                else
-                        le16_add_cpu(&newfe->i_links_count, -1);
+                        ocfs2_add_links_count(newfe, -1);
                status = ocfs2_journal_dirty(handle, newfe_bh);
                if (status < 0) {
@@ -1307,7 +1322,7 @@ static int ocfs2_rename(struct inode *old_dir,
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
                                         OCFS2_I(old_inode)->ip_blkno,
-                                         new_dir_bh, insert_entry_bh);
+                                         new_dir_bh, &target_insert);
        }
        old_inode->i_ctime = CURRENT_TIME;
@@ -1334,15 +1349,13 @@ static int ocfs2_rename(struct inode *old_dir,
         * because the insert might have changed the type of directory
         * we're dealing with.
         */
-        old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
+        status = ocfs2_find_entry(old_dentry->d_name.name,
-                                     old_dentry->d_name.len,
+                                  old_dentry->d_name.len, old_dir,
-                                     old_dir, &old_de);
+                                  &old_entry_lookup);
-        if (!old_de_bh) {
+        if (status)
-                status = -EIO;
                goto bail;
-        }
-        status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+        status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1353,9 +1366,10 @@ static int ocfs2_rename(struct inode *old_dir,
                new_inode->i_ctime = CURRENT_TIME;
        }
        old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
-        if (old_inode_de_bh) {
-                status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh,
+        if (update_dot_dot) {
-                                            old_inode_dot_dot_de, new_dir);
+                status = ocfs2_update_entry(old_inode, handle,
+                                            &old_inode_dot_dot_res, new_dir);
                old_dir->i_nlink--;
                if (new_inode) {
                        new_inode->i_nlink--;
@@ -1391,14 +1405,13 @@ static int ocfs2_rename(struct inode *old_dir,
                } else {
                        struct ocfs2_dinode *fe;
                        status = ocfs2_journal_access_di(handle, old_dir,
-                                                         old_dir_bh,
+                                                      old_dir_bh,
-                                                         OCFS2_JOURNAL_ACCESS_WRITE);
+                                                      OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
-                        fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
+                        ocfs2_set_links_count(fe, old_dir->i_nlink);
                        status = ocfs2_journal_dirty(handle, old_dir_bh);
                }
        }
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
        status = 0;
 bail:
@@ -1429,15 +1442,17 @@ bail:
        if (new_inode)
                iput(new_inode);
+        ocfs2_free_dir_lookup_result(&target_lookup_res);
+        ocfs2_free_dir_lookup_result(&old_entry_lookup);
+        ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
+        ocfs2_free_dir_lookup_result(&orphan_insert);
+        ocfs2_free_dir_lookup_result(&target_insert);
        brelse(newfe_bh);
        brelse(old_inode_bh);
        brelse(old_dir_bh);
        brelse(new_dir_bh);
-        brelse(new_de_bh);
-        brelse(old_de_bh);
-        brelse(old_inode_de_bh);
-        brelse(orphan_entry_bh);
-        brelse(insert_entry_bh);
        mlog_exit(status);
@@ -1558,7 +1573,6 @@ static int ocfs2_symlink(struct inode *dir,
        struct inode *inode = NULL;
        struct super_block *sb;
        struct buffer_head *new_fe_bh = NULL;
-        struct buffer_head *de_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_dinode *dirfe;
@@ -1572,6 +1586,7 @@ static int ocfs2_symlink(struct inode *dir,
                .enable = 1,
        };
        int did_quota = 0, did_quota_inode = 0;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1592,7 +1607,7 @@ static int ocfs2_symlink(struct inode *dir,
        }
        dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-        if (!dirfe->i_links_count) {
+        if (!ocfs2_read_links_count(dirfe)) {
                /* can't make a file in a deleted directory. */
                status = -ENOENT;
                goto bail;
@@ -1605,7 +1620,7 @@ static int ocfs2_symlink(struct inode *dir,
        status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                              dentry->d_name.name,
-                                              dentry->d_name.len, &de_bh);
+                                              dentry->d_name.len, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1744,7 +1759,7 @@ static int ocfs2_symlink(struct inode *dir,
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
-                                 de_bh);
+                                 &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1772,9 +1787,9 @@ bail:
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
-        brelse(de_bh);
        kfree(si.name);
        kfree(si.value);
+        ocfs2_free_dir_lookup_result(&lookup);
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
        if (data_ac)
@@ -1826,7 +1841,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    struct inode *inode,
                                    char *name,
-                                    struct buffer_head **de_bh)
+                                    struct ocfs2_dir_lookup_result *lookup)
 {
        struct inode *orphan_dir_inode;
        struct buffer_head *orphan_dir_bh = NULL;
@@ -1857,7 +1872,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
        status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
                                              orphan_dir_bh, name,
-                                              OCFS2_ORPHAN_NAMELEN, de_bh);
+                                              OCFS2_ORPHAN_NAMELEN, lookup);
        if (status < 0) {
                ocfs2_inode_unlock(orphan_dir_inode, 1);
@@ -1884,7 +1899,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct inode *inode,
                            struct ocfs2_dinode *fe,
                            char *name,
-                            struct buffer_head *de_bh,
+                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode)
 {
        struct buffer_head *orphan_dir_bh = NULL;
@@ -1910,8 +1925,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
         * underneath us... */
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
-                le16_add_cpu(&orphan_fe->i_links_count, 1);
+                ocfs2_add_links_count(orphan_fe, 1);
-        orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
        if (status < 0) {
@@ -1922,7 +1937,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
                                   OCFS2_ORPHAN_NAMELEN, inode,
                                   OCFS2_I(inode)->ip_blkno,
-                                   orphan_dir_bh, de_bh);
+                                   orphan_dir_bh, lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1955,8 +1970,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        char name[OCFS2_ORPHAN_NAMELEN + 1];
        struct ocfs2_dinode *orphan_fe;
        int status = 0;
-        struct buffer_head *target_de_bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        struct ocfs2_dir_entry *target_de = NULL;
        mlog_entry_void();
@@ -1971,17 +1985,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
             OCFS2_ORPHAN_NAMELEN);
        /* find it's spot in the orphan directory */
-        target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
+        status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
-                                        orphan_dir_inode, &target_de);
+                                  &lookup);
-        if (!target_de_bh) {
+        if (status) {
-                status = -ENOENT;
                mlog_errno(status);
                goto leave;
        }
        /* remove it from the orphan directory */
-        status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
+        status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
-                                    target_de_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1997,8 +2009,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        /* do the i_nlink dance! :) */
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
-                le16_add_cpu(&orphan_fe->i_links_count, -1);
+                ocfs2_add_links_count(orphan_fe, -1);
-        orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
        if (status < 0) {
@@ -2007,7 +2019,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        }
 leave:
-        brelse(target_de_bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(status);
        return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 946d3c34b90b..1386281950db 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_replay_map;
 struct ocfs2_quota_recovery;
 struct ocfs2_dentry_lock;
 struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
        atomic_t vol_state;
        struct mutex recovery_lock;
        struct ocfs2_recovery_map *recovery_map;
+        struct ocfs2_replay_map *replay_map;
        struct task_struct *recovery_thread_task;
        int disable_recovery;
        wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
        u64 la_last_gd;
-#ifdef CONFIG_OCFS2_FS_STATS
-        struct dentry *local_alloc_debug;
-        char *local_alloc_debug_buf;
-#endif
        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
        struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
        struct ocfs2_lock_res osb_rename_lockres;
+        struct ocfs2_lock_res osb_nfs_sync_lockres;
        struct ocfs2_dlm_debug *osb_dlm_debug;
        struct dentry *osb_debug_root;
+        struct dentry *osb_ctxt;
        wait_queue_head_t recovery_event;
@@ -344,6 +343,12 @@ struct ocfs2_super
        /* used to protect metaecc calculation check of xattr. */
        spinlock_t osb_xattr_lock;
+        unsigned int                    osb_dx_mask;
+        u32                             osb_dx_seed[4];
+        /* the group we used to allocate inodes. */
+        u64                             osb_inode_alloc_group;
 };
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+                return 1;
+        return 0;
+}
+static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
+{
+        if (ocfs2_supports_indexed_dirs(osb))
+                return OCFS2_DX_LINK_MAX;
+        return OCFS2_LINK_MAX;
+}
+static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
+{
+        u32 nlink = le16_to_cpu(di->i_links_count);
+        u32 hi = le16_to_cpu(di->i_links_count_hi);
+        if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
+                nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+        return nlink;
+}
+static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
+{
+        u16 lo, hi;
+        lo = nlink;
+        hi = nlink >> OCFS2_LINKS_HI_SHIFT;
+        di->i_links_count = cpu_to_le16(lo);
+        di->i_links_count_hi = cpu_to_le16(hi);
+}
+static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
+{
+        u32 links = ocfs2_read_links_count(di);
+        links += n;
+        ocfs2_set_links_count(di, links);
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DIR_TRAILER(ptr)                                 \
        (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+#define OCFS2_IS_VALID_DX_ROOT(ptr)                                     \
+        (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
+#define OCFS2_IS_VALID_DX_LEAF(ptr)                                     \
+        (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
 {
@@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
        return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
 }
+static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
+                                               u64 blocks)
+{
+        int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
+        unsigned int clusters;
+        clusters = ocfs2_blocks_to_clusters(sb, blocks);
+        return (u64)clusters << bits;
+}
 static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
                                                u64 bytes)
 {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 2332ef740f4f..7ab6e9e5e77c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE     "XATTR01"
 #define OCFS2_DIR_TRAILER_SIGNATURE     "DIRTRL1"
+#define OCFS2_DX_ROOT_SIGNATURE         "DXDIR01"
+#define OCFS2_DX_LEAF_SIGNATURE         "DXLEAF1"
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)                       \
@@ -95,7 +97,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
-                                         | OCFS2_FEATURE_INCOMPAT_META_ECC)
+                                         | OCFS2_FEATURE_INCOMPAT_META_ECC \
+                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -151,6 +154,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR            0x0200
+/* Support for indexed directores */
+#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS     0x0400
 /* Metadata checksum and error correction */
 #define OCFS2_FEATURE_INCOMPAT_META_ECC         0x0800
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 #define OCFS2_DIR_REC_LEN(name_len)     (((name_len) + OCFS2_DIR_MEMBER_LEN + \
                                          OCFS2_DIR_ROUND) & \
                                         ~OCFS2_DIR_ROUND)
+#define OCFS2_DIR_MIN_REC_LEN   OCFS2_DIR_REC_LEN(1)
 #define OCFS2_LINK_MAX          32000
+#define OCFS2_DX_LINK_MAX       ((1U << 31) - 1U)
+#define OCFS2_LINKS_HI_SHIFT    16
+#define OCFS2_DX_ENTRIES_MAX    (0xffffffffU)
 #define S_SHIFT                 12
 static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
 /*B8*/  __le16 s_xattr_inline_size;     /* extended attribute inline size
                                           for this fs*/
        __le16 s_reserved0;
-        __le32 s_reserved1;
+        __le32 s_dx_seed[3];            /* seed[0-2] for dx dir hash.
-/*C0*/  __le64 s_reserved2[16];         /* Fill out superblock */
+                                         * s_uuid_hash serves as seed[3]. */
+/*C0*/  __le64 s_reserved2[15];         /* Fill out superblock */
 /*140*/
        /*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
                                           belongs to */
        __le16 i_suballoc_bit;          /* Bit offset in suballocator
                                           block group */
-/*10*/  __le16 i_reserved0;
+/*10*/  __le16 i_links_count_hi;        /* High 16 bits of links count */
        __le16 i_xattr_inline_size;
        __le32 i_clusters;              /* Cluster count */
        __le32 i_uid;                   /* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
        __le16 i_dyn_features;
        __le64 i_xattr_loc;
 /*80*/  struct ocfs2_block_check i_check;       /* Error checking */
-/*88*/  __le64 i_reserved2[6];
+/*88*/  __le64 i_dx_root;               /* Pointer to dir index root block */
+        __le64 i_reserved2[5];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
 /*40*/
 };
+ /*
+ * A directory entry in the indexed tree. We don't store the full name here,
+ * but instead provide a pointer to the full dirent in the unindexed tree.
+ *
+ * We also store name_len here so as to reduce the number of leaf blocks we
+ * need to search in case of collisions.
+ */
+struct ocfs2_dx_entry {
+        __le32          dx_major_hash;  /* Used to find logical
+                                         * cluster in index */
+        __le32          dx_minor_hash;  /* Lower bits used to find
+                                         * block in cluster */
+        __le64          dx_dirent_blk;  /* Physical block in unindexed
+                                         * tree holding this dirent. */
+};
+struct ocfs2_dx_entry_list {
+        __le32          de_reserved;
+        __le16          de_count;       /* Maximum number of entries
+                                         * possible in de_entries */
+        __le16          de_num_used;    /* Current number of
+                                         * de_entries entries */
+        struct  ocfs2_dx_entry          de_entries[0];  /* Indexed dir entries
+                                                         * in a packed array of
+                                                         * length de_num_used */
+};
+#define OCFS2_DX_FLAG_INLINE    0x01
+/*
+ * A directory indexing block. Each indexed directory has one of these,
+ * pointed to by ocfs2_dinode.
+ *
+ * This block stores an indexed btree root, and a set of free space
+ * start-of-list pointers.
+ */
+struct ocfs2_dx_root_block {
+        __u8            dr_signature[8];        /* Signature for verification */
+        struct ocfs2_block_check dr_check;      /* Error checking */
+        __le16          dr_suballoc_slot;       /* Slot suballocator this
+                                                 * block belongs to. */
+        __le16          dr_suballoc_bit;        /* Bit offset in suballocator
+                                                 * block group */
+        __le32          dr_fs_generation;       /* Must match super block */
+        __le64          dr_blkno;               /* Offset on disk, in blocks */
+        __le64          dr_last_eb_blk;         /* Pointer to last
+                                                 * extent block */
+        __le32          dr_clusters;            /* Clusters allocated
+                                                 * to the indexed tree. */
+        __u8            dr_flags;               /* OCFS2_DX_FLAG_* flags */
+        __u8            dr_reserved0;
+        __le16          dr_reserved1;
+        __le64          dr_dir_blkno;           /* Pointer to parent inode */
+        __le32          dr_num_entries;         /* Total number of
+                                                 * names stored in
+                                                 * this directory.*/
+        __le32          dr_reserved2;
+        __le64          dr_free_blk;            /* Pointer to head of free
+                                                 * unindexed block list. */
+        __le64          dr_reserved3[15];
+        union {
+                struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
+                                                   * bits for maximum space
+                                                   * efficiency. */
+                struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
+                                                        * entries. We grow out
+                                                        * to extents if this
+                                                        * gets too big. */
+        };
+};
+/*
+ * The header of a leaf block in the indexed tree.
+ */
+struct ocfs2_dx_leaf {
+        __u8            dl_signature[8];/* Signature for verification */
+        struct ocfs2_block_check dl_check;      /* Error checking */
+        __le64          dl_blkno;       /* Offset on disk, in blocks */
+        __le32          dl_fs_generation;/* Must match super block */
+        __le32          dl_reserved0;
+        __le64          dl_reserved1;
+        struct ocfs2_dx_entry_list      dl_list;
+};
 /*
 * On disk allocator group structure for OCFS2
 */
@@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
        int size;
@@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
+        return size / sizeof(struct ocfs2_dx_entry);
+}
+static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
+        return size / sizeof(struct ocfs2_dx_entry);
+}
 static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
 {
        u16 size;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index eb6f50c9ceca..a53ce87481bf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_LOCK_TYPE_FLOCK,
        OCFS2_LOCK_TYPE_QINFO,
+        OCFS2_LOCK_TYPE_NFS_SYNC,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_QINFO:
                        c = 'Q';
                        break;
+                case OCFS2_LOCK_TYPE_NFS_SYNC:
+                        c = 'Y';
+                        break;
                default:
                        c = '\0';
        }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a69628603e18..8439f6b324b9 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -48,7 +48,8 @@
 #include "buffer_head_io.h"
 #define NOT_ALLOC_NEW_GROUP             0
-#define ALLOC_NEW_GROUP                 1
+#define ALLOC_NEW_GROUP                 0x1
+#define ALLOC_GROUPS_FROM_GLOBAL        0x2
 #define OCFS2_MAX_INODES_TO_STEAL       1024
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
                                   struct buffer_head *bh,
-                                   u64 max_block);
+                                   u64 max_block,
+                                   u64 *last_alloc_group,
+                                   int flags);
 static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u16 *bg_bit_off);
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                                             u32 bits_wanted, u64 max_block,
+                                             int flags,
                                             struct ocfs2_alloc_context **ac);
 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
                                   struct buffer_head *bh,
-                                   u64 max_block)
+                                   u64 max_block,
+                                   u64 *last_alloc_group,
+                                   int flags)
 {
        int status, credits;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        cl = &fe->id2.i_chain;
        status = ocfs2_reserve_clusters_with_limit(osb,
                                                   le16_to_cpu(cl->cl_cpg),
-                                                   max_block, &ac);
+                                                   max_block, flags, &ac);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                goto bail;
        }
+        if (last_alloc_group && *last_alloc_group != 0) {
+                mlog(0, "use old allocation group %llu for block group alloc\n",
+                     (unsigned long long)*last_alloc_group);
+                ac->ac_last_group = *last_alloc_group;
+        }
        status = ocfs2_claim_clusters(osb,
                                      handle,
                                      ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
        status = 0;
+        /* save the new last alloc group so that the caller can cache it. */
+        if (last_alloc_group)
+                *last_alloc_group = ac->ac_last_group;
 bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                                       struct ocfs2_alloc_context *ac,
                                       int type,
                                       u32 slot,
-                                       int alloc_new_group)
+                                       u64 *last_alloc_group,
+                                       int flags)
 {
        int status;
        u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                        goto bail;
                }
-                if (alloc_new_group != ALLOC_NEW_GROUP) {
+                if (!(flags & ALLOC_NEW_GROUP)) {
                        mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
                             "and we don't alloc a new group for it.\n",
                             slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                }
                status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
-                                                 ac->ac_max_block);
+                                                 ac->ac_max_block,
+                                                 last_alloc_group, flags);
                if (status < 0) {
                        if (status != -ENOSPC)
                                mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
-                                             slot, ALLOC_NEW_GROUP);
+                                             slot, NULL, ALLOC_NEW_GROUP);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
                status = ocfs2_reserve_suballoc_bits(osb, ac,
                                                     INODE_ALLOC_SYSTEM_INODE,
-                                                     slot, NOT_ALLOC_NEW_GROUP);
+                                                     slot, NULL,
+                                                     NOT_ALLOC_NEW_GROUP);
                if (status >= 0) {
                        ocfs2_set_inode_steal_slot(osb, slot);
                        break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 {
        int status;
        s16 slot = ocfs2_get_inode_steal_slot(osb);
+        u64 alloc_group;
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
        if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                goto inode_steal;
        atomic_set(&osb->s_num_inodes_stolen, 0);
+        alloc_group = osb->osb_inode_alloc_group;
        status = ocfs2_reserve_suballoc_bits(osb, *ac,
                                             INODE_ALLOC_SYSTEM_INODE,
-                                             osb->slot_num, ALLOC_NEW_GROUP);
+                                             osb->slot_num,
+                                             &alloc_group,
+                                             ALLOC_NEW_GROUP |
+                                             ALLOC_GROUPS_FROM_GLOBAL);
        if (status >= 0) {
                status = 0;
+                spin_lock(&osb->osb_lock);
+                osb->osb_inode_alloc_group = alloc_group;
+                spin_unlock(&osb->osb_lock);
+                mlog(0, "after reservation, new allocation group is "
+                     "%llu\n", (unsigned long long)alloc_group);
                /*
                 * Some inodes must be freed by us, so try to allocate
                 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, ac,
                                             GLOBAL_BITMAP_SYSTEM_INODE,
-                                             OCFS2_INVALID_SLOT,
+                                             OCFS2_INVALID_SLOT, NULL,
                                             ALLOC_NEW_GROUP);
        if (status < 0 && status != -ENOSPC) {
                mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
 * things a bit. */
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                                             u32 bits_wanted, u64 max_block,
+                                             int flags,
                                             struct ocfs2_alloc_context **ac)
 {
        int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
        (*ac)->ac_max_block = max_block;
        status = -ENOSPC;
-        if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+        if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
+            ocfs2_alloc_should_use_local(osb, bits_wanted)) {
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac)
 {
-        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
+                                                 ALLOC_NEW_GROUP, ac);
 }
 /*
@@ -1618,8 +1651,41 @@ bail:
        return status;
 }
+static void ocfs2_init_inode_ac_group(struct inode *dir,
+                                      struct buffer_head *parent_fe_bh,
+                                      struct ocfs2_alloc_context *ac)
+{
+        struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+        /*
+         * Try to allocate inodes from some specific group.
+         *
+         * If the parent dir has recorded the last group used in allocation,
+         * cool, use it. Otherwise if we try to allocate new inode from the
+         * same slot the parent dir belongs to, use the same chunk.
+         *
+         * We are very careful here to avoid the mistake of setting
+         * ac_last_group to a group descriptor from a different (unlocked) slot.
+         */
+        if (OCFS2_I(dir)->ip_last_used_group &&
+            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
+                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
+        else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
+                ac->ac_last_group = ocfs2_which_suballoc_group(
+                                        le64_to_cpu(fe->i_blkno),
+                                        le16_to_cpu(fe->i_suballoc_bit));
+}
+static inline void ocfs2_save_inode_ac_group(struct inode *dir,
+                                             struct ocfs2_alloc_context *ac)
+{
+        OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
+        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
+}
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
                          handle_t *handle,
+                          struct inode *dir,
+                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
                          u16 *suballoc_bit,
                          u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        BUG_ON(ac->ac_bits_wanted != 1);
        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
        status = ocfs2_claim_suballoc_bits(osb,
                                           ac,
                                           handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
        ac->ac_bits_given++;
+        ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
 bail:
        mlog_exit(status);
@@ -2116,3 +2185,167 @@ out:
        return ret;
 }
+/*
+ * Read the inode specified by blkno to get suballoc_slot and
+ * suballoc_bit.
+ */
+static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
+                                       u16 *suballoc_slot, u16 *suballoc_bit)
+{
+        int status;
+        struct buffer_head *inode_bh = NULL;
+        struct ocfs2_dinode *inode_fe;
+        mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
+        /* dirty read disk */
+        status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
+        if (status < 0) {
+                mlog(ML_ERROR, "read block %llu failed %d\n",
+                     (unsigned long long)blkno, status);
+                goto bail;
+        }
+        inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
+        if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
+                mlog(ML_ERROR, "invalid inode %llu requested\n",
+                     (unsigned long long)blkno);
+                status = -EINVAL;
+                goto bail;
+        }
+        if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
+            (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
+                mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
+                     (unsigned long long)blkno,
+                     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+                status = -EINVAL;
+                goto bail;
+        }
+        if (suballoc_slot)
+                *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
+        if (suballoc_bit)
+                *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+bail:
+        brelse(inode_bh);
+        mlog_exit(status);
+        return status;
+}
+/*
+ * test whether bit is SET in allocator bitmap or not.  on success, 0
+ * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
+ * is returned and *res is meaningless.  Call this after you have
+ * cluster locked against suballoc, or you may get a result based on
+ * non-up2date contents
+ */
+static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
+                                   struct inode *suballoc,
+                                   struct buffer_head *alloc_bh, u64 blkno,
+                                   u16 bit, int *res)
+{
+        struct ocfs2_dinode *alloc_fe;
+        struct ocfs2_group_desc *group;
+        struct buffer_head *group_bh = NULL;
+        u64 bg_blkno;
+        int status;
+        mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
+                   (unsigned int)bit);
+        alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+        if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
+                     (unsigned int)bit,
+                     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+                status = -EINVAL;
+                goto bail;
+        }
+        bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+                                             &group_bh);
+        if (status < 0) {
+                mlog(ML_ERROR, "read group %llu failed %d\n",
+                     (unsigned long long)bg_blkno, status);
+                goto bail;
+        }
+        group = (struct ocfs2_group_desc *) group_bh->b_data;
+        *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
+bail:
+        brelse(group_bh);
+        mlog_exit(status);
+        return status;
+}
+/*
+ * Test if the bit representing this inode (blkno) is set in the
+ * suballocator.
+ *
+ * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ *
+ * In the event of failure, a negative value is returned and *res is
+ * meaningless.
+ *
+ * Callers must make sure to hold nfs_sync_lock to prevent
+ * ocfs2_delete_inode() on another node from accessing the same
+ * suballocator concurrently.
+ */
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
+{
+        int status;
+        u16 suballoc_bit = 0, suballoc_slot = 0;
+        struct inode *inode_alloc_inode;
+        struct buffer_head *alloc_bh = NULL;
+        mlog_entry("blkno: %llu", (unsigned long long)blkno);
+        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
+                                             &suballoc_bit);
+        if (status < 0) {
+                mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
+                goto bail;
+        }
+        inode_alloc_inode =
+                ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+                                            suballoc_slot);
+        if (!inode_alloc_inode) {
+                /* the error code could be inaccurate, but we are not able to
+                 * get the correct one. */
+                status = -EINVAL;
+                mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
+                     (u32)suballoc_slot);
+                goto bail;
+        }
+        mutex_lock(&inode_alloc_inode->i_mutex);
+        status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
+        if (status < 0) {
+                mutex_unlock(&inode_alloc_inode->i_mutex);
+                mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
+                     (u32)suballoc_slot, status);
+                goto bail;
+        }
+        status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
+                                         blkno, suballoc_bit, res);
+        if (status < 0)
+                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+        ocfs2_inode_unlock(inode_alloc_inode, 0);
+        mutex_unlock(&inode_alloc_inode->i_mutex);
+        iput(inode_alloc_inode);
+        brelse(alloc_bh);
+bail:
+        mlog_exit(status);
+        return status;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e3c13c77f9e8..8c9a78a43164 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
                         u64 *blkno_start);
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
                          handle_t *handle,
+                          struct inode *dir,
+                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
                          u16 *suballoc_bit,
                          u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
                          struct ocfs2_alloc_context **meta_ac);
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7ac83a81ee55..79ff8d9d37e0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
        {Opt_err, NULL}
 };
+#ifdef CONFIG_DEBUG_FS
+static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
+{
+        int out = 0;
+        int i;
+        struct ocfs2_cluster_connection *cconn = osb->cconn;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        out += snprintf(buf + out, len - out,
+                        "%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
+                        "Device", osb->dev_str, osb->uuid_str,
+                        osb->fs_generation, osb->vol_label);
+        out += snprintf(buf + out, len - out,
+                        "%10s => State: %d  Flags: 0x%lX\n", "Volume",
+                        atomic_read(&osb->vol_state), osb->osb_flags);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Block: %lu  Cluster: %d\n", "Sizes",
+                        osb->sb->s_blocksize, osb->s_clustersize);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Compat: 0x%X  Incompat: 0x%X  "
+                        "ROcompat: 0x%X\n",
+                        "Features", osb->s_feature_compat,
+                        osb->s_feature_incompat, osb->s_feature_ro_compat);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
+                        osb->s_mount_opt, osb->s_atime_quantum);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Stack: %s  Name: %*s  Version: %d.%d\n",
+                        "Cluster",
+                        (*osb->osb_cluster_stack == '\0' ?
+                         "o2cb" : osb->osb_cluster_stack),
+                        cconn->cc_namelen, cconn->cc_name,
+                        cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
+        spin_lock(&osb->dc_task_lock);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
+                        "WorkSeq: %lu\n", "DownCnvt",
+                        task_pid_nr(osb->dc_task), osb->blocked_lock_count,
+                        osb->dc_wake_sequence, osb->dc_work_sequence);
+        spin_unlock(&osb->dc_task_lock);
+        spin_lock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out, "%10s => Pid: %d  Nodes:",
+                        "Recovery",
+                        (osb->recovery_thread_task ?
+                         task_pid_nr(osb->recovery_thread_task) : -1));
+        if (rm->rm_used == 0)
+                out += snprintf(buf + out, len - out, " None\n");
+        else {
+                for (i = 0; i < rm->rm_used; i++)
+                        out += snprintf(buf + out, len - out, " %d",
+                                        rm->rm_entries[i]);
+                out += snprintf(buf + out, len - out, "\n");
+        }
+        spin_unlock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+                        task_pid_nr(osb->commit_task), osb->osb_commit_interval,
+                        atomic_read(&osb->needs_checkpoint));
+        out += snprintf(buf + out, len - out,
+                        "%10s => State: %d  NumTxns: %d  TxnId: %lu\n",
+                        "Journal", osb->journal->j_state,
+                        atomic_read(&osb->journal->j_num_trans),
+                        osb->journal->j_trans_id);
+        out += snprintf(buf + out, len - out,
+                        "%10s => GlobalAllocs: %d  LocalAllocs: %d  "
+                        "SubAllocs: %d  LAWinMoves: %d  SAExtends: %d\n",
+                        "Stats",
+                        atomic_read(&osb->alloc_stats.bitmap_data),
+                        atomic_read(&osb->alloc_stats.local_data),
+                        atomic_read(&osb->alloc_stats.bg_allocs),
+                        atomic_read(&osb->alloc_stats.moves),
+                        atomic_read(&osb->alloc_stats.bg_extends));
+        out += snprintf(buf + out, len - out,
+                        "%10s => State: %u  Descriptor: %llu  Size: %u bits  "
+                        "Default: %u bits\n",
+                        "LocalAlloc", osb->local_alloc_state,
+                        (unsigned long long)osb->la_last_gd,
+                        osb->local_alloc_bits, osb->local_alloc_default_bits);
+        spin_lock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Slot: %d  NumStolen: %d\n", "Steal",
+                        osb->s_inode_steal_slot,
+                        atomic_read(&osb->s_num_inodes_stolen));
+        spin_unlock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
+                        "Slots", "Num", "RecoGen");
+        for (i = 0; i < osb->max_slots; ++i) {
+                out += snprintf(buf + out, len - out,
+                                "%10s  %c %3d  %10d\n",
+                                " ",
+                                (i == osb->slot_num ? '*' : ' '),
+                                i, osb->slot_recovery_generations[i]);
+        }
+        return out;
+}
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+        struct ocfs2_super *osb = inode->i_private;
+        char *buf = NULL;
+        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!buf)
+                goto bail;
+        i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
+        file->private_data = buf;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+        kfree(file->private_data);
+        return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
+}
+#else
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+static struct file_operations ocfs2_osb_debug_fops = {
+        .open =         ocfs2_osb_debug_open,
+        .release =      ocfs2_debug_release,
+        .read =         ocfs2_debug_read,
+        .llseek =       generic_file_llseek,
+};
 /*
 * write_super and sync_fs ripped right out of ext3.
 */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                goto read_super_error;
        }
+        osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
+                                            osb->osb_debug_root,
+                                            osb,
+                                            &ocfs2_osb_debug_fops);
+        if (!osb->osb_ctxt) {
+                status = -EINVAL;
+                mlog_errno(status);
+                goto read_super_error;
+        }
        status = ocfs2_mount_volume(sb);
        if (osb->root_inode)
                inode = igrab(osb->root_inode);
@@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        osb = OCFS2_SB(sb);
        BUG_ON(!osb);
+        debugfs_remove(osb->osb_ctxt);
        ocfs2_disable_quotas(osb);
        ocfs2_shutdown_local_alloc(osb);
@@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
        sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+        osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
+        for (i = 0; i < 3; i++)
+                osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
+        osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
        osb->sb = sb;
        /* Save off for ocfs2_rw_direct */
        osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
         * lock, and it's marked as dirty, set the bit in the recover
         * map and launch a recovery thread for it. */
        status = ocfs2_mark_dead_nodes(osb);
+        if (status < 0) {
+                mlog_errno(status);
+                goto finally;
+        }
+        status = ocfs2_compute_replay_slots(osb);
        if (status < 0)
                mlog_errno(status);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ed0a0cfd68d2..579dd1b1110f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/utsname.h>
+#include <linux/namei.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -54,26 +55,6 @@
 #include "buffer_head_io.h"
-static char *ocfs2_page_getlink(struct dentry * dentry,
-                                struct page **ppage);
-static char *ocfs2_fast_symlink_getlink(struct inode *inode,
-                                        struct buffer_head **bh);
-/* get the link contents into pagecache */
-static char *ocfs2_page_getlink(struct dentry * dentry,
-                                struct page **ppage)
-{
-        struct page * page;
-        struct address_space *mapping = dentry->d_inode->i_mapping;
-        page = read_mapping_page(mapping, 0, NULL);
-        if (IS_ERR(page))
-                goto sync_fail;
-        *ppage = page;
-        return kmap(page);
-sync_fail:
-        return (char*)page;
-}
 static char *ocfs2_fast_symlink_getlink(struct inode *inode,
                                        struct buffer_head **bh)
@@ -128,40 +109,55 @@ out:
        return ret;
 }
-static void *ocfs2_follow_link(struct dentry *dentry,
+static void *ocfs2_fast_follow_link(struct dentry *dentry,
-                               struct nameidata *nd)
+                                    struct nameidata *nd)
 {
-        int status;
+        int status = 0;
-        char *link;
+        int len;
+        char *target, *link = ERR_PTR(-ENOMEM);
        struct inode *inode = dentry->d_inode;
-        struct page *page = NULL;
        struct buffer_head *bh = NULL;
-        
-        if (ocfs2_inode_is_fast_symlink(inode))
+        mlog_entry_void();
-                link = ocfs2_fast_symlink_getlink(inode, &bh);
-        else
+        BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
-                link = ocfs2_page_getlink(dentry, &page);
+        target = ocfs2_fast_symlink_getlink(inode, &bh);
-        if (IS_ERR(link)) {
+        if (IS_ERR(target)) {
-                status = PTR_ERR(link);
+                status = PTR_ERR(target);
                mlog_errno(status);
                goto bail;
        }
-        status = vfs_follow_link(nd, link);
+        /* Fast symlinks can't be large */
+        len = strlen(target);
+        link = kzalloc(len + 1, GFP_NOFS);
+        if (!link) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        memcpy(link, target, len);
+        nd_set_link(nd, link);
 bail:
-        if (page) {
-                kunmap(page);
-                page_cache_release(page);
-        }
        brelse(bh);
-        return ERR_PTR(status);
+        mlog_exit(status);
+        return status ? ERR_PTR(status) : link;
+}
+static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+        char *link = cookie;
+        kfree(link);
 }
 const struct inode_operations ocfs2_symlink_inode_operations = {
        .readlink       = page_readlink,
-        .follow_link    = ocfs2_follow_link,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
        .getattr        = ocfs2_getattr,
        .setattr        = ocfs2_setattr,
        .setxattr       = generic_setxattr,
@@ -171,7 +167,8 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
 };
 const struct inode_operations ocfs2_fast_symlink_inode_operations = {
        .readlink       = ocfs2_readlink,
-        .follow_link    = ocfs2_follow_link,
+        .follow_link    = ocfs2_fast_follow_link,
+        .put_link       = ocfs2_fast_put_link,
        .getattr        = ocfs2_getattr,
        .setattr        = ocfs2_setattr,
        .setxattr       = generic_setxattr,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2563df89fc2a..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
                          struct ocfs2_security_xattr_info *si,
                          int *want_clusters,
                          int *xattr_credits,
-                          struct ocfs2_alloc_context **xattr_ac)
+                          int *want_meta)
 {
        int ret = 0;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
            (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
            (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
-                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                *want_meta = *want_meta + 1;
-                if (ret) {
-                        mlog_errno(ret);
-                        return ret;
-                }
                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
        }
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
                             int *, int *, struct ocfs2_alloc_context **);
 int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
                          int, struct ocfs2_security_xattr_info *,
-                          int *, int *, struct ocfs2_alloc_context **);
+                          int *, int *, int *);
 /*
 * xattrs can live inside an inode, as part of an external xattr block,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972bb..379ae5fb4411 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *s = dentry->d_sb;
        struct omfs_sb_info *sbi = OMFS_SB(s);
+        u64 id = huge_encode_dev(s->s_bdev->bd_dev);
        buf->f_type = OMFS_MAGIC;
        buf->f_bsize = sbi->s_blocksize;
        buf->f_blocks = sbi->s_num_blocks;
        buf->f_files = sbi->s_num_blocks;
        buf->f_namelen = OMFS_NAMELEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_bfree = buf->f_bavail = buf->f_ffree =
                omfs_count_free(s);
        return 0;
 }
@@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_uid = current_uid();
        sbi->s_gid = current_gid();
-        sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+        sbi->s_dmask = sbi->s_fmask = current_umask();
        if (!parse_options((char *) data, sbi))
                goto end;
diff --git a/fs/open.c b/fs/open.c
index 75b61677daaf..bdfbf03615a4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
 #include <linux/falloc.h>
+#include <linux/fs_struct.h>
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
@@ -1032,7 +1033,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
        if (!IS_ERR(tmp)) {
                fd = get_unused_fd_flags(flags);
                if (fd >= 0) {
-                        struct file *f = do_filp_open(dfd, tmp, flags, mode);
+                        struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
                        if (IS_ERR(f)) {
                                put_unused_fd(fd);
                                fd = PTR_ERR(f);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 38e337d51ced..99e33ef40be4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <linux/blktrace_api.h>
 #include "check.h"
@@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = {
 static struct attribute_group *part_attr_groups[] = {
        &part_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+        &blk_trace_attr_group,
+#endif
        NULL
 };
diff --git a/fs/pipe.c b/fs/pipe.c
index 4af7aa521813..13414ec45b8d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -37,6 +37,42 @@
 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
 */
+static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+{
+        if (pipe->inode)
+                mutex_lock_nested(&pipe->inode->i_mutex, subclass);
+}
+void pipe_lock(struct pipe_inode_info *pipe)
+{
+        /*
+         * pipe_lock() nests non-pipe inode locks (for writing to a file)
+         */
+        pipe_lock_nested(pipe, I_MUTEX_PARENT);
+}
+EXPORT_SYMBOL(pipe_lock);
+void pipe_unlock(struct pipe_inode_info *pipe)
+{
+        if (pipe->inode)
+                mutex_unlock(&pipe->inode->i_mutex);
+}
+EXPORT_SYMBOL(pipe_unlock);
+void pipe_double_lock(struct pipe_inode_info *pipe1,
+                      struct pipe_inode_info *pipe2)
+{
+        BUG_ON(pipe1 == pipe2);
+        if (pipe1 < pipe2) {
+                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+        } else {
+                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+        }
+}
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe)
 {
@@ -47,12 +83,10 @@ void pipe_wait(struct pipe_inode_info *pipe)
         * is considered a noninteractive wait:
         */
        prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
-        if (pipe->inode)
+        pipe_unlock(pipe);
-                mutex_unlock(&pipe->inode->i_mutex);
        schedule();
        finish_wait(&pipe->wait, &wait);
-        if (pipe->inode)
+        pipe_lock(pipe);
-                mutex_lock(&pipe->inode->i_mutex);
 }
 static int
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7e4877d9dcb5..725a650bbbb8 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,6 +80,7 @@
 #include <linux/delayacct.h>
 #include <linux/seq_file.h>
 #include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
 #include <linux/tracehook.h>
 #include <asm/pgtable.h>
@@ -352,6 +353,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        char state;
        pid_t ppid = 0, pgid = -1, sid = -1;
        int num_threads = 0;
+        int permitted;
        struct mm_struct *mm;
        unsigned long long start_time;
        unsigned long cmin_flt = 0, cmaj_flt = 0;
@@ -364,11 +366,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        state = *get_task_state(task);
        vsize = eip = esp = 0;
+        permitted = ptrace_may_access(task, PTRACE_MODE_READ);
        mm = get_task_mm(task);
        if (mm) {
                vsize = task_vsize(mm);
-                eip = KSTK_EIP(task);
+                if (permitted) {
-                esp = KSTK_ESP(task);
+                        eip = KSTK_EIP(task);
+                        esp = KSTK_ESP(task);
+                }
        }
        get_task_comm(tcomm, task);
@@ -424,7 +429,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                unlock_task_sighand(task, &flags);
        }
-        if (!whole || num_threads < 2)
+        if (permitted && (!whole || num_threads < 2))
                wchan = get_wchan(task);
        if (!whole) {
                min_flt = task->min_flt;
@@ -476,7 +481,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                rsslim,
                mm ? mm->start_code : 0,
                mm ? mm->end_code : 0,
-                mm ? mm->start_stack : 0,
+                (permitted && mm) ? mm->start_stack : 0,
                esp,
                eip,
                /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b688..3326bbf9ab95 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 /* NOTE:
@@ -321,7 +322,10 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
        wchan = get_wchan(task);
        if (lookup_symbol_name(wchan, symname) < 0)
-                return sprintf(buffer, "%lu", wchan);
+                if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                        return 0;
+                else
+                        return sprintf(buffer, "%lu", wchan);
        else
                return sprintf(buffer, "%s", symname);
 }
@@ -647,14 +651,14 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
        struct proc_mounts *p = file->private_data;
        struct mnt_namespace *ns = p->ns;
-        unsigned res = 0;
+        unsigned res = POLLIN | POLLRDNORM;
        poll_wait(file, &ns->poll, wait);
        spin_lock(&vfsmount_lock);
        if (p->event != ns->event) {
                p->event = ns->event;
-                res = POLLERR;
+                res |= POLLERR | POLLPRI;
        }
        spin_unlock(&vfsmount_lock);
@@ -1952,7 +1956,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;
-        struct dentry *error = ERR_PTR(-EINVAL);
+        struct dentry *error = ERR_PTR(-ENOENT);
        inode = proc_pid_make_inode(dir->i_sb, task);
        if (!inode)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..c6b0302af4c4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -35,7 +35,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        si_meminfo(&i);
        si_swapinfo(&i);
-        committed = atomic_long_read(&vm_committed_space);
+        committed = percpu_counter_read_positive(&vm_committed_as);
        allowed = ((totalram_pages - hugetlb_total_pages())
                * sysctl_overcommit_ratio / 100) + total_swap_pages;
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeram-i.freehigh),
 #endif
 #ifndef CONFIG_MMU
-                K((unsigned long) atomic_read(&mmap_pages_allocated)),
+                K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
 #endif
                K(i.totalswap),
                K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index b446d7ad0b0d..7e14d1a04001 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -76,7 +76,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 /*
 * display a list of all the REGIONs the kernel knows about
- * - nommu kernals have a single flat list
+ * - nommu kernels have a single flat list
 */
 static int nommu_region_list_show(struct seq_file *m, void *_p)
 {
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 4a9e0f65ae60..83adcc869437 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,16 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
        struct proc_dir_entry *ent;
                
-        if (!driver->ops->read_proc || !driver->driver_name ||
+        if (!driver->driver_name || driver->proc_entry ||
-            driver->proc_entry)
+            !driver->ops->proc_fops)
                return;
-        ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
+        ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
-        if (!ent)
+                               driver->ops->proc_fops, driver);
-                return;
-        ent->read_proc = driver->ops->read_proc;
-        ent->data = driver;
        driver->proc_entry = ent;
 }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 1e15a2b176e8..b080b791d9e3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -67,8 +67,7 @@ static int proc_get_sb(struct file_system_type *fs_type,
                sb->s_flags = flags;
                err = proc_fill_super(sb);
                if (err) {
-                        up_write(&sb->s_umount);
+                        deactivate_locked_super(sb);
-                        deactivate_super(sb);
                        return err;
                }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index f75efa22df5e..81e4eb60972e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -18,6 +18,9 @@
 #ifndef arch_irq_stat
 #define arch_irq_stat() 0
 #endif
+#ifndef arch_idle_time
+#define arch_idle_time(cpu) 0
+#endif
 static int show_stat(struct seq_file *p, void *v)
 {
@@ -40,6 +43,7 @@ static int show_stat(struct seq_file *p, void *v)
                nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
                system = cputime64_add(system, kstat_cpu(i).cpustat.system);
                idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+                idle = cputime64_add(idle, arch_idle_time(i));
                iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
                irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
@@ -69,6 +73,7 @@ static int show_stat(struct seq_file *p, void *v)
                nice = kstat_cpu(i).cpustat.nice;
                system = kstat_cpu(i).cpustat.system;
                idle = kstat_cpu(i).cpustat.idle;
+                idle = cputime64_add(idle, arch_idle_time(i));
                iowait = kstat_cpu(i).cpustat.iowait;
                irq = kstat_cpu(i).cpustat.irq;
                softirq = kstat_cpu(i).cpustat.softirq;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b0ae0be4801f..6f61b7cc32e0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
        struct file *file = vma->vm_file;
        int flags = vma->vm_flags;
        unsigned long ino = 0;
+        unsigned long long pgoff = 0;
        dev_t dev = 0;
        int len;
@@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
+                pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
        }
        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
@@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                        flags & VM_WRITE ? 'w' : '-',
                        flags & VM_EXEC ? 'x' : '-',
                        flags & VM_MAYSHARE ? 's' : 'p',
-                        ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+                        pgoff,
                        MAJOR(dev), MINOR(dev), ino, &len);
        /*
@@ -663,6 +665,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                goto out_task;
        ret = 0;
+        if (!count)
+                goto out_task;
        mm = get_task_mm(task);
        if (!mm)
                goto out_task;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..64a72e2e7650 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
@@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
        else
                bytes += kobjsize(mm);
        
-        if (current->fs && atomic_read(&current->fs->count) > 1)
+        if (current->fs && current->fs->users > 1)
                sbytes += kobjsize(current->fs);
        else
                bytes += kobjsize(current->fs);
@@ -125,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
        struct file *file;
        dev_t dev = 0;
        int flags, len;
+        unsigned long long pgoff = 0;
        flags = vma->vm_flags;
        file = vma->vm_file;
@@ -133,17 +135,18 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
+                pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
        }
        seq_printf(m,
-                   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+                   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
                   vma->vm_start,
                   vma->vm_end,
                   flags & VM_READ ? 'r' : '-',
                   flags & VM_WRITE ? 'w' : '-',
                   flags & VM_EXEC ? 'x' : '-',
                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-                   vma->vm_pgoff << PAGE_SHIFT,
+                   pgoff,
                   MAJOR(dev), MINOR(dev), ino, &len);
        if (file) {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2aad1044b84c..fe1f0f31d11c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        lock_kernel();
@@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bfree   = qnx4_count_free_blocks(sb);
        buf->f_bavail  = buf->f_bfree;
        buf->f_namelen = QNX4_NAME_MAX;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        unlock_kernel();
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 385a0831cc99..68d4f6dc0578 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -1,12 +1,3 @@
-#
-# Makefile for the Linux filesystems.
-#
-# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
-# Rewritten to use lists instead of if-statements.
-#
-obj-y :=
 obj-$(CONFIG_QUOTA)             += dquot.o
 obj-$(CONFIG_QFMT_V1)           += quota_v1.o
 obj-$(CONFIG_QFMT_V2)           += quota_v2.o
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2ca967a5ef77..607c579e5eca 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -823,7 +823,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                        continue;
                if (!atomic_read(&inode->i_writecount))
                        continue;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 995ef1d6686c..ebb2c417912c 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -59,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = {
 */
 int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
-        struct pagevec lru_pvec;
        unsigned long npages, xpages, loop, limit;
        struct page *pages;
        unsigned order;
@@ -102,24 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
        memset(data, 0, newsize);
        /* attach all the pages to the inode's address space */
-        pagevec_init(&lru_pvec, 0);
        for (loop = 0; loop < npages; loop++) {
                struct page *page = pages + loop;
-                ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+                ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
+                                        GFP_KERNEL);
                if (ret < 0)
                        goto add_error;
-                if (!pagevec_add(&lru_pvec, page))
-                        __pagevec_lru_add_file(&lru_pvec);
                /* prevent the page from being discarded on memory pressure */
                SetPageDirty(page);
                unlock_page(page);
        }
-        pagevec_lru_add_file(&lru_pvec);
        return 0;
 fsize_exceeded:
@@ -128,10 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
        return -EFBIG;
 add_error:
-        pagevec_lru_add_file(&lru_pvec);
+        while (loop < npages)
-        page_cache_release(pages + loop);
+                __free_page(pages + loop++);
-        for (loop++; loop < npages; loop++)
-                __free_page(pages + loop);
        return ret;
 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b7e6ac706b87..3a6b193d8444 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -33,12 +33,15 @@
 #include <linux/backing-dev.h>
 #include <linux/ramfs.h>
 #include <linux/sched.h>
+#include <linux/parser.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 /* some random number */
 #define RAMFS_MAGIC     0x858458f6
+#define RAMFS_DEFAULT_MODE      0755
 static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
@@ -158,30 +161,102 @@ static const struct inode_operations ramfs_dir_inode_operations = {
 static const struct super_operations ramfs_ops = {
        .statfs         = simple_statfs,
        .drop_inode     = generic_delete_inode,
+        .show_options   = generic_show_options,
+};
+struct ramfs_mount_opts {
+        umode_t mode;
+};
+enum {
+        Opt_mode,
+        Opt_err
 };
+static const match_table_t tokens = {
+        {Opt_mode, "mode=%o"},
+        {Opt_err, NULL}
+};
+struct ramfs_fs_info {
+        struct ramfs_mount_opts mount_opts;
+};
+static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
+{
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        int token;
+        char *p;
+        opts->mode = RAMFS_DEFAULT_MODE;
+        while ((p = strsep(&data, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_mode:
+                        if (match_octal(&args[0], &option))
+                                return -EINVAL;
+                        opts->mode = option & S_IALLUGO;
+                        break;
+                default:
+                        printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
 static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 {
-        struct inode * inode;
+        struct ramfs_fs_info *fsi;
-        struct dentry * root;
+        struct inode *inode = NULL;
+        struct dentry *root;
-        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        int err;
-        sb->s_blocksize = PAGE_CACHE_SIZE;
-        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+        save_mount_options(sb, data);
-        sb->s_magic = RAMFS_MAGIC;
-        sb->s_op = &ramfs_ops;
+        fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
-        sb->s_time_gran = 1;
+        sb->s_fs_info = fsi;
-        inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0);
+        if (!fsi) {
-        if (!inode)
+                err = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
+        }
+        err = ramfs_parse_options(data, &fsi->mount_opts);
+        if (err)
+                goto fail;
+        sb->s_maxbytes          = MAX_LFS_FILESIZE;
+        sb->s_blocksize         = PAGE_CACHE_SIZE;
+        sb->s_blocksize_bits    = PAGE_CACHE_SHIFT;
+        sb->s_magic             = RAMFS_MAGIC;
+        sb->s_op                = &ramfs_ops;
+        sb->s_time_gran         = 1;
+        inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
+        if (!inode) {
+                err = -ENOMEM;
+                goto fail;
+        }
        root = d_alloc_root(inode);
+        sb->s_root = root;
        if (!root) {
-                iput(inode);
+                err = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
-        sb->s_root = root;
        return 0;
+fail:
+        kfree(fsi);
+        sb->s_fs_info = NULL;
+        iput(inode);
+        return err;
 }
 int ramfs_get_sb(struct file_system_type *fs_type,
@@ -197,10 +272,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type,
                            mnt);
 }
+static void ramfs_kill_sb(struct super_block *sb)
+{
+        kfree(sb->s_fs_info);
+        kill_litter_super(sb);
+}
 static struct file_system_type ramfs_fs_type = {
        .name           = "ramfs",
        .get_sb         = ramfs_get_sb,
-        .kill_sb        = kill_litter_super,
+        .kill_sb        = ramfs_kill_sb,
 };
 static struct file_system_type rootfs_fs_type = {
        .name           = "rootfs",
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973e..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,62 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
        return ret;
 }
+static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
+{
+#define HALF_LONG_BITS (BITS_PER_LONG / 2)
+        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
+}
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        struct file *file;
+        ssize_t ret = -EBADF;
+        int fput_needed;
+        if (pos < 0)
+                return -EINVAL;
+        file = fget_light(fd, &fput_needed);
+        if (file) {
+                ret = -ESPIPE;
+                if (file->f_mode & FMODE_PREAD)
+                        ret = vfs_readv(file, vec, vlen, &pos);
+                fput_light(file, fput_needed);
+        }
+        if (ret > 0)
+                add_rchar(current, ret);
+        inc_syscr(current);
+        return ret;
+}
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        struct file *file;
+        ssize_t ret = -EBADF;
+        int fput_needed;
+        if (pos < 0)
+                return -EINVAL;
+        file = fget_light(fd, &fput_needed);
+        if (file) {
+                ret = -ESPIPE;
+                if (file->f_mode & FMODE_PWRITE)
+                        ret = vfs_writev(file, vec, vlen, &pos);
+                fput_light(file, fput_needed);
+        }
+        if (ret > 0)
+                add_wchar(current, ret);
+        inc_syscw(current);
+        return ret;
+}
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                           size_t count, loff_t max)
 {
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 949b8c6addc8..513f431038f9 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,5 +1,6 @@
 config REISERFS_FS
        tristate "Reiserfs support"
+        select CRC32
        help
          Stores not just filenames but the files themselves in a balanced
          tree.  Uses journalling.
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 67a80d7e59e2..45ee3d357c70 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -41,6 +41,18 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 #define store_ih(where,what) copy_item_head (where, what)
+static inline bool is_privroot_deh(struct dentry *dir,
+                                   struct reiserfs_de_head *deh)
+{
+        int ret = 0;
+#ifdef CONFIG_REISERFS_FS_XATTR
+        struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
+        ret = (dir == dir->d_parent && privroot->d_inode &&
+               deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+#endif
+        return ret;
+}
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                           filldir_t filldir, loff_t *pos)
 {
@@ -138,18 +150,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                }
                                /* Ignore the .reiserfs_priv entry */
-                                if (reiserfs_xattrs(inode->i_sb) &&
+                                if (is_privroot_deh(dentry, deh))
-                                    !old_format_only(inode->i_sb) &&
-                                    dentry == inode->i_sb->s_root &&
-                                    REISERFS_SB(inode->i_sb)->priv_root &&
-                                    REISERFS_SB(inode->i_sb)->priv_root->d_inode
-                                    && deh_objectid(deh) ==
-                                    le32_to_cpu(INODE_PKEY
-                                                (REISERFS_SB(inode->i_sb)->
-                                                 priv_root->d_inode)->
-                                                k_objectid)) {
                                        continue;
-                                }
                                d_off = deh_offset(deh);
                                *pos = d_off;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index efd4d720718e..271579128634 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -338,21 +338,8 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                                &path_to_entry, &de);
        pathrelse(&path_to_entry);
        if (retval == NAME_FOUND) {
-                /* Hide the .reiserfs_priv directory */
+                inode = reiserfs_iget(dir->i_sb,
-                if (reiserfs_xattrs(dir->i_sb) &&
+                                      (struct cpu_key *)&(de.de_dir_id));
-                    !old_format_only(dir->i_sb) &&
-                    REISERFS_SB(dir->i_sb)->priv_root &&
-                    REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
-                    de.de_objectid ==
-                    le32_to_cpu(INODE_PKEY
-                                (REISERFS_SB(dir->i_sb)->priv_root->d_inode)->
-                                k_objectid)) {
-                        reiserfs_write_unlock(dir->i_sb);
-                        return ERR_PTR(-EACCES);
-                }
-                inode =
-                    reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
                if (!inode || IS_ERR(inode)) {
                        reiserfs_write_unlock(dir->i_sb);
                        return ERR_PTR(-EACCES);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 972250c62896..3567fb9e3fb1 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/crc32.h>
 struct file_system_type reiserfs_fs_type;
@@ -447,13 +448,11 @@ int remove_save_link(struct inode *inode, int truncate)
 static void reiserfs_kill_sb(struct super_block *s)
 {
        if (REISERFS_SB(s)) {
-#ifdef CONFIG_REISERFS_FS_XATTR
                if (REISERFS_SB(s)->xattr_root) {
                        d_invalidate(REISERFS_SB(s)->xattr_root);
                        dput(REISERFS_SB(s)->xattr_root);
                        REISERFS_SB(s)->xattr_root = NULL;
                }
-#endif
                if (REISERFS_SB(s)->priv_root) {
                        d_invalidate(REISERFS_SB(s)->priv_root);
                        dput(REISERFS_SB(s)->priv_root);
@@ -1315,8 +1314,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        }
 out_ok:
-        kfree(s->s_options);
+        replace_mount_options(s, new_opts);
-        s->s_options = new_opts;
        return 0;
 out_err:
@@ -1841,7 +1839,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                        goto error;
                }
-                if ((errval = reiserfs_xattr_init(s, s->s_flags))) {
+                if ((errval = reiserfs_lookup_privroot(s)) ||
+                    (errval = reiserfs_xattr_init(s, s->s_flags))) {
                        dput(s->s_root);
                        s->s_root = NULL;
                        goto error;
@@ -1854,7 +1853,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                        reiserfs_info(s, "using 3.5.x disk format\n");
                }
-                if ((errval = reiserfs_xattr_init(s, s->s_flags))) {
+                if ((errval = reiserfs_lookup_privroot(s)) ||
+                    (errval = reiserfs_xattr_init(s, s->s_flags))) {
                        dput(s->s_root);
                        s->s_root = NULL;
                        goto error;
@@ -1904,6 +1904,10 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bsize = dentry->d_sb->s_blocksize;
        /* changed to accommodate gcc folks. */
        buf->f_type = REISERFS_SUPER_MAGIC;
+        buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
+        buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
+                                sizeof(rs->s_uuid)/2);
        return 0;
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f83f52bae390..8e7deb0e6964 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -113,41 +113,30 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 #define xattr_may_create(flags) (!flags || flags & XATTR_CREATE)
-/* Returns and possibly creates the xattr dir. */
+static struct dentry *open_xa_root(struct super_block *sb, int flags)
-static struct dentry *lookup_or_create_dir(struct dentry *parent,
-                                            const char *name, int flags)
 {
-        struct dentry *dentry;
+        struct dentry *privroot = REISERFS_SB(sb)->priv_root;
-        BUG_ON(!parent);
+        struct dentry *xaroot;
+        if (!privroot->d_inode)
-        dentry = lookup_one_len(name, parent, strlen(name));
+                return ERR_PTR(-ENODATA);
-        if (IS_ERR(dentry))
-                return dentry;
-        else if (!dentry->d_inode) {
-                int err = -ENODATA;
-                if (xattr_may_create(flags)) {
+        mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
-                        mutex_lock_nested(&parent->d_inode->i_mutex,
-                                          I_MUTEX_XATTR);
-                        err = xattr_mkdir(parent->d_inode, dentry, 0700);
-                        mutex_unlock(&parent->d_inode->i_mutex);
-                }
+        xaroot = dget(REISERFS_SB(sb)->xattr_root);
+        if (!xaroot)
+                xaroot = ERR_PTR(-ENODATA);
+        else if (!xaroot->d_inode) {
+                int err = -ENODATA;
+                if (xattr_may_create(flags))
+                        err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
                if (err) {
-                        dput(dentry);
+                        dput(xaroot);
-                        dentry = ERR_PTR(err);
+                        xaroot = ERR_PTR(err);
                }
        }
-        return dentry;
+        mutex_unlock(&privroot->d_inode->i_mutex);
-}
+        return xaroot;
-static struct dentry *open_xa_root(struct super_block *sb, int flags)
-{
-        struct dentry *privroot = REISERFS_SB(sb)->priv_root;
-        if (!privroot)
-                return ERR_PTR(-ENODATA);
-        return lookup_or_create_dir(privroot, XAROOT_NAME, flags);
 }
 static struct dentry *open_xa_dir(const struct inode *inode, int flags)
@@ -163,10 +152,22 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
                 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
                 inode->i_generation);
-        xadir = lookup_or_create_dir(xaroot, namebuf, flags);
+        mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR);
+        xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
+        if (!IS_ERR(xadir) && !xadir->d_inode) {
+                int err = -ENODATA;
+                if (xattr_may_create(flags))
+                        err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
+                if (err) {
+                        dput(xadir);
+                        xadir = ERR_PTR(err);
+                }
+        }
+        mutex_unlock(&xaroot->d_inode->i_mutex);
        dput(xaroot);
        return xadir;
 }
 /* The following are side effects of other operations that aren't explicitly
@@ -184,6 +185,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
 {
        struct reiserfs_dentry_buf *dbuf = buf;
        struct dentry *dentry;
+        WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
        if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
                return -ENOSPC;
@@ -349,6 +351,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
        if (IS_ERR(xadir))
                return ERR_CAST(xadir);
+        mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
        xafile = lookup_one_len(name, xadir, strlen(name));
        if (IS_ERR(xafile)) {
                err = PTR_ERR(xafile);
@@ -360,18 +363,15 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
        if (!xafile->d_inode) {
                err = -ENODATA;
-                if (xattr_may_create(flags)) {
+                if (xattr_may_create(flags))
-                        mutex_lock_nested(&xadir->d_inode->i_mutex,
-                                          I_MUTEX_XATTR);
                        err = xattr_create(xadir->d_inode, xafile,
                                              0700|S_IFREG);
-                        mutex_unlock(&xadir->d_inode->i_mutex);
-                }
        }
        if (err)
                dput(xafile);
 out:
+        mutex_unlock(&xadir->d_inode->i_mutex);
        dput(xadir);
        if (err)
                return ERR_PTR(err);
@@ -435,6 +435,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
        if (IS_ERR(xadir))
                return PTR_ERR(xadir);
+        mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
        dentry = lookup_one_len(name, xadir, strlen(name));
        if (IS_ERR(dentry)) {
                err = PTR_ERR(dentry);
@@ -442,14 +443,13 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
        }
        if (dentry->d_inode) {
-                mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
                err = xattr_unlink(xadir->d_inode, dentry);
-                mutex_unlock(&xadir->d_inode->i_mutex);
                update_ctime(inode);
        }
        dput(dentry);
 out_dput:
+        mutex_unlock(&xadir->d_inode->i_mutex);
        dput(xadir);
        return err;
 }
@@ -687,20 +687,6 @@ out:
        return err;
 }
-/* Actual operations that are exported to VFS-land */
-struct xattr_handler *reiserfs_xattr_handlers[] = {
-        &reiserfs_xattr_user_handler,
-        &reiserfs_xattr_trusted_handler,
-#ifdef CONFIG_REISERFS_FS_SECURITY
-        &reiserfs_xattr_security_handler,
-#endif
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-        &reiserfs_posix_acl_access_handler,
-        &reiserfs_posix_acl_default_handler,
-#endif
-        NULL
-};
 /*
 * In order to implement different sets of xattr operations for each xattr
 * prefix with the generic xattr API, a filesystem should create a
@@ -843,7 +829,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
        if (!dentry->d_inode)
                return -EINVAL;
-        if (!reiserfs_xattrs(dentry->d_sb) ||
+        if (!dentry->d_sb->s_xattr ||
            get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
                return -EOPNOTSUPP;
@@ -885,42 +871,50 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
        return error;
 }
-int reiserfs_permission(struct inode *inode, int mask)
-{
-        /*
-         * We don't do permission checks on the internal objects.
-         * Permissions are determined by the "owning" object.
-         */
-        if (IS_PRIVATE(inode))
-                return 0;
-        /*
-         * Stat data v1 doesn't support ACLs.
-         */
-        if (get_inode_sd_version(inode) == STAT_DATA_V1)
-                return generic_permission(inode, mask, NULL);
-        else
-                return generic_permission(inode, mask, reiserfs_check_acl);
-}
 static int create_privroot(struct dentry *dentry)
 {
        int err;
        struct inode *inode = dentry->d_parent->d_inode;
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR);
+        WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
        err = xattr_mkdir(inode, dentry, 0700);
-        mutex_unlock(&inode->i_mutex);
+        if (err || !dentry->d_inode) {
-        if (err) {
+                reiserfs_warning(dentry->d_sb, "jdm-20006",
-                dput(dentry);
+                                 "xattrs/ACLs enabled and couldn't "
-                dentry = NULL;
+                                 "find/create .reiserfs_priv. "
+                                 "Failing mount.");
+                return -EOPNOTSUPP;
        }
-        if (dentry && dentry->d_inode)
+        dentry->d_inode->i_flags |= S_PRIVATE;
-                reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
+        reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
-                              "storage.\n", PRIVROOT_NAME);
+                      "storage.\n", PRIVROOT_NAME);
-        return err;
+        return 0;
 }
+#else
+int __init reiserfs_xattr_register_handlers(void) { return 0; }
+void reiserfs_xattr_unregister_handlers(void) {}
+static int create_privroot(struct dentry *dentry) { return 0; }
+#endif
+/* Actual operations that are exported to VFS-land */
+struct xattr_handler *reiserfs_xattr_handlers[] = {
+#ifdef CONFIG_REISERFS_FS_XATTR
+        &reiserfs_xattr_user_handler,
+        &reiserfs_xattr_trusted_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_SECURITY
+        &reiserfs_xattr_security_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+        &reiserfs_posix_acl_access_handler,
+        &reiserfs_posix_acl_default_handler,
+#endif
+        NULL
+};
 static int xattr_mount_check(struct super_block *s)
 {
        /* We need generation numbers to ensure that the oid mapping is correct
@@ -940,21 +934,33 @@ static int xattr_mount_check(struct super_block *s)
        return 0;
 }
-#else
+int reiserfs_permission(struct inode *inode, int mask)
-int __init reiserfs_xattr_register_handlers(void) { return 0; }
+{
-void reiserfs_xattr_unregister_handlers(void) {}
+        /*
+         * We don't do permission checks on the internal objects.
+         * Permissions are determined by the "owning" object.
+         */
+        if (IS_PRIVATE(inode))
+                return 0;
+#ifdef CONFIG_REISERFS_FS_XATTR
+        /*
+         * Stat data v1 doesn't support ACLs.
+         */
+        if (get_inode_sd_version(inode) != STAT_DATA_V1)
+                return generic_permission(inode, mask, reiserfs_check_acl);
 #endif
+        return generic_permission(inode, mask, NULL);
+}
 /* This will catch lookups from the fs root to .reiserfs_priv */
 static int
 xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 {
        struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
-        if (name->len == priv_root->d_name.len &&
+        if (container_of(q1, struct dentry, d_name) == priv_root)
-            name->hash == priv_root->d_name.hash &&
-            !memcmp(name->name, priv_root->d_name.name, name->len)) {
                return -ENOENT;
-        } else if (q1->len == name->len &&
+        if (q1->len == name->len &&
                   !memcmp(q1->name, name->name, name->len))
                return 0;
        return 1;
@@ -964,73 +970,71 @@ static const struct dentry_operations xattr_lookup_poison_ops = {
        .d_compare = xattr_lookup_poison,
 };
+int reiserfs_lookup_privroot(struct super_block *s)
+{
+        struct dentry *dentry;
+        int err = 0;
+        /* If we don't have the privroot located yet - go find it */
+        mutex_lock(&s->s_root->d_inode->i_mutex);
+        dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
+                                strlen(PRIVROOT_NAME));
+        if (!IS_ERR(dentry)) {
+                REISERFS_SB(s)->priv_root = dentry;
+                s->s_root->d_op = &xattr_lookup_poison_ops;
+                if (dentry->d_inode)
+                        dentry->d_inode->i_flags |= S_PRIVATE;
+        } else
+                err = PTR_ERR(dentry);
+        mutex_unlock(&s->s_root->d_inode->i_mutex);
+        return err;
+}
 /* We need to take a copy of the mount flags since things like
 * MS_RDONLY don't get set until *after* we're called.
 * mount_flags != mount_options */
 int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 {
        int err = 0;
+        struct dentry *privroot = REISERFS_SB(s)->priv_root;
-#ifdef CONFIG_REISERFS_FS_XATTR
        err = xattr_mount_check(s);
        if (err)
                goto error;
-#endif
-        /* If we don't have the privroot located yet - go find it */
+        if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
-        if (!REISERFS_SB(s)->priv_root) {
+                mutex_lock(&s->s_root->d_inode->i_mutex);
-                struct dentry *dentry;
+                err = create_privroot(REISERFS_SB(s)->priv_root);
-                dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
+                mutex_unlock(&s->s_root->d_inode->i_mutex);
-                                        strlen(PRIVROOT_NAME));
-                if (!IS_ERR(dentry)) {
-#ifdef CONFIG_REISERFS_FS_XATTR
-                        if (!(mount_flags & MS_RDONLY) && !dentry->d_inode)
-                                err = create_privroot(dentry);
-#endif
-                        if (!dentry->d_inode) {
-                                dput(dentry);
-                                dentry = NULL;
-                        }
-                } else
-                        err = PTR_ERR(dentry);
-                if (!err && dentry) {
-                        s->s_root->d_op = &xattr_lookup_poison_ops;
-                        dentry->d_inode->i_flags |= S_PRIVATE;
-                        REISERFS_SB(s)->priv_root = dentry;
-#ifdef CONFIG_REISERFS_FS_XATTR
-                /* xattrs are unavailable */
-                } else if (!(mount_flags & MS_RDONLY)) {
-                        /* If we're read-only it just means that the dir
-                         * hasn't been created. Not an error -- just no
-                         * xattrs on the fs. We'll check again if we
-                         * go read-write */
-                        reiserfs_warning(s, "jdm-20006",
-                                         "xattrs/ACLs enabled and couldn't "
-                                         "find/create .reiserfs_priv. "
-                                         "Failing mount.");
-                        err = -EOPNOTSUPP;
-#endif
-                }
        }
-#ifdef CONFIG_REISERFS_FS_XATTR
+        if (privroot->d_inode) {
-        if (!err)
                s->s_xattr = reiserfs_xattr_handlers;
+                mutex_lock(&privroot->d_inode->i_mutex);
+                if (!REISERFS_SB(s)->xattr_root) {
+                        struct dentry *dentry;
+                        dentry = lookup_one_len(XAROOT_NAME, privroot,
+                                                strlen(XAROOT_NAME));
+                        if (!IS_ERR(dentry))
+                                REISERFS_SB(s)->xattr_root = dentry;
+                        else
+                                err = PTR_ERR(dentry);
+                }
+                mutex_unlock(&privroot->d_inode->i_mutex);
+        }
 error:
        if (err) {
                clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
                clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
        }
-#endif
        /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
-        s->s_flags = s->s_flags & ~MS_POSIXACL;
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
        if (reiserfs_posixacl(s))
                s->s_flags |= MS_POSIXACL;
-#endif
+        else
+                s->s_flags &= ~MS_POSIXACL;
        return err;
 }
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d423416d93d1..c303c426fe2b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
        } else {
              apply_umask:
                /* no ACL, apply umask */
-                inode->i_mode &= ~current->fs->umask;
+                inode->i_mode &= ~current_umask();
        }
        return err;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 4d3c20e787c3..a92c8792c0f6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -55,8 +55,16 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
                           struct reiserfs_security_handle *sec)
 {
        int blocks = 0;
-        int error = security_inode_init_security(inode, dir, &sec->name,
+        int error;
-                                                 &sec->value, &sec->length);
+        sec->name = NULL;
+        /* Don't add selinux attributes on xattrs - they'll never get used */
+        if (IS_PRIVATE(dir))
+                return 0;
+        error = security_inode_init_security(inode, dir, &sec->name,
+                                             &sec->value, &sec->length);
        if (error) {
                if (error == -EOPNOTSUPP)
                        error = 0;
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
 config ROMFS_FS
        tristate "ROM file system support"
-        depends on BLOCK
+        depends on BLOCK || MTD
        ---help---
          This is a very small read-only file system mainly intended for
          initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
          If you don't know whether you need it, then you don't need it:
          answer N.
+#
+# Select the backing stores to be supported
+#
+choice
+        prompt "RomFS backing stores"
+        depends on ROMFS_FS
+        default ROMFS_BACKED_BY_BLOCK
+        help
+          Select the backing stores to be supported.
+config ROMFS_BACKED_BY_BLOCK
+        bool "Block device-backed ROM file system support"
+        depends on BLOCK
+        help
+          This permits ROMFS to use block devices buffered through the page
+          cache as the medium from which to retrieve data.  It does not allow
+          direct mapping of the medium.
+          If unsure, answer Y.
+config ROMFS_BACKED_BY_MTD
+        bool "MTD-backed ROM file system support"
+        depends on MTD=y || (ROMFS_FS=m && MTD)
+        help
+          This permits ROMFS to use MTD based devices directly, without the
+          intercession of the block layer (which may have been disabled).  It
+          also allows direct mapping of MTD devices through romfs files under
+          NOMMU conditions if the underlying device is directly addressable by
+          the CPU.
+          If unsure, answer Y.
+config ROMFS_BACKED_BY_BOTH
+        bool "Both the above"
+        depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
+endchoice
+config ROMFS_ON_BLOCK
+        bool
+        default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+config ROMFS_ON_MTD
+        bool
+        default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
 #
-# Makefile for the linux romfs filesystem routines.
+# Makefile for the linux RomFS filesystem routines.
 #
 obj-$(CONFIG_ROMFS_FS) += romfs.o
-romfs-objs := inode.o
+romfs-y := storage.o super.o
+ifneq ($(CONFIG_MMU),y)
+romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
+endif
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/*
- * ROMFS file system, Linux implementation
- *
- * Copyright (C) 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
- *
- * Using parts of the minix filesystem
- * Copyright (C) 1991, 1992  Linus Torvalds
- *
- * and parts of the affs filesystem additionally
- * Copyright (C) 1993  Ray Burr
- * Copyright (C) 1996  Hans-Joachim Widmaier
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes
- *                                      Changed for 2.1.19 modules
- *      Jan 1997                        Initial release
- *      Jun 1997                        2.1.43+ changes
- *                                      Proper page locking in readpage
- *                                      Changed to work with 2.1.45+ fs
- *      Jul 1997                        Fixed follow_link
- *                      2.1.47
- *                                      lookup shouldn't return -ENOENT
- *                                      from Horst von Brand:
- *                                        fail on wrong checksum
- *                                        double unlock_super was possible
- *                                        correct namelen for statfs
- *                                      spotted by Bill Hawes:
- *                                        readlink shouldn't iput()
- *      Jun 1998        2.1.106         from Avery Pennarun: glibc scandir()
- *                                        exposed a problem in readdir
- *                      2.1.107         code-freeze spellchecker run
- *      Aug 1998                        2.1.118+ VFS changes
- *      Sep 1998        2.1.122         another VFS change (follow_link)
- *      Apr 1999        2.2.7           no more EBADF checking in
- *                                        lookup/readdir, use ERR_PTR
- *      Jun 1999        2.3.6           d_alloc_root use changed
- *                      2.3.9           clean up usage of ENOENT/negative
- *                                        dentries in lookup
- *                                      clean up page flags setting
- *                                        (error, uptodate, locking) in
- *                                        in readpage
- *                                      use init_special_inode for
- *                                        fifos/sockets (and streamline) in
- *                                        read_inode, fix _ops table order
- *      Aug 1999        2.3.16          __initfunc() => __init change
- *      Oct 1999        2.3.24          page->owner hack obsoleted
- *      Nov 1999        2.3.27          2.3.25+ page->offset => index change
- */
-/* todo:
- *      - see Documentation/filesystems/romfs.txt
- *      - use allocated, not stack memory for file names?
- *      - considering write access...
- *      - network (tftp) files?
- *      - merge back some _op tables
- */
-/*
- * Sorry about some optimizations and for some goto's.  I just wanted
- * to squeeze some more bytes out of this code.. :)
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/romfs_fs.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <asm/uaccess.h>
-struct romfs_inode_info {
-        unsigned long i_metasize;       /* size of non-data area */
-        unsigned long i_dataoffset;     /* from the start of fs */
-        struct inode vfs_inode;
-};
-static struct inode *romfs_iget(struct super_block *, unsigned long);
-/* instead of private superblock data */
-static inline unsigned long romfs_maxsize(struct super_block *sb)
-{
-        return (unsigned long)sb->s_fs_info;
-}
-static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
-{
-        return container_of(inode, struct romfs_inode_info, vfs_inode);
-}
-static __u32
-romfs_checksum(void *data, int size)
-{
-        __u32 sum;
-        __be32 *ptr;
-        sum = 0; ptr = data;
-        size>>=2;
-        while (size>0) {
-                sum += be32_to_cpu(*ptr++);
-                size--;
-        }
-        return sum;
-}
-static const struct super_operations romfs_ops;
-static int romfs_fill_super(struct super_block *s, void *data, int silent)
-{
-        struct buffer_head *bh;
-        struct romfs_super_block *rsb;
-        struct inode *root;
-        int sz, ret = -EINVAL;
-        /* I would parse the options here, but there are none.. :) */
-        sb_set_blocksize(s, ROMBSIZE);
-        s->s_maxbytes = 0xFFFFFFFF;
-        bh = sb_bread(s, 0);
-        if (!bh) {
-                /* XXX merge with other printk? */
-                printk ("romfs: unable to read superblock\n");
-                goto outnobh;
-        }
-        rsb = (struct romfs_super_block *)bh->b_data;
-        sz = be32_to_cpu(rsb->size);
-        if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
-           || sz < ROMFH_SIZE) {
-                if (!silent)
-                        printk ("VFS: Can't find a romfs filesystem on dev "
-                                "%s.\n", s->s_id);
-                goto out;
-        }
-        if (romfs_checksum(rsb, min_t(int, sz, 512))) {
-                printk ("romfs: bad initial checksum on dev "
-                        "%s.\n", s->s_id);
-                goto out;
-        }
-        s->s_magic = ROMFS_MAGIC;
-        s->s_fs_info = (void *)(long)sz;
-        s->s_flags |= MS_RDONLY;
-        /* Find the start of the fs */
-        sz = (ROMFH_SIZE +
-              strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
-             & ROMFH_MASK;
-        s->s_op = &romfs_ops;
-        root = romfs_iget(s, sz);
-        if (IS_ERR(root)) {
-                ret = PTR_ERR(root);
-                goto out;
-        }
-        ret = -ENOMEM;
-        s->s_root = d_alloc_root(root);
-        if (!s->s_root)
-                goto outiput;
-        brelse(bh);
-        return 0;
-outiput:
-        iput(root);
-out:
-        brelse(bh);
-outnobh:
-        return ret;
-}
-/* That's simple too. */
-static int
-romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        buf->f_type = ROMFS_MAGIC;
-        buf->f_bsize = ROMBSIZE;
-        buf->f_bfree = buf->f_bavail = buf->f_ffree;
-        buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
-        buf->f_namelen = ROMFS_MAXFN;
-        return 0;
-}
-/* some helper routines */
-static int
-romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
-{
-        struct buffer_head *bh;
-        unsigned long avail, maxsize, res;
-        maxsize = romfs_maxsize(i->i_sb);
-        if (offset >= maxsize)
-                return -1;
-        /* strnlen is almost always valid */
-        if (count > maxsize || offset+count > maxsize)
-                count = maxsize-offset;
-        bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-        if (!bh)
-                return -1;              /* error */
-        avail = ROMBSIZE - (offset & ROMBMASK);
-        maxsize = min_t(unsigned long, count, avail);
-        res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
-        brelse(bh);
-        if (res < maxsize)
-                return res;             /* found all of it */
-        while (res < count) {
-                offset += maxsize;
-                bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-                if (!bh)
-                        return -1;
-                maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-                avail = strnlen(bh->b_data, maxsize);
-                res += avail;
-                brelse(bh);
-                if (avail < maxsize)
-                        return res;
-        }
-        return res;
-}
-static int
-romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
-{
-        struct buffer_head *bh;
-        unsigned long avail, maxsize, res;
-        maxsize = romfs_maxsize(i->i_sb);
-        if (offset >= maxsize || count > maxsize || offset+count>maxsize)
-                return -1;
-        bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-        if (!bh)
-                return -1;              /* error */
-        avail = ROMBSIZE - (offset & ROMBMASK);
-        maxsize = min_t(unsigned long, count, avail);
-        memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
-        brelse(bh);
-        res = maxsize;                  /* all of it */
-        while (res < count) {
-                offset += maxsize;
-                dest += maxsize;
-                bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-                if (!bh)
-                        return -1;
-                maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-                memcpy(dest, bh->b_data, maxsize);
-                brelse(bh);
-                res += maxsize;
-        }
-        return res;
-}
-static unsigned char romfs_dtype_table[] = {
-        DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
-};
-static int
-romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct inode *i = filp->f_path.dentry->d_inode;
-        struct romfs_inode ri;
-        unsigned long offset, maxoff;
-        int j, ino, nextfh;
-        int stored = 0;
-        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
-        lock_kernel();
-        maxoff = romfs_maxsize(i->i_sb);
-        offset = filp->f_pos;
-        if (!offset) {
-                offset = i->i_ino & ROMFH_MASK;
-                if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto out;
-                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        }
-        /* Not really failsafe, but we are read-only... */
-        for(;;) {
-                if (!offset || offset >= maxoff) {
-                        offset = maxoff;
-                        filp->f_pos = offset;
-                        goto out;
-                }
-                filp->f_pos = offset;
-                /* Fetch inode info */
-                if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto out;
-                j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
-                if (j < 0)
-                        goto out;
-                fsname[j]=0;
-                romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
-                ino = offset;
-                nextfh = be32_to_cpu(ri.next);
-                if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
-                        ino = be32_to_cpu(ri.spec);
-                if (filldir(dirent, fsname, j, offset, ino,
-                            romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
-                        goto out;
-                }
-                stored++;
-                offset = nextfh & ROMFH_MASK;
-        }
-out:
-        unlock_kernel();
-        return stored;
-}
-static struct dentry *
-romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        unsigned long offset, maxoff;
-        long res;
-        int fslen;
-        struct inode *inode = NULL;
-        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
-        struct romfs_inode ri;
-        const char *name;               /* got from dentry */
-        int len;
-        res = -EACCES;                  /* placeholder for "no data here" */
-        offset = dir->i_ino & ROMFH_MASK;
-        lock_kernel();
-        if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-                goto error;
-        maxoff = romfs_maxsize(dir->i_sb);
-        offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        /* OK, now find the file whose name is in "dentry" in the
-         * directory specified by "dir".  */
-        name = dentry->d_name.name;
-        len = dentry->d_name.len;
-        for(;;) {
-                if (!offset || offset >= maxoff)
-                        goto success; /* negative success */
-                if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto error;
-                /* try to match the first 16 bytes of name */
-                fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
-                if (len < ROMFH_SIZE) {
-                        if (len == fslen) {
-                                /* both are shorter, and same size */
-                                romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-                                if (strncmp (name, fsname, len) == 0)
-                                        break;
-                        }
-                } else if (fslen >= ROMFH_SIZE) {
-                        /* both are longer; XXX optimize max size */
-                        fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
-                        if (len == fslen) {
-                                romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-                                if (strncmp(name, fsname, len) == 0)
-                                        break;
-                        }
-                }
-                /* next entry */
-                offset = be32_to_cpu(ri.next) & ROMFH_MASK;
-        }
-        /* Hard link handling */
-        if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
-                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        inode = romfs_iget(dir->i_sb, offset);
-        if (IS_ERR(inode)) {
-                res = PTR_ERR(inode);
-                goto error;
-        }
-success:
-        d_add(dentry, inode);
-        res = 0;
-error:
-        unlock_kernel();
-        return ERR_PTR(res);
-}
-/*
- * Ok, we do readpage, to be able to execute programs.  Unfortunately,
- * we can't use bmap, since we may have looser alignments.
- */
-static int
-romfs_readpage(struct file *file, struct page * page)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t offset, size;
-        unsigned long filled;
-        void *buf;
-        int result = -EIO;
-        page_cache_get(page);
-        lock_kernel();
-        buf = kmap(page);
-        if (!buf)
-                goto err_out;
-        /* 32 bit warning -- but not for us :) */
-        offset = page_offset(page);
-        size = i_size_read(inode);
-        filled = 0;
-        result = 0;
-        if (offset < size) {
-                unsigned long readlen;
-                size -= offset;
-                readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
-                filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
-                if (filled != readlen) {
-                        SetPageError(page);
-                        filled = 0;
-                        result = -EIO;
-                }
-        }
-        if (filled < PAGE_SIZE)
-                memset(buf + filled, 0, PAGE_SIZE-filled);
-        if (!result)
-                SetPageUptodate(page);
-        flush_dcache_page(page);
-        unlock_page(page);
-        kunmap(page);
-err_out:
-        page_cache_release(page);
-        unlock_kernel();
-        return result;
-}
-/* Mapping from our types to the kernel */
-static const struct address_space_operations romfs_aops = {
-        .readpage = romfs_readpage
-};
-static const struct file_operations romfs_dir_operations = {
-        .read           = generic_read_dir,
-        .readdir        = romfs_readdir,
-};
-static const struct inode_operations romfs_dir_inode_operations = {
-        .lookup         = romfs_lookup,
-};
-static mode_t romfs_modemap[] =
-{
-        0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
-        S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
-};
-static struct inode *
-romfs_iget(struct super_block *sb, unsigned long ino)
-{
-        int nextfh, ret;
-        struct romfs_inode ri;
-        struct inode *i;
-        ino &= ROMFH_MASK;
-        i = iget_locked(sb, ino);
-        if (!i)
-                return ERR_PTR(-ENOMEM);
-        if (!(i->i_state & I_NEW))
-                return i;
-        i->i_mode = 0;
-        /* Loop for finding the real hard link */
-        for(;;) {
-                if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
-                        printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
-                                ino);
-                        iget_failed(i);
-                        return ERR_PTR(-EIO);
-                }
-                /* XXX: do romfs_checksum here too (with name) */
-                nextfh = be32_to_cpu(ri.next);
-                if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
-                        break;
-                ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        }
-        i->i_nlink = 1;         /* Hard to decide.. */
-        i->i_size = be32_to_cpu(ri.size);
-        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
-        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-        /* Precalculate the data offset */
-        ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-        if (ret >= 0)
-                ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-        else
-                ino = 0;
-        ROMFS_I(i)->i_metasize = ino;
-        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
-        /* Compute permissions */
-        ino = romfs_modemap[nextfh & ROMFH_TYPE];
-        /* only "normal" files have ops */
-        switch (nextfh & ROMFH_TYPE) {
-                case 1:
-                        i->i_size = ROMFS_I(i)->i_metasize;
-                        i->i_op = &romfs_dir_inode_operations;
-                        i->i_fop = &romfs_dir_operations;
-                        if (nextfh & ROMFH_EXEC)
-                                ino |= S_IXUGO;
-                        i->i_mode = ino;
-                        break;
-                case 2:
-                        i->i_fop = &generic_ro_fops;
-                        i->i_data.a_ops = &romfs_aops;
-                        if (nextfh & ROMFH_EXEC)
-                                ino |= S_IXUGO;
-                        i->i_mode = ino;
-                        break;
-                case 3:
-                        i->i_op = &page_symlink_inode_operations;
-                        i->i_data.a_ops = &romfs_aops;
-                        i->i_mode = ino | S_IRWXUGO;
-                        break;
-                default:
-                        /* depending on MBZ for sock/fifos */
-                        nextfh = be32_to_cpu(ri.spec);
-                        init_special_inode(i, ino,
-                                        MKDEV(nextfh>>16,nextfh&0xffff));
-        }
-        unlock_new_inode(i);
-        return i;
-}
-static struct kmem_cache * romfs_inode_cachep;
-static struct inode *romfs_alloc_inode(struct super_block *sb)
-{
-        struct romfs_inode_info *ei;
-        ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
-        if (!ei)
-                return NULL;
-        return &ei->vfs_inode;
-}
-static void romfs_destroy_inode(struct inode *inode)
-{
-        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
-}
-static void init_once(void *foo)
-{
-        struct romfs_inode_info *ei = foo;
-        inode_init_once(&ei->vfs_inode);
-}
-static int init_inodecache(void)
-{
-        romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
-                                             sizeof(struct romfs_inode_info),
-                                             0, (SLAB_RECLAIM_ACCOUNT|
-                                                SLAB_MEM_SPREAD),
-                                             init_once);
-        if (romfs_inode_cachep == NULL)
-                return -ENOMEM;
-        return 0;
-}
-static void destroy_inodecache(void)
-{
-        kmem_cache_destroy(romfs_inode_cachep);
-}
-static int romfs_remount(struct super_block *sb, int *flags, char *data)
-{
-        *flags |= MS_RDONLY;
-        return 0;
-}
-static const struct super_operations romfs_ops = {
-        .alloc_inode    = romfs_alloc_inode,
-        .destroy_inode  = romfs_destroy_inode,
-        .statfs         = romfs_statfs,
-        .remount_fs     = romfs_remount,
-};
-static int romfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
-                           mnt);
-}
-static struct file_system_type romfs_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "romfs",
-        .get_sb         = romfs_get_sb,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
-static int __init init_romfs_fs(void)
-{
-        int err = init_inodecache();
-        if (err)
-                goto out1;
-        err = register_filesystem(&romfs_fs_type);
-        if (err)
-                goto out;
-        return 0;
-out:
-        destroy_inodecache();
-out1:
-        return err;
-}
-static void __exit exit_romfs_fs(void)
-{
-        unregister_filesystem(&romfs_fs_type);
-        destroy_inodecache();
-}
-/* Yes, works even as a module... :) */
-module_init(init_romfs_fs)
-module_exit(exit_romfs_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..95217b830118
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
+/* RomFS internal definitions
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/romfs_fs.h>
+struct romfs_inode_info {
+        struct inode    vfs_inode;
+        unsigned long   i_metasize;     /* size of non-data area */
+        unsigned long   i_dataoffset;   /* from the start of fs */
+};
+static inline size_t romfs_maxsize(struct super_block *sb)
+{
+        return (size_t) (unsigned long) sb->s_fs_info;
+}
+static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
+{
+        return container_of(inode, struct romfs_inode_info, vfs_inode);
+}
+/*
+ * mmap-nommu.c
+ */
+#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
+extern const struct file_operations romfs_ro_fops;
+#else
+#define romfs_ro_fops   generic_ro_fops
+#endif
+/*
+ * storage.c
+ */
+extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
+                          void *buf, size_t buflen);
+extern ssize_t romfs_dev_strnlen(struct super_block *sb,
+                                 unsigned long pos, size_t maxlen);
+extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+                            const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
+/* NOMMU mmap support for RomFS on MTD devices
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/mm.h>
+#include <linux/mtd/super.h>
+#include "internal.h"
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ *   mappings)
+ * - attempts to map through to the underlying MTD device
+ */
+static unsigned long romfs_get_unmapped_area(struct file *file,
+                                             unsigned long addr,
+                                             unsigned long len,
+                                             unsigned long pgoff,
+                                             unsigned long flags)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct mtd_info *mtd = inode->i_sb->s_mtd;
+        unsigned long isize, offset;
+        if (!mtd)
+                goto cant_map_directly;
+        isize = i_size_read(inode);
+        offset = pgoff << PAGE_SHIFT;
+        if (offset > isize || len > isize || offset > isize - len)
+                return (unsigned long) -EINVAL;
+        /* we need to call down to the MTD layer to do the actual mapping */
+        if (mtd->get_unmapped_area) {
+                if (addr != 0)
+                        return (unsigned long) -EINVAL;
+                if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+                        return (unsigned long) -EINVAL;
+                offset += ROMFS_I(inode)->i_dataoffset;
+                if (offset > mtd->size - len)
+                        return (unsigned long) -EINVAL;
+                return mtd->get_unmapped_area(mtd, len, offset, flags);
+        }
+cant_map_directly:
+        return (unsigned long) -ENOSYS;
+}
+/*
+ * permit a R/O mapping to be made directly through onto an MTD device if
+ * possible
+ */
+static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+const struct file_operations romfs_ro_fops = {
+        .llseek                 = generic_file_llseek,
+        .read                   = do_sync_read,
+        .aio_read               = generic_file_aio_read,
+        .splice_read            = generic_file_splice_read,
+        .mmap                   = romfs_mmap,
+        .get_unmapped_area      = romfs_get_unmapped_area,
+};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..b3208adf8e71
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,293 @@
+/* RomFS storage access routines
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/fs.h>
+#include <linux/mtd/super.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
+#error no ROMFS backing store interface configured
+#endif
+#ifdef CONFIG_ROMFS_ON_MTD
+#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
+/*
+ * read data from an romfs image on an MTD device
+ */
+static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
+                          void *buf, size_t buflen)
+{
+        size_t rlen;
+        int ret;
+        ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
+        return (ret < 0 || rlen != buflen) ? -EIO : 0;
+}
+/*
+ * determine the length of a string in a romfs image on an MTD device
+ */
+static ssize_t romfs_mtd_strnlen(struct super_block *sb,
+                                 unsigned long pos, size_t maxlen)
+{
+        ssize_t n = 0;
+        size_t segment;
+        u_char buf[16], *p;
+        size_t len;
+        int ret;
+        /* scan the string up to 16 bytes at a time */
+        while (maxlen > 0) {
+                segment = min_t(size_t, maxlen, 16);
+                ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+                if (ret < 0)
+                        return ret;
+                p = memchr(buf, 0, len);
+                if (p)
+                        return n + (p - buf);
+                maxlen -= len;
+                pos += len;
+                n += len;
+        }
+        return n;
+}
+/*
+ * compare a string to one in a romfs image on MTD
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos,
+                            const char *str, size_t size)
+{
+        u_char buf[17];
+        size_t len, segment;
+        int ret;
+        /* scan the string up to 16 bytes at a time, and attempt to grab the
+         * trailing NUL whilst we're at it */
+        buf[0] = 0xff;
+        while (size > 0) {
+                segment = min_t(size_t, size + 1, 17);
+                ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+                if (ret < 0)
+                        return ret;
+                len--;
+                if (memcmp(buf, str, len) != 0)
+                        return 0;
+                buf[0] = buf[len];
+                size -= len;
+                pos += len;
+                str += len;
+        }
+        /* check the trailing NUL was */
+        if (buf[0])
+                return 0;
+        return 1;
+}
+#endif /* CONFIG_ROMFS_ON_MTD */
+#ifdef CONFIG_ROMFS_ON_BLOCK
+/*
+ * read data from an romfs image on a block device
+ */
+static int romfs_blk_read(struct super_block *sb, unsigned long pos,
+                          void *buf, size_t buflen)
+{
+        struct buffer_head *bh;
+        unsigned long offset;
+        size_t segment;
+        /* copy the string up to blocksize bytes at a time */
+        while (buflen > 0) {
+                offset = pos & (ROMBSIZE - 1);
+                segment = min_t(size_t, buflen, ROMBSIZE - offset);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                memcpy(buf, bh->b_data + offset, segment);
+                brelse(bh);
+                buf += segment;
+                buflen -= segment;
+                pos += segment;
+        }
+        return 0;
+}
+/*
+ * determine the length of a string in romfs on a block device
+ */
+static ssize_t romfs_blk_strnlen(struct super_block *sb,
+                                 unsigned long pos, size_t limit)
+{
+        struct buffer_head *bh;
+        unsigned long offset;
+        ssize_t n = 0;
+        size_t segment;
+        u_char *buf, *p;
+        /* scan the string up to blocksize bytes at a time */
+        while (limit > 0) {
+                offset = pos & (ROMBSIZE - 1);
+                segment = min_t(size_t, limit, ROMBSIZE - offset);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                buf = bh->b_data + offset;
+                p = memchr(buf, 0, segment);
+                brelse(bh);
+                if (p)
+                        return n + (p - buf);
+                limit -= segment;
+                pos += segment;
+                n += segment;
+        }
+        return n;
+}
+/*
+ * compare a string to one in a romfs image on a block device
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos,
+                            const char *str, size_t size)
+{
+        struct buffer_head *bh;
+        unsigned long offset;
+        size_t segment;
+        bool matched, terminated = false;
+        /* compare string up to a block at a time */
+        while (size > 0) {
+                offset = pos & (ROMBSIZE - 1);
+                segment = min_t(size_t, size, ROMBSIZE - offset);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                matched = (memcmp(bh->b_data + offset, str, segment) == 0);
+                size -= segment;
+                pos += segment;
+                str += segment;
+                if (matched && size == 0 && offset + segment < ROMBSIZE) {
+                        if (!bh->b_data[offset + segment])
+                                terminated = true;
+                        else
+                                matched = false;
+                }
+                brelse(bh);
+                if (!matched)
+                        return 0;
+        }
+        if (!terminated) {
+                /* the terminating NUL must be on the first byte of the next
+                 * block */
+                BUG_ON((pos & (ROMBSIZE - 1)) != 0);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                matched = !bh->b_data[0];
+                brelse(bh);
+                if (!matched)
+                        return 0;
+        }
+        return 1;
+}
+#endif /* CONFIG_ROMFS_ON_BLOCK */
+/*
+ * read data from the romfs image
+ */
+int romfs_dev_read(struct super_block *sb, unsigned long pos,
+                   void *buf, size_t buflen)
+{
+        size_t limit;
+        limit = romfs_maxsize(sb);
+        if (pos >= limit)
+                return -EIO;
+        if (buflen > limit - pos)
+                buflen = limit - pos;
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd)
+                return romfs_mtd_read(sb, pos, buf, buflen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev)
+                return romfs_blk_read(sb, pos, buf, buflen);
+#endif
+        return -EIO;
+}
+/*
+ * determine the length of a string in romfs
+ */
+ssize_t romfs_dev_strnlen(struct super_block *sb,
+                          unsigned long pos, size_t maxlen)
+{
+        size_t limit;
+        limit = romfs_maxsize(sb);
+        if (pos >= limit)
+                return -EIO;
+        if (maxlen > limit - pos)
+                maxlen = limit - pos;
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd)
+                return romfs_mtd_strnlen(sb, pos, limit);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev)
+                return romfs_blk_strnlen(sb, pos, limit);
+#endif
+        return -EIO;
+}
+/*
+ * compare a string to one in romfs
+ * - the string to be compared to, str, may not be NUL-terminated; instead the
+ *   string is of the specified size
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+                     const char *str, size_t size)
+{
+        size_t limit;
+        limit = romfs_maxsize(sb);
+        if (pos >= limit)
+                return -EIO;
+        if (size > ROMFS_MAXFN)
+                return -ENAMETOOLONG;
+        if (size + 1 > limit - pos)
+                return -EIO;
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd)
+                return romfs_mtd_strcmp(sb, pos, str, size);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev)
+                return romfs_blk_strcmp(sb, pos, str, size);
+#endif
+        return -EIO;
+}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..4ab3c03d8f95
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,654 @@
+/* Block- or MTD-based romfs
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Derived from: ROMFS file system, Linux implementation
+ *
+ * Copyright © 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
+ *
+ * Using parts of the minix filesystem
+ * Copyright © 1991, 1992  Linus Torvalds
+ *
+ * and parts of the affs filesystem additionally
+ * Copyright © 1993  Ray Burr
+ * Copyright © 1996  Hans-Joachim Widmaier
+ *
+ * Changes
+ *                                      Changed for 2.1.19 modules
+ *      Jan 1997                        Initial release
+ *      Jun 1997                        2.1.43+ changes
+ *                                      Proper page locking in readpage
+ *                                      Changed to work with 2.1.45+ fs
+ *      Jul 1997                        Fixed follow_link
+ *                      2.1.47
+ *                                      lookup shouldn't return -ENOENT
+ *                                      from Horst von Brand:
+ *                                        fail on wrong checksum
+ *                                        double unlock_super was possible
+ *                                        correct namelen for statfs
+ *                                      spotted by Bill Hawes:
+ *                                        readlink shouldn't iput()
+ *      Jun 1998        2.1.106         from Avery Pennarun: glibc scandir()
+ *                                        exposed a problem in readdir
+ *                      2.1.107         code-freeze spellchecker run
+ *      Aug 1998                        2.1.118+ VFS changes
+ *      Sep 1998        2.1.122         another VFS change (follow_link)
+ *      Apr 1999        2.2.7           no more EBADF checking in
+ *                                        lookup/readdir, use ERR_PTR
+ *      Jun 1999        2.3.6           d_alloc_root use changed
+ *                      2.3.9           clean up usage of ENOENT/negative
+ *                                        dentries in lookup
+ *                                      clean up page flags setting
+ *                                        (error, uptodate, locking) in
+ *                                        in readpage
+ *                                      use init_special_inode for
+ *                                        fifos/sockets (and streamline) in
+ *                                        read_inode, fix _ops table order
+ *      Aug 1999        2.3.16          __initfunc() => __init change
+ *      Oct 1999        2.3.24          page->owner hack obsoleted
+ *      Nov 1999        2.3.27          2.3.25+ page->offset => index change
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+static struct kmem_cache *romfs_inode_cachep;
+static const umode_t romfs_modemap[8] = {
+        0,                      /* hard link */
+        S_IFDIR  | 0644,        /* directory */
+        S_IFREG  | 0644,        /* regular file */
+        S_IFLNK  | 0777,        /* symlink */
+        S_IFBLK  | 0600,        /* blockdev */
+        S_IFCHR  | 0600,        /* chardev */
+        S_IFSOCK | 0644,        /* socket */
+        S_IFIFO  | 0644         /* FIFO */
+};
+static const unsigned char romfs_dtype_table[] = {
+        DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
+/*
+ * read a page worth of data from the image
+ */
+static int romfs_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        loff_t offset, size;
+        unsigned long fillsize, pos;
+        void *buf;
+        int ret;
+        buf = kmap(page);
+        if (!buf)
+                return -ENOMEM;
+        /* 32 bit warning -- but not for us :) */
+        offset = page_offset(page);
+        size = i_size_read(inode);
+        fillsize = 0;
+        ret = 0;
+        if (offset < size) {
+                size -= offset;
+                fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
+                pos = ROMFS_I(inode)->i_dataoffset + offset;
+                ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+                if (ret < 0) {
+                        SetPageError(page);
+                        fillsize = 0;
+                        ret = -EIO;
+                }
+        }
+        if (fillsize < PAGE_SIZE)
+                memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
+        if (ret == 0)
+                SetPageUptodate(page);
+        flush_dcache_page(page);
+        kunmap(page);
+        unlock_page(page);
+        return ret;
+}
+static const struct address_space_operations romfs_aops = {
+        .readpage       = romfs_readpage
+};
+/*
+ * read the entries from a directory
+ */
+static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct inode *i = filp->f_dentry->d_inode;
+        struct romfs_inode ri;
+        unsigned long offset, maxoff;
+        int j, ino, nextfh;
+        int stored = 0;
+        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
+        int ret;
+        maxoff = romfs_maxsize(i->i_sb);
+        offset = filp->f_pos;
+        if (!offset) {
+                offset = i->i_ino & ROMFH_MASK;
+                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+                if (ret < 0)
+                        goto out;
+                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        }
+        /* Not really failsafe, but we are read-only... */
+        for (;;) {
+                if (!offset || offset >= maxoff) {
+                        offset = maxoff;
+                        filp->f_pos = offset;
+                        goto out;
+                }
+                filp->f_pos = offset;
+                /* Fetch inode info */
+                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+                if (ret < 0)
+                        goto out;
+                j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+                                      sizeof(fsname) - 1);
+                if (j < 0)
+                        goto out;
+                ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+                if (ret < 0)
+                        goto out;
+                fsname[j] = '\0';
+                ino = offset;
+                nextfh = be32_to_cpu(ri.next);
+                if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
+                        ino = be32_to_cpu(ri.spec);
+                if (filldir(dirent, fsname, j, offset, ino,
+                            romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+                        goto out;
+                stored++;
+                offset = nextfh & ROMFH_MASK;
+        }
+out:
+        return stored;
+}
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
+                                   struct nameidata *nd)
+{
+        unsigned long offset, maxoff;
+        struct inode *inode;
+        struct romfs_inode ri;
+        const char *name;               /* got from dentry */
+        int len, ret;
+        offset = dir->i_ino & ROMFH_MASK;
+        ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
+        if (ret < 0)
+                goto error;
+        /* search all the file entries in the list starting from the one
+         * pointed to by the directory's special data */
+        maxoff = romfs_maxsize(dir->i_sb);
+        offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        name = dentry->d_name.name;
+        len = dentry->d_name.len;
+        for (;;) {
+                if (!offset || offset >= maxoff)
+                        goto out0;
+                ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
+                if (ret < 0)
+                        goto error;
+                /* try to match the first 16 bytes of name */
+                ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name,
+                                       len);
+                if (ret < 0)
+                        goto error;
+                if (ret == 1)
+                        break;
+                /* next entry */
+                offset = be32_to_cpu(ri.next) & ROMFH_MASK;
+        }
+        /* Hard link handling */
+        if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        inode = romfs_iget(dir->i_sb, offset);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                goto error;
+        }
+        goto outi;
+        /*
+         * it's a bit funky, _lookup needs to return an error code
+         * (negative) or a NULL, both as a dentry.  ENOENT should not
+         * be returned, instead we need to create a negative dentry by
+         * d_add(dentry, NULL); and return 0 as no error.
+         * (Although as I see, it only matters on writable file
+         * systems).
+         */
+out0:
+        inode = NULL;
+outi:
+        d_add(dentry, inode);
+        ret = 0;
+error:
+        return ERR_PTR(ret);
+}
+static const struct file_operations romfs_dir_operations = {
+        .read           = generic_read_dir,
+        .readdir        = romfs_readdir,
+};
+static struct inode_operations romfs_dir_inode_operations = {
+        .lookup         = romfs_lookup,
+};
+/*
+ * get a romfs inode based on its position in the image (which doubles as the
+ * inode number)
+ */
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
+{
+        struct romfs_inode_info *inode;
+        struct romfs_inode ri;
+        struct inode *i;
+        unsigned long nlen;
+        unsigned nextfh;
+        int ret;
+        umode_t mode;
+        /* we might have to traverse a chain of "hard link" file entries to get
+         * to the actual file */
+        for (;;) {
+                ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
+                if (ret < 0)
+                        goto error;
+                /* XXX: do romfs_checksum here too (with name) */
+                nextfh = be32_to_cpu(ri.next);
+                if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
+                        break;
+                pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        }
+        /* determine the length of the filename */
+        nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
+        if (IS_ERR_VALUE(nlen))
+                goto eio;
+        /* get an inode for this image position */
+        i = iget_locked(sb, pos);
+        if (!i)
+                return ERR_PTR(-ENOMEM);
+        if (!(i->i_state & I_NEW))
+                return i;
+        /* precalculate the data offset */
+        inode = ROMFS_I(i);
+        inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
+        inode->i_dataoffset = pos + inode->i_metasize;
+        i->i_nlink = 1;         /* Hard to decide.. */
+        i->i_size = be32_to_cpu(ri.size);
+        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
+        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+        /* set up mode and ops */
+        mode = romfs_modemap[nextfh & ROMFH_TYPE];
+        switch (nextfh & ROMFH_TYPE) {
+        case ROMFH_DIR:
+                i->i_size = ROMFS_I(i)->i_metasize;
+                i->i_op = &romfs_dir_inode_operations;
+                i->i_fop = &romfs_dir_operations;
+                if (nextfh & ROMFH_EXEC)
+                        mode |= S_IXUGO;
+                break;
+        case ROMFH_REG:
+                i->i_fop = &romfs_ro_fops;
+                i->i_data.a_ops = &romfs_aops;
+                if (i->i_sb->s_mtd)
+                        i->i_data.backing_dev_info =
+                                i->i_sb->s_mtd->backing_dev_info;
+                if (nextfh & ROMFH_EXEC)
+                        mode |= S_IXUGO;
+                break;
+        case ROMFH_SYM:
+                i->i_op = &page_symlink_inode_operations;
+                i->i_data.a_ops = &romfs_aops;
+                mode |= S_IRWXUGO;
+                break;
+        default:
+                /* depending on MBZ for sock/fifos */
+                nextfh = be32_to_cpu(ri.spec);
+                init_special_inode(i, mode, MKDEV(nextfh >> 16,
+                                                  nextfh & 0xffff));
+                break;
+        }
+        i->i_mode = mode;
+        unlock_new_inode(i);
+        return i;
+eio:
+        ret = -EIO;
+error:
+        printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+        return ERR_PTR(ret);
+}
+/*
+ * allocate a new inode
+ */
+static struct inode *romfs_alloc_inode(struct super_block *sb)
+{
+        struct romfs_inode_info *inode;
+        inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+        return inode ? &inode->vfs_inode : NULL;
+}
+/*
+ * return a spent inode to the slab cache
+ */
+static void romfs_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
+}
+/*
+ * get filesystem statistics
+ */
+static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+        buf->f_type = ROMFS_MAGIC;
+        buf->f_namelen = ROMFS_MAXFN;
+        buf->f_bsize = ROMBSIZE;
+        buf->f_bfree = buf->f_bavail = buf->f_ffree;
+        buf->f_blocks =
+                (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
+        return 0;
+}
+/*
+ * remounting must involve read-only
+ */
+static int romfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        *flags |= MS_RDONLY;
+        return 0;
+}
+static const struct super_operations romfs_super_ops = {
+        .alloc_inode    = romfs_alloc_inode,
+        .destroy_inode  = romfs_destroy_inode,
+        .statfs         = romfs_statfs,
+        .remount_fs     = romfs_remount,
+};
+/*
+ * checksum check on part of a romfs filesystem
+ */
+static __u32 romfs_checksum(const void *data, int size)
+{
+        const __be32 *ptr = data;
+        __u32 sum;
+        sum = 0;
+        size >>= 2;
+        while (size > 0) {
+                sum += be32_to_cpu(*ptr++);
+                size--;
+        }
+        return sum;
+}
+/*
+ * fill in the superblock
+ */
+static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct romfs_super_block *rsb;
+        struct inode *root;
+        unsigned long pos, img_size;
+        const char *storage;
+        size_t len;
+        int ret;
+#ifdef CONFIG_BLOCK
+        if (!sb->s_mtd) {
+                sb_set_blocksize(sb, ROMBSIZE);
+        } else {
+                sb->s_blocksize = ROMBSIZE;
+                sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
+        }
+#endif
+        sb->s_maxbytes = 0xFFFFFFFF;
+        sb->s_magic = ROMFS_MAGIC;
+        sb->s_flags |= MS_RDONLY | MS_NOATIME;
+        sb->s_op = &romfs_super_ops;
+        /* read the image superblock and check it */
+        rsb = kmalloc(512, GFP_KERNEL);
+        if (!rsb)
+                return -ENOMEM;
+        sb->s_fs_info = (void *) 512;
+        ret = romfs_dev_read(sb, 0, rsb, 512);
+        if (ret < 0)
+                goto error_rsb;
+        img_size = be32_to_cpu(rsb->size);
+        if (sb->s_mtd && img_size > sb->s_mtd->size)
+                goto error_rsb_inval;
+        sb->s_fs_info = (void *) img_size;
+        if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
+            img_size < ROMFH_SIZE) {
+                if (!silent)
+                        printk(KERN_WARNING "VFS:"
+                               " Can't find a romfs filesystem on dev %s.\n",
+                               sb->s_id);
+                goto error_rsb_inval;
+        }
+        if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
+                printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
+                       sb->s_id);
+                goto error_rsb_inval;
+        }
+        storage = sb->s_mtd ? "MTD" : "the block layer";
+        len = strnlen(rsb->name, ROMFS_MAXFN);
+        if (!silent)
+                printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
+                       (unsigned) len, (unsigned) len, rsb->name, storage);
+        kfree(rsb);
+        rsb = NULL;
+        /* find the root directory */
+        pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
+        root = romfs_iget(sb, pos);
+        if (!root)
+                goto error;
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root)
+                goto error_i;
+        return 0;
+error_i:
+        iput(root);
+error:
+        return -EINVAL;
+error_rsb_inval:
+        ret = -EINVAL;
+error_rsb:
+        return ret;
+}
+/*
+ * get a superblock for mounting
+ */
+static int romfs_get_sb(struct file_system_type *fs_type,
+                        int flags, const char *dev_name,
+                        void *data, struct vfsmount *mnt)
+{
+        int ret = -EINVAL;
+#ifdef CONFIG_ROMFS_ON_MTD
+        ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+                         mnt);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (ret == -EINVAL)
+                ret = get_sb_bdev(fs_type, flags, dev_name, data,
+                                  romfs_fill_super, mnt);
+#endif
+        return ret;
+}
+/*
+ * destroy a romfs superblock in the appropriate manner
+ */
+static void romfs_kill_sb(struct super_block *sb)
+{
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd) {
+                kill_mtd_super(sb);
+                return;
+        }
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev) {
+                kill_block_super(sb);
+                return;
+        }
+#endif
+}
+static struct file_system_type romfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "romfs",
+        .get_sb         = romfs_get_sb,
+        .kill_sb        = romfs_kill_sb,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+/*
+ * inode storage initialiser
+ */
+static void romfs_i_init_once(void *_inode)
+{
+        struct romfs_inode_info *inode = _inode;
+        inode_init_once(&inode->vfs_inode);
+}
+/*
+ * romfs module initialisation
+ */
+static int __init init_romfs_fs(void)
+{
+        int ret;
+        printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+        romfs_inode_cachep =
+                kmem_cache_create("romfs_i",
+                                  sizeof(struct romfs_inode_info), 0,
+                                  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                  romfs_i_init_once);
+        if (!romfs_inode_cachep) {
+                printk(KERN_ERR
+                       "ROMFS error: Failed to initialise inode cache\n");
+                return -ENOMEM;
+        }
+        ret = register_filesystem(&romfs_fs_type);
+        if (ret) {
+                printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+                goto error_register;
+        }
+        return 0;
+error_register:
+        kmem_cache_destroy(romfs_inode_cachep);
+        return ret;
+}
+/*
+ * romfs module removal
+ */
+static void __exit exit_romfs_fs(void)
+{
+        unregister_filesystem(&romfs_fs_type);
+        kmem_cache_destroy(romfs_inode_cachep);
+}
+module_init(init_romfs_fs);
+module_exit(exit_romfs_fs);
+MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a966..666953d59a35 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
                 */
                wait_on_page_writeback(page);
-                if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+                if (page_has_private(page) &&
+                    !try_to_release_page(page, GFP_KERNEL))
                        goto out_unlock;
                /*
@@ -181,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
        do_wakeup = 0;
        page_nr = 0;
-        if (pipe->inode)
+        pipe_lock(pipe);
-                mutex_lock(&pipe->inode->i_mutex);
        for (;;) {
                if (!pipe->readers) {
@@ -244,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                pipe->waiting_writers--;
        }
-        if (pipe->inode) {
+        pipe_unlock(pipe);
-                mutex_unlock(&pipe->inode->i_mutex);
-                if (do_wakeup) {
+        if (do_wakeup) {
-                        smp_mb();
+                smp_mb();
-                        if (waitqueue_active(&pipe->wait))
+                if (waitqueue_active(&pipe->wait))
-                                wake_up_interruptible(&pipe->wait);
+                        wake_up_interruptible(&pipe->wait);
-                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-                }
        }
        while (page_nr < spd_pages)
@@ -554,8 +552,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
 * a new page in the output file page cache and fill/dirty that.
 */
-static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
-                        struct splice_desc *sd)
+                 struct splice_desc *sd)
 {
        struct file *file = sd->u.file;
        struct address_space *mapping = file->f_mapping;
@@ -599,108 +597,177 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 out:
        return ret;
 }
+EXPORT_SYMBOL(pipe_to_file);
+static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
+{
+        smp_mb();
+        if (waitqueue_active(&pipe->wait))
+                wake_up_interruptible(&pipe->wait);
+        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+}
 /**
- * __splice_from_pipe - splice data from a pipe to given actor
+ * splice_from_pipe_feed - feed available data from a pipe to a file
 * @pipe:       pipe to splice from
 * @sd:         information to @actor
 * @actor:      handler that splices the data
 *
 * Description:
- *    This function does little more than loop over the pipe and call
+ *    This function loops over the pipe and calls @actor to do the
- *    @actor to do the actual moving of a single struct pipe_buffer to
+ *    actual moving of a single struct pipe_buffer to the desired
- *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ *    destination.  It returns when there's no more buffers left in
- *    pipe_to_user.
+ *    the pipe or if the requested number of bytes (@sd->total_len)
+ *    have been copied.  It returns a positive number (one) if the
+ *    pipe needs to be filled with more data, zero if the required
+ *    number of bytes have been copied and -errno on error.
 *
+ *    This, together with splice_from_pipe_{begin,end,next}, may be
+ *    used to implement the functionality of __splice_from_pipe() when
+ *    locking is required around copying the pipe buffers to the
+ *    destination.
 */
-ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
-                           splice_actor *actor)
+                          splice_actor *actor)
 {
-        int ret, do_wakeup, err;
+        int ret;
-        ret = 0;
-        do_wakeup = 0;
-        for (;;) {
-                if (pipe->nrbufs) {
-                        struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
-                        const struct pipe_buf_operations *ops = buf->ops;
-                        sd->len = buf->len;
+        while (pipe->nrbufs) {
-                        if (sd->len > sd->total_len)
+                struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
-                                sd->len = sd->total_len;
+                const struct pipe_buf_operations *ops = buf->ops;
-                        err = actor(pipe, buf, sd);
+                sd->len = buf->len;
-                        if (err <= 0) {
+                if (sd->len > sd->total_len)
-                                if (!ret && err != -ENODATA)
+                        sd->len = sd->total_len;
-                                        ret = err;
-                                break;
+                ret = actor(pipe, buf, sd);
-                        }
+                if (ret <= 0) {
+                        if (ret == -ENODATA)
+                                ret = 0;
+                        return ret;
+                }
+                buf->offset += ret;
+                buf->len -= ret;
-                        ret += err;
+                sd->num_spliced += ret;
-                        buf->offset += err;
+                sd->len -= ret;
-                        buf->len -= err;
+                sd->pos += ret;
+                sd->total_len -= ret;
-                        sd->len -= err;
+                if (!buf->len) {
-                        sd->pos += err;
+                        buf->ops = NULL;
-                        sd->total_len -= err;
+                        ops->release(pipe, buf);
-                        if (sd->len)
+                        pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
-                                continue;
+                        pipe->nrbufs--;
+                        if (pipe->inode)
+                                sd->need_wakeup = true;
+                }
-                        if (!buf->len) {
+                if (!sd->total_len)
-                                buf->ops = NULL;
+                        return 0;
-                                ops->release(pipe, buf);
+        }
-                                pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
-                                pipe->nrbufs--;
-                                if (pipe->inode)
-                                        do_wakeup = 1;
-                        }
-                        if (!sd->total_len)
+        return 1;
-                                break;
+}
-                }
+EXPORT_SYMBOL(splice_from_pipe_feed);
-                if (pipe->nrbufs)
+/**
-                        continue;
+ * splice_from_pipe_next - wait for some data to splice from
+ * @pipe:       pipe to splice from
+ * @sd:         information about the splice operation
+ *
+ * Description:
+ *    This function will wait for some data and return a positive
+ *    value (one) if pipe buffers are available.  It will return zero
+ *    or -errno if no more data needs to be spliced.
+ */
+int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+        while (!pipe->nrbufs) {
                if (!pipe->writers)
-                        break;
+                        return 0;
-                if (!pipe->waiting_writers) {
-                        if (ret)
-                                break;
-                }
-                if (sd->flags & SPLICE_F_NONBLOCK) {
+                if (!pipe->waiting_writers && sd->num_spliced)
-                        if (!ret)
+                        return 0;
-                                ret = -EAGAIN;
-                        break;
-                }
-                if (signal_pending(current)) {
+                if (sd->flags & SPLICE_F_NONBLOCK)
-                        if (!ret)
+                        return -EAGAIN;
-                                ret = -ERESTARTSYS;
-                        break;
-                }
-                if (do_wakeup) {
+                if (signal_pending(current))
-                        smp_mb();
+                        return -ERESTARTSYS;
-                        if (waitqueue_active(&pipe->wait))
-                                wake_up_interruptible_sync(&pipe->wait);
+                if (sd->need_wakeup) {
-                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+                        wakeup_pipe_writers(pipe);
-                        do_wakeup = 0;
+                        sd->need_wakeup = false;
                }
                pipe_wait(pipe);
        }
-        if (do_wakeup) {
+        return 1;
-                smp_mb();
+}
-                if (waitqueue_active(&pipe->wait))
+EXPORT_SYMBOL(splice_from_pipe_next);
-                        wake_up_interruptible(&pipe->wait);
-                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-        }
-        return ret;
+/**
+ * splice_from_pipe_begin - start splicing from pipe
+ * @sd:         information about the splice operation
+ *
+ * Description:
+ *    This function should be called before a loop containing
+ *    splice_from_pipe_next() and splice_from_pipe_feed() to
+ *    initialize the necessary fields of @sd.
+ */
+void splice_from_pipe_begin(struct splice_desc *sd)
+{
+        sd->num_spliced = 0;
+        sd->need_wakeup = false;
+}
+EXPORT_SYMBOL(splice_from_pipe_begin);
+/**
+ * splice_from_pipe_end - finish splicing from pipe
+ * @pipe:       pipe to splice from
+ * @sd:         information about the splice operation
+ *
+ * Description:
+ *    This function will wake up pipe writers if necessary.  It should
+ *    be called after a loop containing splice_from_pipe_next() and
+ *    splice_from_pipe_feed().
+ */
+void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+        if (sd->need_wakeup)
+                wakeup_pipe_writers(pipe);
+}
+EXPORT_SYMBOL(splice_from_pipe_end);
+/**
+ * __splice_from_pipe - splice data from a pipe to given actor
+ * @pipe:       pipe to splice from
+ * @sd:         information to @actor
+ * @actor:      handler that splices the data
+ *
+ * Description:
+ *    This function does little more than loop over the pipe and call
+ *    @actor to do the actual moving of a single struct pipe_buffer to
+ *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ *    pipe_to_user.
+ *
+ */
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+                           splice_actor *actor)
+{
+        int ret;
+        splice_from_pipe_begin(sd);
+        do {
+                ret = splice_from_pipe_next(pipe, sd);
+                if (ret > 0)
+                        ret = splice_from_pipe_feed(pipe, sd, actor);
+        } while (ret > 0);
+        splice_from_pipe_end(pipe, sd);
+        return sd->num_spliced ? sd->num_spliced : ret;
 }
 EXPORT_SYMBOL(__splice_from_pipe);
@@ -714,7 +781,7 @@ EXPORT_SYMBOL(__splice_from_pipe);
 * @actor:      handler that splices the data
 *
 * Description:
- *    See __splice_from_pipe. This function locks the input and output inodes,
+ *    See __splice_from_pipe. This function locks the pipe inode,
 *    otherwise it's identical to __splice_from_pipe().
 *
 */
@@ -723,7 +790,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
                         splice_actor *actor)
 {
        ssize_t ret;
-        struct inode *inode = out->f_mapping->host;
        struct splice_desc sd = {
                .total_len = len,
                .flags = flags,
@@ -731,21 +797,15 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
                .u.file = out,
        };
-        /*
+        pipe_lock(pipe);
-         * The actor worker might be calling ->write_begin and
-         * ->write_end. Most of the time, these expect i_mutex to
-         * be held. Since this may result in an ABBA deadlock with
-         * pipe->inode, we have to order lock acquiry here.
-         */
-        inode_double_lock(inode, pipe->inode);
        ret = __splice_from_pipe(pipe, &sd, actor);
-        inode_double_unlock(inode, pipe->inode);
+        pipe_unlock(pipe);
        return ret;
 }
 /**
- * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
+ * generic_file_splice_write - splice data from a pipe to a file
 * @pipe:       pipe info
 * @out:        file to write to
 * @ppos:       position in @out
@@ -754,13 +814,12 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 *
 * Description:
 *    Will either move or copy pages (determined by @flags options) from
- *    the given pipe inode to the given file. The caller is responsible
+ *    the given pipe inode to the given file.
- *    for acquiring i_mutex on both inodes.
 *
 */
 ssize_t
-generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
+generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
-                                 loff_t *ppos, size_t len, unsigned int flags)
+                          loff_t *ppos, size_t len, unsigned int flags)
 {
        struct address_space *mapping = out->f_mapping;
        struct inode *inode = mapping->host;
@@ -771,70 +830,28 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
                .u.file = out,
        };
        ssize_t ret;
-        int err;
-        err = file_remove_suid(out);
-        if (unlikely(err))
-                return err;
-        ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-        if (ret > 0) {
-                unsigned long nr_pages;
-                *ppos += ret;
+        pipe_lock(pipe);
-                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                /*
+        splice_from_pipe_begin(&sd);
-                 * If file or inode is SYNC and we actually wrote some data,
+        do {
-                 * sync it.
+                ret = splice_from_pipe_next(pipe, &sd);
-                 */
+                if (ret <= 0)
-                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        break;
-                        err = generic_osync_inode(inode, mapping,
-                                                  OSYNC_METADATA|OSYNC_DATA);
-                        if (err)
-                                ret = err;
-                }
-                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
-        }
-        return ret;
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-}
+                ret = file_remove_suid(out);
+                if (!ret)
+                        ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+                mutex_unlock(&inode->i_mutex);
+        } while (ret > 0);
+        splice_from_pipe_end(pipe, &sd);
-EXPORT_SYMBOL(generic_file_splice_write_nolock);
+        pipe_unlock(pipe);
-/**
+        if (sd.num_spliced)
- * generic_file_splice_write - splice data from a pipe to a file
+                ret = sd.num_spliced;
- * @pipe:       pipe info
- * @out:        file to write to
- * @ppos:       position in @out
- * @len:        number of bytes to splice
- * @flags:      splice modifier flags
- *
- * Description:
- *    Will either move or copy pages (determined by @flags options) from
- *    the given pipe inode to the given file.
- *
- */
-ssize_t
-generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
-                          loff_t *ppos, size_t len, unsigned int flags)
-{
-        struct address_space *mapping = out->f_mapping;
-        struct inode *inode = mapping->host;
-        struct splice_desc sd = {
-                .total_len = len,
-                .flags = flags,
-                .pos = *ppos,
-                .u.file = out,
-        };
-        ssize_t ret;
-        inode_double_lock(inode, pipe->inode);
-        ret = file_remove_suid(out);
-        if (likely(!ret))
-                ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-        inode_double_unlock(inode, pipe->inode);
        if (ret > 0) {
                unsigned long nr_pages;
@@ -1323,8 +1340,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
        if (!pipe)
                return -EBADF;
-        if (pipe->inode)
+        pipe_lock(pipe);
-                mutex_lock(&pipe->inode->i_mutex);
        error = ret = 0;
        while (nr_segs) {
@@ -1379,8 +1395,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
                iov++;
        }
-        if (pipe->inode)
+        pipe_unlock(pipe);
-                mutex_unlock(&pipe->inode->i_mutex);
        if (!ret)
                ret = error;
@@ -1508,7 +1523,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
                return 0;
        ret = 0;
-        mutex_lock(&pipe->inode->i_mutex);
+        pipe_lock(pipe);
        while (!pipe->nrbufs) {
                if (signal_pending(current)) {
@@ -1526,7 +1541,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
                pipe_wait(pipe);
        }
-        mutex_unlock(&pipe->inode->i_mutex);
+        pipe_unlock(pipe);
        return ret;
 }
@@ -1546,7 +1561,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
                return 0;
        ret = 0;
-        mutex_lock(&pipe->inode->i_mutex);
+        pipe_lock(pipe);
        while (pipe->nrbufs >= PIPE_BUFFERS) {
                if (!pipe->readers) {
@@ -1567,7 +1582,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
                pipe->waiting_writers--;
        }
-        mutex_unlock(&pipe->inode->i_mutex);
+        pipe_unlock(pipe);
        return ret;
 }
@@ -1583,10 +1598,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
        /*
         * Potential ABBA deadlock, work around it by ordering lock
-         * grabbing by inode address. Otherwise two different processes
+         * grabbing by pipe info address. Otherwise two different processes
         * could deadlock (one doing tee from A -> B, the other from B -> A).
         */
-        inode_double_lock(ipipe->inode, opipe->inode);
+        pipe_double_lock(ipipe, opipe);
        do {
                if (!opipe->readers) {
@@ -1637,7 +1652,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
                ret = -EAGAIN;
-        inode_double_unlock(ipipe->inode, opipe->inode);
+        pipe_unlock(ipipe);
+        pipe_unlock(opipe);
        /*
         * If we put data in the output pipe, wakeup any potential readers.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 8258cf9a0317..70e3244fa30f 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,4 +5,3 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o
-#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 1c4739e33af6..40c98fa6b5d6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -252,6 +252,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
        cache->entries = entries;
        cache->block_size = block_size;
        cache->pages = block_size >> PAGE_CACHE_SHIFT;
+        cache->pages = cache->pages ? cache->pages : 1;
        cache->name = name;
        cache->num_waiters = 0;
        spin_lock_init(&cache->lock);
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/zlib.h>
+#include <linux/slab.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 681ec0d83799..0adc624c956f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -157,6 +157,16 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
                goto failed_mount;
+        /*
+         * Check the system page size is not larger than the filesystem
+         * block size (by default 128K).  This is currently not supported.
+         */
+        if (PAGE_CACHE_SIZE > msblk->block_size) {
+                ERROR("Page size > filesystem block size (%d).  This is "
+                        "currently not supported!\n", msblk->block_size);
+                goto failed_mount;
+        }
        msblk->block_log = le16_to_cpu(sblk->block_log);
        if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
                goto failed_mount;
@@ -301,6 +311,7 @@ failure:
 static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+        u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
        TRACE("Entered squashfs_statfs\n");
@@ -311,6 +322,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = msblk->inodes;
        buf->f_ffree = 0;
        buf->f_namelen = SQUASHFS_NAME_LEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/stat.c b/fs/stat.c
index 2db740a0cfb5..075694e31d8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -55,59 +55,54 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 EXPORT_SYMBOL(vfs_getattr);
-int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
+int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
-        struct path path;
+        struct file *f = fget(fd);
-        int error;
+        int error = -EBADF;
-        error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
+        if (f) {
-        if (!error) {
+                error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
-                error = vfs_getattr(path.mnt, path.dentry, stat);
+                fput(f);
-                path_put(&path);
        }
        return error;
 }
+EXPORT_SYMBOL(vfs_fstat);
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
 {
-        return vfs_stat_fd(AT_FDCWD, name, stat);
+        struct path path;
-}
+        int error = -EINVAL;
+        int lookup_flags = 0;
-EXPORT_SYMBOL(vfs_stat);
+        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+                goto out;
-int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
+        if (!(flag & AT_SYMLINK_NOFOLLOW))
-{
+                lookup_flags |= LOOKUP_FOLLOW;
-        struct path path;
-        int error;
-        error = user_path_at(dfd, name, 0, &path);
+        error = user_path_at(dfd, filename, lookup_flags, &path);
-        if (!error) {
+        if (error)
-                error = vfs_getattr(path.mnt, path.dentry, stat);
+                goto out;
-                path_put(&path);
-        }
+        error = vfs_getattr(path.mnt, path.dentry, stat);
+        path_put(&path);
+out:
        return error;
 }
+EXPORT_SYMBOL(vfs_fstatat);
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_stat(char __user *name, struct kstat *stat)
 {
-        return vfs_lstat_fd(AT_FDCWD, name, stat);
+        return vfs_fstatat(AT_FDCWD, name, stat, 0);
 }
+EXPORT_SYMBOL(vfs_stat);
-EXPORT_SYMBOL(vfs_lstat);
+int vfs_lstat(char __user *name, struct kstat *stat)
-int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
-        struct file *f = fget(fd);
+        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
-        int error = -EBADF;
-        if (f) {
-                error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
-                fput(f);
-        }
-        return error;
 }
+EXPORT_SYMBOL(vfs_lstat);
-EXPORT_SYMBOL(vfs_fstat);
 #ifdef __ARCH_WANT_OLD_STAT
@@ -155,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_stat(filename, &stat);
-                error = cp_old_stat(&stat, statbuf);
+        if (error)
+                return error;
-        return error;
+        return cp_old_stat(&stat, statbuf);
 }
 SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_lstat(filename, &stat);
-                error = cp_old_stat(&stat, statbuf);
+        if (error)
+                return error;
-        return error;
+        return cp_old_stat(&stat, statbuf);
 }
 SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
@@ -240,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+        int error = vfs_stat(filename, &stat);
-        if (!error)
-                error = cp_new_stat(&stat, statbuf);
-        return error;
+        if (error)
+                return error;
+        return cp_new_stat(&stat, statbuf);
 }
 SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_lstat(filename, &stat);
-                error = cp_new_stat(&stat, statbuf);
+        if (error)
+                return error;
-        return error;
+        return cp_new_stat(&stat, statbuf);
 }
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
@@ -264,21 +261,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
                struct stat __user *, statbuf, int, flag)
 {
        struct kstat stat;
-        int error = -EINVAL;
+        int error;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-                goto out;
-        if (flag & AT_SYMLINK_NOFOLLOW)
-                error = vfs_lstat_fd(dfd, filename, &stat);
-        else
-                error = vfs_stat_fd(dfd, filename, &stat);
-        if (!error)
-                error = cp_new_stat(&stat, statbuf);
-out:
+        error = vfs_fstatat(dfd, filename, &stat, flag);
-        return error;
+        if (error)
+                return error;
+        return cp_new_stat(&stat, statbuf);
 }
 #endif
@@ -404,21 +392,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
                struct stat64 __user *, statbuf, int, flag)
 {
        struct kstat stat;
-        int error = -EINVAL;
+        int error;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-                goto out;
-        if (flag & AT_SYMLINK_NOFOLLOW)
-                error = vfs_lstat_fd(dfd, filename, &stat);
-        else
-                error = vfs_stat_fd(dfd, filename, &stat);
-        if (!error)
-                error = cp_new_stat64(&stat, statbuf);
-out:
+        error = vfs_fstatat(dfd, filename, &stat, flag);
-        return error;
+        if (error)
+                return error;
+        return cp_new_stat64(&stat, statbuf);
 }
 #endif /* __ARCH_WANT_STAT64 */
diff --git a/fs/super.c b/fs/super.c
index 2ba481518ba7..1943fdf655fa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -208,6 +208,34 @@ void deactivate_super(struct super_block *s)
 EXPORT_SYMBOL(deactivate_super);
 /**
+ *      deactivate_locked_super -       drop an active reference to superblock
+ *      @s: superblock to deactivate
+ *
+ *      Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
+ *      it does not unlock it until it's all over.  As the result, it's safe to
+ *      use to dispose of new superblock on ->get_sb() failure exits - nobody
+ *      will see the sucker until it's all over.  Equivalent using up_write +
+ *      deactivate_super is safe for that purpose only if superblock is either
+ *      safe to use or has NULL ->s_root when we unlock.
+ */
+void deactivate_locked_super(struct super_block *s)
+{
+        struct file_system_type *fs = s->s_type;
+        if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
+                s->s_count -= S_BIAS-1;
+                spin_unlock(&sb_lock);
+                vfs_dq_off(s, 0);
+                fs->kill_sb(s);
+                put_filesystem(fs);
+                put_super(s);
+        } else {
+                up_write(&s->s_umount);
+        }
+}
+EXPORT_SYMBOL(deactivate_locked_super);
+/**
 *      grab_super - acquire an active reference
 *      @s: reference we are trying to make active
 *
@@ -287,6 +315,7 @@ int fsync_super(struct super_block *sb)
        __fsync_super(sb);
        return sync_blockdev(sb->s_bdev);
 }
+EXPORT_SYMBOL_GPL(fsync_super);
 /**
 *      generic_shutdown_super  -       common helper for ->kill_sb()
@@ -770,6 +799,45 @@ void kill_litter_super(struct super_block *sb)
 EXPORT_SYMBOL(kill_litter_super);
+static int ns_test_super(struct super_block *sb, void *data)
+{
+        return sb->s_fs_info == data;
+}
+static int ns_set_super(struct super_block *sb, void *data)
+{
+        sb->s_fs_info = data;
+        return set_anon_super(sb, NULL);
+}
+int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct super_block *sb;
+        sb = sget(fs_type, ns_test_super, ns_set_super, data);
+        if (IS_ERR(sb))
+                return PTR_ERR(sb);
+        if (!sb->s_root) {
+                int err;
+                sb->s_flags = flags;
+                err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+                if (err) {
+                        deactivate_locked_super(sb);
+                        return err;
+                }
+                sb->s_flags |= MS_ACTIVE;
+        }
+        simple_set_mnt(mnt, sb);
+        return 0;
+}
+EXPORT_SYMBOL(get_sb_ns);
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
@@ -813,8 +881,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
        if (s->s_root) {
                if ((flags ^ s->s_flags) & MS_RDONLY) {
-                        up_write(&s->s_umount);
+                        deactivate_locked_super(s);
-                        deactivate_super(s);
                        error = -EBUSY;
                        goto error_bdev;
                }
@@ -829,8 +896,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
                sb_set_blocksize(s, block_size(bdev));
                error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
                if (error) {
-                        up_write(&s->s_umount);
+                        deactivate_locked_super(s);
-                        deactivate_super(s);
                        goto error;
                }
@@ -856,7 +922,7 @@ void kill_block_super(struct super_block *sb)
        struct block_device *bdev = sb->s_bdev;
        fmode_t mode = sb->s_mode;
-        bdev->bd_super = 0;
+        bdev->bd_super = NULL;
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
        close_bdev_exclusive(bdev, mode);
@@ -880,8 +946,7 @@ int get_sb_nodev(struct file_system_type *fs_type,
        error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
        if (error) {
-                up_write(&s->s_umount);
+                deactivate_locked_super(s);
-                deactivate_super(s);
                return error;
        }
        s->s_flags |= MS_ACTIVE;
@@ -911,8 +976,7 @@ int get_sb_single(struct file_system_type *fs_type,
                s->s_flags = flags;
                error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
                if (error) {
-                        up_write(&s->s_umount);
+                        deactivate_locked_super(s);
-                        deactivate_super(s);
                        return error;
                }
                s->s_flags |= MS_ACTIVE;
@@ -965,8 +1029,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        return mnt;
 out_sb:
        dput(mnt->mnt_root);
-        up_write(&mnt->mnt_sb->s_umount);
+        deactivate_locked_super(mnt->mnt_sb);
-        deactivate_super(mnt->mnt_sb);
 out_free_secdata:
        free_secdata(secdata);
 out_mnt:
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 07703d3ff4a1..9345806c8853 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -157,14 +157,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
                        count = size - offs;
        }
-        temp = kmalloc(count, GFP_KERNEL);
+        temp = memdup_user(userbuf, count);
-        if (!temp)
+        if (IS_ERR(temp))
-                return -ENOMEM;
+                return PTR_ERR(temp);
-        if (copy_from_user(temp, userbuf, count)) {
-                count = -EFAULT;
-                goto out_free;
-        }
        mutex_lock(&bb->mutex);
@@ -176,8 +171,6 @@ static ssize_t write(struct file *file, const char __user *userbuf,
        if (count > 0)
                *off = offs + count;
-out_free:
-        kfree(temp);
        return count;
 }
@@ -234,7 +227,7 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return ret;
 }
-static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct file *file = vma->vm_file;
        struct bin_buffer *bb = file->private_data;
@@ -242,15 +235,15 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        int ret;
        if (!bb->vm_ops)
-                return -EINVAL;
+                return VM_FAULT_SIGBUS;
        if (!bb->vm_ops->page_mkwrite)
                return 0;
        if (!sysfs_get_active_two(attr_sd))
-                return -EINVAL;
+                return VM_FAULT_SIGBUS;
-        ret = bb->vm_ops->page_mkwrite(vma, page);
+        ret = bb->vm_ops->page_mkwrite(vma, vmf);
        sysfs_put_active_two(attr_sd);
        return ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 289c43a47263..561a9c050cef 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -446,11 +446,11 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
        if (buffer->event != atomic_read(&od->event))
                goto trigger;
-        return 0;
+        return DEFAULT_POLLMASK;
 trigger:
        buffer->needs_read_fill = 1;
-        return POLLERR|POLLPRI;
+        return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 void sysfs_notify_dirent(struct sysfs_dirent *sd)
@@ -667,6 +667,7 @@ struct sysfs_schedule_callback_struct {
        struct work_struct      work;
 };
+static struct workqueue_struct *sysfs_workqueue;
 static DEFINE_MUTEX(sysfs_workq_mutex);
 static LIST_HEAD(sysfs_workq);
 static void sysfs_schedule_callback_work(struct work_struct *work)
@@ -715,11 +716,20 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
        mutex_lock(&sysfs_workq_mutex);
        list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
                if (ss->kobj == kobj) {
+                        module_put(owner);
                        mutex_unlock(&sysfs_workq_mutex);
                        return -EAGAIN;
                }
        mutex_unlock(&sysfs_workq_mutex);
+        if (sysfs_workqueue == NULL) {
+                sysfs_workqueue = create_singlethread_workqueue("sysfsd");
+                if (sysfs_workqueue == NULL) {
+                        module_put(owner);
+                        return -ENOMEM;
+                }
+        }
        ss = kmalloc(sizeof(*ss), GFP_KERNEL);
        if (!ss) {
                module_put(owner);
@@ -735,7 +745,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
        mutex_lock(&sysfs_workq_mutex);
        list_add_tail(&ss->workq_list, &sysfs_workq);
        mutex_unlock(&sysfs_workq_mutex);
-        schedule_work(&ss->work);
+        queue_work(sysfs_workqueue, &ss->work);
        return 0;
 }
 EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3d81bf58dae2..da20b48d350f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = sb->s_magic;
        buf->f_bsize = sb->s_blocksize;
@@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = sbi->s_ninodes;
        buf->f_ffree = sysv_count_free_inodes(sb);
        buf->f_namelen = SYSV_NAMELEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index e35b54d5059d..830e3f76f442 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR
        depends on UBIFS_FS
        help
          This option allows to explicitly choose which compressions, if any,
-          are enabled in UBIFS. Removing compressors means inbility to read
+          are enabled in UBIFS. Removing compressors means inability to read
          existing file systems.
          If unsure, say 'N'.
@@ -32,7 +32,7 @@ config UBIFS_FS_LZO
        depends on UBIFS_FS
        default y
        help
-           LZO compressor is generally faster then zlib but compresses worse.
+           LZO compressor is generally faster than zlib but compresses worse.
           Say 'Y' if unsure.
 config UBIFS_FS_ZLIB
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
 }
 /**
- * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
 * @c: UBIFS file-system description object
 *
- * This function calculates and returns the number of eraseblocks which should
+ * This function calculates and returns the number of LEBs which should be kept
- * be kept for index usage.
+ * for index usage.
 */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-        int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+        int idx_lebs;
        long long idx_size;
        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
        /* And make sure we have thrice the index size of space reserved */
-        idx_size = idx_size + (idx_size << 1);
+        idx_size += idx_size << 1;
        /*
         * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
         * pair, nor similarly the two variables for the new index size, so we
         * have to do this costly 64-bit division on fast-path.
         */
-        idx_size += eff_leb_size - 1;
+        idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
-        idx_lebs = div_u64(idx_size, eff_leb_size);
        /*
         * The index head is not available for the in-the-gaps method, so add an
         * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
 * do_budget_space - reserve flash space for index and data growth.
 * @c: UBIFS file-system description object
 *
- * This function makes sure UBIFS has enough free eraseblocks for index growth
+ * This function makes sure UBIFS has enough free LEBs for index growth and
- * and data.
+ * data.
 *
 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
 * would take if it was consolidated and written to the flash. This guarantees
 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
 * be able to commit dirty index. So this function basically adds amount of
 * budgeted index space to the size of the current index, multiplies this by 3,
- * and makes sure this does not exceed the amount of free eraseblocks.
+ * and makes sure this does not exceed the amount of free LEBs.
 *
 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
 *    be large, because UBIFS does not do any index consolidation as long as
 *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
 *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
+ * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
- *   consolidated to take up to @c->min_idx_lebs LEBs.
+ *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
 *
 * This function returns zero in case of success, and %-ENOSPC in case of
 * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 * This function calculates amount of free space to report to user-space.
 *
 * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alignment, wastage at the end of eraseblocks, etc), it cannot report real
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
- * amount of free flash space it has (well, because not all dirty space is
+ * free flash space it has (well, because not all dirty space is reclaimable,
- * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
- * it would bread user expectations about what free space is. Users seem to
+ * bread user expectations about what free space is. Users seem to accustomed
- * accustomed to assume that if the file-system reports N bytes of free space,
+ * to assume that if the file-system reports N bytes of free space, they would
- * they would be able to fit a file of N bytes to the FS. This almost works for
+ * be able to fit a file of N bytes to the FS. This almost works for
 * traditional file-systems, because they have way less overhead than UBIFS.
 * So, to keep users happy, UBIFS tries to take the overhead into account.
 */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                                          "bad or corrupted node)");
                else {
                        for (i = 0; i < nlen && dent->name[i]; i++)
-                                printk("%c", dent->name[i]);
+                                printk(KERN_CONT "%c", dent->name[i]);
                }
-                printk("\n");
+                printk(KERN_CONT "\n");
                break;
        }
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
                        /*
                         * Make sure the last key in our znode is less or
-                         * equivalent than the the key in zbranch which goes
+                         * equivalent than the key in the zbranch which goes
                         * after our pointing zbranch.
                         */
                        cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f261..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        struct ubifs_inode *ui = ubifs_inode(inode);
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+        int skipped_read = 0;
        struct page *page;
        ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        if (!PageUptodate(page)) {
                /* The page is not loaded from the flash */
-                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
                        /*
                         * We change whole page so no need to load it. But we
                         * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                         * the media.
                         */
                        SetPageChecked(page);
-                else {
+                        skipped_read = 1;
+                } else {
                        err = do_readpage(page);
                        if (err) {
                                unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        if (unlikely(err)) {
                ubifs_assert(err == -ENOSPC);
                /*
+                 * If we skipped reading the page because we were going to
+                 * write all of it, then it is not up to date.
+                 */
+                if (skipped_read) {
+                        ClearPageChecked(page);
+                        ClearPageUptodate(page);
+                }
+                /*
                 * Budgeting failed which means it would have to force
                 * write-back but didn't, because we set the @fast flag in the
                 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
 * whole index and correct all inode sizes, which is long an unacceptable.
 *
 * To prevent situations like this, UBIFS writes pages back only if they are
- * within last synchronized inode size, i.e. the the size which has been
+ * within the last synchronized inode size, i.e. the size which has been
 * written to the flash media last time. Otherwise, UBIFS forces inode
 * write-back, thus making sure the on-flash inode contains current inode size,
 * and then keeps writing pages back.
@@ -1434,8 +1444,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 * mmap()d file has taken write protection fault and is being made
 * writable. UBIFS must ensure page is budgeted for.
 */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1458,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
        if (unlikely(c->ro_media))
-                return -EROFS;
+                return VM_FAULT_SIGBUS; /* -EROFS */
        /*
         * We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1491,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                if (err == -ENOSPC)
                        ubifs_warn("out of space for mmapped file "
                                   "(inode number %lu)", inode->i_ino);
-                return err;
+                return VM_FAULT_SIGBUS;
        }
        lock_page(page);
@@ -1520,6 +1531,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 out_unlock:
        unlock_page(page);
        ubifs_release_budget(c, &req);
+        if (err)
+                err = VM_FAULT_SIGBUS;
        return err;
 }
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
 * ubifs_find_free_space - find a data LEB with free space.
 * @c: the UBIFS file-system description object
 * @min_space: minimum amount of required free space
- * @free: contains amount of free space in the LEB on exit
+ * @offs: contains offset of where free space starts on exit
 * @squeeze: whether to try to find space in a non-empty LEB first
 *
 * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
 * failed to find a LEB with @min_space bytes of free space and other a negative
 * error codes in case of failure.
 */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
                          int squeeze)
 {
        const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
                spin_unlock(&c->space_lock);
        }
-        *free = lprops->free;
+        *offs = c->leb_size - lprops->free;
        ubifs_release_lprops(c);
-        if (*free == c->leb_size) {
+        if (*offs == 0) {
                /*
                 * Ensure that empty LEBs have been unmapped. They may not have
                 * been, for example, because of an unclean unmount.  Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
                        return err;
        }
-        dbg_find("found LEB %d, free %d", lnum, *free);
+        dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
-        ubifs_assert(*free >= min_space);
+        ubifs_assert(*offs <= c->leb_size - min_space);
        return lnum;
 out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
 * have to waste large pieces of free space at the end of LEB B, because nodes
 * from LEB A would not fit. And the worst situation is when all nodes are of
 * maximum size. So dark watermark is the amount of free + dirty space in LEB
- * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
 * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
 #include "ubifs.h"
 /*
- * GC tries to optimize the way it fit nodes to available space, and it sorts
- * nodes a little. The below constants are watermarks which define "large",
- * "medium", and "small" nodes.
- */
-#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
-#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
-/*
 * GC may need to move more than one LEB to make progress. The below constants
 * define "soft" and "hard" limits on the number of LEBs the garbage collector
 * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 /**
- * joinup - bring data nodes for an inode together.
+ * list_sort - sort a list.
- * @c: UBIFS file-system description object
+ * @priv: private data, passed to @cmp
- * @sleb: describes scanned LEB
+ * @head: the list to sort
- * @inum: inode number
+ * @cmp: the elements comparison function
- * @blk: block number
- * @data: list to which to add data nodes
 *
- * This function looks at the first few nodes in the scanned LEB @sleb and adds
+ * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
- * them to @data if they are data nodes from @inum and have a larger block
+ * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
- * number than @blk. This function returns %0 on success and a negative error
+ * in ascending order.
- * code on failure.
+ *
+ * The comparison function @cmp is supposed to return a negative value if @a is
+ * than @b, and a positive value if @a is greater than @b. If @a and @b are
+ * equivalent, then it does not matter what this function returns.
 */
-static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
+static void list_sort(void *priv, struct list_head *head,
-                  unsigned int blk, struct list_head *data)
+                      int (*cmp)(void *priv, struct list_head *a,
+                                 struct list_head *b))
 {
-        int err, cnt = 6, lnum = sleb->lnum, offs;
+        struct list_head *p, *q, *e, *list, *tail, *oldhead;
-        struct ubifs_scan_node *snod, *tmp;
+        int insize, nmerges, psize, qsize, i;
-        union ubifs_key *key;
+        if (list_empty(head))
+                return;
+        list = head->next;
+        list_del(head);
+        insize = 1;
+        for (;;) {
+                p = oldhead = list;
+                list = tail = NULL;
+                nmerges = 0;
+                while (p) {
+                        nmerges++;
+                        q = p;
+                        psize = 0;
+                        for (i = 0; i < insize; i++) {
+                                psize++;
+                                q = q->next == oldhead ? NULL : q->next;
+                                if (!q)
+                                        break;
+                        }
-        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                        qsize = insize;
-                key = &snod->key;
+                        while (psize > 0 || (qsize > 0 && q)) {
-                if (key_inum(c, key) == inum &&
+                                if (!psize) {
-                    key_type(c, key) == UBIFS_DATA_KEY &&
+                                        e = q;
-                    key_block(c, key) > blk) {
+                                        q = q->next;
-                        offs = snod->offs;
+                                        qsize--;
-                        err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
+                                        if (q == oldhead)
-                        if (err < 0)
+                                                q = NULL;
-                                return err;
+                                } else if (!qsize || !q) {
-                        list_del(&snod->list);
+                                        e = p;
-                        if (err) {
+                                        p = p->next;
-                                list_add_tail(&snod->list, data);
+                                        psize--;
-                                blk = key_block(c, key);
+                                        if (p == oldhead)
-                        } else
+                                                p = NULL;
-                                kfree(snod);
+                                } else if (cmp(priv, p, q) <= 0) {
-                        cnt = 6;
+                                        e = p;
-                } else if (--cnt == 0)
+                                        p = p->next;
+                                        psize--;
+                                        if (p == oldhead)
+                                                p = NULL;
+                                } else {
+                                        e = q;
+                                        q = q->next;
+                                        qsize--;
+                                        if (q == oldhead)
+                                                q = NULL;
+                                }
+                                if (tail)
+                                        tail->next = e;
+                                else
+                                        list = e;
+                                e->prev = tail;
+                                tail = e;
+                        }
+                        p = q;
+                }
+                tail->next = list;
+                list->prev = tail;
+                if (nmerges <= 1)
                        break;
+                insize *= 2;
        }
-        return 0;
+        head->next = list;
+        head->prev = list->prev;
+        list->prev->next = head;
+        list->prev = head;
 }
 /**
- * move_nodes - move nodes.
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        ino_t inuma, inumb;
+        struct ubifs_info *c = priv;
+        struct ubifs_scan_node *sa, *sb;
+        cond_resched();
+        sa = list_entry(a, struct ubifs_scan_node, list);
+        sb = list_entry(b, struct ubifs_scan_node, list);
+        ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+        ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+        inuma = key_inum(c, &sa->key);
+        inumb = key_inum(c, &sb->key);
+        if (inuma == inumb) {
+                unsigned int blka = key_block(c, &sa->key);
+                unsigned int blkb = key_block(c, &sb->key);
+                if (blka <= blkb)
+                        return -1;
+        } else if (inuma <= inumb)
+                return -1;
+        return 1;
+}
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        int typea, typeb;
+        ino_t inuma, inumb;
+        struct ubifs_info *c = priv;
+        struct ubifs_scan_node *sa, *sb;
+        cond_resched();
+        sa = list_entry(a, struct ubifs_scan_node, list);
+        sb = list_entry(b, struct ubifs_scan_node, list);
+        typea = key_type(c, &sa->key);
+        typeb = key_type(c, &sb->key);
+        ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+        /* Inodes go before directory entries */
+        if (typea == UBIFS_INO_KEY) {
+                if (typeb == UBIFS_INO_KEY)
+                        return sb->len - sa->len;
+                return -1;
+        }
+        if (typeb == UBIFS_INO_KEY)
+                return 1;
+        ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+        inuma = key_inum(c, &sa->key);
+        inumb = key_inum(c, &sb->key);
+        if (inuma == inumb) {
+                uint32_t hasha = key_hash(c, &sa->key);
+                uint32_t hashb = key_hash(c, &sb->key);
+                if (hasha <= hashb)
+                        return -1;
+        } else if (inuma <= inumb)
+                return -1;
+        return 1;
+}
+/**
+ * sort_nodes - sort nodes for GC.
 * @c: UBIFS file-system description object
- * @sleb: describes nodes to move
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
 *
- * This function moves valid nodes from data LEB described by @sleb to the GC
+ * This function sorts the list of inodes to garbage collect. First of all, it
- * journal head. The obsolete nodes are dropped.
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
 *
- * When moving nodes we have to deal with classical bin-packing problem: the
+ * Non-data nodes are sorted as follows.
- * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
+ *   o First go inode nodes - they are sorted in descending length order.
- * where the nodes in the @sleb->nodes list are the elements which should be
+ *   o Then go directory entry nodes - they are sorted in hash order, which
- * fit optimally to the bins. This function uses the "first fit decreasing"
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
- * strategy, although it does not really sort the nodes but just split them on
+ *     inode number go before direntry nodes with higher parent inode number,
- * 3 classes - large, medium, and small, so they are roughly sorted.
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
 *
- * This function returns zero in case of success, %-EAGAIN if commit is
+ * This function returns zero in case of success and a negative error code in
- * required, and other negative error codes in case of other failures.
+ * case of failure.
 */
-static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                      struct list_head *nondata, int *min)
 {
        struct ubifs_scan_node *snod, *tmp;
-        struct list_head data, large, medium, small;
-        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
-        int avail, err, min = INT_MAX;
-        unsigned int blk = 0;
-        ino_t inum = 0;
-        INIT_LIST_HEAD(&data);
+        *min = INT_MAX;
-        INIT_LIST_HEAD(&large);
-        INIT_LIST_HEAD(&medium);
-        INIT_LIST_HEAD(&small);
-        while (!list_empty(&sleb->nodes)) {
+        /* Separate data nodes and non-data nodes */
-                struct list_head *lst = sleb->nodes.next;
+        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                int err;
-                snod = list_entry(lst, struct ubifs_scan_node, list);
                ubifs_assert(snod->type != UBIFS_IDX_NODE);
                ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
                err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
                                         snod->offs, 0);
                if (err < 0)
-                        goto out;
+                        return err;
-                list_del(lst);
                if (!err) {
                        /* The node is obsolete, remove it from the list */
+                        list_del(&snod->list);
                        kfree(snod);
                        continue;
                }
-                /*
+                if (snod->len < *min)
-                 * Sort the list of nodes so that data nodes go first, large
+                        *min = snod->len;
-                 * nodes go second, and small nodes go last.
-                 */
+                if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
-                if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
+                        list_move_tail(&snod->list, nondata);
-                        if (inum != key_inum(c, &snod->key)) {
-                                if (inum) {
-                                        /*
-                                         * Try to move data nodes from the same
-                                         * inode together.
-                                         */
-                                        err = joinup(c, sleb, inum, blk, &data);
-                                        if (err)
-                                                goto out;
-                                }
-                                inum = key_inum(c, &snod->key);
-                                blk = key_block(c, &snod->key);
-                        }
-                        list_add_tail(lst, &data);
-                } else if (snod->len > MEDIUM_NODE_WM)
-                        list_add_tail(lst, &large);
-                else if (snod->len > SMALL_NODE_WM)
-                        list_add_tail(lst, &medium);
-                else
-                        list_add_tail(lst, &small);
-                /* And find the smallest node */
-                if (snod->len < min)
-                        min = snod->len;
        }
-        /*
+        /* Sort data and non-data nodes */
-         * Join the tree lists so that we'd have one roughly sorted list
+        list_sort(c, &sleb->nodes, &data_nodes_cmp);
-         * ('large' will be the head of the joined list).
+        list_sort(c, nondata, &nondata_nodes_cmp);
-         */
+        return 0;
-        list_splice(&data, &large);
+}
-        list_splice(&medium, large.prev);
-        list_splice(&small, large.prev);
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+        int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+        cond_resched();
+        err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+        if (err)
+                return err;
+        err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+                                snod->offs, new_lnum, new_offs,
+                                snod->len);
+        list_del(&snod->list);
+        kfree(snod);
+        return err;
+}
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+        int err, min;
+        LIST_HEAD(nondata);
+        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
        if (wbuf->lnum == -1) {
                /*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
                 */
                err = switch_gc_head(c);
                if (err)
-                        goto out;
+                        return err;
        }
+        err = sort_nodes(c, sleb, &nondata, &min);
+        if (err)
+                goto out;
        /* Write nodes to their new location. Use the first-fit strategy */
        while (1) {
-                avail = c->leb_size - wbuf->offs - wbuf->used;
+                int avail;
-                list_for_each_entry_safe(snod, tmp, &large, list) {
+                struct ubifs_scan_node *snod, *tmp;
-                        int new_lnum, new_offs;
+                /* Move data nodes */
+                list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                        avail = c->leb_size - wbuf->offs - wbuf->used;
+                        if  (snod->len > avail)
+                                /*
+                                 * Do not skip data nodes in order to optimize
+                                 * bulk-read.
+                                 */
+                                break;
+                        err = move_node(c, sleb, snod, wbuf);
+                        if (err)
+                                goto out;
+                }
+                /* Move non-data nodes */
+                list_for_each_entry_safe(snod, tmp, &nondata, list) {
+                        avail = c->leb_size - wbuf->offs - wbuf->used;
                        if (avail < min)
                                break;
-                        if (snod->len > avail)
+                        if  (snod->len > avail) {
-                                /* This node does not fit */
+                                /*
+                                 * Keep going only if this is an inode with
+                                 * some data. Otherwise stop and switch the GC
+                                 * head. IOW, we assume that data-less inode
+                                 * nodes and direntry nodes are roughly of the
+                                 * same size.
+                                 */
+                                if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+                                    snod->len == UBIFS_INO_NODE_SZ)
+                                        break;
                                continue;
+                        }
-                        cond_resched();
+                        err = move_node(c, sleb, snod, wbuf);
-                        new_lnum = wbuf->lnum;
-                        new_offs = wbuf->offs + wbuf->used;
-                        err = ubifs_wbuf_write_nolock(wbuf, snod->node,
-                                                      snod->len);
                        if (err)
                                goto out;
-                        err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
-                                                snod->offs, new_lnum, new_offs,
-                                                snod->len);
-                        if (err)
-                                goto out;
-                        avail = c->leb_size - wbuf->offs - wbuf->used;
-                        list_del(&snod->list);
-                        kfree(snod);
                }
-                if (list_empty(&large))
+                if (list_empty(&sleb->nodes) && list_empty(&nondata))
                        break;
                /*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
        return 0;
 out:
-        list_for_each_entry_safe(snod, tmp, &large, list) {
+        list_splice_tail(&nondata, &sleb->nodes);
-                list_del(&snod->list);
-                kfree(snod);
-        }
        return err;
 }
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
 */
 static int reserve_space(struct ubifs_info *c, int jhead, int len)
 {
-        int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+        int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
        struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
        /*
@@ -139,10 +139,9 @@ again:
         * Write buffer wasn't seek'ed or there is no enough space - look for an
         * LEB with some empty space.
         */
-        lnum = ubifs_find_free_space(c, len, &free, squeeze);
+        lnum = ubifs_find_free_space(c, len, &offs, squeeze);
        if (lnum >= 0) {
                /* Found an LEB, add it to the journal head */
-                offs = c->leb_size - free;
                err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
                if (err)
                        goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
 * @host: host inode
 *
 * This function writes the updated version of an extended attribute inode and
- * the host inode tho the journal (to the base head). The host inode is written
+ * the host inode to the journal (to the base head). The host inode is written
 * after the extended attribute inode in order to guarantee that the extended
 * attribute will be flushed when the inode is synchronized by 'fsync()' and
 * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
 * @c: UBIFS file-system description object
 * @key: the key to get hash from
 */
-static inline int key_hash(const struct ubifs_info *c,
+static inline uint32_t key_hash(const struct ubifs_info *c,
-                           const union ubifs_key *key)
+                                const union ubifs_key *key)
 {
        return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
 }
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
 * @c: UBIFS file-system description object
 * @k: the key to get hash from
 */
-static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
 {
        const union ubifs_key *key = k;
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        }
        /*
-         * Make sure the the amount of space in buds will not exceed
+         * Make sure the amount of space in buds will not exceed the
         * 'c->max_bud_bytes' limit, because we want to guarantee mount time
         * limits.
         *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
                                bud->jhead, c->leb_size - bud->start,
                                c->cmt_bud_bytes);
                        rb_erase(p1, &c->buds);
-                        list_del(&bud->list);
                        /*
                         * If the commit does not finish, the recovery will need
                         * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
                         * commit i.e. do not allow them to be garbage
                         * collected.
                         */
-                        list_add(&bud->list, &c->old_buds);
+                        list_move(&bud->list, &c->old_buds);
                }
        }
        spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
                while (offs + len > c->leb_size) {
                        alen = ALIGN(offs, c->min_io_size);
                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-                        dbg_chk_lpt_sz(c, 2, alen - offs);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = alloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
                if (offs + c->lsave_sz > c->leb_size) {
                        alen = ALIGN(offs, c->min_io_size);
                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-                        dbg_chk_lpt_sz(c, 2, alen - offs);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = alloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
                if (offs + c->ltab_sz > c->leb_size) {
                        alen = ALIGN(offs, c->min_io_size);
                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-                        dbg_chk_lpt_sz(c, 2, alen - offs);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = alloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
                                                       alen, UBI_SHORTTERM);
                                if (err)
                                        return err;
-                                dbg_chk_lpt_sz(c, 4, alen - wlen);
                        }
-                        dbg_chk_lpt_sz(c, 2, 0);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = realloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
-                        offs = 0;
+                        offs = from = 0;
-                        from = 0;
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                        err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
                                              UBI_SHORTTERM);
                        if (err)
                                return err;
-                        dbg_chk_lpt_sz(c, 2, alen - wlen);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = realloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
-                        offs = 0;
+                        offs = from = 0;
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                        err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
                                              UBI_SHORTTERM);
                        if (err)
                                return err;
-                        dbg_chk_lpt_sz(c, 2, alen - wlen);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = realloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
-                        offs = 0;
+                        offs = from = 0;
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                        err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 /**
 * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
 * @c: the UBIFS file-system description object
- * @action: action
+ * @action: what to do
 * @len: length written
 *
 * This function returns %0 on success and a negative error code on failure.
+ * The @action argument may be one of:
+ *   o %0 - LPT debugging checking starts, initialize debugging variables;
+ *   o %1 - wrote an LPT node, increase LPT size by @len bytes;
+ *   o %2 - switched to a different LEB and wasted @len bytes;
+ *   o %3 - check that we've written the right number of bytes.
+ *   o %4 - wasted @len bytes;
 */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                                       lnum, offs);
                        err = ubifs_unpack_nnode(c, buf, &nnode);
                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
-                                printk("%d:%d", nnode.nbranch[i].lnum,
+                                printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
                                       nnode.nbranch[i].offs);
                                if (i != UBIFS_LPT_FANOUT - 1)
-                                        printk(", ");
+                                        printk(KERN_CONT ", ");
                        }
-                        printk("\n");
+                        printk(KERN_CONT "\n");
                        break;
                }
                case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
 * @lnum: LEB number of the LEB from which @buf was read
 * @offs: offset from which @buf was read
 *
- * This function scans @buf for more nodes and returns %0 is a node is found and
+ * This function ensures that the corrupted node at @offs is the last thing
- * %1 if no more nodes are found.
+ * written to a LEB. This function returns %1 if more data is not found and
+ * %0 if more data is found.
 */
 static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
                        int lnum, int offs)
 {
-        int skip, next_offs = 0;
+        struct ubifs_ch *ch = buf;
+        int skip, dlen = le32_to_cpu(ch->len);
-        if (len > UBIFS_DATA_NODE_SZ) {
+        /* Check for empty space after the corrupt node's common header */
-                struct ubifs_ch *ch = buf;
+        skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
-                int dlen = le32_to_cpu(ch->len);
+        if (is_empty(buf + skip, len - skip))
+                return 1;
-                if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
+        /*
-                    dlen <= UBIFS_MAX_DATA_NODE_SZ)
+         * The area after the common header size is not empty, so the common
-                        /* The corrupt node looks like a data node */
+         * header must be intact. Check it.
-                        next_offs = ALIGN(offs + dlen, 8);
+         */
-        }
+        if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+                dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
-        if (c->min_io_size == 1)
+                return 0;
-                skip = 8;
-        else
-                skip = ALIGN(offs + 1, c->min_io_size) - offs;
-        offs += skip;
-        buf += skip;
-        len -= skip;
-        while (len > 8) {
-                struct ubifs_ch *ch = buf;
-                uint32_t magic = le32_to_cpu(ch->magic);
-                int ret;
-                if (magic == UBIFS_NODE_MAGIC) {
-                        ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
-                        if (ret == SCANNED_A_NODE || ret > 0) {
-                                /*
-                                 * There is a small chance this is just data in
-                                 * a data node, so check that possibility. e.g.
-                                 * this is part of a file that itself contains
-                                 * a UBIFS image.
-                                 */
-                                if (next_offs && offs + le32_to_cpu(ch->len) <=
-                                    next_offs)
-                                        continue;
-                                dbg_rcvry("unexpected node at %d:%d", lnum,
-                                          offs);
-                                return 0;
-                        }
-                }
-                offs += 8;
-                buf += 8;
-                len -= 8;
        }
-        return 1;
+        /* Now we know the corrupt node's length we can skip over it */
+        skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+        /* After which there should be empty space */
+        if (is_empty(buf + skip, len - skip))
+                return 1;
+        dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
+        return 0;
 }
 /**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                dirty -= c->leb_size - lp->free;
                /*
                 * If the replay order was perfect the dirty space would now be
-                 * zero. The order is not perfect because the the journal heads
+                 * zero. The order is not perfect because the journal heads
                 * race with each other. This is not a problem but is does mean
                 * that the dirty space may temporarily exceed c->leb_size
                 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        if (tmp64 > DEFAULT_MAX_RP_SIZE)
                tmp64 = DEFAULT_MAX_RP_SIZE;
        sup->rp_size = cpu_to_le64(tmp64);
+        sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
        err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
        kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
        if (IS_ERR(sup))
                return PTR_ERR(sup);
+        c->fmt_version = le32_to_cpu(sup->fmt_version);
+        c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
        /*
         * The software supports all previous versions but not future versions,
         * due to the unavailability of time-travelling equipment.
         */
-        c->fmt_version = le32_to_cpu(sup->fmt_version);
        if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-                ubifs_err("on-flash format version is %d, but software only "
+                struct super_block *sb = c->vfs_sb;
-                          "supports up to version %d", c->fmt_version,
+                int mounting_ro = sb->s_flags & MS_RDONLY;
-                          UBIFS_FORMAT_VERSION);
-                err = -EINVAL;
+                ubifs_assert(!c->ro_media || mounting_ro);
-                goto out;
+                if (!mounting_ro ||
+                    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
+                        ubifs_err("on-flash format version is w%d/r%d, but "
+                                  "software only supports up to version "
+                                  "w%d/r%d", c->fmt_version,
+                                  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+                                  UBIFS_RO_COMPAT_VERSION);
+                        if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
+                                ubifs_msg("only R/O mounting is possible");
+                                err = -EROFS;
+                        } else
+                                err = -EINVAL;
+                        goto out;
+                }
+                /*
+                 * The FS is mounted R/O, and the media format is
+                 * R/O-compatible with the UBIFS implementation, so we can
+                 * mount.
+                 */
+                c->rw_incompat = 1;
        }
        if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
        c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
        c->main_first = c->leb_cnt - c->main_lebs;
-        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
        err = validate_sb(c, sup);
 out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
                 * Move this one to the end of the list to provide some
                 * fairness.
                 */
-                list_del(&c->infos_list);
+                list_move_tail(&c->infos_list, &ubifs_infos);
-                list_add_tail(&c->infos_list, &ubifs_infos);
                mutex_unlock(&c->umount_mutex);
                if (freed >= nr)
                        break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
                        }
                        if (i == 1) {
-                                list_del(&c->infos_list);
+                                list_move_tail(&c->infos_list, &ubifs_infos);
-                                list_add_tail(&c->infos_list, &ubifs_infos);
                                spin_unlock(&ubifs_infos_lock);
                                ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..e9f7a754c4f7 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
                seq_printf(s, ",no_chk_data_crc");
        if (c->mount_opts.override_compr) {
-                seq_printf(s, ",compr=");
+                seq_printf(s, ",compr=%s",
-                seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+                           ubifs_compr_name(c->mount_opts.compr_type));
        }
        return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
        if (err)
                return err;
+        /* Initialize effective LEB size used in budgeting calculations */
+        c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
        return 0;
 }
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
        long long tmp64;
        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
        /*
         * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
                        goto out_cbuf;
                /* Create background thread */
-                c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+                c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
                if (IS_ERR(c->bgt)) {
                        err = PTR_ERR(c->bgt);
                        c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
                else {
                        c->need_recovery = 0;
                        ubifs_msg("recovery completed");
-                        /* GC LEB has to be empty and taken at this point */
+                        /*
-                        ubifs_assert(c->lst.taken_empty_lebs == 1);
+                         * GC LEB has to be empty and taken at this point. But
+                         * the journal head LEBs may also be accounted as
+                         * "empty taken" if they are empty.
+                         */
+                        ubifs_assert(c->lst.taken_empty_lebs > 0);
                }
        } else
-                ubifs_assert(c->lst.taken_empty_lebs == 1);
+                ubifs_assert(c->lst.taken_empty_lebs > 0);
        err = dbg_check_filesystem(c);
        if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
        x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
        ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
                  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
-        ubifs_msg("media format:       %d (latest is %d)",
+        ubifs_msg("media format:       w%d/r%d (latest is w%d/r%d)",
-                  c->fmt_version, UBIFS_FORMAT_VERSION);
+                  c->fmt_version, c->ro_compat_version,
+                  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
        ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
        ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
                c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
        int err, lnum;
+        if (c->rw_incompat) {
+                ubifs_err("the file-system is not R/W-compatible");
+                ubifs_msg("on-flash format version is w%d/r%d, but software "
+                          "only supports up to version w%d/r%d", c->fmt_version,
+                          c->ro_compat_version, UBIFS_FORMAT_VERSION,
+                          UBIFS_RO_COMPAT_VERSION);
+                return -EROFS;
+        }
        mutex_lock(&c->umount_mutex);
        dbg_save_space_info(c);
        c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        ubifs_create_buds_lists(c);
        /* Create background thread */
-        c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+        c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
        if (IS_ERR(c->bgt)) {
                err = PTR_ERR(c->bgt);
                c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                c->bu.buf = NULL;
        }
-        ubifs_assert(c->lst.taken_empty_lebs == 1);
+        ubifs_assert(c->lst.taken_empty_lebs > 0);
        return 0;
 }
@@ -2038,8 +2055,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
        return 0;
 out_deact:
-        up_write(&sb->s_umount);
+        deactivate_locked_super(sb);
-        deactivate_super(sb);
 out_close:
        ubi_close_volume(ubi);
        return err;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
         * splitting in the middle of the colliding sequence. Also, when
         * removing the leftmost key, we would have to correct the key of the
         * parent node, which would introduce additional complications. Namely,
-         * if we changed the the leftmost key of the parent znode, the garbage
+         * if we changed the leftmost key of the parent znode, the garbage
         * collector would be unable to find it (GC is doing this when GC'ing
         * indexing LEBs). Although we already have an additional RB-tree where
         * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
 /* UBIFS node magic number (must not have the padding byte first or last) */
 #define UBIFS_NODE_MAGIC  0x06101831
-/* UBIFS on-flash format version */
+/*
+ * UBIFS on-flash format version. This version is increased when the on-flash
+ * format is changing. If this happens, UBIFS is will support older versions as
+ * well. But older UBIFS code will not support newer formats. Format changes
+ * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
+ * a new feature.
+ *
+ * UBIFS went into mainline kernel with format version 4. The older formats
+ * were development formats.
+ */
 #define UBIFS_FORMAT_VERSION 4
+/*
+ * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
+ * implementations will not be able to mount newer formats in read-write mode.
+ * However, depending on the change, it may be possible to mount newer formats
+ * in R/O mode. This is indicated by the R/O compatibility version which is
+ * stored in the super-block.
+ *
+ * This is needed to support boot-loaders which only need R/O mounting. With
+ * this flag it is possible to do UBIFS format changes without a need to update
+ * boot-loaders.
+ */
+#define UBIFS_RO_COMPAT_VERSION 0
 /* Minimum logical eraseblock size in bytes */
 #define UBIFS_MIN_LEB_SZ (15*1024)
@@ -53,7 +75,7 @@
 /*
 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
- * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * shorter than uncompressed data length, UBIFS prefers to leave this data
 * node uncompress, because it'll be read faster.
 */
 #define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
 * @padding2: reserved for future, zeroes
 * @time_gran: time granularity in nanoseconds
 * @uuid: UUID generated when the file system image was created
+ * @ro_compat_version: UBIFS R/O compatibility version
 */
 struct ubifs_sb_node {
        struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
        __le64 rp_size;
        __le32 time_gran;
        __u8 uuid[16];
-        __u8 padding2[3972];
+        __le32 ro_compat_version;
+        __u8 padding2[3968];
 } __attribute__ ((packed));
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
 *          by @commit_sem
 * @cnt_lock: protects @highest_inum and @max_sqnum counters
 * @fmt_version: UBIFS on-flash format version
+ * @ro_compat_version: R/O compatibility version
 * @uuid: UUID from super block
 *
 * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
 *                   recovery)
 * @bulk_read: enable bulk-reads
 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @rw_incompat: the media is not R/W compatible
 *
 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
 *             @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
 * @min_io_shift: number of bits in @min_io_size minus one
 * @leb_size: logical eraseblock size in bytes
 * @half_leb_size: half LEB size
+ * @idx_leb_size: how many bytes of an LEB are effectively available when it is
+ *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
 * @leb_cnt: count of logical eraseblocks
 * @max_leb_cnt: maximum count of logical eraseblocks
 * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
 *             previous commit start
 * @uncat_list: list of un-categorized LEBs
 * @empty_list: list of empty LEBs
- * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
- * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
 * @freeable_cnt: number of freeable LEBs in @freeable_list
 *
 * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
        unsigned long long cmt_no;
        spinlock_t cnt_lock;
        int fmt_version;
+        int ro_compat_version;
        unsigned char uuid[16];
        int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
        unsigned int no_chk_data_crc:1;
        unsigned int bulk_read:1;
        unsigned int default_compr:2;
+        unsigned int rw_incompat:1;
        struct mutex tnc_mutex;
        struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
        int min_io_shift;
        int leb_size;
        int half_leb_size;
+        int idx_leb_size;
        int leb_cnt;
        int max_leb_cnt;
        int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 /* find.c */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
                          int squeeze);
 int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 2bb788a2acb1..e48e9a3af763 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb,
 {
        struct buffer_head *bh = NULL;
        int retval = 0;
-        kernel_lb_addr loc;
+        struct kernel_lb_addr loc;
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
-        bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block));
+        bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
        if (!bh)
                retval = -EIO;
@@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb,
        return slot;
 }
-static bool udf_add_free_space(struct udf_sb_info *sbi,
+static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
-                                u16 partition, u32 cnt)
 {
+        struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDesc *lvid;
-        if (sbi->s_lvid_bh == NULL)
+        if (!sbi->s_lvid_bh)
-                return false;
+                return;
        lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
        le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
-        return true;
+        udf_updated_lvid(sb);
 }
 static void udf_bitmap_free_blocks(struct super_block *sb,
                                   struct inode *inode,
                                   struct udf_bitmap *bitmap,
-                                   kernel_lb_addr bloc, uint32_t offset,
+                                   struct kernel_lb_addr *bloc,
+                                   uint32_t offset,
                                   uint32_t count)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct buffer_head *bh = NULL;
+        struct udf_part_map *partmap;
        unsigned long block;
        unsigned long block_group;
        unsigned long bit;
@@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
        unsigned long overflow;
        mutex_lock(&sbi->s_alloc_mutex);
-        if (bloc.logicalBlockNum < 0 ||
+        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-            (bloc.logicalBlockNum + count) >
+        if (bloc->logicalBlockNum < 0 ||
-                sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+            (bloc->logicalBlockNum + count) >
+                partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
-                          bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
+                          bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
-                          sbi->s_partmaps[bloc.partitionReferenceNum].
+                          count, partmap->s_partition_len);
-                                                        s_partition_len);
                goto error_return;
        }
-        block = bloc.logicalBlockNum + offset +
+        block = bloc->logicalBlockNum + offset +
                (sizeof(struct spaceBitmapDesc) << 3);
        do {
@@ -207,7 +209,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                        } else {
                                if (inode)
                                        vfs_dq_free_block(inode, 1);
-                                udf_add_free_space(sbi, sbi->s_partition, 1);
+                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
                mark_buffer_dirty(bh);
@@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
        } while (overflow);
 error_return:
-        sb->s_dirt = 1;
-        if (sbi->s_lvid_bh)
-                mark_buffer_dirty(sbi->s_lvid_bh);
        mutex_unlock(&sbi->s_alloc_mutex);
 }
@@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
        } while (block_count > 0);
 out:
-        if (udf_add_free_space(sbi, partition, -alloc_count))
+        udf_add_free_space(sb, partition, -alloc_count);
-                mark_buffer_dirty(sbi->s_lvid_bh);
-        sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        return alloc_count;
 }
@@ -409,9 +406,7 @@ got_block:
        mark_buffer_dirty(bh);
-        if (udf_add_free_space(sbi, partition, -1))
+        udf_add_free_space(sb, partition, -1);
-                mark_buffer_dirty(sbi->s_lvid_bh);
-        sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        *err = 0;
        return newblock;
@@ -425,26 +420,28 @@ error_return:
 static void udf_table_free_blocks(struct super_block *sb,
                                  struct inode *inode,
                                  struct inode *table,
-                                  kernel_lb_addr bloc, uint32_t offset,
+                                  struct kernel_lb_addr *bloc,
+                                  uint32_t offset,
                                  uint32_t count)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
+        struct udf_part_map *partmap;
        uint32_t start, end;
        uint32_t elen;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        struct extent_position oepos, epos;
        int8_t etype;
        int i;
        struct udf_inode_info *iinfo;
        mutex_lock(&sbi->s_alloc_mutex);
-        if (bloc.logicalBlockNum < 0 ||
+        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-            (bloc.logicalBlockNum + count) >
+        if (bloc->logicalBlockNum < 0 ||
-                sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+            (bloc->logicalBlockNum + count) >
+                partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
                          bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
-                          sbi->s_partmaps[bloc.partitionReferenceNum].
+                          partmap->s_partition_len);
-                                                        s_partition_len);
                goto error_return;
        }
@@ -453,11 +450,10 @@ static void udf_table_free_blocks(struct super_block *sb,
           could occure, but.. oh well */
        if (inode)
                vfs_dq_free_block(inode, count);
-        if (udf_add_free_space(sbi, sbi->s_partition, count))
+        udf_add_free_space(sb, sbi->s_partition, count);
-                mark_buffer_dirty(sbi->s_lvid_bh);
-        start = bloc.logicalBlockNum + offset;
+        start = bloc->logicalBlockNum + offset;
-        end = bloc.logicalBlockNum + offset + count - 1;
+        end = bloc->logicalBlockNum + offset + count - 1;
        epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
        elen = 0;
@@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                                start += count;
                                count = 0;
                        }
-                        udf_write_aext(table, &oepos, eloc, elen, 1);
+                        udf_write_aext(table, &oepos, &eloc, elen, 1);
                } else if (eloc.logicalBlockNum == (end + 1)) {
                        if ((0x3FFFFFFF - elen) <
                                        (count << sb->s_blocksize_bits)) {
@@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                                end -= count;
                                count = 0;
                        }
-                        udf_write_aext(table, &oepos, eloc, elen, 1);
+                        udf_write_aext(table, &oepos, &eloc, elen, 1);
                }
                if (epos.bh != oepos.bh) {
@@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb,
                 */
                int adsize;
-                short_ad *sad = NULL;
+                struct short_ad *sad = NULL;
-                long_ad *lad = NULL;
+                struct long_ad *lad = NULL;
                struct allocExtDesc *aed;
                eloc.logicalBlockNum = start;
@@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb,
                        (count << sb->s_blocksize_bits);
                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                        adsize = sizeof(short_ad);
+                        adsize = sizeof(struct short_ad);
                else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                        adsize = sizeof(long_ad);
+                        adsize = sizeof(struct long_ad);
                else {
                        brelse(oepos.bh);
                        brelse(epos.bh);
@@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                        elen -= sb->s_blocksize;
                        epos.bh = udf_tread(sb,
-                                        udf_get_lb_pblock(sb, epos.block, 0));
+                                        udf_get_lb_pblock(sb, &epos.block, 0));
                        if (!epos.bh) {
                                brelse(oepos.bh);
                                goto error_return;
@@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb,
                        if (sbi->s_udfrev >= 0x0200)
                                udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
                                            3, 1, epos.block.logicalBlockNum,
-                                            sizeof(tag));
+                                            sizeof(struct tag));
                        else
                                udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
                                            2, 1, epos.block.logicalBlockNum,
-                                            sizeof(tag));
+                                            sizeof(struct tag));
                        switch (iinfo->i_alloc_type) {
                        case ICBTAG_FLAG_AD_SHORT:
-                                sad = (short_ad *)sptr;
+                                sad = (struct short_ad *)sptr;
                                sad->extLength = cpu_to_le32(
                                        EXT_NEXT_EXTENT_ALLOCDECS |
                                        sb->s_blocksize);
@@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                                        cpu_to_le32(epos.block.logicalBlockNum);
                                break;
                        case ICBTAG_FLAG_AD_LONG:
-                                lad = (long_ad *)sptr;
+                                lad = (struct long_ad *)sptr;
                                lad->extLength = cpu_to_le32(
                                        EXT_NEXT_EXTENT_ALLOCDECS |
                                        sb->s_blocksize);
@@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                /* It's possible that stealing the block emptied the extent */
                if (elen) {
-                        udf_write_aext(table, &epos, eloc, elen, 1);
+                        udf_write_aext(table, &epos, &eloc, elen, 1);
                        if (!epos.bh) {
                                iinfo->i_lenAlloc += adsize;
@@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb,
        brelse(oepos.bh);
 error_return:
-        sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        return;
 }
@@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
        struct udf_sb_info *sbi = UDF_SB(sb);
        int alloc_count = 0;
        uint32_t elen, adsize;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        struct extent_position epos;
        int8_t etype = -1;
        struct udf_inode_info *iinfo;
@@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
        iinfo = UDF_I(table);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                return 0;
@@ -707,7 +702,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
                        alloc_count = block_count;
                        eloc.logicalBlockNum += alloc_count;
                        elen -= (alloc_count << sb->s_blocksize_bits);
-                        udf_write_aext(table, &epos, eloc,
+                        udf_write_aext(table, &epos, &eloc,
                                        (etype << 30) | elen, 1);
                } else
                        udf_delete_aext(table, epos, eloc,
@@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
        brelse(epos.bh);
-        if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) {
+        if (alloc_count)
-                mark_buffer_dirty(sbi->s_lvid_bh);
+                udf_add_free_space(sb, partition, -alloc_count);
-                sb->s_dirt = 1;
-        }
        mutex_unlock(&sbi->s_alloc_mutex);
        return alloc_count;
 }
@@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb,
        uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
        uint32_t newblock = 0, adsize;
        uint32_t elen, goal_elen = 0;
-        kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+        struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
        struct extent_position epos, goal_epos;
        int8_t etype;
        struct udf_inode_info *iinfo = UDF_I(table);
@@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb,
        *err = -ENOSPC;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                return newblock;
@@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb,
        }
        if (goal_elen)
-                udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1);
+                udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
        else
                udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
        brelse(goal_epos.bh);
-        if (udf_add_free_space(sbi, partition, -1))
+        udf_add_free_space(sb, partition, -1);
-                mark_buffer_dirty(sbi->s_lvid_bh);
-        sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        *err = 0;
        return newblock;
 }
-inline void udf_free_blocks(struct super_block *sb,
+void udf_free_blocks(struct super_block *sb, struct inode *inode,
-                            struct inode *inode,
+                     struct kernel_lb_addr *bloc, uint32_t offset,
-                            kernel_lb_addr bloc, uint32_t offset,
+                     uint32_t count)
-                            uint32_t count)
 {
-        uint16_t partition = bloc.partitionReferenceNum;
+        uint16_t partition = bloc->partitionReferenceNum;
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
-                return udf_bitmap_free_blocks(sb, inode,
+                udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
-                                              map->s_uspace.s_bitmap,
+                                       bloc, offset, count);
-                                              bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
-                return udf_table_free_blocks(sb, inode,
+                udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
-                                             map->s_uspace.s_table,
+                                      bloc, offset, count);
-                                             bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
-                return udf_bitmap_free_blocks(sb, inode,
+                udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
-                                              map->s_fspace.s_bitmap,
+                                       bloc, offset, count);
-                                              bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
-                return udf_table_free_blocks(sb, inode,
+                udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
-                                             map->s_fspace.s_table,
+                                      bloc, offset, count);
-                                             bloc, offset, count);
-        } else {
-                return;
        }
 }
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 62dc270c69d1..2efd4d5291b6 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
        uint8_t lfi;
        loff_t size = udf_ext0_offset(dir) + dir->i_size;
        struct buffer_head *tmp, *bha[16];
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        int i, num, ret = 0;
@@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                        ret = -ENOENT;
                        goto out;
                }
-                block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                                epos.offset -= sizeof(short_ad);
+                                epos.offset -= sizeof(struct short_ad);
                        else if (iinfo->i_alloc_type ==
                                        ICBTAG_FLAG_AD_LONG)
-                                epos.offset -= sizeof(long_ad);
+                                epos.offset -= sizeof(struct long_ad);
                } else {
                        offset = 0;
                }
@@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                        if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
                                i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
                        for (num = 0; i > 0; i--) {
-                                block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i);
+                                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
                                tmp = udf_tgetblk(dir->i_sb, block);
                                if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
                                        bha[num++] = tmp;
@@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                        memcpy(fname, "..", flen);
                        dt_type = DT_DIR;
                } else {
-                        kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
+                        struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
-                        iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0);
+                        iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
                        flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
                        dt_type = DT_UNKNOWN;
                }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2820f8fcf4cc..1d2c570704c8 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -20,7 +20,7 @@
 #if 0
 static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
-                                uint8_t ad_size, kernel_lb_addr fe_loc,
+                                uint8_t ad_size, struct kernel_lb_addr fe_loc,
                                int *pos, int *offset, struct buffer_head **bh,
                                int *error)
 {
@@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                                         struct udf_fileident_bh *fibh,
                                         struct fileIdentDesc *cfi,
                                         struct extent_position *epos,
-                                         kernel_lb_addr *eloc, uint32_t *elen,
+                                         struct kernel_lb_addr *eloc, uint32_t *elen,
                                         sector_t *offset)
 {
        struct fileIdentDesc *fi;
@@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                    (EXT_RECORDED_ALLOCATED >> 30))
                        return NULL;
-                block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+                block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
                (*offset)++;
@@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                        if (i + *offset > (*elen >> blocksize_bits))
                                i = (*elen >> blocksize_bits)-*offset;
                        for (num = 0; i > 0; i--) {
-                                block = udf_get_lb_pblock(dir->i_sb, *eloc,
+                                block = udf_get_lb_pblock(dir->i_sb, eloc,
                                                          *offset + i);
                                tmp = udf_tgetblk(dir->i_sb, block);
                                if (tmp && !buffer_uptodate(tmp) &&
@@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                    (EXT_RECORDED_ALLOCATED >> 30))
                        return NULL;
-                block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+                block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
                (*offset)++;
@@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
 }
 #if 0
-static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
+static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
 {
-        extent_ad *ext;
+        struct extent_ad *ext;
        struct fileEntry *fe;
        uint8_t *ptr;
@@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
        if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
                ptr += *offset;
-        ext = (extent_ad *)ptr;
+        ext = (struct extent_ad *)ptr;
-        *offset = *offset + sizeof(extent_ad);
+        *offset = *offset + sizeof(struct extent_ad);
        return ext;
 }
 #endif
-short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
+struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
                              int inc)
 {
-        short_ad *sa;
+        struct short_ad *sa;
        if ((!ptr) || (!offset)) {
                printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
                return NULL;
        }
-        if ((*offset + sizeof(short_ad)) > maxoffset)
+        if ((*offset + sizeof(struct short_ad)) > maxoffset)
                return NULL;
        else {
-                sa = (short_ad *)ptr;
+                sa = (struct short_ad *)ptr;
                if (sa->extLength == 0)
                        return NULL;
        }
        if (inc)
-                *offset += sizeof(short_ad);
+                *offset += sizeof(struct short_ad);
        return sa;
 }
-long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
+struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
 {
-        long_ad *la;
+        struct long_ad *la;
        if ((!ptr) || (!offset)) {
                printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
                return NULL;
        }
-        if ((*offset + sizeof(long_ad)) > maxoffset)
+        if ((*offset + sizeof(struct long_ad)) > maxoffset)
                return NULL;
        else {
-                la = (long_ad *)ptr;
+                la = (struct long_ad *)ptr;
                if (la->extLength == 0)
                        return NULL;
        }
        if (inc)
-                *offset += sizeof(long_ad);
+                *offset += sizeof(struct long_ad);
        return la;
 }
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index a0974df82b31..4792b771aa80 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -38,10 +38,10 @@
 #define _ECMA_167_H 1
 /* Character set specification (ECMA 167r3 1/7.2.1) */
-typedef struct {
+struct charspec {
        uint8_t         charSetType;
        uint8_t         charSetInfo[63];
-} __attribute__ ((packed)) charspec;
+} __attribute__ ((packed));
 /* Character Set Type (ECMA 167r3 1/7.2.1.1) */
 #define CHARSPEC_TYPE_CS0               0x00    /* (1/7.2.2) */
@@ -57,7 +57,7 @@ typedef struct {
 typedef uint8_t         dstring;
 /* Timestamp (ECMA 167r3 1/7.3) */
-typedef struct {
+struct timestamp {
        __le16          typeAndTimezone;
        __le16          year;
        uint8_t         month;
@@ -68,7 +68,7 @@ typedef struct {
        uint8_t         centiseconds;
        uint8_t         hundredsOfMicroseconds;
        uint8_t         microseconds;
-} __attribute__ ((packed)) timestamp;
+} __attribute__ ((packed));
 /* Type and Time Zone (ECMA 167r3 1/7.3.1) */
 #define TIMESTAMP_TYPE_MASK             0xF000
@@ -78,11 +78,11 @@ typedef struct {
 #define TIMESTAMP_TIMEZONE_MASK         0x0FFF
 /* Entity identifier (ECMA 167r3 1/7.4) */
-typedef struct {
+struct regid {
        uint8_t         flags;
        uint8_t         ident[23];
        uint8_t         identSuffix[8];
-} __attribute__ ((packed)) regid;
+} __attribute__ ((packed));
 /* Flags (ECMA 167r3 1/7.4.1) */
 #define ENTITYID_FLAGS_DIRTY            0x00
@@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc {
 /* Boot Descriptor (ECMA 167r3 2/9.4) */
 struct bootDesc {
-        uint8_t         structType;
+        uint8_t                 structType;
-        uint8_t         stdIdent[VSD_STD_ID_LEN];
+        uint8_t                 stdIdent[VSD_STD_ID_LEN];
-        uint8_t         structVersion;
+        uint8_t                 structVersion;
-        uint8_t         reserved1;
+        uint8_t                 reserved1;
-        regid           archType;
+        struct regid            archType;
-        regid           bootIdent;
+        struct regid            bootIdent;
-        __le32          bootExtLocation;
+        __le32                  bootExtLocation;
-        __le32          bootExtLength;
+        __le32                  bootExtLength;
-        __le64          loadAddress;
+        __le64                  loadAddress;
-        __le64          startAddress;
+        __le64                  startAddress;
-        timestamp       descCreationDateAndTime;
+        struct timestamp        descCreationDateAndTime;
-        __le16          flags;
+        __le16                  flags;
-        uint8_t         reserved2[32];
+        uint8_t                 reserved2[32];
-        uint8_t         bootUse[1906];
+        uint8_t                 bootUse[1906];
 } __attribute__ ((packed));
 /* Flags (ECMA 167r3 2/9.4.12) */
 #define BOOT_FLAGS_ERASE                0x01
 /* Extent Descriptor (ECMA 167r3 3/7.1) */
-typedef struct {
+struct extent_ad {
        __le32          extLength;
        __le32          extLocation;
-} __attribute__ ((packed)) extent_ad;
+} __attribute__ ((packed));
-typedef struct {
+struct kernel_extent_ad {
        uint32_t        extLength;
        uint32_t        extLocation;
-} kernel_extent_ad;
+};
 /* Descriptor Tag (ECMA 167r3 3/7.2) */
-typedef struct {
+struct tag {
        __le16          tagIdent;
        __le16          descVersion;
        uint8_t         tagChecksum;
@@ -166,7 +166,7 @@ typedef struct {
        __le16          descCRC;
        __le16          descCRCLength;
        __le32          tagLocation;
-} __attribute__ ((packed)) tag;
+} __attribute__ ((packed));
 /* Tag Identifier (ECMA 167r3 3/7.2.1) */
 #define TAG_IDENT_PVD                   0x0001
@@ -190,28 +190,28 @@ struct NSRDesc {
 /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
 struct primaryVolDesc {
-        tag             descTag;
+        struct tag              descTag;
-        __le32          volDescSeqNum;
+        __le32                  volDescSeqNum;
-        __le32          primaryVolDescNum;
+        __le32                  primaryVolDescNum;
-        dstring         volIdent[32];
+        dstring                 volIdent[32];
-        __le16          volSeqNum;
+        __le16                  volSeqNum;
-        __le16          maxVolSeqNum;
+        __le16                  maxVolSeqNum;
-        __le16          interchangeLvl;
+        __le16                  interchangeLvl;
-        __le16          maxInterchangeLvl;
+        __le16                  maxInterchangeLvl;
-        __le32          charSetList;
+        __le32                  charSetList;
-        __le32          maxCharSetList;
+        __le32                  maxCharSetList;
-        dstring         volSetIdent[128];
+        dstring                 volSetIdent[128];
-        charspec        descCharSet;
+        struct charspec         descCharSet;
-        charspec        explanatoryCharSet;
+        struct charspec         explanatoryCharSet;
-        extent_ad       volAbstract;
+        struct extent_ad        volAbstract;
-        extent_ad       volCopyright;
+        struct extent_ad        volCopyright;
-        regid           appIdent;
+        struct regid            appIdent;
-        timestamp       recordingDateAndTime;
+        struct timestamp        recordingDateAndTime;
-        regid           impIdent;
+        struct regid            impIdent;
-        uint8_t         impUse[64];
+        uint8_t                 impUse[64];
-        __le32          predecessorVolDescSeqLocation;
+        __le32                  predecessorVolDescSeqLocation;
-        __le16          flags;
+        __le16                  flags;
-        uint8_t         reserved[22];
+        uint8_t                 reserved[22];
 } __attribute__ ((packed));
 /* Flags (ECMA 167r3 3/10.1.21) */
@@ -219,40 +219,40 @@ struct primaryVolDesc {
 /* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
 struct anchorVolDescPtr {
-        tag             descTag;
+        struct tag              descTag;
-        extent_ad       mainVolDescSeqExt;
+        struct extent_ad        mainVolDescSeqExt;
-        extent_ad       reserveVolDescSeqExt;
+        struct extent_ad        reserveVolDescSeqExt;
-        uint8_t         reserved[480];
+        uint8_t                 reserved[480];
 } __attribute__ ((packed));
 /* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
 struct volDescPtr {
-        tag             descTag;
+        struct tag              descTag;
-        __le32          volDescSeqNum;
+        __le32                  volDescSeqNum;
-        extent_ad       nextVolDescSeqExt;
+        struct extent_ad        nextVolDescSeqExt;
-        uint8_t         reserved[484];
+        uint8_t                 reserved[484];
 } __attribute__ ((packed));
 /* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
 struct impUseVolDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          volDescSeqNum;
-        regid           impIdent;
+        struct regid    impIdent;
        uint8_t         impUse[460];
 } __attribute__ ((packed));
 /* Partition Descriptor (ECMA 167r3 3/10.5) */
 struct partitionDesc {
-        tag descTag;
+        struct tag descTag;
        __le32 volDescSeqNum;
        __le16 partitionFlags;
        __le16 partitionNumber;
-        regid partitionContents;
+        struct regid partitionContents;
        uint8_t partitionContentsUse[128];
        __le32 accessType;
        __le32 partitionStartingLocation;
        __le32 partitionLength;
-        regid impIdent;
+        struct regid impIdent;
        uint8_t impUse[128];
        uint8_t reserved[156];
 } __attribute__ ((packed));
@@ -278,19 +278,19 @@ struct partitionDesc {
 /* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
 struct logicalVolDesc {
-        tag             descTag;
+        struct tag              descTag;
-        __le32          volDescSeqNum;
+        __le32                  volDescSeqNum;
-        charspec        descCharSet;
+        struct charspec         descCharSet;
-        dstring         logicalVolIdent[128];
+        dstring                 logicalVolIdent[128];
-        __le32          logicalBlockSize;
+        __le32                  logicalBlockSize;
-        regid           domainIdent;
+        struct regid            domainIdent;
-        uint8_t         logicalVolContentsUse[16];
+        uint8_t                 logicalVolContentsUse[16];
-        __le32          mapTableLength;
+        __le32                  mapTableLength;
-        __le32          numPartitionMaps;
+        __le32                  numPartitionMaps;
-        regid           impIdent;
+        struct regid            impIdent;
-        uint8_t         impUse[128];
+        uint8_t                 impUse[128];
-        extent_ad       integritySeqExt;
+        struct extent_ad        integritySeqExt;
-        uint8_t         partitionMaps[0];
+        uint8_t                 partitionMaps[0];
 } __attribute__ ((packed));
 /* Generic Partition Map (ECMA 167r3 3/10.7.1) */
@@ -322,30 +322,30 @@ struct genericPartitionMap2 {
 /* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
 struct unallocSpaceDesc {
-        tag             descTag;
+        struct tag              descTag;
-        __le32          volDescSeqNum;
+        __le32                  volDescSeqNum;
-        __le32          numAllocDescs;
+        __le32                  numAllocDescs;
-        extent_ad       allocDescs[0];
+        struct extent_ad        allocDescs[0];
 } __attribute__ ((packed));
 /* Terminating Descriptor (ECMA 167r3 3/10.9) */
 struct terminatingDesc {
-        tag             descTag;
+        struct tag      descTag;
        uint8_t         reserved[496];
 } __attribute__ ((packed));
 /* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
 struct logicalVolIntegrityDesc {
-        tag             descTag;
+        struct tag              descTag;
-        timestamp       recordingDateAndTime;
+        struct timestamp        recordingDateAndTime;
-        __le32          integrityType;
+        __le32                  integrityType;
-        extent_ad       nextIntegrityExt;
+        struct extent_ad        nextIntegrityExt;
-        uint8_t         logicalVolContentsUse[32];
+        uint8_t                 logicalVolContentsUse[32];
-        __le32          numOfPartitions;
+        __le32                  numOfPartitions;
-        __le32          lengthOfImpUse;
+        __le32                  lengthOfImpUse;
-        __le32          freeSpaceTable[0];
+        __le32                  freeSpaceTable[0];
-        __le32          sizeTable[0];
+        __le32                  sizeTable[0];
-        uint8_t         impUse[0];
+        uint8_t                 impUse[0];
 } __attribute__ ((packed));
 /* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc {
 #define LVID_INTEGRITY_TYPE_CLOSE       0x00000001
 /* Recorded Address (ECMA 167r3 4/7.1) */
-typedef struct {
+struct lb_addr {
        __le32          logicalBlockNum;
        __le16          partitionReferenceNum;
-} __attribute__ ((packed)) lb_addr;
+} __attribute__ ((packed));
 /* ... and its in-core analog */
-typedef struct {
+struct kernel_lb_addr {
        uint32_t                logicalBlockNum;
        uint16_t                partitionReferenceNum;
-} kernel_lb_addr;
+};
 /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
-typedef struct {
+struct short_ad {
        __le32          extLength;
        __le32          extPosition;
-} __attribute__ ((packed)) short_ad;
+} __attribute__ ((packed));
 /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
-typedef struct {
+struct long_ad {
        __le32          extLength;
-        lb_addr         extLocation;
+        struct lb_addr  extLocation;
        uint8_t         impUse[6];
-} __attribute__ ((packed)) long_ad;
+} __attribute__ ((packed));
-typedef struct {
+struct kernel_long_ad {
-        uint32_t        extLength;
+        uint32_t                extLength;
-        kernel_lb_addr  extLocation;
+        struct kernel_lb_addr   extLocation;
-        uint8_t         impUse[6];
+        uint8_t                 impUse[6];
-} kernel_long_ad;
+};
 /* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
-typedef struct {
+struct ext_ad {
        __le32          extLength;
        __le32          recordedLength;
        __le32          informationLength;
-        lb_addr         extLocation;
+        struct lb_addr  extLocation;
-} __attribute__ ((packed)) ext_ad;
+} __attribute__ ((packed));
-typedef struct {
+struct kernel_ext_ad {
-        uint32_t        extLength;
+        uint32_t                extLength;
-        uint32_t        recordedLength;
+        uint32_t                recordedLength;
-        uint32_t        informationLength;
+        uint32_t                informationLength;
-        kernel_lb_addr  extLocation;
+        struct kernel_lb_addr   extLocation;
-} kernel_ext_ad;
+};
 /* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
@@ -415,44 +415,44 @@ typedef struct {
 /* File Set Descriptor (ECMA 167r3 4/14.1) */
 struct fileSetDesc {
-        tag             descTag;
+        struct tag              descTag;
-        timestamp       recordingDateAndTime;
+        struct timestamp        recordingDateAndTime;
-        __le16          interchangeLvl;
+        __le16                  interchangeLvl;
-        __le16          maxInterchangeLvl;
+        __le16                  maxInterchangeLvl;
-        __le32          charSetList;
+        __le32                  charSetList;
-        __le32          maxCharSetList;
+        __le32                  maxCharSetList;
-        __le32          fileSetNum;
+        __le32                  fileSetNum;
-        __le32          fileSetDescNum;
+        __le32                  fileSetDescNum;
-        charspec        logicalVolIdentCharSet;
+        struct charspec         logicalVolIdentCharSet;
-        dstring         logicalVolIdent[128];
+        dstring                 logicalVolIdent[128];
-        charspec        fileSetCharSet;
+        struct charspec         fileSetCharSet;
-        dstring         fileSetIdent[32];
+        dstring                 fileSetIdent[32];
-        dstring         copyrightFileIdent[32];
+        dstring                 copyrightFileIdent[32];
-        dstring         abstractFileIdent[32];
+        dstring                 abstractFileIdent[32];
-        long_ad         rootDirectoryICB;
+        struct long_ad          rootDirectoryICB;
-        regid           domainIdent;
+        struct regid            domainIdent;
-        long_ad         nextExt;
+        struct long_ad          nextExt;
-        long_ad         streamDirectoryICB;
+        struct long_ad          streamDirectoryICB;
-        uint8_t         reserved[32];
+        uint8_t                 reserved[32];
 } __attribute__ ((packed));
 /* Partition Header Descriptor (ECMA 167r3 4/14.3) */
 struct partitionHeaderDesc {
-        short_ad        unallocSpaceTable;
+        struct short_ad unallocSpaceTable;
-        short_ad        unallocSpaceBitmap;
+        struct short_ad unallocSpaceBitmap;
-        short_ad        partitionIntegrityTable;
+        struct short_ad partitionIntegrityTable;
-        short_ad        freedSpaceTable;
+        struct short_ad freedSpaceTable;
-        short_ad        freedSpaceBitmap;
+        struct short_ad freedSpaceBitmap;
        uint8_t         reserved[88];
 } __attribute__ ((packed));
 /* File Identifier Descriptor (ECMA 167r3 4/14.4) */
 struct fileIdentDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le16          fileVersionNum;
        uint8_t         fileCharacteristics;
        uint8_t         lengthFileIdent;
-        long_ad         icb;
+        struct long_ad  icb;
        __le16          lengthOfImpUse;
        uint8_t         impUse[0];
        uint8_t         fileIdent[0];
@@ -468,22 +468,22 @@ struct fileIdentDesc {
 /* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
 struct allocExtDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          previousAllocExtLocation;
        __le32          lengthAllocDescs;
 } __attribute__ ((packed));
 /* ICB Tag (ECMA 167r3 4/14.6) */
-typedef struct {
+struct icbtag {
        __le32          priorRecordedNumDirectEntries;
        __le16          strategyType;
        __le16          strategyParameter;
        __le16          numEntries;
        uint8_t         reserved;
        uint8_t         fileType;
-        lb_addr         parentICBLocation;
+        struct lb_addr  parentICBLocation;
        __le16          flags;
-} __attribute__ ((packed)) icbtag;
+} __attribute__ ((packed));
 /* Strategy Type (ECMA 167r3 4/14.6.2) */
 #define ICBTAG_STRATEGY_TYPE_UNDEF      0x0000
@@ -528,41 +528,41 @@ typedef struct {
 /* Indirect Entry (ECMA 167r3 4/14.7) */
 struct indirectEntry {
-        tag             descTag;
+        struct tag      descTag;
-        icbtag          icbTag;
+        struct icbtag   icbTag;
-        long_ad         indirectICB;
+        struct long_ad  indirectICB;
 } __attribute__ ((packed));
 /* Terminal Entry (ECMA 167r3 4/14.8) */
 struct terminalEntry {
-        tag             descTag;
+        struct tag      descTag;
-        icbtag          icbTag;
+        struct icbtag   icbTag;
 } __attribute__ ((packed));
 /* File Entry (ECMA 167r3 4/14.9) */
 struct fileEntry {
-        tag             descTag;
+        struct tag              descTag;
-        icbtag          icbTag;
+        struct icbtag           icbTag;
-        __le32          uid;
+        __le32                  uid;
-        __le32          gid;
+        __le32                  gid;
-        __le32          permissions;
+        __le32                  permissions;
-        __le16          fileLinkCount;
+        __le16                  fileLinkCount;
-        uint8_t         recordFormat;
+        uint8_t                 recordFormat;
-        uint8_t         recordDisplayAttr;
+        uint8_t                 recordDisplayAttr;
-        __le32          recordLength;
+        __le32                  recordLength;
-        __le64          informationLength;
+        __le64                  informationLength;
-        __le64          logicalBlocksRecorded;
+        __le64                  logicalBlocksRecorded;
-        timestamp       accessTime;
+        struct timestamp        accessTime;
-        timestamp       modificationTime;
+        struct timestamp        modificationTime;
-        timestamp       attrTime;
+        struct timestamp        attrTime;
-        __le32          checkpoint;
+        __le32                  checkpoint;
-        long_ad         extendedAttrICB;
+        struct long_ad          extendedAttrICB;
-        regid           impIdent;
+        struct regid            impIdent;
-        __le64          uniqueID;
+        __le64                  uniqueID;
-        __le32          lengthExtendedAttr;
+        __le32                  lengthExtendedAttr;
-        __le32          lengthAllocDescs;
+        __le32                  lengthAllocDescs;
-        uint8_t         extendedAttr[0];
+        uint8_t                 extendedAttr[0];
-        uint8_t         allocDescs[0];
+        uint8_t                 allocDescs[0];
 } __attribute__ ((packed));
 /* Permissions (ECMA 167r3 4/14.9.5) */
@@ -604,7 +604,7 @@ struct fileEntry {
 /* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
 struct extendedAttrHeaderDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          impAttrLocation;
        __le32          appAttrLocation;
 } __attribute__ ((packed));
@@ -687,7 +687,7 @@ struct impUseExtAttr {
        uint8_t         reserved[3];
        __le32          attrLength;
        __le32          impUseLength;
-        regid           impIdent;
+        struct regid    impIdent;
        uint8_t         impUse[0];
 } __attribute__ ((packed));
@@ -698,7 +698,7 @@ struct appUseExtAttr {
        uint8_t         reserved[3];
        __le32          attrLength;
        __le32          appUseLength;
-        regid           appIdent;
+        struct regid    appIdent;
        uint8_t         appUse[0];
 } __attribute__ ((packed));
@@ -712,15 +712,15 @@ struct appUseExtAttr {
 /* Unallocated Space Entry (ECMA 167r3 4/14.11) */
 struct unallocSpaceEntry {
-        tag             descTag;
+        struct tag      descTag;
-        icbtag          icbTag;
+        struct icbtag   icbTag;
        __le32          lengthAllocDescs;
        uint8_t         allocDescs[0];
 } __attribute__ ((packed));
 /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
 struct spaceBitmapDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          numOfBits;
        __le32          numOfBytes;
        uint8_t         bitmap[0];
@@ -728,13 +728,13 @@ struct spaceBitmapDesc {
 /* Partition Integrity Entry (ECMA 167r3 4/14.13) */
 struct partitionIntegrityEntry {
-        tag             descTag;
+        struct tag              descTag;
-        icbtag          icbTag;
+        struct icbtag           icbTag;
-        timestamp       recordingDateAndTime;
+        struct timestamp        recordingDateAndTime;
-        uint8_t         integrityType;
+        uint8_t                 integrityType;
-        uint8_t         reserved[175];
+        uint8_t                 reserved[175];
-        regid           impIdent;
+        struct regid            impIdent;
-        uint8_t         impUse[256];
+        uint8_t                 impUse[256];
 } __attribute__ ((packed));
 /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
@@ -765,32 +765,32 @@ struct pathComponent {
 /* File Entry (ECMA 167r3 4/14.17) */
 struct extendedFileEntry {
-        tag             descTag;
+        struct tag              descTag;
-        icbtag          icbTag;
+        struct icbtag           icbTag;
-        __le32          uid;
+        __le32                  uid;
-        __le32          gid;
+        __le32                  gid;
-        __le32          permissions;
+        __le32                  permissions;
-        __le16          fileLinkCount;
+        __le16                  fileLinkCount;
-        uint8_t         recordFormat;
+        uint8_t                 recordFormat;
-        uint8_t         recordDisplayAttr;
+        uint8_t                 recordDisplayAttr;
-        __le32          recordLength;
+        __le32                  recordLength;
-        __le64          informationLength;
+        __le64                  informationLength;
-        __le64          objectSize;
+        __le64                  objectSize;
-        __le64          logicalBlocksRecorded;
+        __le64                  logicalBlocksRecorded;
-        timestamp       accessTime;
+        struct timestamp        accessTime;
-        timestamp       modificationTime;
+        struct timestamp        modificationTime;
-        timestamp       createTime;
+        struct timestamp        createTime;
-        timestamp       attrTime;
+        struct timestamp        attrTime;
-        __le32          checkpoint;
+        __le32                  checkpoint;
-        __le32          reserved;
+        __le32                  reserved;
-        long_ad         extendedAttrICB;
+        struct long_ad          extendedAttrICB;
-        long_ad         streamDirectoryICB;
+        struct long_ad          streamDirectoryICB;
-        regid           impIdent;
+        struct regid            impIdent;
-        __le64          uniqueID;
+        __le64                  uniqueID;
-        __le32          lengthExtendedAttr;
+        __le32                  lengthExtendedAttr;
-        __le32          lengthAllocDescs;
+        __le32                  lengthAllocDescs;
-        uint8_t         extendedAttr[0];
+        uint8_t                 extendedAttr[0];
-        uint8_t         allocDescs[0];
+        uint8_t                 allocDescs[0];
 } __attribute__ ((packed));
 #endif /* _ECMA_167_H */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 47dbe5613f90..c10fa39f97e2 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode)
                        le32_add_cpu(&lvidiu->numDirs, -1);
                else
                        le32_add_cpu(&lvidiu->numFiles, -1);
+                udf_updated_lvid(sb);
-                mark_buffer_dirty(sbi->s_lvid_bh);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
-        udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1);
+        udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
 }
 struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
@@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
                        uniqueID += 16;
                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(sbi->s_lvid_bh);
+                udf_updated_lvid(sb);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
        inode->i_mode = mode;
@@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        iinfo->i_location.logicalBlockNum = block;
        iinfo->i_location.partitionReferenceNum =
                                dinfo->i_location.partitionReferenceNum;
-        inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0);
+        inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
        inode->i_blocks = 0;
        iinfo->i_lenEAttr = 0;
        iinfo->i_lenAlloc = 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 30ebde490f7f..e7533f785636 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
                                        sector_t *, int *);
 static int8_t udf_insert_aext(struct inode *, struct extent_position,
-                              kernel_lb_addr, uint32_t);
+                              struct kernel_lb_addr, uint32_t);
 static void udf_split_extents(struct inode *, int *, int, int,
-                              kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+                              struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_prealloc_extents(struct inode *, int, int,
-                                 kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+                                 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_merge_extents(struct inode *,
-                              kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+                              struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_update_extents(struct inode *,
-                               kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
+                               struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
                               struct extent_position *);
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
@@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
 {
        int newblock;
        struct buffer_head *dbh = NULL;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        uint8_t alloctype;
        struct extent_position epos;
@@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
        epos.bh = NULL;
        epos.block = iinfo->i_location;
        epos.offset = udf_file_entry_alloc_offset(inode);
-        udf_add_aext(inode, &epos, eloc, elen, 0);
+        udf_add_aext(inode, &epos, &eloc, elen, 0);
        /* UniqueID stuff */
        brelse(epos.bh);
@@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
 /* Extend the file by 'blocks' blocks, return the number of extents added */
 int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
-                    kernel_long_ad *last_ext, sector_t blocks)
+                    struct kernel_long_ad *last_ext, sector_t blocks)
 {
        sector_t add;
        int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
        struct super_block *sb = inode->i_sb;
-        kernel_lb_addr prealloc_loc = {};
+        struct kernel_lb_addr prealloc_loc = {};
        int prealloc_len = 0;
        struct udf_inode_info *iinfo;
@@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        }
        if (fake) {
-                udf_add_aext(inode, last_pos, last_ext->extLocation,
+                udf_add_aext(inode, last_pos, &last_ext->extLocation,
                             last_ext->extLength, 1);
                count++;
        } else
-                udf_write_aext(inode, last_pos, last_ext->extLocation,
+                udf_write_aext(inode, last_pos, &last_ext->extLocation,
                                last_ext->extLength, 1);
        /* Managed to do everything necessary? */
@@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        /* Create enough extents to cover the whole hole */
        while (blocks > add) {
                blocks -= add;
-                if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
                                 last_ext->extLength, 1) == -1)
                        return -1;
                count++;
@@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        if (blocks) {
                last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                        (blocks << sb->s_blocksize_bits);
-                if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
                                 last_ext->extLength, 1) == -1)
                        return -1;
                count++;
@@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
 out:
        /* Do we have some preallocated blocks saved? */
        if (prealloc_len) {
-                if (udf_add_aext(inode, last_pos, prealloc_loc,
+                if (udf_add_aext(inode, last_pos, &prealloc_loc,
                                 prealloc_len, 1) == -1)
                        return -1;
                last_ext->extLocation = prealloc_loc;
@@ -459,9 +459,9 @@ out:
        /* last_pos should point to the last written extent... */
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                last_pos->offset -= sizeof(short_ad);
+                last_pos->offset -= sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                last_pos->offset -= sizeof(long_ad);
+                last_pos->offset -= sizeof(struct long_ad);
        else
                return -1;
@@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 {
        static sector_t last_block;
        struct buffer_head *result = NULL;
-        kernel_long_ad laarr[EXTENT_MERGE_SIZE];
+        struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
        struct extent_position prev_epos, cur_epos, next_epos;
        int count = 0, startnum = 0, endnum = 0;
        uint32_t elen = 0, tmpelen;
-        kernel_lb_addr eloc, tmpeloc;
+        struct kernel_lb_addr eloc, tmpeloc;
        int c = 1;
        loff_t lbcount = 0, b_off = 0;
        uint32_t newblocknum, newblock;
@@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        elen = EXT_RECORDED_ALLOCATED |
                                ((elen + inode->i_sb->s_blocksize - 1) &
                                 ~(inode->i_sb->s_blocksize - 1));
-                        etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1);
+                        etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
                }
                brelse(prev_epos.bh);
                brelse(cur_epos.bh);
                brelse(next_epos.bh);
-                newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+                newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
                *phys = newblock;
                return NULL;
        }
@@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                } else {
                        /* Create a fake extent when there's not one */
                        memset(&laarr[0].extLocation, 0x00,
-                                sizeof(kernel_lb_addr));
+                                sizeof(struct kernel_lb_addr));
                        laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
                        /* Will udf_extend_file() create real extent from
                           a fake one? */
@@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                                inode->i_sb->s_blocksize;
                        memset(&laarr[c].extLocation, 0x00,
-                                sizeof(kernel_lb_addr));
+                                sizeof(struct kernel_lb_addr));
                        count++;
                        endnum++;
                }
@@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 static void udf_split_extents(struct inode *inode, int *c, int offset,
                              int newblocknum,
-                              kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                              struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                              int *endnum)
 {
        unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
                if (offset) {
                        if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                                udf_free_blocks(inode->i_sb, inode,
-                                                laarr[curr].extLocation,
+                                                &laarr[curr].extLocation,
                                                0, offset);
                                laarr[curr].extLength =
                                        EXT_NOT_RECORDED_NOT_ALLOCATED |
@@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
 }
 static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
-                                 kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                                 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                                 int *endnum)
 {
        int start, length = 0, currlength = 0, i;
@@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
                                         inode->i_sb->s_blocksize_bits);
                        else {
                                memmove(&laarr[c + 2], &laarr[c + 1],
-                                        sizeof(long_ad) * (*endnum - (c + 1)));
+                                        sizeof(struct long_ad) * (*endnum - (c + 1)));
                                (*endnum)++;
                                laarr[c + 1].extLocation.logicalBlockNum = next;
                                laarr[c + 1].extLocation.partitionReferenceNum =
@@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
                                        if (*endnum > (i + 1))
                                                memmove(&laarr[i],
                                                        &laarr[i + 1],
-                                                        sizeof(long_ad) *
+                                                        sizeof(struct long_ad) *
                                                        (*endnum - (i + 1)));
                                        i--;
                                        (*endnum)--;
@@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
 }
 static void udf_merge_extents(struct inode *inode,
-                              kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                              struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                              int *endnum)
 {
        int i;
@@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode,
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        for (i = 0; i < (*endnum - 1); i++) {
-                kernel_long_ad *li /*l[i]*/ = &laarr[i];
+                struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
-                kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
+                struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
                if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
                        (((li->extLength >> 30) ==
@@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode,
                                         blocksize - 1) & ~(blocksize - 1));
                                if (*endnum > (i + 2))
                                        memmove(&laarr[i + 1], &laarr[i + 2],
-                                                sizeof(long_ad) *
+                                                sizeof(struct long_ad) *
                                                (*endnum - (i + 2)));
                                i--;
                                (*endnum)--;
@@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode,
                                (EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
                           ((lip1->extLength >> 30) ==
                                (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
-                        udf_free_blocks(inode->i_sb, inode, li->extLocation, 0,
+                        udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
                                        ((li->extLength &
                                          UDF_EXTENT_LENGTH_MASK) +
                                         blocksize - 1) >> blocksize_bits);
@@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode,
                                          blocksize - 1) & ~(blocksize - 1));
                                if (*endnum > (i + 2))
                                        memmove(&laarr[i + 1], &laarr[i + 2],
-                                                sizeof(long_ad) *
+                                                sizeof(struct long_ad) *
                                                (*endnum - (i + 2)));
                                i--;
                                (*endnum)--;
@@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode,
                } else if ((li->extLength >> 30) ==
                                        (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                        udf_free_blocks(inode->i_sb, inode,
-                                        li->extLocation, 0,
+                                        &li->extLocation, 0,
                                        ((li->extLength &
                                                UDF_EXTENT_LENGTH_MASK) +
                                         blocksize - 1) >> blocksize_bits);
@@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode,
 }
 static void udf_update_extents(struct inode *inode,
-                               kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                               struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                               int startnum, int endnum,
                               struct extent_position *epos)
 {
        int start = 0, i;
-        kernel_lb_addr tmploc;
+        struct kernel_lb_addr tmploc;
        uint32_t tmplen;
        if (startnum > endnum) {
@@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode,
        for (i = start; i < endnum; i++) {
                udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
-                udf_write_aext(inode, epos, laarr[i].extLocation,
+                udf_write_aext(inode, epos, &laarr[i].extLocation,
                               laarr[i].extLength, 1);
        }
 }
@@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode)
         *      i_nlink = 1
         *      i_op = NULL;
         */
-        bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident);
+        bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
        if (!bh) {
                printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
                       inode->i_ino);
@@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode)
        if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
                struct buffer_head *ibh;
-                ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1,
+                ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
                                        &ident);
                if (ident == TAG_IDENT_IE && ibh) {
                        struct buffer_head *nbh = NULL;
-                        kernel_lb_addr loc;
+                        struct kernel_lb_addr loc;
                        struct indirectEntry *ie;
                        ie = (struct indirectEntry *)ibh->b_data;
                        loc = lelb_to_cpu(ie->indirectICB.extLocation);
                        if (ie->indirectICB.extLength &&
-                                (nbh = udf_read_ptagged(inode->i_sb, loc, 0,
+                                (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
                                                        &ident))) {
                                if (ident == TAG_IDENT_FE ||
                                        ident == TAG_IDENT_EFE) {
                                        memcpy(&iinfo->i_location,
                                                &loc,
-                                                sizeof(kernel_lb_addr));
+                                                sizeof(struct kernel_lb_addr));
                                        brelse(bh);
                                        brelse(ibh);
                                        brelse(nbh);
@@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        inode->i_size = le64_to_cpu(fe->informationLength);
        iinfo->i_lenExtents = inode->i_size;
-        inode->i_mode = udf_convert_permissions(fe);
+        if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
-        inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask;
+                        sbi->s_fmode != UDF_INVALID_MODE)
+                inode->i_mode = sbi->s_fmode;
+        else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
+                        sbi->s_dmode != UDF_INVALID_MODE)
+                inode->i_mode = sbi->s_dmode;
+        else
+                inode->i_mode = udf_convert_permissions(fe);
+        inode->i_mode &= ~sbi->s_umask;
        if (iinfo->i_efe == 0) {
                inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        bh = udf_tread(inode->i_sb,
                        udf_get_lb_pblock(inode->i_sb,
-                                          iinfo->i_location, 0));
+                                          &iinfo->i_location, 0));
        if (!bh) {
                udf_debug("bread failure\n");
                return -EIO;
@@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
                                        sizeof(struct unallocSpaceEntry));
                crclen = sizeof(struct unallocSpaceEntry) +
-                                iinfo->i_lenAlloc - sizeof(tag);
+                                iinfo->i_lenAlloc - sizeof(struct tag);
                use->descTag.tagLocation = cpu_to_le32(
                                                iinfo->i_location.
                                                        logicalBlockNum);
                use->descTag.descCRCLength = cpu_to_le16(crclen);
                use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
-                                                           sizeof(tag),
+                                                           sizeof(struct tag),
                                                           crclen));
                use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
@@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        fe->informationLength = cpu_to_le64(inode->i_size);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-                regid *eid;
+                struct regid *eid;
                struct deviceSpec *dsea =
                        (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
                if (!dsea) {
                        dsea = (struct deviceSpec *)
                                udf_add_extendedattr(inode,
                                                     sizeof(struct deviceSpec) +
-                                                     sizeof(regid), 12, 0x3);
+                                                     sizeof(struct regid), 12, 0x3);
                        dsea->attrType = cpu_to_le32(12);
                        dsea->attrSubtype = 1;
                        dsea->attrLength = cpu_to_le32(
                                                sizeof(struct deviceSpec) +
-                                                sizeof(regid));
+                                                sizeof(struct regid));
-                        dsea->impUseLength = cpu_to_le32(sizeof(regid));
+                        dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
                }
-                eid = (regid *)dsea->impUse;
+                eid = (struct regid *)dsea->impUse;
-                memset(eid, 0, sizeof(regid));
+                memset(eid, 0, sizeof(struct regid));
                strcpy(eid->ident, UDF_ID_DEVELOPER);
                eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
                eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
                udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
                udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
-                memset(&(fe->impIdent), 0, sizeof(regid));
+                memset(&(fe->impIdent), 0, sizeof(struct regid));
                strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
                fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
                fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
                udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
-                memset(&(efe->impIdent), 0, sizeof(regid));
+                memset(&(efe->impIdent), 0, sizeof(struct regid));
                strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
                efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
                efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        fe->descTag.tagLocation = cpu_to_le32(
                                        iinfo->i_location.logicalBlockNum);
        crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
-                                                                sizeof(tag);
+                                                                sizeof(struct tag);
        fe->descTag.descCRCLength = cpu_to_le16(crclen);
-        fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag),
+        fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
                                                  crclen));
        fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
@@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        return err;
 }
-struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
+struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
 {
        unsigned long block = udf_get_lb_pblock(sb, ino, 0);
        struct inode *inode = iget_locked(sb, block);
@@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
                return NULL;
        if (inode->i_state & I_NEW) {
-                memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr));
+                memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
                __udf_read_inode(inode);
                unlock_new_inode(inode);
        }
@@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
        if (is_bad_inode(inode))
                goto out_iput;
-        if (ino.logicalBlockNum >= UDF_SB(sb)->
+        if (ino->logicalBlockNum >= UDF_SB(sb)->
-                        s_partmaps[ino.partitionReferenceNum].s_partition_len) {
+                        s_partmaps[ino->partitionReferenceNum].s_partition_len) {
                udf_debug("block=%d, partition=%d out of range\n",
-                          ino.logicalBlockNum, ino.partitionReferenceNum);
+                          ino->logicalBlockNum, ino->partitionReferenceNum);
                make_bad_inode(inode);
                goto out_iput;
        }
@@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
 }
 int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
-                    kernel_lb_addr eloc, uint32_t elen, int inc)
+                    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
-        short_ad *sad = NULL;
+        struct short_ad *sad = NULL;
-        long_ad *lad = NULL;
+        struct long_ad *lad = NULL;
        struct allocExtDesc *aed;
        int8_t etype;
        uint8_t *ptr;
@@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                ptr = epos->bh->b_data + epos->offset;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                return -1;
@@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                char *sptr, *dptr;
                struct buffer_head *nbh;
                int err, loffset;
-                kernel_lb_addr obloc = epos->block;
+                struct kernel_lb_addr obloc = epos->block;
                epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
                                                obloc.partitionReferenceNum,
@@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                if (!epos->block.logicalBlockNum)
                        return -1;
                nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
-                                                                 epos->block,
+                                                                 &epos->block,
                                                                 0));
                if (!nbh)
                        return -1;
@@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                }
                if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
                        udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
-                                    epos->block.logicalBlockNum, sizeof(tag));
+                                    epos->block.logicalBlockNum, sizeof(struct tag));
                else
                        udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
-                                    epos->block.logicalBlockNum, sizeof(tag));
+                                    epos->block.logicalBlockNum, sizeof(struct tag));
                switch (iinfo->i_alloc_type) {
                case ICBTAG_FLAG_AD_SHORT:
-                        sad = (short_ad *)sptr;
+                        sad = (struct short_ad *)sptr;
                        sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
                                                     inode->i_sb->s_blocksize);
                        sad->extPosition =
                                cpu_to_le32(epos->block.logicalBlockNum);
                        break;
                case ICBTAG_FLAG_AD_LONG:
-                        lad = (long_ad *)sptr;
+                        lad = (struct long_ad *)sptr;
                        lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
                                                     inode->i_sb->s_blocksize);
                        lad->extLocation = cpu_to_lelb(epos->block);
@@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 }
 int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
-                      kernel_lb_addr eloc, uint32_t elen, int inc)
+                      struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
        uint8_t *ptr;
-        short_ad *sad;
+        struct short_ad *sad;
-        long_ad *lad;
+        struct long_ad *lad;
        struct udf_inode_info *iinfo = UDF_I(inode);
        if (!epos->bh)
@@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
        switch (iinfo->i_alloc_type) {
        case ICBTAG_FLAG_AD_SHORT:
-                sad = (short_ad *)ptr;
+                sad = (struct short_ad *)ptr;
                sad->extLength = cpu_to_le32(elen);
-                sad->extPosition = cpu_to_le32(eloc.logicalBlockNum);
+                sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
                break;
        case ICBTAG_FLAG_AD_LONG:
-                lad = (long_ad *)ptr;
+                lad = (struct long_ad *)ptr;
                lad->extLength = cpu_to_le32(elen);
-                lad->extLocation = cpu_to_lelb(eloc);
+                lad->extLocation = cpu_to_lelb(*eloc);
                memset(lad->impUse, 0x00, sizeof(lad->impUse));
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
                break;
        default:
                return -1;
@@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
 }
 int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
-                     kernel_lb_addr *eloc, uint32_t *elen, int inc)
+                     struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
        int8_t etype;
@@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
                epos->block = *eloc;
                epos->offset = sizeof(struct allocExtDesc);
                brelse(epos->bh);
-                block = udf_get_lb_pblock(inode->i_sb, epos->block, 0);
+                block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
                epos->bh = udf_tread(inode->i_sb, block);
                if (!epos->bh) {
                        udf_debug("reading block %d failed!\n", block);
@@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
 }
 int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
-                        kernel_lb_addr *eloc, uint32_t *elen, int inc)
+                        struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
        int alen;
        int8_t etype;
        uint8_t *ptr;
-        short_ad *sad;
+        struct short_ad *sad;
-        long_ad *lad;
+        struct long_ad *lad;
        struct udf_inode_info *iinfo = UDF_I(inode);
        if (!epos->bh) {
@@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
 }
 static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
-                              kernel_lb_addr neloc, uint32_t nelen)
+                              struct kernel_lb_addr neloc, uint32_t nelen)
 {
-        kernel_lb_addr oeloc;
+        struct kernel_lb_addr oeloc;
        uint32_t oelen;
        int8_t etype;
@@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
                get_bh(epos.bh);
        while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
-                udf_write_aext(inode, &epos, neloc, nelen, 1);
+                udf_write_aext(inode, &epos, &neloc, nelen, 1);
                neloc = oeloc;
                nelen = (etype << 30) | oelen;
        }
-        udf_add_aext(inode, &epos, neloc, nelen, 1);
+        udf_add_aext(inode, &epos, &neloc, nelen, 1);
        brelse(epos.bh);
        return (nelen >> 30);
 }
 int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
-                       kernel_lb_addr eloc, uint32_t elen)
+                       struct kernel_lb_addr eloc, uint32_t elen)
 {
        struct extent_position oepos;
        int adsize;
@@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
        iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                adsize = 0;
@@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                return -1;
        while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
-                udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1);
+                udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
                if (oepos.bh != epos.bh) {
                        oepos.block = epos.block;
                        brelse(oepos.bh);
@@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                        oepos.offset = epos.offset - adsize;
                }
        }
-        memset(&eloc, 0x00, sizeof(kernel_lb_addr));
+        memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
        elen = 0;
        if (epos.bh != oepos.bh) {
-                udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1);
+                udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
-                udf_write_aext(inode, &oepos, eloc, elen, 1);
+                udf_write_aext(inode, &oepos, &eloc, elen, 1);
-                udf_write_aext(inode, &oepos, eloc, elen, 1);
+                udf_write_aext(inode, &oepos, &eloc, elen, 1);
                if (!oepos.bh) {
                        iinfo->i_lenAlloc -= (adsize * 2);
                        mark_inode_dirty(inode);
@@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                        mark_buffer_dirty_inode(oepos.bh, inode);
                }
        } else {
-                udf_write_aext(inode, &oepos, eloc, elen, 1);
+                udf_write_aext(inode, &oepos, &eloc, elen, 1);
                if (!oepos.bh) {
                        iinfo->i_lenAlloc -= adsize;
                        mark_inode_dirty(inode);
@@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 }
 int8_t inode_bmap(struct inode *inode, sector_t block,
-                  struct extent_position *pos, kernel_lb_addr *eloc,
+                  struct extent_position *pos, struct kernel_lb_addr *eloc,
                  uint32_t *elen, sector_t *offset)
 {
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
 long udf_block_map(struct inode *inode, sector_t block)
 {
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        struct extent_position epos = {};
@@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
                                                (EXT_RECORDED_ALLOCATED >> 30))
-                ret = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+                ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
        else
                ret = 0;
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 84bf0fd4a4f1..9215700c00a4 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
                        }
                }
                /* rewrite CRC + checksum of eahd */
-                crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag);
+                crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
                eahd->descTag.descCRCLength = cpu_to_le16(crclen);
                eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
-                                                sizeof(tag), crclen));
+                                                sizeof(struct tag), crclen));
                eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
                iinfo->i_lenEAttr += size;
                return (struct genericFormat *)&ea[offset];
@@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
 struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
                                    uint32_t location, uint16_t *ident)
 {
-        tag *tag_p;
+        struct tag *tag_p;
        struct buffer_head *bh = NULL;
        /* Read the block */
@@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
                return NULL;
        }
-        tag_p = (tag *)(bh->b_data);
+        tag_p = (struct tag *)(bh->b_data);
        *ident = le16_to_cpu(tag_p->tagIdent);
@@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
        }
        /* Verify the descriptor CRC */
-        if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize ||
+        if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
            le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
-                                        bh->b_data + sizeof(tag),
+                                        bh->b_data + sizeof(struct tag),
                                        le16_to_cpu(tag_p->descCRCLength)))
                return bh;
@@ -255,27 +255,28 @@ error_out:
        return NULL;
 }
-struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc,
+struct buffer_head *udf_read_ptagged(struct super_block *sb,
+                                     struct kernel_lb_addr *loc,
                                     uint32_t offset, uint16_t *ident)
 {
        return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
-                               loc.logicalBlockNum + offset, ident);
+                               loc->logicalBlockNum + offset, ident);
 }
 void udf_update_tag(char *data, int length)
 {
-        tag *tptr = (tag *)data;
+        struct tag *tptr = (struct tag *)data;
-        length -= sizeof(tag);
+        length -= sizeof(struct tag);
        tptr->descCRCLength = cpu_to_le16(length);
-        tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length));
+        tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
        tptr->tagChecksum = udf_tag_checksum(tptr);
 }
 void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
                 uint32_t loc, int length)
 {
-        tag *tptr = (tag *)data;
+        struct tag *tptr = (struct tag *)data;
        tptr->tagIdent = cpu_to_le16(ident);
        tptr->descVersion = cpu_to_le16(version);
        tptr->tagSerialNum = cpu_to_le16(snum);
@@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
        udf_update_tag(data, length);
 }
-u8 udf_tag_checksum(const tag *t)
+u8 udf_tag_checksum(const struct tag *t)
 {
        u8 *data = (u8 *)t;
        u8 checksum = 0;
        int i;
-        for (i = 0; i < sizeof(tag); ++i)
+        for (i = 0; i < sizeof(struct tag); ++i)
                if (i != 4) /* position of checksum */
                        checksum += data[i];
        return checksum;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f84bfaa8d941..6a29fa34c478 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
                 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
                 uint8_t *impuse, uint8_t *fileident)
 {
-        uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag);
+        uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
        uint16_t crc;
        int offset;
        uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
                memset(fibh->ebh->b_data, 0x00, padlen + offset);
        }
-        crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag),
+        crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
-                      sizeof(struct fileIdentDesc) - sizeof(tag));
+                      sizeof(struct fileIdentDesc) - sizeof(struct tag));
        if (fibh->sbh == fibh->ebh) {
                crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
-                              crclen + sizeof(tag) -
+                              crclen + sizeof(struct tag) -
                              sizeof(struct fileIdentDesc));
        } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
                crc = crc_itu_t(crc, fibh->ebh->b_data +
                                        sizeof(struct fileIdentDesc) +
                                        fibh->soffset,
-                              crclen + sizeof(tag) -
+                              crclen + sizeof(struct tag) -
                                        sizeof(struct fileIdentDesc));
        } else {
                crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
@@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
        uint8_t lfi;
        uint16_t liu;
        loff_t size;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        struct extent_position epos = {};
@@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
                    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
                        goto out_err;
-                block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                                epos.offset -= sizeof(short_ad);
+                                epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                                epos.offset -= sizeof(long_ad);
+                                epos.offset -= sizeof(struct long_ad);
                } else
                        offset = 0;
@@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 #ifdef UDF_RECOVERY
        /* temporary shorthand for specifying files by inode number */
        if (!strncmp(dentry->d_name.name, ".B=", 3)) {
-                kernel_lb_addr lb = {
+                struct kernel_lb_addr lb = {
                        .logicalBlockNum = 0,
                        .partitionReferenceNum =
                                simple_strtoul(dentry->d_name.name + 3,
@@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 #endif /* UDF_RECOVERY */
        if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
+                struct kernel_lb_addr loc;
                if (fibh.sbh != fibh.ebh)
                        brelse(fibh.ebh);
                brelse(fibh.sbh);
-                inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation));
+                loc = lelb_to_cpu(cfi.icb.extLocation);
+                inode = udf_iget(dir->i_sb, &loc);
                if (!inode) {
                        unlock_kernel();
                        return ERR_PTR(-EACCES);
@@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
        uint8_t lfi;
        uint16_t liu;
        int block;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen = 0;
        sector_t offset;
        struct extent_position epos = {};
@@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
                if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
                    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
                        block = udf_get_lb_pblock(dir->i_sb,
-                                        dinfo->i_location, 0);
+                                        &dinfo->i_location, 0);
                        fibh->soffset = fibh->eoffset = sb->s_blocksize;
                        goto add;
                }
-                block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                                epos.offset -= sizeof(short_ad);
+                                epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                                epos.offset -= sizeof(long_ad);
+                                epos.offset -= sizeof(struct long_ad);
                } else
                        offset = 0;
@@ -409,10 +412,10 @@ add:
        if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
                elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
                if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                        epos.offset -= sizeof(short_ad);
+                        epos.offset -= sizeof(struct short_ad);
                else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                        epos.offset -= sizeof(long_ad);
+                        epos.offset -= sizeof(struct long_ad);
-                udf_write_aext(dir, &epos, eloc, elen, 1);
+                udf_write_aext(dir, &epos, &eloc, elen, 1);
        }
        f_pos += nfidlen;
@@ -494,10 +497,10 @@ add:
        memset(cfi, 0, sizeof(struct fileIdentDesc));
        if (UDF_SB(sb)->s_udfrev >= 0x0200)
                udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
-                            sizeof(tag));
+                            sizeof(struct tag));
        else
                udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
-                            sizeof(tag));
+                            sizeof(struct tag));
        cfi->fileVersionNum = cpu_to_le16(1);
        cfi->lengthFileIdent = namelen;
        cfi->lengthOfImpUse = cpu_to_le16(0);
@@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
        cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
-                memset(&(cfi->icb), 0x00, sizeof(long_ad));
+                memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
        return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
 }
@@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir)
        loff_t f_pos;
        loff_t size = udf_ext0_offset(dir) + dir->i_size;
        int block;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        struct extent_position epos = {};
@@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir)
        else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
                              &epos, &eloc, &elen, &offset) ==
                                        (EXT_RECORDED_ALLOCATED >> 30)) {
-                block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                                epos.offset -= sizeof(short_ad);
+                                epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                                epos.offset -= sizeof(long_ad);
+                                epos.offset -= sizeof(struct long_ad);
                } else
                        offset = 0;
@@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        struct udf_fileident_bh fibh;
        struct fileIdentDesc *fi, cfi;
-        kernel_lb_addr tloc;
+        struct kernel_lb_addr tloc;
        retval = -ENOENT;
        lock_kernel();
@@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        retval = -EIO;
        tloc = lelb_to_cpu(cfi.icb.extLocation);
-        if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+        if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
                goto end_rmdir;
        retval = -ENOTEMPTY;
        if (!empty_dir(inode))
@@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct udf_fileident_bh fibh;
        struct fileIdentDesc *fi;
        struct fileIdentDesc cfi;
-        kernel_lb_addr tloc;
+        struct kernel_lb_addr tloc;
        retval = -ENOENT;
        lock_kernel();
@@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        retval = -EIO;
        tloc = lelb_to_cpu(cfi.icb.extLocation);
-        if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+        if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
                goto end_unlink;
        if (!inode->i_nlink) {
@@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_op = &page_symlink_inode_operations;
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
-                kernel_lb_addr eloc;
+                struct kernel_lb_addr eloc;
                uint32_t bsize;
                block = udf_new_block(inode->i_sb, inode,
@@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                                iinfo->i_location.partitionReferenceNum;
                bsize = inode->i_sb->s_blocksize;
                iinfo->i_lenExtents = bsize;
-                udf_add_aext(inode, &epos, eloc, bsize, 0);
+                udf_add_aext(inode, &epos, &eloc, bsize, 0);
                brelse(epos.bh);
                block = udf_get_pblock(inode->i_sb, block,
@@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct fileIdentDesc ocfi, ncfi;
        struct buffer_head *dir_bh = NULL;
        int retval = -ENOENT;
-        kernel_lb_addr tloc;
+        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
        lock_kernel();
@@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                brelse(ofibh.sbh);
        }
        tloc = lelb_to_cpu(ocfi.icb.extLocation);
-        if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0)
+        if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
            != old_inode->i_ino)
                goto end_rename;
@@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (!dir_fi)
                        goto end_rename;
                tloc = lelb_to_cpu(dir_fi->icb.extLocation);
-                if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) !=
+                if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
                                old_dir->i_ino)
                        goto end_rename;
@@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
         */
        ncfi.fileVersionNum = ocfi.fileVersionNum;
        ncfi.fileCharacteristics = ocfi.fileCharacteristics;
-        memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad));
+        memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
        udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
        /* The old fid may have moved - find it again */
@@ -1242,6 +1245,7 @@ end_rename:
 static struct dentry *udf_get_parent(struct dentry *child)
 {
+        struct kernel_lb_addr tloc;
        struct inode *inode = NULL;
        struct qstr dotdot = {.name = "..", .len = 2};
        struct fileIdentDesc cfi;
@@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
-        inode = udf_iget(child->d_inode->i_sb,
+        tloc = lelb_to_cpu(cfi.icb.extLocation);
-                         lelb_to_cpu(cfi.icb.extLocation));
+        inode = udf_iget(child->d_inode->i_sb, &tloc);
        if (!inode)
                goto out_unlock;
        unlock_kernel();
@@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
                                        u16 partref, __u32 generation)
 {
        struct inode *inode;
-        kernel_lb_addr loc;
+        struct kernel_lb_addr loc;
        if (block == 0)
                return ERR_PTR(-ESTALE);
        loc.logicalBlockNum = block;
        loc.partitionReferenceNum = partref;
-        inode = udf_iget(sb, loc);
+        inode = udf_iget(sb, &loc);
        if (inode == NULL)
                return ERR_PTR(-ENOMEM);
@@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
 {
        int len = *lenp;
        struct inode *inode =  de->d_inode;
-        kernel_lb_addr location = UDF_I(inode)->i_location;
+        struct kernel_lb_addr location = UDF_I(inode)->i_location;
        struct fid *fid = (struct fid *)fh;
        int type = FILEID_UDF_WITHOUT_PARENT;
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 65ff47902bd2..fbff74654df2 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -85,7 +85,7 @@ struct appIdentSuffix {
 /* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
 /* Implementation Use (UDF 2.50 2.2.6.4) */
 struct logicalVolIntegrityDescImpUse {
-        regid           impIdent;
+        struct regid    impIdent;
        __le32          numFiles;
        __le32          numDirs;
        __le16          minUDFReadRev;
@@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse {
 /* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
 /* Implementation Use (UDF 2.50 2.2.7.2) */
 struct impUseVolDescImpUse {
-        charspec        LVICharset;
+        struct charspec LVICharset;
        dstring         logicalVolIdent[128];
        dstring         LVInfo1[36];
        dstring         LVInfo2[36];
        dstring         LVInfo3[36];
-        regid           impIdent;
+        struct regid    impIdent;
        uint8_t         impUse[128];
 } __attribute__ ((packed));
@@ -110,7 +110,7 @@ struct udfPartitionMap2 {
        uint8_t         partitionMapType;
        uint8_t         partitionMapLength;
        uint8_t         reserved1[2];
-        regid           partIdent;
+        struct regid    partIdent;
        __le16          volSeqNum;
        __le16          partitionNum;
 } __attribute__ ((packed));
@@ -120,7 +120,7 @@ struct virtualPartitionMap {
        uint8_t         partitionMapType;
        uint8_t         partitionMapLength;
        uint8_t         reserved1[2];
-        regid           partIdent;
+        struct regid    partIdent;
        __le16          volSeqNum;
        __le16          partitionNum;
        uint8_t         reserved2[24];
@@ -131,7 +131,7 @@ struct sparablePartitionMap {
        uint8_t partitionMapType;
        uint8_t partitionMapLength;
        uint8_t reserved1[2];
-        regid partIdent;
+        struct regid partIdent;
        __le16 volSeqNum;
        __le16 partitionNum;
        __le16 packetLength;
@@ -146,7 +146,7 @@ struct metadataPartitionMap {
        uint8_t         partitionMapType;
        uint8_t         partitionMapLength;
        uint8_t         reserved1[2];
-        regid           partIdent;
+        struct regid    partIdent;
        __le16          volSeqNum;
        __le16          partitionNum;
        __le32          metadataFileLoc;
@@ -161,7 +161,7 @@ struct metadataPartitionMap {
 /* Virtual Allocation Table (UDF 1.5 2.2.10) */
 struct virtualAllocationTable15 {
        __le32          VirtualSector[0];
-        regid           vatIdent;
+        struct regid    vatIdent;
        __le32          previousVATICBLoc;
 } __attribute__ ((packed));
@@ -192,8 +192,8 @@ struct sparingEntry {
 } __attribute__ ((packed));
 struct sparingTable {
-        tag             descTag;
+        struct tag      descTag;
-        regid           sparingIdent;
+        struct regid    sparingIdent;
        __le16          reallocationTableLen;
        __le16          reserved;
        __le32          sequenceNum;
@@ -206,7 +206,7 @@ struct sparingTable {
 #define ICBTAG_FILE_TYPE_MIRROR         0xFB
 #define ICBTAG_FILE_TYPE_BITMAP         0xFC
-/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
+/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
 struct allocDescImpUse {
        __le16          flags;
        uint8_t         impUse[4];
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 96dfd207c3d6..4b540ee632d5 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
 {
        struct super_block *sb = inode->i_sb;
        struct udf_part_map *map;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t ext_offset;
        struct extent_position epos = {};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e25e7010627b..72348cc855a4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -81,16 +81,13 @@ static char error_buf[1024];
 /* These are the "meat" - everything else is stuffing */
 static int udf_fill_super(struct super_block *, void *, int);
 static void udf_put_super(struct super_block *);
-static void udf_write_super(struct super_block *);
+static int udf_sync_fs(struct super_block *, int);
 static int udf_remount_fs(struct super_block *, int *, char *);
-static int udf_check_valid(struct super_block *, int, int);
+static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
-static int udf_vrs(struct super_block *sb, int silent);
+static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
-static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad);
+                            struct kernel_lb_addr *);
-static void udf_find_anchor(struct super_block *);
-static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
-                            kernel_lb_addr *);
 static void udf_load_fileset(struct super_block *, struct buffer_head *,
-                             kernel_lb_addr *);
+                             struct kernel_lb_addr *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
@@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = {
        .delete_inode   = udf_delete_inode,
        .clear_inode    = udf_clear_inode,
        .put_super      = udf_put_super,
-        .write_super    = udf_write_super,
+        .sync_fs        = udf_sync_fs,
        .statfs         = udf_statfs,
        .remount_fs     = udf_remount_fs,
        .show_options   = udf_show_options,
@@ -201,6 +198,8 @@ struct udf_options {
        mode_t umask;
        gid_t gid;
        uid_t uid;
+        mode_t fmode;
+        mode_t dmode;
        struct nls_table *nls_map;
 };
@@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
                seq_puts(seq, ",nostrict");
-        if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE)
+        if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
                seq_printf(seq, ",bs=%lu", sb->s_blocksize);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
                seq_puts(seq, ",unhide");
@@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
                seq_printf(seq, ",gid=%u", sbi->s_gid);
        if (sbi->s_umask != 0)
                seq_printf(seq, ",umask=%o", sbi->s_umask);
+        if (sbi->s_fmode != UDF_INVALID_MODE)
+                seq_printf(seq, ",mode=%o", sbi->s_fmode);
+        if (sbi->s_dmode != UDF_INVALID_MODE)
+                seq_printf(seq, ",dmode=%o", sbi->s_dmode);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
                seq_printf(seq, ",session=%u", sbi->s_session);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
                seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
-        /*
+        if (sbi->s_anchor != 0)
-         * s_anchor[2] could be zeroed out in case there is no anchor
+                seq_printf(seq, ",anchor=%u", sbi->s_anchor);
-         * in the specified block, but then the "anchor=N" option
-         * originally given by the user wasn't effective, so it's OK
-         * if we don't show it.
-         */
-        if (sbi->s_anchor[2] != 0)
-                seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]);
        /*
         * volume, partition, fileset and rootdir seem to be ignored
         * currently
@@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
 *
 *      gid=            Set the default group.
 *      umask=          Set the default umask.
+ *      mode=           Set the default file permissions.
+ *      dmode=          Set the default directory permissions.
 *      uid=            Set the default user.
 *      bs=             Set the block size.
 *      unhide          Show otherwise hidden files.
@@ -366,7 +365,8 @@ enum {
        Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
        Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
        Opt_rootdir, Opt_utf8, Opt_iocharset,
-        Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
+        Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
+        Opt_fmode, Opt_dmode
 };
 static const match_table_t tokens = {
@@ -395,6 +395,8 @@ static const match_table_t tokens = {
        {Opt_rootdir,   "rootdir=%u"},
        {Opt_utf8,      "utf8"},
        {Opt_iocharset, "iocharset=%s"},
+        {Opt_fmode,     "mode=%o"},
+        {Opt_dmode,     "dmode=%o"},
        {Opt_err,       NULL}
 };
@@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
        int option;
        uopt->novrs = 0;
-        uopt->blocksize = UDF_DEFAULT_BLOCKSIZE;
        uopt->partition = 0xFFFF;
        uopt->session = 0xFFFFFFFF;
        uopt->lastblock = 0;
@@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
                switch (token) {
                case Opt_novrs:
                        uopt->novrs = 1;
+                        break;
                case Opt_bs:
                        if (match_int(&args[0], &option))
                                return 0;
                        uopt->blocksize = option;
+                        uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
                        break;
                case Opt_unhide:
                        uopt->flags |= (1 << UDF_FLAG_UNHIDE);
@@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
                case Opt_gforget:
                        uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
                        break;
+                case Opt_fmode:
+                        if (match_octal(args, &option))
+                                return 0;
+                        uopt->fmode = option & 0777;
+                        break;
+                case Opt_dmode:
+                        if (match_octal(args, &option))
+                                return 0;
+                        uopt->dmode = option & 0777;
+                        break;
                default:
                        printk(KERN_ERR "udf: bad mount option \"%s\" "
                               "or missing value\n", p);
@@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
        return 1;
 }
-static void udf_write_super(struct super_block *sb)
-{
-        lock_kernel();
-        if (!(sb->s_flags & MS_RDONLY))
-                udf_open_lvid(sb);
-        sb->s_dirt = 0;
-        unlock_kernel();
-}
 static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 {
        struct udf_options uopt;
@@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        uopt.uid   = sbi->s_uid;
        uopt.gid   = sbi->s_gid;
        uopt.umask = sbi->s_umask;
+        uopt.fmode = sbi->s_fmode;
+        uopt.dmode = sbi->s_dmode;
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
@@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
        sbi->s_umask = uopt.umask;
+        sbi->s_fmode = uopt.fmode;
+        sbi->s_dmode = uopt.dmode;
        if (sbi->s_lvid_bh) {
                int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        return 0;
 }
-static int udf_vrs(struct super_block *sb, int silent)
+/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
+/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
+static loff_t udf_check_vsd(struct super_block *sb)
 {
        struct volStructDesc *vsd = NULL;
        loff_t sector = 32768;
        int sectorsize;
        struct buffer_head *bh = NULL;
-        int iso9660 = 0;
        int nsr02 = 0;
        int nsr03 = 0;
        struct udf_sb_info *sbi;
-        /* Block size must be a multiple of 512 */
-        if (sb->s_blocksize & 511)
-                return 0;
        sbi = UDF_SB(sb);
        if (sb->s_blocksize < sizeof(struct volStructDesc))
                sectorsize = sizeof(struct volStructDesc);
        else
@@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent)
                        break;
                } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
                                    VSD_STD_ID_LEN)) {
-                        iso9660 = sector;
                        switch (vsd->structType) {
                        case 0:
                                udf_debug("ISO9660 Boot Record found\n");
@@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent)
                return 0;
 }
-/*
- * Check whether there is an anchor block in the given block
- */
-static int udf_check_anchor_block(struct super_block *sb, sector_t block)
-{
-        struct buffer_head *bh;
-        uint16_t ident;
-        if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
-            udf_fixed_to_variable(block) >=
-            sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
-                return 0;
-        bh = udf_read_tagged(sb, block, block, &ident);
-        if (!bh)
-                return 0;
-        brelse(bh);
-        return ident == TAG_IDENT_AVDP;
-}
-/* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
-{
-        sector_t last[6];
-        int i;
-        struct udf_sb_info *sbi = UDF_SB(sb);
-        last[0] = lastblock;
-        last[1] = last[0] - 1;
-        last[2] = last[0] + 1;
-        last[3] = last[0] - 2;
-        last[4] = last[0] - 150;
-        last[5] = last[0] - 152;
-        /*  according to spec, anchor is in either:
-         *     block 256
-         *     lastblock-256
-         *     lastblock
-         *  however, if the disc isn't closed, it could be 512 */
-        for (i = 0; i < ARRAY_SIZE(last); i++) {
-                if (last[i] < 0)
-                        continue;
-                if (last[i] >= sb->s_bdev->bd_inode->i_size >>
-                                sb->s_blocksize_bits)
-                        continue;
-                if (udf_check_anchor_block(sb, last[i])) {
-                        sbi->s_anchor[0] = last[i];
-                        sbi->s_anchor[1] = last[i] - 256;
-                        return last[i];
-                }
-                if (last[i] < 256)
-                        continue;
-                if (udf_check_anchor_block(sb, last[i] - 256)) {
-                        sbi->s_anchor[1] = last[i] - 256;
-                        return last[i];
-                }
-        }
-        if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
-                sbi->s_anchor[0] = sbi->s_session + 256;
-                return last[0];
-        }
-        if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
-                sbi->s_anchor[0] = sbi->s_session + 512;
-                return last[0];
-        }
-        return 0;
-}
-/*
- * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
- * be the last block on the media.
- *
- * Return 1 if not found, 0 if ok
- *
- */
-static void udf_find_anchor(struct super_block *sb)
-{
-        sector_t lastblock;
-        struct buffer_head *bh = NULL;
-        uint16_t ident;
-        int i;
-        struct udf_sb_info *sbi = UDF_SB(sb);
-        lastblock = udf_scan_anchors(sb, sbi->s_last_block);
-        if (lastblock)
-                goto check_anchor;
-        /* No anchor found? Try VARCONV conversion of block numbers */
-        UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
-        /* Firstly, we try to not convert number of the last block */
-        lastblock = udf_scan_anchors(sb,
-                                udf_variable_to_fixed(sbi->s_last_block));
-        if (lastblock)
-                goto check_anchor;
-        /* Secondly, we try with converted number of the last block */
-        lastblock = udf_scan_anchors(sb, sbi->s_last_block);
-        if (!lastblock) {
-                /* VARCONV didn't help. Clear it. */
-                UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
-        }
-check_anchor:
-        /*
-         * Check located anchors and the anchor block supplied via
-         * mount options
-         */
-        for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
-                if (!sbi->s_anchor[i])
-                        continue;
-                bh = udf_read_tagged(sb, sbi->s_anchor[i],
-                                        sbi->s_anchor[i], &ident);
-                if (!bh)
-                        sbi->s_anchor[i] = 0;
-                else {
-                        brelse(bh);
-                        if (ident != TAG_IDENT_AVDP)
-                                sbi->s_anchor[i] = 0;
-                }
-        }
-        sbi->s_last_block = lastblock;
-}
 static int udf_find_fileset(struct super_block *sb,
-                            kernel_lb_addr *fileset,
+                            struct kernel_lb_addr *fileset,
-                            kernel_lb_addr *root)
+                            struct kernel_lb_addr *root)
 {
        struct buffer_head *bh = NULL;
        long lastblock;
@@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb,
        if (fileset->logicalBlockNum != 0xFFFFFFFF ||
            fileset->partitionReferenceNum != 0xFFFF) {
-                bh = udf_read_ptagged(sb, *fileset, 0, &ident);
+                bh = udf_read_ptagged(sb, fileset, 0, &ident);
                if (!bh) {
                        return 1;
@@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb,
        sbi = UDF_SB(sb);
        if (!bh) {
                /* Search backwards through the partitions */
-                kernel_lb_addr newfileset;
+                struct kernel_lb_addr newfileset;
 /* --> cvg: FIXME - is it reasonable? */
                return 1;
@@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb,
                        newfileset.logicalBlockNum = 0;
                        do {
-                                bh = udf_read_ptagged(sb, newfileset, 0,
+                                bh = udf_read_ptagged(sb, &newfileset, 0,
                                                      &ident);
                                if (!bh) {
                                        newfileset.logicalBlockNum++;
@@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb,
 static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 {
        struct primaryVolDesc *pvoldesc;
-        struct ustr instr;
+        struct ustr *instr, *outstr;
-        struct ustr outstr;
        struct buffer_head *bh;
        uint16_t ident;
+        int ret = 1;
+        instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        if (!instr)
+                return 1;
+        outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        if (!outstr)
+                goto out1;
        bh = udf_read_tagged(sb, block, block, &ident);
        if (!bh)
-                return 1;
+                goto out2;
        BUG_ON(ident != TAG_IDENT_PVD);
        pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
        if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
                              pvoldesc->recordingDateAndTime)) {
 #ifdef UDFFS_DEBUG
-                timestamp *ts = &pvoldesc->recordingDateAndTime;
+                struct timestamp *ts = &pvoldesc->recordingDateAndTime;
                udf_debug("recording time %04u/%02u/%02u"
                          " %02u:%02u (%x)\n",
                          le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
@@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 #endif
        }
-        if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32))
+        if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
-                if (udf_CS0toUTF8(&outstr, &instr)) {
+                if (udf_CS0toUTF8(outstr, instr)) {
-                        strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name,
+                        strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
-                                outstr.u_len > 31 ? 31 : outstr.u_len);
+                                outstr->u_len > 31 ? 31 : outstr->u_len);
                        udf_debug("volIdent[] = '%s'\n",
                                        UDF_SB(sb)->s_volume_ident);
                }
-        if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128))
+        if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
-                if (udf_CS0toUTF8(&outstr, &instr))
+                if (udf_CS0toUTF8(outstr, instr))
-                        udf_debug("volSetIdent[] = '%s'\n", outstr.u_name);
+                        udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
        brelse(bh);
-        return 0;
+        ret = 0;
+out2:
+        kfree(outstr);
+out1:
+        kfree(instr);
+        return ret;
 }
 static int udf_load_metadata_files(struct super_block *sb, int partition)
@@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        struct udf_meta_data *mdata;
-        kernel_lb_addr addr;
+        struct kernel_lb_addr addr;
        int fe_error = 0;
        map = &sbi->s_partmaps[partition];
@@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        udf_debug("Metadata file location: block = %d part = %d\n",
                          addr.logicalBlockNum, addr.partitionReferenceNum);
-        mdata->s_metadata_fe = udf_iget(sb, addr);
+        mdata->s_metadata_fe = udf_iget(sb, &addr);
        if (mdata->s_metadata_fe == NULL) {
                udf_warning(sb, __func__, "metadata inode efe not found, "
@@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        udf_debug("Mirror metadata file location: block = %d part = %d\n",
                          addr.logicalBlockNum, addr.partitionReferenceNum);
-        mdata->s_mirror_fe = udf_iget(sb, addr);
+        mdata->s_mirror_fe = udf_iget(sb, &addr);
        if (mdata->s_mirror_fe == NULL) {
                if (fe_error) {
@@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
                udf_debug("Bitmap file location: block = %d part = %d\n",
                        addr.logicalBlockNum, addr.partitionReferenceNum);
-                mdata->s_bitmap_fe = udf_iget(sb, addr);
+                mdata->s_bitmap_fe = udf_iget(sb, &addr);
                if (mdata->s_bitmap_fe == NULL) {
                        if (sb->s_flags & MS_RDONLY)
@@ -1037,7 +923,7 @@ error_exit:
 }
 static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
-                             kernel_lb_addr *root)
+                             struct kernel_lb_addr *root)
 {
        struct fileSetDesc *fset;
@@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
        phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
        if (phd->unallocSpaceTable.extLength) {
-                kernel_lb_addr loc = {
+                struct kernel_lb_addr loc = {
                        .logicalBlockNum = le32_to_cpu(
                                phd->unallocSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
-                map->s_uspace.s_table = udf_iget(sb, loc);
+                map->s_uspace.s_table = udf_iget(sb, &loc);
                if (!map->s_uspace.s_table) {
                        udf_debug("cannot load unallocSpaceTable (part %d)\n",
                                        p_index);
@@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                udf_debug("partitionIntegrityTable (part %d)\n", p_index);
        if (phd->freedSpaceTable.extLength) {
-                kernel_lb_addr loc = {
+                struct kernel_lb_addr loc = {
                        .logicalBlockNum = le32_to_cpu(
                                phd->freedSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
-                map->s_fspace.s_table = udf_iget(sb, loc);
+                map->s_fspace.s_table = udf_iget(sb, &loc);
                if (!map->s_fspace.s_table) {
                        udf_debug("cannot load freedSpaceTable (part %d)\n",
                                p_index);
@@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map = &sbi->s_partmaps[p_index];
-        kernel_lb_addr ino;
+        struct kernel_lb_addr ino;
        struct buffer_head *bh = NULL;
        struct udf_inode_info *vati;
        uint32_t pos;
@@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
        /* VAT file entry is in the last recorded block */
        ino.partitionReferenceNum = type1_index;
        ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
-        sbi->s_vat_inode = udf_iget(sb, ino);
+        sbi->s_vat_inode = udf_iget(sb, &ino);
        if (!sbi->s_vat_inode)
                return 1;
@@ -1322,7 +1208,7 @@ out_bh:
 }
 static int udf_load_logicalvol(struct super_block *sb, sector_t block,
-                               kernel_lb_addr *fileset)
+                               struct kernel_lb_addr *fileset)
 {
        struct logicalVolDesc *lvd;
        int i, j, offset;
@@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
        }
        if (fileset) {
-                long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]);
+                struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
                *fileset = lelb_to_cpu(la->extLocation);
                udf_debug("FileSet found in LogicalVolDesc at block=%d, "
@@ -1490,7 +1376,7 @@ out_bh:
 * udf_load_logicalvolint
 *
 */
-static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
+static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
 {
        struct buffer_head *bh = NULL;
        uint16_t ident;
@@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
 *      Written, tested, and released.
 */
 static noinline int udf_process_sequence(struct super_block *sb, long block,
-                                long lastblock, kernel_lb_addr *fileset)
+                                long lastblock, struct kernel_lb_addr *fileset)
 {
        struct buffer_head *bh = NULL;
        struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
        return 0;
 }
+static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
+                             struct kernel_lb_addr *fileset)
+{
+        struct anchorVolDescPtr *anchor;
+        long main_s, main_e, reserve_s, reserve_e;
+        struct udf_sb_info *sbi;
+        sbi = UDF_SB(sb);
+        anchor = (struct anchorVolDescPtr *)bh->b_data;
+        /* Locate the main sequence */
+        main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
+        main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
+        main_e = main_e >> sb->s_blocksize_bits;
+        main_e += main_s;
+        /* Locate the reserve sequence */
+        reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
+        reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
+        reserve_e = reserve_e >> sb->s_blocksize_bits;
+        reserve_e += reserve_s;
+        /* Process the main & reserve sequences */
+        /* responsible for finding the PartitionDesc(s) */
+        if (!udf_process_sequence(sb, main_s, main_e, fileset))
+                return 1;
+        return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
+}
 /*
- * udf_check_valid()
+ * Check whether there is an anchor block in the given block and
+ * load Volume Descriptor Sequence if so.
 */
-static int udf_check_valid(struct super_block *sb, int novrs, int silent)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+                                  struct kernel_lb_addr *fileset)
 {
-        long block;
+        struct buffer_head *bh;
-        struct udf_sb_info *sbi = UDF_SB(sb);
+        uint16_t ident;
+        int ret;
-        if (novrs) {
+        if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
-                udf_debug("Validity check skipped because of novrs option\n");
+            udf_fixed_to_variable(block) >=
+            sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+                return 0;
+        bh = udf_read_tagged(sb, block, block, &ident);
+        if (!bh)
+                return 0;
+        if (ident != TAG_IDENT_AVDP) {
+                brelse(bh);
                return 0;
        }
-        /* Check that it is NSR02 compliant */
+        ret = udf_load_sequence(sb, bh, fileset);
-        /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
+        brelse(bh);
-        block = udf_vrs(sb, silent);
+        return ret;
-        if (block == -1)
-                udf_debug("Failed to read byte 32768. Assuming open "
-                          "disc. Skipping validity check\n");
-        if (block && !sbi->s_last_block)
-                sbi->s_last_block = udf_get_last_block(sb);
-        return !block;
 }
-static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset)
+/* Search for an anchor volume descriptor pointer */
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
+                                 struct kernel_lb_addr *fileset)
 {
-        struct anchorVolDescPtr *anchor;
+        sector_t last[6];
-        uint16_t ident;
-        struct buffer_head *bh;
-        long main_s, main_e, reserve_s, reserve_e;
        int i;
-        struct udf_sb_info *sbi;
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        int last_count = 0;
-        if (!sb)
-                return 1;
-        sbi = UDF_SB(sb);
-        for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
+        /* First try user provided anchor */
-                if (!sbi->s_anchor[i])
+        if (sbi->s_anchor) {
+                if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
+                        return lastblock;
+        }
+        /*
+         * according to spec, anchor is in either:
+         *     block 256
+         *     lastblock-256
+         *     lastblock
+         *  however, if the disc isn't closed, it could be 512.
+         */
+        if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
+                return lastblock;
+        /*
+         * The trouble is which block is the last one. Drives often misreport
+         * this so we try various possibilities.
+         */
+        last[last_count++] = lastblock;
+        if (lastblock >= 1)
+                last[last_count++] = lastblock - 1;
+        last[last_count++] = lastblock + 1;
+        if (lastblock >= 2)
+                last[last_count++] = lastblock - 2;
+        if (lastblock >= 150)
+                last[last_count++] = lastblock - 150;
+        if (lastblock >= 152)
+                last[last_count++] = lastblock - 152;
+        for (i = 0; i < last_count; i++) {
+                if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+                                sb->s_blocksize_bits)
                        continue;
+                if (udf_check_anchor_block(sb, last[i], fileset))
-                bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i],
+                        return last[i];
-                                     &ident);
+                if (last[i] < 256)
-                if (!bh)
                        continue;
+                if (udf_check_anchor_block(sb, last[i] - 256, fileset))
+                        return last[i];
+        }
-                anchor = (struct anchorVolDescPtr *)bh->b_data;
+        /* Finally try block 512 in case media is open */
+        if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
+                return last[0];
+        return 0;
+}
-                /* Locate the main sequence */
+/*
-                main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
+ * Find an anchor volume descriptor and load Volume Descriptor Sequence from
-                main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
+ * area specified by it. The function expects sbi->s_lastblock to be the last
-                main_e = main_e >> sb->s_blocksize_bits;
+ * block on the media.
-                main_e += main_s;
+ *
+ * Return 1 if ok, 0 if not found.
+ *
+ */
+static int udf_find_anchor(struct super_block *sb,
+                           struct kernel_lb_addr *fileset)
+{
+        sector_t lastblock;
+        struct udf_sb_info *sbi = UDF_SB(sb);
-                /* Locate the reserve sequence */
+        lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
-                reserve_s = le32_to_cpu(
+        if (lastblock)
-                                anchor->reserveVolDescSeqExt.extLocation);
+                goto out;
-                reserve_e = le32_to_cpu(
-                                anchor->reserveVolDescSeqExt.extLength);
-                reserve_e = reserve_e >> sb->s_blocksize_bits;
-                reserve_e += reserve_s;
-                brelse(bh);
+        /* No anchor found? Try VARCONV conversion of block numbers */
+        UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+        /* Firstly, we try to not convert number of the last block */
+        lastblock = udf_scan_anchors(sb,
+                                udf_variable_to_fixed(sbi->s_last_block),
+                                fileset);
+        if (lastblock)
+                goto out;
-                /* Process the main & reserve sequences */
+        /* Secondly, we try with converted number of the last block */
-                /* responsible for finding the PartitionDesc(s) */
+        lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
-                if (!(udf_process_sequence(sb, main_s, main_e,
+        if (!lastblock) {
-                                           fileset) &&
+                /* VARCONV didn't help. Clear it. */
-                      udf_process_sequence(sb, reserve_s, reserve_e,
+                UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
-                                           fileset)))
+                return 0;
-                        break;
        }
+out:
+        sbi->s_last_block = lastblock;
+        return 1;
+}
-        if (i == ARRAY_SIZE(sbi->s_anchor)) {
+/*
-                udf_debug("No Anchor block found\n");
+ * Check Volume Structure Descriptor, find Anchor block and load Volume
-                return 1;
+ * Descriptor Sequence
+ */
+static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
+                        int silent, struct kernel_lb_addr *fileset)
+{
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        loff_t nsr_off;
+        if (!sb_set_blocksize(sb, uopt->blocksize)) {
+                if (!silent)
+                        printk(KERN_WARNING "UDF-fs: Bad block size\n");
+                return 0;
+        }
+        sbi->s_last_block = uopt->lastblock;
+        if (!uopt->novrs) {
+                /* Check that it is NSR02 compliant */
+                nsr_off = udf_check_vsd(sb);
+                if (!nsr_off) {
+                        if (!silent)
+                                printk(KERN_WARNING "UDF-fs: No VRS found\n");
+                        return 0;
+                }
+                if (nsr_off == -1)
+                        udf_debug("Failed to read byte 32768. Assuming open "
+                                  "disc. Skipping validity check\n");
+                if (!sbi->s_last_block)
+                        sbi->s_last_block = udf_get_last_block(sb);
+        } else {
+                udf_debug("Validity check skipped because of novrs option\n");
        }
-        udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
-        return 0;
+        /* Look for anchor block and load Volume Descriptor Sequence */
+        sbi->s_anchor = uopt->anchor;
+        if (!udf_find_anchor(sb, fileset)) {
+                if (!silent)
+                        printk(KERN_WARNING "UDF-fs: No anchor found\n");
+                return 0;
+        }
+        return 1;
 }
 static void udf_open_lvid(struct super_block *sb)
@@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb)
        struct buffer_head *bh = sbi->s_lvid_bh;
        struct logicalVolIntegrityDesc *lvid;
        struct logicalVolIntegrityDescImpUse *lvidiu;
        if (!bh)
                return;
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
@@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb)
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
        udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
                                CURRENT_TIME);
-        lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
+        lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
        lvid->descTag.descCRC = cpu_to_le16(
-                crc_itu_t(0, (char *)lvid + sizeof(tag),
+                crc_itu_t(0, (char *)lvid + sizeof(struct tag),
                        le16_to_cpu(lvid->descTag.descCRCLength)));
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
+        sbi->s_lvid_dirty = 0;
 }
 static void udf_close_lvid(struct super_block *sb)
@@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb)
                return;
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
-        if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
-                return;
        lvidiu = udf_sb_lvidiu(sbi);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb)
        lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
        lvid->descTag.descCRC = cpu_to_le16(
-                        crc_itu_t(0, (char *)lvid + sizeof(tag),
+                        crc_itu_t(0, (char *)lvid + sizeof(struct tag),
                                le16_to_cpu(lvid->descTag.descCRCLength)));
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
+        sbi->s_lvid_dirty = 0;
 }
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map)
 static int udf_fill_super(struct super_block *sb, void *options, int silent)
 {
        int i;
+        int ret;
        struct inode *inode = NULL;
        struct udf_options uopt;
-        kernel_lb_addr rootdir, fileset;
+        struct kernel_lb_addr rootdir, fileset;
        struct udf_sb_info *sbi;
        uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
        uopt.uid = -1;
        uopt.gid = -1;
        uopt.umask = 0;
+        uopt.fmode = UDF_INVALID_MODE;
+        uopt.dmode = UDF_INVALID_MODE;
        sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
        if (!sbi)
@@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sbi->s_uid = uopt.uid;
        sbi->s_gid = uopt.gid;
        sbi->s_umask = uopt.umask;
+        sbi->s_fmode = uopt.fmode;
+        sbi->s_dmode = uopt.dmode;
        sbi->s_nls_map = uopt.nls_map;
-        /* Set the block size for all transfers */
-        if (!sb_min_blocksize(sb, uopt.blocksize)) {
-                udf_debug("Bad block size (%d)\n", uopt.blocksize);
-                printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
-                goto error_out;
-        }
        if (uopt.session == 0xFFFFFFFF)
                sbi->s_session = udf_get_last_session(sb);
        else
@@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        udf_debug("Multi-session=%d\n", sbi->s_session);
-        sbi->s_last_block = uopt.lastblock;
-        sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
-        sbi->s_anchor[2] = uopt.anchor;
-        if (udf_check_valid(sb, uopt.novrs, silent)) {
-                /* read volume recognition sequences */
-                printk(KERN_WARNING "UDF-fs: No VRS found\n");
-                goto error_out;
-        }
-        udf_find_anchor(sb);
        /* Fill in the rest of the superblock */
        sb->s_op = &udf_sb_ops;
        sb->s_export_op = &udf_export_ops;
@@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sb->s_magic = UDF_SUPER_MAGIC;
        sb->s_time_gran = 1000;
-        if (udf_load_sequence(sb, &fileset)) {
+        if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+        } else {
+                uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+                if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
+                        if (!silent)
+                                printk(KERN_NOTICE
+                                       "UDF-fs: Rescanning with blocksize "
+                                       "%d\n", UDF_DEFAULT_BLOCKSIZE);
+                        uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
+                        ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+                }
+        }
+        if (!ret) {
                printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
                goto error_out;
        }
@@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        }
        if (!silent) {
-                timestamp ts;
+                struct timestamp ts;
                udf_time_to_disk_stamp(&ts, sbi->s_record_time);
                udf_info("UDF: Mounting volume '%s', "
                         "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
@@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* Assign the root inode */
        /* assign inodes by physical block number */
        /* perhaps it's not extensible enough, but for now ... */
-        inode = udf_iget(sb, rootdir);
+        inode = udf_iget(sb, &rootdir);
        if (!inode) {
                printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
                                "partition=%d\n",
@@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
 }
+static int udf_sync_fs(struct super_block *sb, int wait)
+{
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        mutex_lock(&sbi->s_alloc_mutex);
+        if (sbi->s_lvid_dirty) {
+                /*
+                 * Blockdevice will be synced later so we don't have to submit
+                 * the buffer for IO
+                 */
+                mark_buffer_dirty(sbi->s_lvid_bh);
+                sb->s_dirt = 0;
+                sbi->s_lvid_dirty = 0;
+        }
+        mutex_unlock(&sbi->s_alloc_mutex);
+        return 0;
+}
 static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDescImpUse *lvidiu;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        if (sbi->s_lvid_bh != NULL)
                lvidiu = udf_sb_lvidiu(sbi);
@@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
                                          le32_to_cpu(lvidiu->numDirs)) : 0)
                        + buf->f_bfree;
        buf->f_ffree = buf->f_bfree;
-        /* __kernel_fsid_t f_fsid */
        buf->f_namelen = UDF_NAME_LEN - 2;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
@@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        unsigned int accum = 0;
        int index;
        int block = 0, newblock;
-        kernel_lb_addr loc;
+        struct kernel_lb_addr loc;
        uint32_t bytes;
        uint8_t *ptr;
        uint16_t ident;
@@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
-        bh = udf_read_ptagged(sb, loc, 0, &ident);
+        bh = udf_read_ptagged(sb, &loc, 0, &ident);
        if (!bh) {
                printk(KERN_ERR "udf: udf_count_free failed\n");
@@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
                bytes -= cur_bytes;
                if (bytes) {
                        brelse(bh);
-                        newblock = udf_get_lb_pblock(sb, loc, ++block);
+                        newblock = udf_get_lb_pblock(sb, &loc, ++block);
                        bh = udf_tread(sb, newblock);
                        if (!bh) {
                                udf_debug("read failed\n");
@@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
 {
        unsigned int accum = 0;
        uint32_t elen;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        int8_t etype;
        struct extent_position epos;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 65e19b4f9424..225527cdc885 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -28,10 +28,10 @@
 #include "udf_sb.h"
 static void extent_trunc(struct inode *inode, struct extent_position *epos,
-                         kernel_lb_addr eloc, int8_t etype, uint32_t elen,
+                         struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
                         uint32_t nelen)
 {
-        kernel_lb_addr neloc = {};
+        struct kernel_lb_addr neloc = {};
        int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
        int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
@@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
                                        last_block);
                        etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
                } else
-                        neloc = eloc;
+                        neloc = *eloc;
                nelen = (etype << 30) | nelen;
        }
        if (elen != nelen) {
-                udf_write_aext(inode, epos, neloc, nelen, 0);
+                udf_write_aext(inode, epos, &neloc, nelen, 0);
                if (last_block - first_block > 0) {
                        if (etype == (EXT_RECORDED_ALLOCATED >> 30))
                                mark_inode_dirty(inode);
@@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
 void udf_truncate_tail_extent(struct inode *inode)
 {
        struct extent_position epos = {};
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen, nelen;
        uint64_t lbcount = 0;
        int8_t etype = -1, netype;
@@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode)
                return;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                BUG();
@@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode)
                                       (unsigned)elen);
                        nelen = elen - (lbcount - inode->i_size);
                        epos.offset -= adsize;
-                        extent_trunc(inode, &epos, eloc, etype, elen, nelen);
+                        extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
                        epos.offset += adsize;
                        if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
                                printk(KERN_ERR "udf_truncate_tail_extent(): "
@@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode)
 void udf_discard_prealloc(struct inode *inode)
 {
        struct extent_position epos = { NULL, 0, {0, 0} };
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        uint64_t lbcount = 0;
        int8_t etype = -1, netype;
@@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode)
                return;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                adsize = 0;
@@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode)
        if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                epos.offset -= adsize;
                lbcount -= elen;
-                extent_trunc(inode, &epos, eloc, etype, elen, 0);
+                extent_trunc(inode, &epos, &eloc, etype, elen, 0);
                if (!epos.bh) {
                        iinfo->i_lenAlloc =
                                epos.offset -
@@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
 void udf_truncate_extents(struct inode *inode)
 {
        struct extent_position epos;
-        kernel_lb_addr eloc, neloc = {};
+        struct kernel_lb_addr eloc, neloc = {};
        uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
        int8_t etype;
        struct super_block *sb = inode->i_sb;
@@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode)
        struct udf_inode_info *iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                BUG();
@@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode)
                (inode->i_size & (sb->s_blocksize - 1));
        if (etype != -1) {
                epos.offset -= adsize;
-                extent_trunc(inode, &epos, eloc, etype, elen, byte_offset);
+                extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
                epos.offset += adsize;
                if (byte_offset)
                        lenalloc = epos.offset;
@@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode)
                while ((etype = udf_current_aext(inode, &epos, &eloc,
                                                 &elen, 0)) != -1) {
                        if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-                                udf_write_aext(inode, &epos, neloc, nelen, 0);
+                                udf_write_aext(inode, &epos, &neloc, nelen, 0);
                                if (indirect_ext_len) {
                                        /* We managed to free all extents in the
                                         * indirect extent - free it too */
                                        BUG_ON(!epos.bh);
-                                        udf_free_blocks(sb, inode, epos.block,
+                                        udf_free_blocks(sb, inode, &epos.block,
                                                        0, indirect_ext_len);
                                } else if (!epos.bh) {
                                        iinfo->i_lenAlloc = lenalloc;
@@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode)
                                epos.offset = sizeof(struct allocExtDesc);
                                epos.block = eloc;
                                epos.bh = udf_tread(sb,
-                                                udf_get_lb_pblock(sb, eloc, 0));
+                                                udf_get_lb_pblock(sb, &eloc, 0));
                                if (elen)
                                        indirect_ext_len =
                                                (elen + sb->s_blocksize - 1) >>
@@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode)
                                else
                                        indirect_ext_len = 1;
                        } else {
-                                extent_trunc(inode, &epos, eloc, etype,
+                                extent_trunc(inode, &epos, &eloc, etype,
                                             elen, 0);
                                epos.offset += adsize;
                        }
@@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode)
                if (indirect_ext_len) {
                        BUG_ON(!epos.bh);
-                        udf_free_blocks(sb, inode, epos.block, 0,
+                        udf_free_blocks(sb, inode, &epos.block, 0,
                                        indirect_ext_len);
                } else if (!epos.bh) {
                        iinfo->i_lenAlloc = lenalloc;
@@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode)
                        udf_update_alloc_ext_desc(inode, &epos, lenalloc);
        } else if (inode->i_size) {
                if (byte_offset) {
-                        kernel_long_ad extent;
+                        struct kernel_long_ad extent;
                        /*
                         *  OK, there is not extent covering inode->i_size and
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4f86b1d98a5d..e58d1de41073 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -4,7 +4,7 @@
 struct udf_inode_info {
        struct timespec         i_crtime;
        /* Physical address of inode */
-        kernel_lb_addr          i_location;
+        struct kernel_lb_addr           i_location;
        __u64                   i_unique;
        __u32                   i_lenEAttr;
        __u32                   i_lenAlloc;
@@ -17,8 +17,8 @@ struct udf_inode_info {
        unsigned                i_strat4096 : 1;
        unsigned                reserved : 26;
        union {
-                short_ad        *i_sad;
+                struct short_ad *i_sad;
-                long_ad         *i_lad;
+                struct long_ad          *i_lad;
                __u8            *i_data;
        } i_ext;
        struct inode vfs_inode;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 1c1c514a9725..d113b72c2768 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,6 +30,7 @@
 #define UDF_FLAG_GID_SET        16
 #define UDF_FLAG_SESSION_SET    17
 #define UDF_FLAG_LASTBLOCK_SET  18
+#define UDF_FLAG_BLOCKSIZE_SET  19
 #define UDF_PART_FLAG_UNALLOC_BITMAP    0x0001
 #define UDF_PART_FLAG_UNALLOC_TABLE     0x0002
@@ -48,6 +49,8 @@
 #define UDF_SPARABLE_MAP15              0x1522U
 #define UDF_METADATA_MAP25              0x2511U
+#define UDF_INVALID_MODE                ((mode_t)-1)
 #pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
 struct udf_meta_data {
@@ -114,7 +117,7 @@ struct udf_sb_info {
        /* Sector headers */
        __s32                   s_session;
-        __u32                   s_anchor[3];
+        __u32                   s_anchor;
        __u32                   s_last_block;
        struct buffer_head      *s_lvid_bh;
@@ -123,6 +126,8 @@ struct udf_sb_info {
        mode_t                  s_umask;
        gid_t                   s_gid;
        uid_t                   s_uid;
+        mode_t                  s_fmode;
+        mode_t                  s_dmode;
        /* Root Info */
        struct timespec         s_record_time;
@@ -143,6 +148,8 @@ struct udf_sb_info {
        struct inode            *s_vat_inode;
        struct mutex            s_alloc_mutex;
+        /* Protected by s_alloc_mutex */
+        unsigned int            s_lvid_dirty;
 };
 static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8ec865de5f13..cac51b77a5d1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode)
                return 0;
 }
-#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
 /* computes tag checksum */
-u8 udf_tag_checksum(const tag *t);
+u8 udf_tag_checksum(const struct tag *t);
 struct dentry;
 struct inode;
@@ -95,7 +93,7 @@ struct udf_vds_record {
 };
 struct generic_desc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          volDescSeqNum;
 };
@@ -108,11 +106,22 @@ struct ustr {
 struct extent_position {
        struct buffer_head *bh;
        uint32_t offset;
-        kernel_lb_addr block;
+        struct kernel_lb_addr block;
 };
 /* super.c */
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
+static inline void udf_updated_lvid(struct super_block *sb)
+{
+        struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
+        BUG_ON(!bh);
+        WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
+                     bh->b_data)->integrityType !=
+                     cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
+        sb->s_dirt = 1;
+        UDF_SB(sb)->s_lvid_dirty = 1;
+}
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int,
                     unsigned long);
 /* inode.c */
-extern struct inode *udf_iget(struct super_block *, kernel_lb_addr);
+extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
@@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *);
 extern int udf_write_inode(struct inode *, int);
 extern long udf_block_map(struct inode *, sector_t);
 extern int udf_extend_file(struct inode *, struct extent_position *,
-                           kernel_long_ad *, sector_t);
+                           struct kernel_long_ad *, sector_t);
 extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
-                         kernel_lb_addr *, uint32_t *, sector_t *);
+                         struct kernel_lb_addr *, uint32_t *, sector_t *);
 extern int8_t udf_add_aext(struct inode *, struct extent_position *,
-                           kernel_lb_addr, uint32_t, int);
+                           struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_write_aext(struct inode *, struct extent_position *,
-                             kernel_lb_addr, uint32_t, int);
+                             struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_delete_aext(struct inode *, struct extent_position,
-                              kernel_lb_addr, uint32_t);
+                              struct kernel_lb_addr, uint32_t);
 extern int8_t udf_next_aext(struct inode *, struct extent_position *,
-                            kernel_lb_addr *, uint32_t *, int);
+                            struct kernel_lb_addr *, uint32_t *, int);
 extern int8_t udf_current_aext(struct inode *, struct extent_position *,
-                               kernel_lb_addr *, uint32_t *, int);
+                               struct kernel_lb_addr *, uint32_t *, int);
 /* misc.c */
 extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
 extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
                                           uint32_t, uint16_t *);
 extern struct buffer_head *udf_read_ptagged(struct super_block *,
-                                            kernel_lb_addr, uint32_t,
+                                            struct kernel_lb_addr *, uint32_t,
                                            uint16_t *);
 extern void udf_update_tag(char *, int);
 extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
@@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
                                          uint32_t);
 extern int udf_relocate_blocks(struct super_block *, long, long *);
+static inline uint32_t
+udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
+                  uint32_t offset)
+{
+        return udf_get_pblock(sb, loc->logicalBlockNum,
+                        loc->partitionReferenceNum, offset);
+}
 /* unicode.c */
 extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
 extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
@@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *);
 /* balloc.c */
 extern void udf_free_blocks(struct super_block *, struct inode *,
-                            kernel_lb_addr, uint32_t, uint32_t);
+                            struct kernel_lb_addr *, uint32_t, uint32_t);
 extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
                               uint32_t, uint32_t);
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
@@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
                                                struct udf_fileident_bh *,
                                                struct fileIdentDesc *,
                                                struct extent_position *,
-                                                kernel_lb_addr *, uint32_t *,
+                                                struct kernel_lb_addr *, uint32_t *,
                                                sector_t *);
 extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
                                               int *offset);
-extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
+extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
-extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
+extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
 /* udftime.c */
 extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
-                                                timestamp src);
+                                                struct timestamp src);
-extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src);
+extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
 #endif                          /* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index 489f52fb428c..6a9f3a9cc428 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -4,9 +4,9 @@
 #include <asm/byteorder.h>
 #include <linux/string.h>
-static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
+static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
 {
-        kernel_lb_addr out;
+        struct kernel_lb_addr out;
        out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
        out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
@@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
        return out;
 }
-static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
+static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
 {
-        lb_addr out;
+        struct lb_addr out;
        out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
        out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
@@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
        return out;
 }
-static inline short_ad lesa_to_cpu(short_ad in)
+static inline struct short_ad lesa_to_cpu(struct short_ad in)
 {
-        short_ad out;
+        struct short_ad out;
        out.extLength = le32_to_cpu(in.extLength);
        out.extPosition = le32_to_cpu(in.extPosition);
@@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in)
        return out;
 }
-static inline short_ad cpu_to_lesa(short_ad in)
+static inline struct short_ad cpu_to_lesa(struct short_ad in)
 {
-        short_ad out;
+        struct short_ad out;
        out.extLength = cpu_to_le32(in.extLength);
        out.extPosition = cpu_to_le32(in.extPosition);
@@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in)
        return out;
 }
-static inline kernel_long_ad lela_to_cpu(long_ad in)
+static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
 {
-        kernel_long_ad out;
+        struct kernel_long_ad out;
        out.extLength = le32_to_cpu(in.extLength);
        out.extLocation = lelb_to_cpu(in.extLocation);
@@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in)
        return out;
 }
-static inline long_ad cpu_to_lela(kernel_long_ad in)
+static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
 {
-        long_ad out;
+        struct long_ad out;
        out.extLength = cpu_to_le32(in.extLength);
        out.extLocation = cpu_to_lelb(in.extLocation);
@@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in)
        return out;
 }
-static inline kernel_extent_ad leea_to_cpu(extent_ad in)
+static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
 {
-        kernel_extent_ad out;
+        struct kernel_extent_ad out;
        out.extLength = le32_to_cpu(in.extLength);
        out.extLocation = le32_to_cpu(in.extLocation);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 5f811655c9b5..b8c828c4d200 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,7 +85,8 @@ extern struct timezone sys_tz;
 #define SECS_PER_HOUR   (60 * 60)
 #define SECS_PER_DAY    (SECS_PER_HOUR * 24)
-struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
+struct timespec *
+udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
 {
        int yday;
        u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
@@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
        return dest;
 }
-timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts)
+struct timestamp *
+udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
 {
        long int days, rem, y;
        const unsigned short int *ip;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 9fdf8c93c58e..cefa8c8913e6 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 {
        const uint8_t *ocu;
        uint8_t cmp_id, ocu_len;
-        int i;
+        int i, len;
        ocu_len = ocu_i->u_len;
@@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
                if (cmp_id == 16)
                        c = (c << 8) | ocu[i++];
-                utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
+                len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
-                                              UDF_NAME_LEN - utf_o->u_len);
+                                    UDF_NAME_LEN - utf_o->u_len);
+                /* Valid character? */
+                if (len >= 0)
+                        utf_o->u_len += len;
+                else
+                        utf_o->u_name[utf_o->u_len++] = '?';
        }
        utf_o->u_cmpID = 8;
@@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
                        int length)
 {
-        unsigned len, i, max_val;
+        int len;
+        unsigned i, max_val;
        uint16_t uni_char;
        int u_len;
@@ -302,8 +308,13 @@ try_again:
        u_len = 0U;
        for (i = 0U; i < uni->u_len; i++) {
                len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
-                if (len <= 0)
+                if (!len)
                        continue;
+                /* Invalid character, deal with it */
+                if (len < 0) {
+                        len = 1;
+                        uni_char = '?';
+                }
                if (uni_char > max_val) {
                        max_val = 0xffffU;
@@ -324,34 +335,43 @@ try_again:
 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
                     int flen)
 {
-        struct ustr filename, unifilename;
+        struct ustr *filename, *unifilename;
-        int len;
+        int len = 0;
-        if (udf_build_ustr_exact(&unifilename, sname, flen))
+        filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        if (!filename)
                return 0;
+        unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        if (!unifilename)
+                goto out1;
+        if (udf_build_ustr_exact(unifilename, sname, flen))
+                goto out2;
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-                if (!udf_CS0toUTF8(&filename, &unifilename)) {
+                if (!udf_CS0toUTF8(filename, unifilename)) {
                        udf_debug("Failed in udf_get_filename: sname = %s\n",
                                  sname);
-                        return 0;
+                        goto out2;
                }
        } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-                if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename,
+                if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
-                                  &unifilename)) {
+                                  unifilename)) {
                        udf_debug("Failed in udf_get_filename: sname = %s\n",
                                  sname);
-                        return 0;
+                        goto out2;
                }
        } else
-                return 0;
+                goto out2;
-        len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
+        len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
-                                     unifilename.u_name, unifilename.u_len);
+                                     unifilename->u_name, unifilename->u_len);
-        if (len)
+out2:
-                return len;
+        kfree(unifilename);
+out1:
-        return 0;
+        kfree(filename);
+        return len;
 }
 int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index dbbbc4668769..6321b797061b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = ufs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = ufs_sync_file,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 625ef17c6f83..2bd3a1615714 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -30,7 +30,7 @@
 #include "ufs.h"
-static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
        int err;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e1c1fc5ee239..60359291761f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1268,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct ufs_super_block_first *usb1;
        struct ufs_super_block_second *usb2;
        struct ufs_super_block_third *usb3;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        lock_kernel();
@@ -1290,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
                ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
        buf->f_files = uspi->s_ncg * uspi->s_ipg;
        buf->f_namelen = UFS_MAXNAMLEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        unlock_kernel();
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 69b3427d7885..d0c4acd4f1f3 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -98,8 +98,8 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 /* file.c */
 extern const struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
 extern const struct address_space_operations ufs_aops;
+extern int ufs_sync_file(struct file *, struct dentry *, int);
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index 197c4fcac032..d51b8f9db921 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -237,13 +237,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
        if (size) {
                if (size > XATTR_SIZE_MAX)
                        return -E2BIG;
-                kvalue = kmalloc(size, GFP_KERNEL);
+                kvalue = memdup_user(value, size);
-                if (!kvalue)
+                if (IS_ERR(kvalue))
-                        return -ENOMEM;
+                        return PTR_ERR(kvalue);
-                if (copy_from_user(kvalue, value, size)) {
-                        kfree(kvalue);
-                        return -EFAULT;
-                }
        }
        error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0c..f4e255441574 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
 STATIC int
 xfs_vm_page_mkwrite(
        struct vm_area_struct   *vma,
-        struct page             *page)
+        struct vm_fault         *vmf)
 {
-        return block_page_mkwrite(vma, page, xfs_get_blocks);
+        return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 const struct file_operations xfs_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index d0b499418a7d..34eaab608e6e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -489,17 +489,12 @@ xfs_attrmulti_attr_set(
        if (len > XATTR_SIZE_MAX)
                return EINVAL;
-        kbuf = kmalloc(len, GFP_KERNEL);
+        kbuf = memdup_user(ubuf, len);
-        if (!kbuf)
+        if (IS_ERR(kbuf))
-                return ENOMEM;
+                return PTR_ERR(kbuf);
-        if (copy_from_user(kbuf, ubuf, len))
-                goto out_kfree;
        error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
- out_kfree:
-        kfree(kbuf);
        return error;
 }
@@ -540,20 +535,16 @@ xfs_attrmulti_by_handle(
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
-        error = ENOMEM;
+        ops = memdup_user(am_hreq.ops, size);
-        ops = kmalloc(size, GFP_KERNEL);
+        if (IS_ERR(ops)) {
-        if (!ops)
+                error = PTR_ERR(ops);
                goto out_dput;
+        }
-        error = EFAULT;
-        if (copy_from_user(ops, am_hreq.ops, size))
-                goto out_kfree_ops;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
        error = 0;
        for (i = 0; i < am_hreq.opcount; i++) {
                ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index c70c4e3db790..0882d166239a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -427,20 +427,16 @@ xfs_compat_attrmulti_by_handle(
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
-        error = ENOMEM;
+        ops = memdup_user(compat_ptr(am_hreq.ops), size);
-        ops = kmalloc(size, GFP_KERNEL);
+        if (IS_ERR(ops)) {
-        if (!ops)
+                error = PTR_ERR(ops);
                goto out_dput;
+        }
-        error = EFAULT;
-        if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
-                goto out_kfree_ops;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
        error = 0;
        for (i = 0; i < am_hreq.opcount; i++) {
                ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 631d0137551e..6075382336d7 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -230,7 +230,7 @@ xfs_vn_mknod(
        }
        if (IS_POSIXACL(dir) && !default_acl)
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
        xfs_dentry_to_name(&name, dentry);
        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
@@ -404,7 +404,7 @@ xfs_vn_symlink(
        mode_t          mode;
        mode = S_IFLNK |
-                (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+                (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
        xfs_dentry_to_name(&name, dentry);
        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
author	Felix Blyakher <felixb@sgi.com>	2009-06-10 18:07:47 -0400
committer	Felix Blyakher <felixb@sgi.com>	2009-06-10 18:07:47 -0400
commit	4e73e0eb633f8a1b5cbf20e7f42c6dbfec1d1ca7 (patch)
tree	0cea46e43f0625244c3d06a71d6559e5ec5419ca /fs
parent	4156e735d3abde8e9243b5d22f7999dd3fffab2e (diff)
parent	07a2039b8eb0af4ff464efd3dfd95de5c02648c6 (diff)