Merge commit 'v2.6.30-rc1' into core/urgent

Merge reason: need latest upstream to queue up dependent fix Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-04-08 11:02:50 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-04-08 11:02:57 -0400
commit: ff96e612cba32510e263e17b213235fe5746397e (patch)
tree: a8df57d76b10e0901a4fb76cd2987eb9826a560a /fs
parent: cd84a42f315e50edd454c27a3da3951ccd3d735a (diff)
parent: 577c9c456f0e1371cbade38eaf91ae8e8a308555 (diff)
365 files changed, 45195 insertions, 5780 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index cef8b18ceaa3..9f7270f36b2a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -66,6 +66,13 @@ config GENERIC_ACL
        bool
        select FS_POSIX_ACL
+menu "Caches"
+source "fs/fscache/Kconfig"
+source "fs/cachefiles/Kconfig"
+endmenu
 if BLOCK
 menu "CD-ROM/DVD Filesystems"
@@ -168,6 +175,33 @@ source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
+source "fs/exofs/Kconfig"
+config NILFS2_FS
+        tristate "NILFS2 file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        select CRC32
+        help
+          NILFS2 is a log-structured file system (LFS) supporting continuous
+          snapshotting.  In addition to versioning capability of the entire
+          file system, users can even restore files mistakenly overwritten or
+          destroyed just a few seconds ago.  Since this file system can keep
+          consistency like conventional LFS, it achieves quick recovery after
+          system crashes.
+          NILFS2 creates a number of checkpoints every few seconds or per
+          synchronous write basis (unless there is no change).  Users can
+          select significant versions among continuously created checkpoints,
+          and can change them into snapshots which will be preserved for long
+          periods until they are changed back to checkpoints.  Each
+          snapshot is mountable as a read-only file system concurrently with
+          its writable mount, and this feature is convenient for online backup.
+          Some features including atime, extended attributes, and POSIX ACLs,
+          are not supported yet.
+          To compile this file system support as a module, choose M here: the
+          module will be called nilfs2.  If unsure, say N.
 endif # MISC_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..af6d04700d9c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o drop_caches.o splice.o sync.o utimes.o \
-                stack.o
+                stack.o fs_struct.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=        buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -63,6 +63,7 @@ obj-$(CONFIG_PROFILING)		+= dcookies.o
 obj-$(CONFIG_DLM)               += dlm/
 
 # Do not add any filesystems before this line
+obj-$(CONFIG_FSCACHE)           += fscache/
 obj-$(CONFIG_REISERFS_FS)       += reiserfs/
 obj-$(CONFIG_EXT3_FS)           += ext3/ # Before ext2 so root fs can be ext3
 obj-$(CONFIG_EXT2_FS)           += ext2/
@@ -113,10 +114,13 @@ obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)            += xfs/
 obj-$(CONFIG_9P_FS)             += 9p/
 obj-$(CONFIG_AFS_FS)            += afs/
+obj-$(CONFIG_NILFS2_FS)         += nilfs2/
 obj-$(CONFIG_BEFS_FS)           += befs/
 obj-$(CONFIG_HOSTFS)            += hostfs/
 obj-$(CONFIG_HPPFS)             += hppfs/
+obj-$(CONFIG_CACHEFILES)        += cachefiles/
 obj-$(CONFIG_DEBUG_FS)          += debugfs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
 obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_EXOFS_FS)          += exofs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7f83a46f2b7e..dd9becca4241 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
 static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
+        struct super_block *sb = dentry->d_sb;
+        struct adfs_sb_info *sbi = ADFS_SB(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type    = ADFS_SUPER_MAGIC;
-        buf->f_namelen = asb->s_namelen;
+        buf->f_namelen = sbi->s_namelen;
-        buf->f_bsize   = dentry->d_sb->s_blocksize;
+        buf->f_bsize   = sb->s_blocksize;
-        buf->f_blocks  = asb->s_size;
+        buf->f_blocks  = sbi->s_size;
-        buf->f_files   = asb->s_ids_per_zone * asb->s_map_size;
+        buf->f_files   = sbi->s_ids_per_zone * sbi->s_map_size;
        buf->f_bavail  =
-        buf->f_bfree   = adfs_map_free(dentry->d_sb);
+        buf->f_bfree   = adfs_map_free(sb);
        buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a19d64b582aa..5ce695e707fe 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -533,6 +533,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        int              free;
+        u64              id = huge_encode_dev(sb->s_bdev->bd_dev);
        pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
             AFFS_SB(sb)->s_reserved);
@@ -543,6 +544,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_blocks  = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
        buf->f_bfree   = free;
        buf->f_bavail  = free;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
+        buf->f_namelen = 30;
        return 0;
 }
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index e7b522fe15e1..5c4e61d3c772 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -19,3 +19,11 @@ config AFS_DEBUG
          See <file:Documentation/filesystems/afs.txt> for more information.
          If unsure, say N.
+config AFS_FSCACHE
+        bool "Provide AFS client caching support (EXPERIMENTAL)"
+        depends on EXPERIMENTAL
+        depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
+        help
+          Say Y here if you want AFS data to be cached locally on disk through
+          the generic filesystem cache manager
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index a66671082cfb..4f64b95d57bd 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,7 +2,10 @@
 # Makefile for Red Hat Linux AFS client.
 #
+afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
 kafs-objs := \
+        $(afs-cache-y) \
        callback.o \
        cell.o \
        cmservice.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index de0d7de69edc..e2b1d3f16519 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -1,6 +1,6 @@
 /* AFS caching stuff
 *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -9,248 +9,395 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifdef AFS_CACHING_SUPPORT
+#include <linux/slab.h>
-static cachefs_match_val_t afs_cell_cache_match(void *target,
+#include <linux/sched.h>
-                                                const void *entry);
+#include "internal.h"
-static void afs_cell_cache_update(void *source, void *entry);
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
-struct cachefs_index_def afs_cache_cell_index_def = {
+                                       void *buffer, uint16_t buflen);
-        .name                   = "cell_ix",
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
-        .data_size              = sizeof(struct afs_cache_cell),
+                                       void *buffer, uint16_t buflen);
-        .keys[0]                = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
-        .match                  = afs_cell_cache_match,
+                                                      const void *buffer,
-        .update                 = afs_cell_cache_update,
+                                                      uint16_t buflen);
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t buflen);
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vlocation_cache_check_aux(
+        void *cookie_netfs_data, const void *buffer, uint16_t buflen);
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+                                         void *buffer, uint16_t buflen);
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+                                        void *buffer, uint16_t buflen);
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+                                     uint64_t *size);
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+                                        void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+                                                       const void *buffer,
+                                                       uint16_t buflen);
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
+struct fscache_netfs afs_cache_netfs = {
+        .name                   = "afs",
+        .version                = 0,
+};
+struct fscache_cookie_def afs_cell_cache_index_def = {
+        .name           = "AFS.cell",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = afs_cell_cache_get_key,
+        .get_aux        = afs_cell_cache_get_aux,
+        .check_aux      = afs_cell_cache_check_aux,
+};
+struct fscache_cookie_def afs_vlocation_cache_index_def = {
+        .name                   = "AFS.vldb",
+        .type                   = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key                = afs_vlocation_cache_get_key,
+        .get_aux                = afs_vlocation_cache_get_aux,
+        .check_aux              = afs_vlocation_cache_check_aux,
+};
+struct fscache_cookie_def afs_volume_cache_index_def = {
+        .name           = "AFS.volume",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = afs_volume_cache_get_key,
+};
+struct fscache_cookie_def afs_vnode_cache_index_def = {
+        .name                   = "AFS.vnode",
+        .type                   = FSCACHE_COOKIE_TYPE_DATAFILE,
+        .get_key                = afs_vnode_cache_get_key,
+        .get_attr               = afs_vnode_cache_get_attr,
+        .get_aux                = afs_vnode_cache_get_aux,
+        .check_aux              = afs_vnode_cache_check_aux,
+        .now_uncached           = afs_vnode_cache_now_uncached,
 };
-#endif
 /*
- * match a cell record obtained from the cache
+ * set the key for the index entry
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
-static cachefs_match_val_t afs_cell_cache_match(void *target,
+                                       void *buffer, uint16_t bufmax)
-                                                const void *entry)
 {
-        const struct afs_cache_cell *ccell = entry;
+        const struct afs_cell *cell = cookie_netfs_data;
-        struct afs_cell *cell = target;
+        uint16_t klen;
-        _enter("{%s},{%s}", ccell->name, cell->name);
+        _enter("%p,%p,%u", cell, buffer, bufmax);
-        if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
+        klen = strlen(cell->name);
-                _leave(" = SUCCESS");
+        if (klen > bufmax)
-                return CACHEFS_MATCH_SUCCESS;
+                return 0;
-        }
-        _leave(" = FAILED");
+        memcpy(buffer, cell->name, klen);
-        return CACHEFS_MATCH_FAILED;
+        return klen;
 }
-#endif
 /*
- * update a cell record in the cache
+ * provide new auxilliary cache data
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
-static void afs_cell_cache_update(void *source, void *entry)
+                                       void *buffer, uint16_t bufmax)
 {
-        struct afs_cache_cell *ccell = entry;
+        const struct afs_cell *cell = cookie_netfs_data;
-        struct afs_cell *cell = source;
+        uint16_t dlen;
-        _enter("%p,%p", source, entry);
+        _enter("%p,%p,%u", cell, buffer, bufmax);
-        strncpy(ccell->name, cell->name, sizeof(ccell->name));
+        dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
+        dlen = min(dlen, bufmax);
+        dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
-        memcpy(ccell->vl_servers,
+        memcpy(buffer, cell->vl_addrs, dlen);
-               cell->vl_addrs,
+        return dlen;
-               min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
+}
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+                                                      const void *buffer,
+                                                      uint16_t buflen)
+{
+        _leave(" = OKAY");
+        return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-                                                     const void *entry);
-static void afs_vlocation_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_vlocation_cache_index_def = {
-        .name           = "vldb",
-        .data_size      = sizeof(struct afs_cache_vlocation),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-        .match          = afs_vlocation_cache_match,
-        .update         = afs_vlocation_cache_update,
-};
-#endif
+/*****************************************************************************/
 /*
- * match a VLDB record stored in the cache
+ * set the key for the index entry
- * - may also load target from entry
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+                                            void *buffer, uint16_t bufmax)
-                                                     const void *entry)
 {
-        const struct afs_cache_vlocation *vldb = entry;
+        const struct afs_vlocation *vlocation = cookie_netfs_data;
-        struct afs_vlocation *vlocation = target;
+        uint16_t klen;
+        _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+        klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
+        if (klen > bufmax)
+                return 0;
-        _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+        memcpy(buffer, vlocation->vldb.name, klen);
-        if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
+        _leave(" = %u", klen);
-            ) {
+        return klen;
-                if (!vlocation->valid ||
+}
-                    vlocation->vldb.rtime == vldb->rtime
+/*
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax)
+{
+        const struct afs_vlocation *vlocation = cookie_netfs_data;
+        uint16_t dlen;
+        _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+        dlen = sizeof(struct afs_cache_vlocation);
+        dlen -= offsetof(struct afs_cache_vlocation, nservers);
+        if (dlen > bufmax)
+                return 0;
+        memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
+        _leave(" = %u", dlen);
+        return dlen;
+}
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static
+enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
+                                                    const void *buffer,
+                                                    uint16_t buflen)
+{
+        const struct afs_cache_vlocation *cvldb;
+        struct afs_vlocation *vlocation = cookie_netfs_data;
+        uint16_t dlen;
+        _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
+        /* check the size of the data is what we're expecting */
+        dlen = sizeof(struct afs_cache_vlocation);
+        dlen -= offsetof(struct afs_cache_vlocation, nservers);
+        if (dlen != buflen)
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
+        /* if what's on disk is more valid than what's in memory, then use the
+         * VL record from the cache */
+        if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
+                memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
+                vlocation->valid = 1;
+                _leave(" = SUCCESS [c->m]");
+                return FSCACHE_CHECKAUX_OKAY;
+        }
+        /* need to update the cache if the cached info differs */
+        if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
+                /* delete if the volume IDs for this name differ */
+                if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
+                           sizeof(cvldb->vid)) != 0
                    ) {
-                        vlocation->vldb = *vldb;
+                        _leave(" = OBSOLETE");
-                        vlocation->valid = 1;
+                        return FSCACHE_CHECKAUX_OBSOLETE;
-                        _leave(" = SUCCESS [c->m]");
-                        return CACHEFS_MATCH_SUCCESS;
-                } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
-                        /* delete if VIDs for this name differ */
-                        if (memcmp(&vlocation->vldb.vid,
-                                   &vldb->vid,
-                                   sizeof(vldb->vid)) != 0) {
-                                _leave(" = DELETE");
-                                return CACHEFS_MATCH_SUCCESS_DELETE;
-                        }
-                        _leave(" = UPDATE");
-                        return CACHEFS_MATCH_SUCCESS_UPDATE;
-                } else {
-                        _leave(" = SUCCESS");
-                        return CACHEFS_MATCH_SUCCESS;
                }
+                _leave(" = UPDATE");
+                return FSCACHE_CHECKAUX_NEEDS_UPDATE;
        }
-        _leave(" = FAILED");
+        _leave(" = OKAY");
-        return CACHEFS_MATCH_FAILED;
+        return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
+/*****************************************************************************/
 /*
- * update a VLDB record stored in the cache
+ * set the key for the volume index entry
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
-static void afs_vlocation_cache_update(void *source, void *entry)
+                                        void *buffer, uint16_t bufmax)
 {
-        struct afs_cache_vlocation *vldb = entry;
+        const struct afs_volume *volume = cookie_netfs_data;
-        struct afs_vlocation *vlocation = source;
+        uint16_t klen;
+        _enter("{%u},%p,%u", volume->type, buffer, bufmax);
+        klen = sizeof(volume->type);
+        if (klen > bufmax)
+                return 0;
-        _enter("");
+        memcpy(buffer, &volume->type, sizeof(volume->type));
+        _leave(" = %u", klen);
+        return klen;
-        *vldb = vlocation->vldb;
 }
-#endif
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-                                                  const void *entry);
-static void afs_volume_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_volume_cache_index_def = {
-        .name           = "volume",
-        .data_size      = sizeof(struct afs_cache_vhash),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-        .keys[1]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-        .match          = afs_volume_cache_match,
-        .update         = afs_volume_cache_update,
-};
-#endif
+/*****************************************************************************/
 /*
- * match a volume hash record stored in the cache
+ * set the key for the index entry
 */
-#ifdef AFS_CACHING_SUPPORT
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
-static cachefs_match_val_t afs_volume_cache_match(void *target,
+                                        void *buffer, uint16_t bufmax)
-                                                  const void *entry)
 {
-        const struct afs_cache_vhash *vhash = entry;
+        const struct afs_vnode *vnode = cookie_netfs_data;
-        struct afs_volume *volume = target;
+        uint16_t klen;
-        _enter("{%u},{%u}", volume->type, vhash->vtype);
+        _enter("{%x,%x,%llx},%p,%u",
+               vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+               buffer, bufmax);
-        if (volume->type == vhash->vtype) {
+        klen = sizeof(vnode->fid.vnode);
-                _leave(" = SUCCESS");
+        if (klen > bufmax)
-                return CACHEFS_MATCH_SUCCESS;
+                return 0;
-        }
+        memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
-        _leave(" = FAILED");
+        _leave(" = %u", klen);
-        return CACHEFS_MATCH_FAILED;
+        return klen;
 }
-#endif
 /*
- * update a volume hash record stored in the cache
+ * provide updated file attributes
 */
-#ifdef AFS_CACHING_SUPPORT
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
-static void afs_volume_cache_update(void *source, void *entry)
+                                     uint64_t *size)
 {
-        struct afs_cache_vhash *vhash = entry;
+        const struct afs_vnode *vnode = cookie_netfs_data;
-        struct afs_volume *volume = source;
-        _enter("");
+        _enter("{%x,%x,%llx},",
+               vnode->fid.vnode, vnode->fid.unique,
+               vnode->status.data_version);
-        vhash->vtype = volume->type;
+        *size = vnode->status.size;
 }
-#endif
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
-                                                 const void *entry);
-static void afs_vnode_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_vnode_cache_index_def = {
-        .name           = "vnode",
-        .data_size      = sizeof(struct afs_cache_vnode),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 4 },
-        .match          = afs_vnode_cache_match,
-        .update         = afs_vnode_cache_update,
-};
-#endif
 /*
- * match a vnode record stored in the cache
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+                                        void *buffer, uint16_t bufmax)
+{
+        const struct afs_vnode *vnode = cookie_netfs_data;
+        uint16_t dlen;
+        _enter("{%x,%x,%Lx},%p,%u",
+               vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+               buffer, bufmax);
+        dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+        if (dlen > bufmax)
+                return 0;
+        memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
+        buffer += sizeof(vnode->fid.unique);
+        memcpy(buffer, &vnode->status.data_version,
+               sizeof(vnode->status.data_version));
+        _leave(" = %u", dlen);
+        return dlen;
+}
+/*
+ * check that the auxilliary data indicates that the entry is still valid
 */
-#ifdef AFS_CACHING_SUPPORT
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
+                                                       const void *buffer,
-                                                 const void *entry)
+                                                       uint16_t buflen)
 {
-        const struct afs_cache_vnode *cvnode = entry;
+        struct afs_vnode *vnode = cookie_netfs_data;
-        struct afs_vnode *vnode = target;
+        uint16_t dlen;
-        _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+        _enter("{%x,%x,%llx},%p,%u",
-               vnode->fid.vnode,
+               vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
-               vnode->fid.unique,
+               buffer, buflen);
-               vnode->status.version,
-               cvnode->vnode_id,
+        /* check the size of the data is what we're expecting */
-               cvnode->vnode_unique,
+        dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
-               cvnode->data_version);
+        if (dlen != buflen) {
+                _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
-        if (vnode->fid.vnode != cvnode->vnode_id) {
+                return FSCACHE_CHECKAUX_OBSOLETE;
-                _leave(" = FAILED");
-                return CACHEFS_MATCH_FAILED;
        }
-        if (vnode->fid.unique != cvnode->vnode_unique ||
+        if (memcmp(buffer,
-            vnode->status.version != cvnode->data_version) {
+                   &vnode->fid.unique,
-                _leave(" = DELETE");
+                   sizeof(vnode->fid.unique)
-                return CACHEFS_MATCH_SUCCESS_DELETE;
+                   ) != 0) {
+                unsigned unique;
+                memcpy(&unique, buffer, sizeof(unique));
+                _leave(" = OBSOLETE [uniq %x != %x]",
+                       unique, vnode->fid.unique);
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        }
+        if (memcmp(buffer + sizeof(vnode->fid.unique),
+                   &vnode->status.data_version,
+                   sizeof(vnode->status.data_version)
+                   ) != 0) {
+                afs_dataversion_t version;
+                memcpy(&version, buffer + sizeof(vnode->fid.unique),
+                       sizeof(version));
+                _leave(" = OBSOLETE [vers %llx != %llx]",
+                       version, vnode->status.data_version);
+                return FSCACHE_CHECKAUX_OBSOLETE;
        }
        _leave(" = SUCCESS");
-        return CACHEFS_MATCH_SUCCESS;
+        return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
 /*
- * update a vnode record stored in the cache
+ * indication the cookie is no longer uncached
+ * - this function is called when the backing store currently caching a cookie
+ *   is removed
+ * - the netfs should use this to clean up any markers indicating cached pages
+ * - this is mandatory for any object that may have data
 */
-#ifdef AFS_CACHING_SUPPORT
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
-static void afs_vnode_cache_update(void *source, void *entry)
 {
-        struct afs_cache_vnode *cvnode = entry;
+        struct afs_vnode *vnode = cookie_netfs_data;
-        struct afs_vnode *vnode = source;
+        struct pagevec pvec;
+        pgoff_t first;
+        int loop, nr_pages;
+        _enter("{%x,%x,%Lx}",
+               vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
+        pagevec_init(&pvec, 0);
+        first = 0;
+        for (;;) {
+                /* grab a bunch of pages to clean */
+                nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
+                                          first,
+                                          PAGEVEC_SIZE - pagevec_count(&pvec));
+                if (!nr_pages)
+                        break;
-        _enter("");
+                for (loop = 0; loop < nr_pages; loop++)
+                        ClearPageFsCache(pvec.pages[loop]);
+                first = pvec.pages[nr_pages - 1]->index + 1;
+                pvec.nr = nr_pages;
+                pagevec_release(&pvec);
+                cond_resched();
+        }
-        cvnode->vnode_id        = vnode->fid.vnode;
+        _leave("");
-        cvnode->vnode_unique    = vnode->fid.unique;
-        cvnode->data_version    = vnode->status.version;
 }
-#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 36a3642cf90e..5c4f6b499e90 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,6 +1,6 @@
 /* AFS local cache management interface
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -9,15 +9,4 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifndef AFS_CACHE_H
+#include <linux/fscache.h>
-#define AFS_CACHE_H
-#undef AFS_CACHING_SUPPORT
-#include <linux/mm.h>
-#ifdef AFS_CACHING_SUPPORT
-#include <linux/cachefs.h>
-#endif
-#include "types.h"
-#endif /* AFS_CACHE_H */
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5e1df14e16b1..e19c13f059ed 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
        if (ret < 0)
                goto error;
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        /* put it up for caching */
+        /* put it up for caching (this never returns an error) */
-        cachefs_acquire_cookie(afs_cache_netfs.primary_index,
+        cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
-                               &afs_vlocation_cache_index_def,
+                                             &afs_cell_cache_index_def,
-                               cell,
+                                             cell);
-                               &cell->cache);
 #endif
        /* add to the cell lists */
@@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell)
        list_del_init(&cell->proc_link);
        up_write(&afs_proc_cells_sem);
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_relinquish_cookie(cell->cache, 0);
+        fscache_relinquish_cookie(cell->cache, 0);
 #endif
        key_put(cell->anonymous_key);
        kfree(cell);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a3901769a96c..7a1d942ef68d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 static int afs_launder_page(struct page *page);
+static int afs_readpages(struct file *filp, struct address_space *mapping,
+                         struct list_head *pages, unsigned nr_pages);
 const struct file_operations afs_file_operations = {
        .open           = afs_open,
        .release        = afs_release,
@@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = {
 const struct address_space_operations afs_fs_aops = {
        .readpage       = afs_readpage,
+        .readpages      = afs_readpages,
        .set_page_dirty = afs_set_page_dirty,
        .launder_page   = afs_launder_page,
        .releasepage    = afs_releasepage,
@@ -101,37 +105,18 @@ int afs_release(struct inode *inode, struct file *file)
 /*
 * deal with notification that a page was read from the cache
 */
-#ifdef AFS_CACHING_SUPPORT
+static void afs_file_readpage_read_complete(struct page *page,
-static void afs_readpage_read_complete(void *cookie_data,
+                                            void *data,
-                                       struct page *page,
+                                            int error)
-                                       void *data,
-                                       int error)
 {
-        _enter("%p,%p,%p,%d", cookie_data, page, data, error);
+        _enter("%p,%p,%d", page, data, error);
-        if (error)
+        /* if the read completes with an error, we just unlock the page and let
-                SetPageError(page);
+         * the VM reissue the readpage */
-        else
+        if (!error)
                SetPageUptodate(page);
        unlock_page(page);
 }
-#endif
-/*
- * deal with notification that a page was written to the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_readpage_write_complete(void *cookie_data,
-                                        struct page *page,
-                                        void *data,
-                                        int error)
-{
-        _enter("%p,%p,%p,%d", cookie_data, page, data, error);
-        unlock_page(page);
-}
-#endif
 /*
 * AFS read page from file, directory or symlink
@@ -161,9 +146,9 @@ static int afs_readpage(struct file *file, struct page *page)
        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
                goto error;
-#ifdef AFS_CACHING_SUPPORT
        /* is it cached? */
-        ret = cachefs_read_or_alloc_page(vnode->cache,
+#ifdef CONFIG_AFS_FSCACHE
+        ret = fscache_read_or_alloc_page(vnode->cache,
                                         page,
                                         afs_file_readpage_read_complete,
                                         NULL,
@@ -171,20 +156,21 @@ static int afs_readpage(struct file *file, struct page *page)
 #else
        ret = -ENOBUFS;
 #endif
        switch (ret) {
-                /* read BIO submitted and wb-journal entry found */
-        case 1:
-                BUG(); // TODO - handle wb-journal match
                /* read BIO submitted (page in cache) */
        case 0:
                break;
-                /* no page available in cache */
+                /* page not yet cached */
-        case -ENOBUFS:
        case -ENODATA:
+                _debug("cache said ENODATA");
+                goto go_on;
+                /* page will not be cached */
+        case -ENOBUFS:
+                _debug("cache said ENOBUFS");
        default:
+        go_on:
                offset = page->index << PAGE_CACHE_SHIFT;
                len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
@@ -198,27 +184,25 @@ static int afs_readpage(struct file *file, struct page *page)
                                set_bit(AFS_VNODE_DELETED, &vnode->flags);
                                ret = -ESTALE;
                        }
-#ifdef AFS_CACHING_SUPPORT
-                        cachefs_uncache_page(vnode->cache, page);
+#ifdef CONFIG_AFS_FSCACHE
+                        fscache_uncache_page(vnode->cache, page);
 #endif
+                        BUG_ON(PageFsCache(page));
                        goto error;
                }
                SetPageUptodate(page);
-#ifdef AFS_CACHING_SUPPORT
+                /* send the page to the cache */
-                if (cachefs_write_page(vnode->cache,
+#ifdef CONFIG_AFS_FSCACHE
-                                       page,
+                if (PageFsCache(page) &&
-                                       afs_file_readpage_write_complete,
+                    fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
-                                       NULL,
+                        fscache_uncache_page(vnode->cache, page);
-                                       GFP_KERNEL) != 0
+                        BUG_ON(PageFsCache(page));
-                    ) {
-                        cachefs_uncache_page(vnode->cache, page);
-                        unlock_page(page);
                }
-#else
-                unlock_page(page);
 #endif
+                unlock_page(page);
        }
        _leave(" = 0");
@@ -232,34 +216,59 @@ error:
 }
 /*
- * invalidate part or all of a page
+ * read a set of pages
 */
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static int afs_readpages(struct file *file, struct address_space *mapping,
+                         struct list_head *pages, unsigned nr_pages)
 {
-        int ret = 1;
+        struct afs_vnode *vnode;
+        int ret = 0;
-        _enter("{%lu},%lu", page->index, offset);
+        _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
-        BUG_ON(!PageLocked(page));
+        vnode = AFS_FS_I(mapping->host);
+        if (vnode->flags & AFS_VNODE_DELETED) {
+                _leave(" = -ESTALE");
+                return -ESTALE;
+        }
-        if (PagePrivate(page)) {
+        /* attempt to read as many of the pages as possible */
-                /* We release buffers only if the entire page is being
+#ifdef CONFIG_AFS_FSCACHE
-                 * invalidated.
+        ret = fscache_read_or_alloc_pages(vnode->cache,
-                 * The get_block cached value has been unconditionally
+                                          mapping,
-                 * invalidated, so real IO is not possible anymore.
+                                          pages,
-                 */
+                                          &nr_pages,
-                if (offset == 0) {
+                                          afs_file_readpage_read_complete,
-                        BUG_ON(!PageLocked(page));
+                                          NULL,
+                                          mapping_gfp_mask(mapping));
-                        ret = 0;
+#else
-                        if (!PageWriteback(page))
+        ret = -ENOBUFS;
-                                ret = page->mapping->a_ops->releasepage(page,
+#endif
-                                                                        0);
-                        /* possibly should BUG_ON(!ret); - neilb */
+        switch (ret) {
-                }
+                /* all pages are being read from the cache */
+        case 0:
+                BUG_ON(!list_empty(pages));
+                BUG_ON(nr_pages != 0);
+                _leave(" = 0 [reading all]");
+                return 0;
+                /* there were pages that couldn't be read from the cache */
+        case -ENODATA:
+        case -ENOBUFS:
+                break;
+                /* other error */
+        default:
+                _leave(" = %d", ret);
+                return ret;
        }
-        _leave(" = %d", ret);
+        /* load the missing pages from the network */
+        ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+        _leave(" = %d [netting]", ret);
+        return ret;
 }
 /*
@@ -273,25 +282,82 @@ static int afs_launder_page(struct page *page)
 }
 /*
- * release a page and cleanup its private data
+ * invalidate part or all of a page
+ * - release a page and clean up its private data if offset is 0 (indicating
+ *   the entire page)
+ */
+static void afs_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+        _enter("{%lu},%lu", page->index, offset);
+        BUG_ON(!PageLocked(page));
+        /* we clean up only if the entire page is being invalidated */
+        if (offset == 0) {
+#ifdef CONFIG_AFS_FSCACHE
+                if (PageFsCache(page)) {
+                        struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+                        fscache_wait_on_page_write(vnode->cache, page);
+                        fscache_uncache_page(vnode->cache, page);
+                        ClearPageFsCache(page);
+                }
+#endif
+                if (PagePrivate(page)) {
+                        if (wb && !PageWriteback(page)) {
+                                set_page_private(page, 0);
+                                afs_put_writeback(wb);
+                        }
+                        if (!page_private(page))
+                                ClearPagePrivate(page);
+                }
+        }
+        _leave("");
+}
+/*
+ * release a page and clean up its private state if it's not busy
+ * - return true if the page can now be released, false if not
 */
 static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 {
+        struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
        struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
-        struct afs_writeback *wb;
        _enter("{{%x:%u}[%lu],%lx},%x",
               vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
               gfp_flags);
+        /* deny if page is being written to the cache and the caller hasn't
+         * elected to wait */
+#ifdef CONFIG_AFS_FSCACHE
+        if (PageFsCache(page)) {
+                if (fscache_check_page_write(vnode->cache, page)) {
+                        if (!(gfp_flags & __GFP_WAIT)) {
+                                _leave(" = F [cache busy]");
+                                return 0;
+                        }
+                        fscache_wait_on_page_write(vnode->cache, page);
+                }
+                fscache_uncache_page(vnode->cache, page);
+                ClearPageFsCache(page);
+        }
+#endif
        if (PagePrivate(page)) {
-                wb = (struct afs_writeback *) page_private(page);
+                if (wb) {
-                ASSERT(wb != NULL);
+                        set_page_private(page, 0);
-                set_page_private(page, 0);
+                        afs_put_writeback(wb);
+                }
                ClearPagePrivate(page);
-                afs_put_writeback(wb);
        }
-        _leave(" = 0");
+        /* indicate that the page can be released */
-        return 0;
+        _leave(" = T");
+        return 1;
 }
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index bb47217f6a18..c048f0658751 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
                return -EBADMSG;
        }
+#ifdef CONFIG_AFS_FSCACHE
+        if (vnode->status.size != inode->i_size)
+                fscache_attr_changed(vnode->cache);
+#endif
        inode->i_nlink          = vnode->status.nlink;
        inode->i_uid            = vnode->status.owner;
        inode->i_gid            = 0;
@@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
                return inode;
        }
-#ifdef AFS_CACHING_SUPPORT
-        /* set up caching before reading the status, as fetch-status reads the
-         * first page of symlinks to see if they're really mntpts */
-        cachefs_acquire_cookie(vnode->volume->cache,
-                               NULL,
-                               vnode,
-                               &vnode->cache);
-#endif
        if (!status) {
                /* it's a remotely extant inode */
                set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
@@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
                }
        }
+        /* set up caching before mapping the status, as map-status reads the
+         * first page of symlinks to see if they're really mountpoints */
+        inode->i_size = vnode->status.size;
+#ifdef CONFIG_AFS_FSCACHE
+        vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
+                                              &afs_vnode_cache_index_def,
+                                              vnode);
+#endif
        ret = afs_inode_map_status(vnode, key);
        if (ret < 0)
                goto bad_inode;
@@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
        /* failure */
 bad_inode:
+#ifdef CONFIG_AFS_FSCACHE
+        fscache_relinquish_cookie(vnode->cache, 0);
+        vnode->cache = NULL;
+#endif
        iget_failed(inode);
        _leave(" = %d [bad]", ret);
        return ERR_PTR(ret);
@@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode)
        ASSERT(list_empty(&vnode->writebacks));
        ASSERT(!vnode->cb_promised);
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_relinquish_cookie(vnode->cache, 0);
+        fscache_relinquish_cookie(vnode->cache, 0);
        vnode->cache = NULL;
 #endif
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 67f259d99cd6..106be66dafd2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
 #include "afs.h"
 #include "afs_vl.h"
+#include "cache.h"
 #define AFS_CELL_MAX_ADDRS 15
@@ -193,8 +194,8 @@ struct afs_cell {
        struct key              *anonymous_key; /* anonymous user key for this cell */
        struct list_head        proc_link;      /* /proc cell list link */
        struct proc_dir_entry   *proc_dir;      /* /proc dir for this cell */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        struct cachefs_cookie   *cache;         /* caching cookie */
+        struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        /* server record management */
@@ -249,8 +250,8 @@ struct afs_vlocation {
        struct list_head        grave;          /* link in master graveyard list */
        struct list_head        update;         /* link in master update list */
        struct afs_cell         *cell;          /* cell to which volume belongs */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        struct cachefs_cookie   *cache;         /* caching cookie */
+        struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        struct afs_cache_vlocation vldb;        /* volume information DB record */
        struct afs_volume       *vols[3];       /* volume access record pointer (index by type) */
@@ -302,8 +303,8 @@ struct afs_volume {
        atomic_t                usage;
        struct afs_cell         *cell;          /* cell to which belongs (unrefd ptr) */
        struct afs_vlocation    *vlocation;     /* volume location */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        struct cachefs_cookie   *cache;         /* caching cookie */
+        struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        afs_volid_t             vid;            /* volume ID */
        afs_voltype_t           type;           /* type of volume */
@@ -333,8 +334,8 @@ struct afs_vnode {
        struct afs_server       *server;        /* server currently supplying this file */
        struct afs_fid          fid;            /* the file identifier for this inode */
        struct afs_file_status  status;         /* AFS status info for this file */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        struct cachefs_cookie   *cache;         /* caching cookie */
+        struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        struct afs_permits      *permits;       /* cache of permits so far obtained */
        struct mutex            permits_lock;   /* lock for altering permits list */
@@ -428,6 +429,22 @@ struct afs_uuid {
 /*****************************************************************************/
 /*
+ * cache.c
+ */
+#ifdef CONFIG_AFS_FSCACHE
+extern struct fscache_netfs afs_cache_netfs;
+extern struct fscache_cookie_def afs_cell_cache_index_def;
+extern struct fscache_cookie_def afs_vlocation_cache_index_def;
+extern struct fscache_cookie_def afs_volume_cache_index_def;
+extern struct fscache_cookie_def afs_vnode_cache_index_def;
+#else
+#define afs_cell_cache_index_def        (*(struct fscache_cookie_def *) NULL)
+#define afs_vlocation_cache_index_def   (*(struct fscache_cookie_def *) NULL)
+#define afs_volume_cache_index_def      (*(struct fscache_cookie_def *) NULL)
+#define afs_vnode_cache_index_def       (*(struct fscache_cookie_def *) NULL)
+#endif
+/*
 * callback.c
 */
 extern void afs_init_callback_state(struct afs_server *);
@@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void);
 */
 extern struct rw_semaphore afs_proc_cells_sem;
 extern struct list_head afs_proc_cells;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_cache_cell_index_def;
-#endif
 #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
 extern int afs_cell_init(char *);
@@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *);
 * main.c
 */
 extern struct afs_uuid afs_uuid;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_netfs afs_cache_netfs;
-#endif
 /*
 * misc.c
@@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t);
 /*
 * vlclient.c
 */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vlocation_cache_index_def;
-#endif
 extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
                                    const char *, struct afs_cache_vlocation *,
                                    const struct afs_wait_mode *);
@@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void);
 /*
 * vnode.c
 */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vnode_cache_index_def;
-#endif
-extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
 static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
 {
        return container_of(inode, struct afs_vnode, vfs_inode);
@@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
 /*
 * volume.c
 */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_volume_cache_index_def;
-#endif
 #define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
 extern void afs_put_volume(struct afs_volume *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 2d3e5d4fb9f7..66d54d348c55 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,6 +1,6 @@
 /* AFS client file system
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -29,18 +29,6 @@ static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
-#ifdef AFS_CACHING_SUPPORT
-static struct cachefs_netfs_operations afs_cache_ops = {
-        .get_page_cookie        = afs_cache_get_page_cookie,
-};
-struct cachefs_netfs afs_cache_netfs = {
-        .name                   = "afs",
-        .version                = 0,
-        .ops                    = &afs_cache_ops,
-};
-#endif
 struct afs_uuid afs_uuid;
 /*
@@ -104,10 +92,9 @@ static int __init afs_init(void)
        if (ret < 0)
                return ret;
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
        /* we want to be able to cache */
-        ret = cachefs_register_netfs(&afs_cache_netfs,
+        ret = fscache_register_netfs(&afs_cache_netfs);
-                                     &afs_cache_cell_index_def);
        if (ret < 0)
                goto error_cache;
 #endif
@@ -142,8 +129,8 @@ error_fs:
 error_open_socket:
 error_vl_update_init:
 error_cell_init:
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_unregister_netfs(&afs_cache_netfs);
+        fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
        afs_callback_update_kill();
@@ -175,8 +162,8 @@ static void __exit afs_exit(void)
        afs_vlocation_purge();
        flush_scheduled_work();
        afs_cell_purge();
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_unregister_netfs(&afs_cache_netfs);
+        fscache_unregister_netfs(&afs_cache_netfs);
 #endif
        afs_proc_cleanup();
        rcu_barrier();
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 78db4953a800..2b9e2d03a390 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        if (PageError(page))
                goto error;
-        buf = kmap(page);
+        buf = kmap_atomic(page, KM_USER0);
        memcpy(devname, buf, size);
-        kunmap(page);
+        kunmap_atomic(buf, KM_USER0);
        page_cache_release(page);
        page = NULL;
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 849fc3160cb5..ec2a7431e458 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl,
        vl->vldb = *vldb;
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        /* update volume entry in local cache */
+        fscache_update_cookie(vl->cache);
-        cachefs_update_cookie(vl->cache);
 #endif
 }
@@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
        memset(&vldb, 0, sizeof(vldb));
        /* see if we have an in-cache copy (will set vl->valid if there is) */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_acquire_cookie(cell->cache,
+        vl->cache = fscache_acquire_cookie(vl->cell->cache,
-                               &afs_volume_cache_index_def,
+                                           &afs_vlocation_cache_index_def, vl);
-                               vlocation,
-                               &vl->cache);
 #endif
        if (vl->valid) {
@@ -420,6 +417,11 @@ fill_in_record:
        spin_unlock(&vl->lock);
        wake_up(&vl->waitq);
+        /* update volume entry in local cache */
+#ifdef CONFIG_AFS_FSCACHE
+        fscache_update_cookie(vl->cache);
+#endif
        /* schedule for regular updates */
        afs_vlocation_queue_for_updates(vl);
        goto success;
@@ -465,7 +467,7 @@ found_in_memory:
        spin_unlock(&vl->lock);
 success:
-        _leave(" = %p",vl);
+        _leave(" = %p", vl);
        return vl;
 error_abandon:
@@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
 {
        _enter("%p", vl);
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_relinquish_cookie(vl->cache, 0);
+        fscache_relinquish_cookie(vl->cache, 0);
 #endif
        afs_put_cell(vl->cell);
        kfree(vl);
 }
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 8bab0e3437f9..a353e69e2391 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        }
        /* attach the cache and volume location */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_acquire_cookie(vlocation->cache,
+        volume->cache = fscache_acquire_cookie(vlocation->cache,
-                               &afs_vnode_cache_index_def,
+                                               &afs_volume_cache_index_def,
-                               volume,
+                                               volume);
-                               &volume->cache);
 #endif
        afs_get_vlocation(vlocation);
        volume->vlocation = vlocation;
@@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume)
        up_write(&vlocation->cell->vl_sem);
        /* finish cleaning up the volume */
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
-        cachefs_relinquish_cookie(volume->cache, 0);
+        fscache_relinquish_cookie(volume->cache, 0);
 #endif
        afs_put_vlocation(vlocation);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3fb36d433621..c2e7a7ff0080 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
        _leave(" = %d", ret);
        return ret;
 }
+/*
+ * notification that a previously read-only page is about to become writable
+ * - if it returns an error, the caller will deliver a bus error signal
+ */
+int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+        struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+        _enter("{{%x:%u}},{%lx}",
+               vnode->fid.vid, vnode->fid.vnode, page->index);
+        /* wait for the page to be written to the cache before we allow it to
+         * be modified */
+#ifdef CONFIG_AFS_FSCACHE
+        fscache_wait_on_page_write(vnode->cache, page);
+#endif
+        _leave(" = 0");
+        return 0;
+}
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #endif                          /* __KERNEL__ */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d06cb023ad02..76afd0d6b86c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -900,6 +900,7 @@ static int
 befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        befs_debug(sb, "---> befs_statfs()");
@@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = buf->f_bfree;
        buf->f_files = 0;       /* UNKNOWN */
        buf->f_ffree = 0;       /* UNKNOWN */
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = BEFS_NAME_LEN;
        befs_debug(sb, "<--- befs_statfs()");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 33b7235f853b..40381df34869 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -12,8 +12,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/errno.h>
@@ -21,20 +19,15 @@
 #include <linux/binfmts.h>
 #include <linux/string.h>
 #include <linux/file.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
 #include <linux/slab.h>
-#include <linux/shm.h>
 #include <linux/personality.h>
 #include <linux/elfcore.h>
 #include <linux/init.h>
 #include <linux/highuid.h>
-#include <linux/smp.h>
 #include <linux/compiler.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/utsname.h>
@@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        unsigned long error;
        struct elf_phdr *elf_ppnt, *elf_phdata;
        unsigned long elf_bss, elf_brk;
-        int elf_exec_fileno;
        int retval, i;
        unsigned int size;
        unsigned long elf_entry;
@@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                goto out_free_ph;
        }
-        retval = get_unused_fd();
-        if (retval < 0)
-                goto out_free_ph;
-        get_file(bprm->file);
-        fd_install(elf_exec_fileno = retval, bprm->file);
        elf_ppnt = elf_phdata;
        elf_bss = 0;
        elf_brk = 0;
@@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        retval = -ENOEXEC;
                        if (elf_ppnt->p_filesz > PATH_MAX || 
                            elf_ppnt->p_filesz < 2)
-                                goto out_free_file;
+                                goto out_free_ph;
                        retval = -ENOMEM;
                        elf_interpreter = kmalloc(elf_ppnt->p_filesz,
                                                  GFP_KERNEL);
                        if (!elf_interpreter)
-                                goto out_free_file;
+                                goto out_free_ph;
                        retval = kernel_read(bprm->file, elf_ppnt->p_offset,
                                             elf_interpreter,
@@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        kfree(elf_phdata);
-        sys_close(elf_exec_fileno);
        set_binfmt(&elf_format);
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
@@ -1028,8 +1012,6 @@ out_free_dentry:
                fput(interpreter);
 out_free_interp:
        kfree(elf_interpreter);
-out_free_file:
-        sys_close(elf_exec_fileno);
 out_free_ph:
        kfree(elf_phdata);
        goto out;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f3e72c5c19f5..70cfc4b84ae0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
                        params->elfhdr_addr = seg->addr;
                /* clear any space allocated but not loaded */
-                if (phdr->p_filesz < phdr->p_memsz)
+                if (phdr->p_filesz < phdr->p_memsz) {
-                        clear_user((void *) (seg->addr + phdr->p_filesz),
+                        ret = clear_user((void *) (seg->addr + phdr->p_filesz),
-                                   phdr->p_memsz - phdr->p_filesz);
+                                         phdr->p_memsz - phdr->p_filesz);
+                        if (ret)
+                                return ret;
+                }
                if (mm) {
                        if (phdr->p_flags & PF_X) {
@@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
        struct elf32_fdpic_loadseg *seg;
        struct elf32_phdr *phdr;
        unsigned long load_addr, delta_vaddr;
-        int loop, dvset;
+        int loop, dvset, ret;
        load_addr = params->load_addr;
        delta_vaddr = 0;
@@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                 * PT_LOAD */
                if (prot & PROT_WRITE && disp > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-                        clear_user((void __user *) maddr, disp);
+                        ret = clear_user((void __user *) maddr, disp);
+                        if (ret)
+                                return ret;
                        maddr += disp;
                }
@@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                if (prot & PROT_WRITE && excess1 > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess1);
-                        clear_user((void __user *) maddr + phdr->p_filesz,
+                        ret = clear_user((void __user *) maddr + phdr->p_filesz,
-                                   excess1);
+                                         excess1);
+                        if (ret)
+                                return ret;
                }
 #else
                if (excess > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess);
-                        clear_user((void *) maddr + phdr->p_filesz, excess);
+                        ret = clear_user((void *) maddr + phdr->p_filesz, excess);
+                        if (ret)
+                                return ret;
                }
 #endif
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 08644a61616e..eff74b9c9e77 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -188,7 +188,6 @@ out:
 static int
 load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 {
-        int som_exec_fileno;
        int retval;
        unsigned int size;
        unsigned long som_entry;
@@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                goto out_free;
        }
-        retval = get_unused_fd();
-        if (retval < 0)
-                goto out_free;
-        get_file(bprm->file);
-        fd_install(som_exec_fileno = retval, bprm->file);
        /* Flush all traces of the currently running executable */
        retval = flush_old_exec(bprm);
        if (retval)
diff --git a/fs/bio.c b/fs/bio.c
index a040cde7f6fd..e0c9e545bbfa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1420,8 +1420,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
 }
 /*
- * split a bio - only worry about a bio with a single page
+ * split a bio - only worry about a bio with a single page in its iovec
- * in it's iovec
 */
 struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8c3c6899ccf3..f45dbc18dd17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev)
        }
        return sync_blockdev(bdev);
 }
+EXPORT_SYMBOL(fsync_bdev);
 /**
 * freeze_bdev  --  lock a filesystem and force it into a consistent state
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..7fdd184a528d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..51bfdfc8fcda 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,7 +20,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
-#include <linux/ftrace.h>
 #include "async-thread.h"
 #define WORK_QUEUED_BIT 0
@@ -195,6 +194,9 @@ again_locked:
                                if (!list_empty(&worker->pending))
                                        continue;
+                                if (kthread_should_stop())
+                                        break;
                                /* still no more work?, sleep for real */
                                spin_lock_irq(&worker->lock);
                                set_current_state(TASK_INTERRUPTIBLE);
@@ -208,7 +210,8 @@ again_locked:
                                worker->working = 0;
                                spin_unlock_irq(&worker->lock);
-                                schedule();
+                                if (!kthread_should_stop())
+                                        schedule();
                        }
                        __set_current_state(TASK_RUNNING);
                }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dbb724124633..e5b2533b691a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1244,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
 */
-static noinline void reada_for_search(struct btrfs_root *root,
+static void reada_for_search(struct btrfs_root *root,
-                                      struct btrfs_path *path,
+                             struct btrfs_path *path,
-                                      int level, int slot, u64 objectid)
+                             int level, int slot, u64 objectid)
 {
        struct extent_buffer *node;
        struct btrfs_disk_key disk_key;
@@ -1447,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 }
 /*
+ * helper function for btrfs_search_slot.  The goal is to find a block
+ * in cache without setting the path to blocking.  If we find the block
+ * we return zero and the path is unchanged.
+ *
+ * If we can't find the block, we set the path blocking and do some
+ * reada.  -EAGAIN is returned and the search must be repeated.
+ */
+static int
+read_block_for_search(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct btrfs_path *p,
+                       struct extent_buffer **eb_ret, int level, int slot,
+                       struct btrfs_key *key)
+{
+        u64 blocknr;
+        u64 gen;
+        u32 blocksize;
+        struct extent_buffer *b = *eb_ret;
+        struct extent_buffer *tmp;
+        blocknr = btrfs_node_blockptr(b, slot);
+        gen = btrfs_node_ptr_generation(b, slot);
+        blocksize = btrfs_level_size(root, level - 1);
+        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+                *eb_ret = tmp;
+                return 0;
+        }
+        /*
+         * reduce lock contention at high levels
+         * of the btree by dropping locks before
+         * we read.
+         */
+        btrfs_release_path(NULL, p);
+        if (tmp)
+                free_extent_buffer(tmp);
+        if (p->reada)
+                reada_for_search(root, p, level, slot, key->objectid);
+        tmp = read_tree_block(root, blocknr, blocksize, gen);
+        if (tmp)
+                free_extent_buffer(tmp);
+        return -EAGAIN;
+}
+/*
+ * helper function for btrfs_search_slot.  This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned.  If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct btrfs_path *p,
+                       struct extent_buffer *b, int level, int ins_len)
+{
+        int ret;
+        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+                int sret;
+                sret = reada_for_balance(root, p, level);
+                if (sret)
+                        goto again;
+                btrfs_set_path_blocking(p);
+                sret = split_node(trans, root, p, level);
+                btrfs_clear_path_blocking(p, NULL);
+                BUG_ON(sret > 0);
+                if (sret) {
+                        ret = sret;
+                        goto done;
+                }
+                b = p->nodes[level];
+        } else if (ins_len < 0 && btrfs_header_nritems(b) <
+                   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+                int sret;
+                sret = reada_for_balance(root, p, level);
+                if (sret)
+                        goto again;
+                btrfs_set_path_blocking(p);
+                sret = balance_level(trans, root, p, level);
+                btrfs_clear_path_blocking(p, NULL);
+                if (sret) {
+                        ret = sret;
+                        goto done;
+                }
+                b = p->nodes[level];
+                if (!b) {
+                        btrfs_release_path(NULL, p);
+                        goto again;
+                }
+                BUG_ON(btrfs_header_nritems(b) == 1);
+        }
+        return 0;
+again:
+        ret = -EAGAIN;
+done:
+        return ret;
+}
+/*
 * look for key in the tree.  path is filled in with nodes along the way
 * if key is found, we return zero and you can find the item in the leaf
 * level of the path (level 0)
@@ -1464,16 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      ins_len, int cow)
 {
        struct extent_buffer *b;
-        struct extent_buffer *tmp;
        int slot;
        int ret;
        int level;
-        int should_reada = p->reada;
        int lowest_unlock = 1;
-        int blocksize;
        u8 lowest_level = 0;
-        u64 blocknr;
-        u64 gen;
        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
@@ -1502,7 +1608,11 @@ again:
                if (cow) {
                        int wret;
-                        /* is a cow on this block not required */
+                        /*
+                         * if we don't really need to cow this block
+                         * then we don't want to set the path blocking,
+                         * so we test it here
+                         */
                        if (btrfs_header_generation(b) == trans->transid &&
                            btrfs_header_owner(b) == root->root_key.objectid &&
                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -1557,51 +1667,15 @@ cow_done:
                        if (ret && slot > 0)
                                slot -= 1;
                        p->slots[level] = slot;
-                        if ((p->search_for_split || ins_len > 0) &&
+                        ret = setup_nodes_for_search(trans, root, p, b, level,
-                            btrfs_header_nritems(b) >=
+                                                     ins_len);
-                            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+                        if (ret == -EAGAIN)
-                                int sret;
+                                goto again;
+                        else if (ret)
-                                sret = reada_for_balance(root, p, level);
+                                goto done;
-                                if (sret)
+                        b = p->nodes[level];
-                                        goto again;
+                        slot = p->slots[level];
-                                btrfs_set_path_blocking(p);
-                                sret = split_node(trans, root, p, level);
-                                btrfs_clear_path_blocking(p, NULL);
-                                BUG_ON(sret > 0);
-                                if (sret) {
-                                        ret = sret;
-                                        goto done;
-                                }
-                                b = p->nodes[level];
-                                slot = p->slots[level];
-                        } else if (ins_len < 0 &&
-                                   btrfs_header_nritems(b) <
-                                   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
-                                int sret;
-                                sret = reada_for_balance(root, p, level);
-                                if (sret)
-                                        goto again;
-                                btrfs_set_path_blocking(p);
-                                sret = balance_level(trans, root, p, level);
-                                btrfs_clear_path_blocking(p, NULL);
-                                if (sret) {
-                                        ret = sret;
-                                        goto done;
-                                }
-                                b = p->nodes[level];
-                                if (!b) {
-                                        btrfs_release_path(NULL, p);
-                                        goto again;
-                                }
-                                slot = p->slots[level];
-                                BUG_ON(btrfs_header_nritems(b) == 1);
-                        }
                        unlock_up(p, level, lowest_unlock);
                        /* this is only true while dropping a snapshot */
@@ -1610,44 +1684,11 @@ cow_done:
                                goto done;
                        }
-                        blocknr = btrfs_node_blockptr(b, slot);
+                        ret = read_block_for_search(trans, root, p,
-                        gen = btrfs_node_ptr_generation(b, slot);
+                                                    &b, level, slot, key);
-                        blocksize = btrfs_level_size(root, level - 1);
+                        if (ret == -EAGAIN)
+                                goto again;
-                        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
-                        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-                                b = tmp;
-                        } else {
-                                /*
-                                 * reduce lock contention at high levels
-                                 * of the btree by dropping locks before
-                                 * we read.
-                                 */
-                                if (level > 0) {
-                                        btrfs_release_path(NULL, p);
-                                        if (tmp)
-                                                free_extent_buffer(tmp);
-                                        if (should_reada)
-                                                reada_for_search(root, p,
-                                                                 level, slot,
-                                                                 key->objectid);
-                                        tmp = read_tree_block(root, blocknr,
-                                                         blocksize, gen);
-                                        if (tmp)
-                                                free_extent_buffer(tmp);
-                                        goto again;
-                                } else {
-                                        btrfs_set_path_blocking(p);
-                                        if (tmp)
-                                                free_extent_buffer(tmp);
-                                        if (should_reada)
-                                                reada_for_search(root, p,
-                                                                 level, slot,
-                                                                 key->objectid);
-                                        b = read_node_slot(root, b, slot);
-                                }
-                        }
                        if (!p->skip_locking) {
                                int lret;
@@ -2116,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
        BUG_ON(!path->nodes[level]);
        lower = path->nodes[level];
        nritems = btrfs_header_nritems(lower);
-        if (slot > nritems)
+        BUG_ON(slot > nritems);
-                BUG();
        if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
                BUG();
        if (slot != nritems) {
@@ -4086,28 +4126,44 @@ next:
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
        int slot;
-        int level = 1;
+        int level;
        struct extent_buffer *c;
-        struct extent_buffer *next = NULL;
+        struct extent_buffer *next;
        struct btrfs_key key;
        u32 nritems;
        int ret;
+        int old_spinning = path->leave_spinning;
+        int force_blocking = 0;
        nritems = btrfs_header_nritems(path->nodes[0]);
        if (nritems == 0)
                return 1;
-        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+        /*
+         * we take the blocks in an order that upsets lockdep.  Using
+         * blocking mode is the only way around it.
+         */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        force_blocking = 1;
+#endif
+        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+        level = 1;
+        next = NULL;
        btrfs_release_path(root, path);
        path->keep_locks = 1;
+        if (!force_blocking)
+                path->leave_spinning = 1;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        path->keep_locks = 0;
        if (ret < 0)
                return ret;
-        btrfs_set_path_blocking(path);
        nritems = btrfs_header_nritems(path->nodes[0]);
        /*
         * by releasing the path above we dropped all our locks.  A balance
@@ -4117,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
         */
        if (nritems > 0 && path->slots[0] < nritems - 1) {
                path->slots[0]++;
+                ret = 0;
                goto done;
        }
        while (level < BTRFS_MAX_LEVEL) {
-                if (!path->nodes[level])
+                if (!path->nodes[level]) {
-                        return 1;
+                        ret = 1;
+                        goto done;
+                }
                slot = path->slots[level] + 1;
                c = path->nodes[level];
                if (slot >= btrfs_header_nritems(c)) {
                        level++;
-                        if (level == BTRFS_MAX_LEVEL)
+                        if (level == BTRFS_MAX_LEVEL) {
-                                return 1;
+                                ret = 1;
+                                goto done;
+                        }
                        continue;
                }
@@ -4138,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                        free_extent_buffer(next);
                }
-                /* the path was set to blocking above */
+                next = c;
-                if (level == 1 && (path->locks[1] || path->skip_locking) &&
+                ret = read_block_for_search(NULL, root, path, &next, level,
-                    path->reada)
+                                            slot, &key);
-                        reada_for_search(root, path, level, slot, 0);
+                if (ret == -EAGAIN)
+                        goto again;
-                next = read_node_slot(root, c, slot);
                if (!path->skip_locking) {
-                        btrfs_assert_tree_locked(c);
+                        ret = btrfs_try_spin_lock(next);
-                        btrfs_tree_lock(next);
+                        if (!ret) {
-                        btrfs_set_lock_blocking(next);
+                                btrfs_set_path_blocking(path);
+                                btrfs_tree_lock(next);
+                                if (!force_blocking)
+                                        btrfs_clear_path_blocking(path, next);
+                        }
+                        if (force_blocking)
+                                btrfs_set_lock_blocking(next);
                }
                break;
        }
@@ -4157,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                c = path->nodes[level];
                if (path->locks[level])
                        btrfs_tree_unlock(c);
                free_extent_buffer(c);
                path->nodes[level] = next;
                path->slots[level] = 0;
                if (!path->skip_locking)
                        path->locks[level] = 1;
                if (!level)
                        break;
-                btrfs_set_path_blocking(path);
+                ret = read_block_for_search(NULL, root, path, &next, level,
-                if (level == 1 && path->locks[1] && path->reada)
+                                            0, &key);
-                        reada_for_search(root, path, level, slot, 0);
+                if (ret == -EAGAIN)
-                next = read_node_slot(root, next, 0);
+                        goto again;
                if (!path->skip_locking) {
                        btrfs_assert_tree_locked(path->nodes[level]);
-                        btrfs_tree_lock(next);
+                        ret = btrfs_try_spin_lock(next);
-                        btrfs_set_lock_blocking(next);
+                        if (!ret) {
+                                btrfs_set_path_blocking(path);
+                                btrfs_tree_lock(next);
+                                if (!force_blocking)
+                                        btrfs_clear_path_blocking(path, next);
+                        }
+                        if (force_blocking)
+                                btrfs_set_lock_blocking(next);
                }
        }
+        ret = 0;
 done:
        unlock_up(path, 0, 1);
-        return 0;
+        path->leave_spinning = old_spinning;
+        if (!old_spinning)
+                btrfs_set_path_blocking(path);
+        return ret;
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9417713542a2..ad96495dedc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -143,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 #define BTRFS_FT_MAX            9
 /*
- * the key defines the order in the tree, and so it also defines (optimal)
+ * The key defines the order in the tree, and so it also defines (optimal)
- * block layout.  objectid corresonds to the inode number.  The flags
+ * block layout.
- * tells us things about the object, and is a kind of stream selector.
+ *
- * so for a given inode, keys with flags of 1 might refer to the inode
+ * objectid corresponds to the inode number.
- * data, flags of 2 may point to file data in the btree and flags == 3
+ *
- * may point to extents.
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
 *
 * offset is the starting byte offset for this key in the stream.
 *
@@ -200,7 +203,7 @@ struct btrfs_dev_item {
        /*
         * starting byte of this partition on the device,
-         * to allowr for stripe alignment in the future
+         * to allow for stripe alignment in the future
         */
        __le64 start_offset;
@@ -633,18 +636,35 @@ struct btrfs_space_info {
        struct rw_semaphore groups_sem;
 };
-struct btrfs_free_space {
+/*
-        struct rb_node bytes_index;
+ * free clusters are used to claim free space in relatively large chunks,
-        struct rb_node offset_index;
+ * allowing us to do less seeky writes.  They are used for all metadata
-        u64 offset;
+ * allocations and data allocations in ssd mode.
-        u64 bytes;
+ */
+struct btrfs_free_cluster {
+        spinlock_t lock;
+        spinlock_t refill_lock;
+        struct rb_root root;
+        /* largest extent in this cluster */
+        u64 max_size;
+        /* first extent starting offset */
+        u64 window_start;
+        struct btrfs_block_group_cache *block_group;
+        /*
+         * when a cluster is allocated from a block group, we put the
+         * cluster onto a list in the block group so that it can
+         * be freed before the block group is freed.
+         */
+        struct list_head block_group_list;
 };
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
        spinlock_t lock;
-        struct mutex alloc_mutex;
        struct mutex cache_mutex;
        u64 pinned;
        u64 reserved;
@@ -656,6 +676,7 @@ struct btrfs_block_group_cache {
        struct btrfs_space_info *space_info;
        /* free space cache stuff */
+        spinlock_t tree_lock;
        struct rb_root free_space_bytes;
        struct rb_root free_space_offset;
@@ -667,6 +688,11 @@ struct btrfs_block_group_cache {
        /* usage count */
        atomic_t count;
+        /* List of struct btrfs_free_clusters for this block group.
+         * Today it will only have one thing on it, but that may change
+         */
+        struct list_head cluster_list;
 };
 struct btrfs_leaf_ref_tree {
@@ -728,7 +754,6 @@ struct btrfs_fs_info {
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
-        struct mutex pinned_mutex;
        struct mutex chunk_mutex;
        struct mutex drop_mutex;
        struct mutex volume_mutex;
@@ -839,8 +864,12 @@ struct btrfs_fs_info {
        spinlock_t delalloc_lock;
        spinlock_t new_trans_lock;
        u64 delalloc_bytes;
-        u64 last_alloc;
-        u64 last_data_alloc;
+        /* data_alloc_cluster is only used in ssd mode */
+        struct btrfs_free_cluster data_alloc_cluster;
+        /* all metadata allocations go through this cluster */
+        struct btrfs_free_cluster meta_alloc_cluster;
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
@@ -932,7 +961,6 @@ struct btrfs_root {
 };
 /*
 * inode items have the data typically returned from stat and store other
 * info about object characteristics.  There is one for every file and dir in
 * the FS
@@ -963,7 +991,7 @@ struct btrfs_root {
 #define BTRFS_EXTENT_CSUM_KEY   128
 /*
- * root items point to tree roots.  There are typically in the root
+ * root items point to tree roots.  They are typically in the root
 * tree used by the super block to find all the other trees
 */
 #define BTRFS_ROOT_ITEM_KEY     132
@@ -1010,6 +1038,8 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SSD                 (1 << 3)
 #define BTRFS_MOUNT_DEGRADED            (1 << 4)
 #define BTRFS_MOUNT_COMPRESS            (1 << 5)
+#define BTRFS_MOUNT_NOTREELOG           (1 << 6)
+#define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1748,6 +1778,7 @@ static inline struct dentry *fdentry(struct file *file)
 }
 /* extent-tree.c */
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
@@ -2174,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
-/* free-space-cache.c */
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                         u64 bytenr, u64 size);
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
-                              u64 offset, u64 bytes);
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-                            u64 bytenr, u64 size);
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
-                                 u64 offset, u64 bytes);
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
-                                   *block_group);
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
-                                               *block_group, u64 offset,
-                                               u64 bytes);
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
-                           u64 bytes);
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cbf7dc8ae3ec..d6c01c096a40 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -18,7 +18,6 @@
 #include <linux/sched.h>
 #include <linux/sort.h>
-#include <linux/ftrace.h>
 #include "ctree.h"
 #include "delayed-ref.h"
 #include "transaction.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92d73929d381..92caa8035f36 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
 #include "locking.h"
 #include "ref-cache.h"
 #include "tree-log.h"
+#include "free-space-cache.h"
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -1412,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio)
        ret = extent_range_uptodate(io_tree, start + length,
                                    start + buf_len - 1);
-        if (ret == 1)
-                return ret;
        return ret;
 }
@@ -1647,12 +1646,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->drop_mutex);
-        mutex_init(&fs_info->pinned_mutex);
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
        mutex_init(&fs_info->tree_reloc_mutex);
+        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
        init_waitqueue_head(&fs_info->transaction_throttle);
        init_waitqueue_head(&fs_info->transaction_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5e7cae63d80..178df4c67de4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "free-space-cache.h"
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
@@ -166,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
        u64 extent_start, extent_end, size;
        int ret;
-        mutex_lock(&info->pinned_mutex);
        while (start < end) {
                ret = find_first_extent_bit(&info->pinned_extents, start,
                                            &extent_start, &extent_end,
@@ -192,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
                ret = btrfs_add_free_space(block_group, start, size);
                BUG_ON(ret);
        }
-        mutex_unlock(&info->pinned_mutex);
        return 0;
 }
@@ -291,8 +290,8 @@ next:
                           block_group->key.objectid +
                           block_group->key.offset);
-        remove_sb_from_cache(root, block_group);
        block_group->cached = 1;
+        remove_sb_from_cache(root, block_group);
        ret = 0;
 err:
        btrfs_free_path(path);
@@ -326,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
        return cache;
 }
-static inline void put_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
        if (atomic_dec_and_test(&cache->count))
                kfree(cache);
@@ -399,12 +398,12 @@ again:
                            div_factor(cache->key.offset, factor)) {
                                group_start = cache->key.objectid;
                                spin_unlock(&cache->lock);
-                                put_block_group(cache);
+                                btrfs_put_block_group(cache);
                                goto found;
                        }
                }
                spin_unlock(&cache->lock);
-                put_block_group(cache);
+                btrfs_put_block_group(cache);
                cond_resched();
        }
        if (!wrapped) {
@@ -1594,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
        if (!block_group || block_group->ro)
                readonly = 1;
        if (block_group)
-                put_block_group(block_group);
+                btrfs_put_block_group(block_group);
        return readonly;
 }
@@ -2018,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                                WARN_ON(ret);
                        }
                }
-                put_block_group(cache);
+                btrfs_put_block_group(cache);
                total -= num_bytes;
                bytenr += num_bytes;
        }
@@ -2035,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
                return 0;
        bytenr = cache->key.objectid;
-        put_block_group(cache);
+        btrfs_put_block_group(cache);
        return bytenr;
 }
@@ -2047,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
        if (pin) {
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2055,7 +2053,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                clear_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
        }
-        mutex_unlock(&root->fs_info->pinned_mutex);
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2081,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                        if (cache->cached)
                                btrfs_add_free_space(cache, bytenr, len);
                }
-                put_block_group(cache);
+                btrfs_put_block_group(cache);
                bytenr += len;
                num -= len;
        }
@@ -2112,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
                }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
-                put_block_group(cache);
+                btrfs_put_block_group(cache);
                bytenr += len;
                num -= len;
        }
@@ -2127,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
        struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
        int ret;
-        mutex_lock(&root->fs_info->pinned_mutex);
        while (1) {
                ret = find_first_extent_bit(pinned_extents, last,
                                            &start, &end, EXTENT_DIRTY);
@@ -2136,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
                set_extent_dirty(copy, start, end, GFP_NOFS);
                last = end + 1;
        }
-        mutex_unlock(&root->fs_info->pinned_mutex);
        return 0;
 }
@@ -2149,7 +2144,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        int ret;
        while (1) {
-                mutex_lock(&root->fs_info->pinned_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY);
                if (ret)
@@ -2163,7 +2157,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
-        mutex_unlock(&root->fs_info->pinned_mutex);
        return ret;
 }
@@ -2205,7 +2198,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
        free_extent_buffer(buf);
 pinit:
        btrfs_set_path_blocking(path);
-        mutex_lock(&root->fs_info->pinned_mutex);
        /* unlocks the pinned mutex */
        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
@@ -2511,8 +2503,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
         */
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-                mutex_lock(&root->fs_info->pinned_mutex);
                /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
                update_reserved_extents(root, bytenr, num_bytes, 0);
@@ -2554,228 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        struct btrfs_root *root = orig_root->fs_info->extent_root;
-        u64 total_needed = num_bytes;
+        struct btrfs_free_cluster *last_ptr = NULL;
-        u64 *last_ptr = NULL;
-        u64 last_wanted = 0;
        struct btrfs_block_group_cache *block_group = NULL;
-        int chunk_alloc_done = 0;
        int empty_cluster = 2 * 1024 * 1024;
        int allowed_chunk_alloc = 0;
-        struct list_head *head = NULL, *cur = NULL;
-        int loop = 0;
-        int extra_loop = 0;
        struct btrfs_space_info *space_info;
+        int last_ptr_loop = 0;
+        int loop = 0;
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
        ins->objectid = 0;
        ins->offset = 0;
+        space_info = __find_space_info(root->fs_info, data);
        if (orig_root->ref_cows || empty_size)
                allowed_chunk_alloc = 1;
        if (data & BTRFS_BLOCK_GROUP_METADATA) {
-                last_ptr = &root->fs_info->last_alloc;
+                last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
                        empty_cluster = 64 * 1024;
        }
-        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
+        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
-                last_ptr = &root->fs_info->last_data_alloc;
+                last_ptr = &root->fs_info->data_alloc_cluster;
+        }
        if (last_ptr) {
-                if (*last_ptr) {
+                spin_lock(&last_ptr->lock);
-                        hint_byte = *last_ptr;
+                if (last_ptr->block_group)
-                        last_wanted = *last_ptr;
+                        hint_byte = last_ptr->window_start;
-                } else
+                spin_unlock(&last_ptr->lock);
-                        empty_size += empty_cluster;
-        } else {
-                empty_cluster = 0;
        }
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
-        if (last_wanted && search_start != last_wanted) {
+        if (!last_ptr) {
-                last_wanted = 0;
+                empty_cluster = 0;
-                empty_size += empty_cluster;
+                loop = 1;
        }
-        total_needed += empty_size;
+        if (search_start == hint_byte) {
-        block_group = btrfs_lookup_block_group(root->fs_info, search_start);
+                block_group = btrfs_lookup_block_group(root->fs_info,
-        if (!block_group)
+                                                       search_start);
-                block_group = btrfs_lookup_first_block_group(root->fs_info,
+                if (block_group && block_group_bits(block_group, data)) {
-                                                             search_start);
+                        down_read(&space_info->groups_sem);
-        space_info = __find_space_info(root->fs_info, data);
+                        goto have_block_group;
+                } else if (block_group) {
+                        btrfs_put_block_group(block_group);
+                }
+        }
+search:
        down_read(&space_info->groups_sem);
-        while (1) {
+        list_for_each_entry(block_group, &space_info->block_groups, list) {
-                struct btrfs_free_space *free_space;
+                u64 offset;
-                /*
-                 * the only way this happens if our hint points to a block
-                 * group thats not of the proper type, while looping this
-                 * should never happen
-                 */
-                if (empty_size)
-                        extra_loop = 1;
-                if (!block_group)
+                atomic_inc(&block_group->count);
-                        goto new_group_no_lock;
+                search_start = block_group->key.objectid;
+have_block_group:
                if (unlikely(!block_group->cached)) {
                        mutex_lock(&block_group->cache_mutex);
                        ret = cache_block_group(root, block_group);
                        mutex_unlock(&block_group->cache_mutex);
-                        if (ret)
+                        if (ret) {
+                                btrfs_put_block_group(block_group);
                                break;
+                        }
                }
-                mutex_lock(&block_group->alloc_mutex);
-                if (unlikely(!block_group_bits(block_group, data)))
-                        goto new_group;
                if (unlikely(block_group->ro))
-                        goto new_group;
+                        goto loop;
-                free_space = btrfs_find_free_space(block_group, search_start,
+                if (last_ptr) {
-                                                   total_needed);
+                        /*
-                if (free_space) {
+                         * the refill lock keeps out other
-                        u64 start = block_group->key.objectid;
+                         * people trying to start a new cluster
-                        u64 end = block_group->key.objectid +
+                         */
-                                block_group->key.offset;
+                        spin_lock(&last_ptr->refill_lock);
+                        offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+                                                 num_bytes, search_start);
+                        if (offset) {
+                                /* we have a block, we're done */
+                                spin_unlock(&last_ptr->refill_lock);
+                                goto checks;
+                        }
-                        search_start = stripe_align(root, free_space->offset);
+                        spin_lock(&last_ptr->lock);
+                        /*
+                         * whoops, this cluster doesn't actually point to
+                         * this block group.  Get a ref on the block
+                         * group is does point to and try again
+                         */
+                        if (!last_ptr_loop && last_ptr->block_group &&
+                            last_ptr->block_group != block_group) {
+                                btrfs_put_block_group(block_group);
+                                block_group = last_ptr->block_group;
+                                atomic_inc(&block_group->count);
+                                spin_unlock(&last_ptr->lock);
+                                spin_unlock(&last_ptr->refill_lock);
+                                last_ptr_loop = 1;
+                                search_start = block_group->key.objectid;
+                                goto have_block_group;
+                        }
+                        spin_unlock(&last_ptr->lock);
-                        /* move on to the next group */
+                        /*
-                        if (search_start + num_bytes >= search_end)
+                         * this cluster didn't work out, free it and
-                                goto new_group;
+                         * start over
+                         */
+                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
-                        /* move on to the next group */
+                        last_ptr_loop = 0;
-                        if (search_start + num_bytes > end)
-                                goto new_group;
-                        if (last_wanted && search_start != last_wanted) {
+                        /* allocate a cluster in this block group */
-                                total_needed += empty_cluster;
+                        ret = btrfs_find_space_cluster(trans,
-                                empty_size += empty_cluster;
+                                               block_group, last_ptr,
-                                last_wanted = 0;
+                                               offset, num_bytes,
+                                               empty_cluster + empty_size);
+                        if (ret == 0) {
                                /*
-                                 * if search_start is still in this block group
+                                 * now pull our allocation out of this
-                                 * then we just re-search this block group
+                                 * cluster
                                 */
-                                if (search_start >= start &&
+                                offset = btrfs_alloc_from_cluster(block_group,
-                                    search_start < end) {
+                                                  last_ptr, num_bytes,
-                                        mutex_unlock(&block_group->alloc_mutex);
+                                                  search_start);
-                                        continue;
+                                if (offset) {
+                                        /* we found one, proceed */
+                                        spin_unlock(&last_ptr->refill_lock);
+                                        goto checks;
                                }
-                                /* else we go to the next block group */
-                                goto new_group;
                        }
+                        /*
-                        if (exclude_nr > 0 &&
+                         * at this point we either didn't find a cluster
-                            (search_start + num_bytes > exclude_start &&
+                         * or we weren't able to allocate a block from our
-                             search_start < exclude_start + exclude_nr)) {
+                         * cluster.  Free the cluster we've been trying
-                                search_start = exclude_start + exclude_nr;
+                         * to use, and go to the next block group
-                                /*
+                         */
-                                 * if search_start is still in this block group
+                        if (loop < 2) {
-                                 * then we just re-search this block group
+                                btrfs_return_cluster_to_free_space(NULL,
-                                 */
+                                                                   last_ptr);
-                                if (search_start >= start &&
+                                spin_unlock(&last_ptr->refill_lock);
-                                    search_start < end) {
+                                goto loop;
-                                        mutex_unlock(&block_group->alloc_mutex);
-                                        last_wanted = 0;
-                                        continue;
-                                }
-                                /* else we go to the next block group */
-                                goto new_group;
                        }
+                        spin_unlock(&last_ptr->refill_lock);
+                }
-                        ins->objectid = search_start;
+                offset = btrfs_find_space_for_alloc(block_group, search_start,
-                        ins->offset = num_bytes;
+                                                    num_bytes, empty_size);
+                if (!offset)
+                        goto loop;
+checks:
+                search_start = stripe_align(root, offset);
+                /* move on to the next group */
+                if (search_start + num_bytes >= search_end) {
+                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        goto loop;
+                }
-                        btrfs_remove_free_space_lock(block_group, search_start,
+                /* move on to the next group */
-                                                     num_bytes);
+                if (search_start + num_bytes >
-                        /* we are all good, lets return */
+                    block_group->key.objectid + block_group->key.offset) {
-                        mutex_unlock(&block_group->alloc_mutex);
+                        btrfs_add_free_space(block_group, offset, num_bytes);
-                        break;
+                        goto loop;
                }
-new_group:
-                mutex_unlock(&block_group->alloc_mutex);
-                put_block_group(block_group);
-                block_group = NULL;
-new_group_no_lock:
-                /* don't try to compare new allocations against the
-                 * last allocation any more
-                 */
-                last_wanted = 0;
-                /*
+                if (exclude_nr > 0 &&
-                 * Here's how this works.
+                    (search_start + num_bytes > exclude_start &&
-                 * loop == 0: we were searching a block group via a hint
+                     search_start < exclude_start + exclude_nr)) {
-                 *              and didn't find anything, so we start at
+                        search_start = exclude_start + exclude_nr;
-                 *              the head of the block groups and keep searching
-                 * loop == 1: we're searching through all of the block groups
+                        btrfs_add_free_space(block_group, offset, num_bytes);
-                 *              if we hit the head again we have searched
+                        /*
-                 *              all of the block groups for this space and we
+                         * if search_start is still in this block group
-                 *              need to try and allocate, if we cant error out.
+                         * then we just re-search this block group
-                 * loop == 2: we allocated more space and are looping through
-                 *              all of the block groups again.
-                 */
-                if (loop == 0) {
-                        head = &space_info->block_groups;
-                        cur = head->next;
-                        loop++;
-                } else if (loop == 1 && cur == head) {
-                        int keep_going;
-                        /* at this point we give up on the empty_size
-                         * allocations and just try to allocate the min
-                         * space.
-                         *
-                         * The extra_loop field was set if an empty_size
-                         * allocation was attempted above, and if this
-                         * is try we need to try the loop again without
-                         * the additional empty_size.
                         */
-                        total_needed -= empty_size;
+                        if (search_start >= block_group->key.objectid &&
-                        empty_size = 0;
+                            search_start < (block_group->key.objectid +
-                        keep_going = extra_loop;
+                                            block_group->key.offset))
-                        loop++;
+                                goto have_block_group;
+                        goto loop;
+                }
-                        if (allowed_chunk_alloc && !chunk_alloc_done) {
+                ins->objectid = search_start;
-                                up_read(&space_info->groups_sem);
+                ins->offset = num_bytes;
-                                ret = do_chunk_alloc(trans, root, num_bytes +
-                                                     2 * 1024 * 1024, data, 1);
+                if (offset < search_start)
-                                down_read(&space_info->groups_sem);
+                        btrfs_add_free_space(block_group, offset,
-                                if (ret < 0)
+                                             search_start - offset);
-                                        goto loop_check;
+                BUG_ON(offset > search_start);
-                                head = &space_info->block_groups;
-                                /*
+                /* we are all good, lets return */
-                                 * we've allocated a new chunk, keep
+                break;
-                                 * trying
+loop:
-                                 */
+                btrfs_put_block_group(block_group);
-                                keep_going = 1;
+        }
-                                chunk_alloc_done = 1;
+        up_read(&space_info->groups_sem);
-                        } else if (!allowed_chunk_alloc) {
-                                space_info->force_alloc = 1;
+        /* loop == 0, try to find a clustered alloc in every block group
-                        }
+         * loop == 1, try again after forcing a chunk allocation
-loop_check:
+         * loop == 2, set empty_size and empty_cluster to 0 and try again
-                        if (keep_going) {
+         */
-                                cur = head->next;
+        if (!ins->objectid && loop < 3 &&
-                                extra_loop = 0;
+            (empty_size || empty_cluster || allowed_chunk_alloc)) {
-                        } else {
+                if (loop >= 2) {
-                                break;
+                        empty_size = 0;
-                        }
+                        empty_cluster = 0;
-                } else if (cur == head) {
-                        break;
                }
-                block_group = list_entry(cur, struct btrfs_block_group_cache,
+                if (allowed_chunk_alloc) {
-                                         list);
+                        ret = do_chunk_alloc(trans, root, num_bytes +
-                atomic_inc(&block_group->count);
+                                             2 * 1024 * 1024, data, 1);
+                        allowed_chunk_alloc = 0;
+                } else {
+                        space_info->force_alloc = 1;
+                }
-                search_start = block_group->key.objectid;
+                if (loop < 3) {
-                cur = cur->next;
+                        loop++;
+                        goto search;
+                }
+                ret = -ENOSPC;
+        } else if (!ins->objectid) {
+                ret = -ENOSPC;
        }
        /* we found what we needed */
@@ -2783,21 +2782,10 @@ loop_check:
                if (!(data & BTRFS_BLOCK_GROUP_DATA))
                        trans->block_group = block_group->key.objectid;
-                if (last_ptr)
+                btrfs_put_block_group(block_group);
-                        *last_ptr = ins->objectid + ins->offset;
                ret = 0;
-        } else if (!ret) {
-                printk(KERN_ERR "btrfs searching for %llu bytes, "
-                       "num_bytes %llu, loop %d, allowed_alloc %d\n",
-                       (unsigned long long)total_needed,
-                       (unsigned long long)num_bytes,
-                       loop, allowed_chunk_alloc);
-                ret = -ENOSPC;
        }
-        if (block_group)
-                put_block_group(block_group);
-        up_read(&space_info->groups_sem);
        return ret;
 }
@@ -2902,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        ret = btrfs_discard_extent(root, start, len);
        btrfs_add_free_space(cache, start, len);
-        put_block_group(cache);
+        btrfs_put_block_group(cache);
        update_reserved_extents(root, start, len, 0);
        return ret;
@@ -3040,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_remove_free_space(block_group, ins->objectid,
                                      ins->offset);
        BUG_ON(ret);
-        put_block_group(block_group);
+        btrfs_put_block_group(block_group);
        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
                                            ref_generation, owner, ins, 1);
        return ret;
@@ -5729,7 +5717,7 @@ next:
        WARN_ON(block_group->reserved > 0);
        WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
        spin_unlock(&block_group->lock);
-        put_block_group(block_group);
+        btrfs_put_block_group(block_group);
        ret = 0;
 out:
        btrfs_free_path(path);
@@ -5856,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
-                mutex_init(&cache->alloc_mutex);
+                spin_lock_init(&cache->tree_lock);
                mutex_init(&cache->cache_mutex);
                INIT_LIST_HEAD(&cache->list);
+                INIT_LIST_HEAD(&cache->cluster_list);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -5912,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
-        mutex_init(&cache->alloc_mutex);
+        spin_lock_init(&cache->tree_lock);
        mutex_init(&cache->cache_mutex);
        INIT_LIST_HEAD(&cache->list);
+        INIT_LIST_HEAD(&cache->cluster_list);
        btrfs_set_block_group_used(&cache->item, bytes_used);
        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5974,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_unlock(&block_group->space_info->lock);
        block_group->space_info->full = 0;
-        put_block_group(block_group);
+        btrfs_put_block_group(block_group);
-        put_block_group(block_group);
+        btrfs_put_block_group(block_group);
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 08085af089e2..eb2bee8b7fbf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                disko = 0;
                flags = 0;
-                switch (em->block_start) {
+                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
-                case EXTENT_MAP_LAST_BYTE:
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
-                        break;
+                } else if (em->block_start == EXTENT_MAP_HOLE) {
-                case EXTENT_MAP_HOLE:
                        flags |= FIEMAP_EXTENT_UNWRITTEN;
-                        break;
+                } else if (em->block_start == EXTENT_MAP_INLINE) {
-                case EXTENT_MAP_INLINE:
                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
                                  FIEMAP_EXTENT_NOT_ALIGNED);
-                        break;
+                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
-                case EXTENT_MAP_DELALLOC:
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
-                        break;
+                } else {
-                default:
                        disko = em->block_start;
-                        break;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..b187917b36fa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
        rb = tree_insert(&tree->map, em->start, &em->rb_node);
        if (rb) {
                ret = -EEXIST;
-                free_extent_map(merge);
                goto out;
        }
        atomic_inc(&em->refs);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..768b9523662d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
 #include <linux/sched.h>
 #include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+struct btrfs_free_space {
+        struct rb_node bytes_index;
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+};
 static int tree_insert_offset(struct rb_root *root, u64 offset,
                              struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 }
 /*
- * searches the tree for the given offset.  If contains is set we will return
+ * searches the tree for the given offset.
- * the free space that contains the given offset.  If contains is not set we
+ *
- * will return the free space that starts at or after the given offset and is
+ * fuzzy == 1: this is used for allocations where we are given a hint of where
- * at least bytes long.
+ * to look for free space.  Because the hint may not be completely on an offset
+ * mark, or the hint may no longer point to free space we need to fudge our
+ * results a bit.  So we look for free space starting at or after offset with at
+ * least bytes size.  We prefer to find as close to the given offset as we can.
+ * Also if the offset is within a free space range, then we will return the free
+ * space that contains the given offset, which means we can return a free space
+ * chunk with an offset before the provided offset.
+ *
+ * fuzzy == 0: this is just a normal tree search.  Give us the free space that
+ * starts at the given offset which is at least bytes size, and if its not there
+ * return NULL.
 */
 static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
                                                   u64 offset, u64 bytes,
-                                                   int contains)
+                                                   int fuzzy)
 {
        struct rb_node *n = root->rb_node;
        struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
                if (offset < entry->offset) {
-                        if (!contains &&
+                        if (fuzzy &&
                            (!ret || entry->offset < ret->offset) &&
                            (bytes <= entry->bytes))
                                ret = entry;
                        n = n->rb_left;
                } else if (offset > entry->offset) {
-                        if ((entry->offset + entry->bytes - 1) >= offset &&
+                        if (fuzzy &&
+                            (entry->offset + entry->bytes - 1) >= offset &&
                            bytes <= entry->bytes) {
                                ret = entry;
                                break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
        int ret = 0;
+        BUG_ON(!info->bytes);
        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
                                 &info->offset_index);
        if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
        return ret;
 }
-static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                                  u64 offset, u64 bytes)
+                         u64 offset, u64 bytes)
 {
        struct btrfs_free_space *right_info;
        struct btrfs_free_space *left_info;
        struct btrfs_free_space *info = NULL;
-        struct btrfs_free_space *alloc_info;
        int ret = 0;
-        alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+        info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-        if (!alloc_info)
+        if (!info)
                return -ENOMEM;
+        info->offset = offset;
+        info->bytes = bytes;
+        spin_lock(&block_group->tree_lock);
        /*
         * first we want to see if there is free space adjacent to the range we
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
        right_info = tree_search_offset(&block_group->free_space_offset,
-                                        offset+bytes, 0, 1);
+                                        offset+bytes, 0, 0);
        left_info = tree_search_offset(&block_group->free_space_offset,
                                       offset-1, 0, 1);
-        if (right_info && right_info->offset == offset+bytes) {
+        if (right_info) {
                unlink_free_space(block_group, right_info);
-                info = right_info;
+                info->bytes += right_info->bytes;
-                info->offset = offset;
+                kfree(right_info);
-                info->bytes += bytes;
-        } else if (right_info && right_info->offset != offset+bytes) {
-                printk(KERN_ERR "btrfs adding space in the middle of an "
-                       "existing free space area. existing: "
-                       "offset=%llu, bytes=%llu. new: offset=%llu, "
-                       "bytes=%llu\n", (unsigned long long)right_info->offset,
-                       (unsigned long long)right_info->bytes,
-                       (unsigned long long)offset,
-                       (unsigned long long)bytes);
-                BUG();
        }
-        if (left_info) {
+        if (left_info && left_info->offset + left_info->bytes == offset) {
                unlink_free_space(block_group, left_info);
+                info->offset = left_info->offset;
-                if (unlikely((left_info->offset + left_info->bytes) !=
+                info->bytes += left_info->bytes;
-                             offset)) {
+                kfree(left_info);
-                        printk(KERN_ERR "btrfs free space to the left "
-                               "of new free space isn't "
-                               "quite right. existing: offset=%llu, "
-                               "bytes=%llu. new: offset=%llu, bytes=%llu\n",
-                               (unsigned long long)left_info->offset,
-                               (unsigned long long)left_info->bytes,
-                               (unsigned long long)offset,
-                               (unsigned long long)bytes);
-                        BUG();
-                }
-                if (info) {
-                        info->offset = left_info->offset;
-                        info->bytes += left_info->bytes;
-                        kfree(left_info);
-                } else {
-                        info = left_info;
-                        info->bytes += bytes;
-                }
        }
-        if (info) {
-                ret = link_free_space(block_group, info);
-                if (!ret)
-                        info = NULL;
-                goto out;
-        }
-        info = alloc_info;
-        alloc_info = NULL;
-        info->offset = offset;
-        info->bytes = bytes;
        ret = link_free_space(block_group, info);
        if (ret)
                kfree(info);
-out:
+        spin_unlock(&block_group->tree_lock);
        if (ret) {
                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
-                if (ret == -EEXIST)
+                BUG_ON(ret == -EEXIST);
-                        BUG();
        }
-        kfree(alloc_info);
        return ret;
 }
-static int
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                            u64 offset, u64 bytes)
-                          u64 offset, u64 bytes)
 {
        struct btrfs_free_space *info;
        int ret = 0;
+        spin_lock(&block_group->tree_lock);
        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
                                  1);
        if (info && info->offset == offset) {
                if (info->bytes < bytes) {
                        printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                               (unsigned long long)bytes);
                        WARN_ON(1);
                        ret = -EINVAL;
+                        spin_unlock(&block_group->tree_lock);
                        goto out;
                }
                unlink_free_space(block_group, info);
                if (info->bytes == bytes) {
                        kfree(info);
+                        spin_unlock(&block_group->tree_lock);
                        goto out;
                }
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                info->bytes -= bytes;
                ret = link_free_space(block_group, info);
+                spin_unlock(&block_group->tree_lock);
                BUG_ON(ret);
        } else if (info && info->offset < offset &&
                   info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                         */
                        kfree(info);
                }
+                spin_unlock(&block_group->tree_lock);
                /* step two, insert a new info struct to cover anything
                 * before the hole
                 */
-                ret = __btrfs_add_free_space(block_group, old_start,
+                ret = btrfs_add_free_space(block_group, old_start,
-                                             offset - old_start);
+                                           offset - old_start);
                BUG_ON(ret);
        } else {
+                spin_unlock(&block_group->tree_lock);
+                if (!info) {
+                        printk(KERN_ERR "couldn't find space %llu to free\n",
+                               (unsigned long long)offset);
+                        printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
+                               block_group->cached, block_group->key.objectid,
+                               block_group->key.offset);
+                        btrfs_dump_free_space(block_group, bytes);
+                } else if (info) {
+                        printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
+                               "but wanted offset=%llu bytes=%llu\n",
+                               info->offset, info->bytes, offset, bytes);
+                }
                WARN_ON(1);
        }
 out:
        return ret;
 }
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                         u64 offset, u64 bytes)
-{
-        int ret;
-        struct btrfs_free_space *sp;
-        mutex_lock(&block_group->alloc_mutex);
-        ret = __btrfs_add_free_space(block_group, offset, bytes);
-        sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
-        BUG_ON(!sp);
-        mutex_unlock(&block_group->alloc_mutex);
-        return ret;
-}
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
-                              u64 offset, u64 bytes)
-{
-        int ret;
-        struct btrfs_free_space *sp;
-        ret = __btrfs_add_free_space(block_group, offset, bytes);
-        sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
-        BUG_ON(!sp);
-        return ret;
-}
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-                            u64 offset, u64 bytes)
-{
-        int ret = 0;
-        mutex_lock(&block_group->alloc_mutex);
-        ret = __btrfs_remove_free_space(block_group, offset, bytes);
-        mutex_unlock(&block_group->alloc_mutex);
-        return ret;
-}
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
-                                 u64 offset, u64 bytes)
-{
-        int ret;
-        ret = __btrfs_remove_free_space(block_group, offset, bytes);
-        return ret;
-}
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes)
 {
@@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
+                printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
+                       info->bytes);
        }
        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
               "\n", count);
@@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
        return ret;
 }
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache.  If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already.  In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster)
+{
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        spin_lock(&cluster->lock);
+        if (cluster->block_group != block_group)
+                goto out;
+        cluster->window_start = 0;
+        node = rb_first(&cluster->root);
+        while(node) {
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                node = rb_next(&entry->offset_index);
+                rb_erase(&entry->offset_index, &cluster->root);
+                link_free_space(block_group, entry);
+        }
+        list_del_init(&cluster->block_group_list);
+        btrfs_put_block_group(cluster->block_group);
+        cluster->block_group = NULL;
+        cluster->root.rb_node = NULL;
+out:
+        spin_unlock(&cluster->lock);
+        return 0;
+}
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 {
        struct btrfs_free_space *info;
        struct rb_node *node;
+        struct btrfs_free_cluster *cluster;
+        struct btrfs_free_cluster *safe;
+        spin_lock(&block_group->tree_lock);
+        list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+                                 block_group_list) {
+                WARN_ON(cluster->block_group != block_group);
+                __btrfs_return_cluster_to_free_space(block_group, cluster);
+        }
-        mutex_lock(&block_group->alloc_mutex);
        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
                info = rb_entry(node, struct btrfs_free_space, bytes_index);
                unlink_free_space(block_group, info);
                kfree(info);
                if (need_resched()) {
-                        mutex_unlock(&block_group->alloc_mutex);
+                        spin_unlock(&block_group->tree_lock);
                        cond_resched();
-                        mutex_lock(&block_group->alloc_mutex);
+                        spin_lock(&block_group->tree_lock);
                }
        }
-        mutex_unlock(&block_group->alloc_mutex);
+        spin_unlock(&block_group->tree_lock);
 }
-#if 0
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
-static struct btrfs_free_space *btrfs_find_free_space_offset(struct
+                               u64 offset, u64 bytes, u64 empty_size)
-                                                      btrfs_block_group_cache
-                                                      *block_group, u64 offset,
-                                                      u64 bytes)
 {
-        struct btrfs_free_space *ret;
+        struct btrfs_free_space *entry = NULL;
+        u64 ret = 0;
-        mutex_lock(&block_group->alloc_mutex);
+        spin_lock(&block_group->tree_lock);
-        ret = tree_search_offset(&block_group->free_space_offset, offset,
+        entry = tree_search_offset(&block_group->free_space_offset, offset,
-                                 bytes, 0);
+                                   bytes + empty_size, 1);
-        mutex_unlock(&block_group->alloc_mutex);
+        if (!entry)
+                entry = tree_search_bytes(&block_group->free_space_bytes,
+                                          offset, bytes + empty_size);
+        if (entry) {
+                unlink_free_space(block_group, entry);
+                ret = entry->offset;
+                entry->offset += bytes;
+                entry->bytes -= bytes;
+                if (!entry->bytes)
+                        kfree(entry);
+                else
+                        link_free_space(block_group, entry);
+        }
+        spin_unlock(&block_group->tree_lock);
        return ret;
 }
-static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+/*
-                                                     btrfs_block_group_cache
+ * given a cluster, put all of its extents back into the free space
-                                                     *block_group, u64 offset,
+ * cache.  If a block group is passed, this function will only free
-                                                     u64 bytes)
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+                               struct btrfs_block_group_cache *block_group,
+                               struct btrfs_free_cluster *cluster)
 {
-        struct btrfs_free_space *ret;
+        int ret;
-        mutex_lock(&block_group->alloc_mutex);
+        /* first, get a safe pointer to the block group */
+        spin_lock(&cluster->lock);
+        if (!block_group) {
+                block_group = cluster->block_group;
+                if (!block_group) {
+                        spin_unlock(&cluster->lock);
+                        return 0;
+                }
+        } else if (cluster->block_group != block_group) {
+                /* someone else has already freed it don't redo their work */
+                spin_unlock(&cluster->lock);
+                return 0;
+        }
+        atomic_inc(&block_group->count);
+        spin_unlock(&cluster->lock);
-        ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
+        /* now return any extents the cluster had on it */
-        mutex_unlock(&block_group->alloc_mutex);
+        spin_lock(&block_group->tree_lock);
+        ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+        spin_unlock(&block_group->tree_lock);
+        /* finally drop our ref */
+        btrfs_put_block_group(block_group);
        return ret;
 }
-#endif
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+/*
-                                               *block_group, u64 offset,
+ * given a cluster, try to allocate 'bytes' from it, returns 0
-                                               u64 bytes)
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster, u64 bytes,
+                             u64 min_start)
+{
+        struct btrfs_free_space *entry = NULL;
+        struct rb_node *node;
+        u64 ret = 0;
+        spin_lock(&cluster->lock);
+        if (bytes > cluster->max_size)
+                goto out;
+        if (cluster->block_group != block_group)
+                goto out;
+        node = rb_first(&cluster->root);
+        if (!node)
+                goto out;
+        entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        while(1) {
+                if (entry->bytes < bytes || entry->offset < min_start) {
+                        struct rb_node *node;
+                        node = rb_next(&entry->offset_index);
+                        if (!node)
+                                break;
+                        entry = rb_entry(node, struct btrfs_free_space,
+                                         offset_index);
+                        continue;
+                }
+                ret = entry->offset;
+                entry->offset += bytes;
+                entry->bytes -= bytes;
+                if (entry->bytes == 0) {
+                        rb_erase(&entry->offset_index, &cluster->root);
+                        kfree(entry);
+                }
+                break;
+        }
+out:
+        spin_unlock(&cluster->lock);
+        return ret;
+}
+/*
+ * here we try to find a cluster of blocks in a block group.  The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster,
+                             u64 offset, u64 bytes, u64 empty_size)
 {
-        struct btrfs_free_space *ret = NULL;
+        struct btrfs_free_space *entry = NULL;
+        struct rb_node *node;
+        struct btrfs_free_space *next;
+        struct btrfs_free_space *last;
+        u64 min_bytes;
+        u64 window_start;
+        u64 window_free;
+        u64 max_extent = 0;
+        int total_retries = 0;
+        int ret;
+        /* for metadata, allow allocates with more holes */
+        if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+                /*
+                 * we want to do larger allocations when we are
+                 * flushing out the delayed refs, it helps prevent
+                 * making more work as we go along.
+                 */
+                if (trans->transaction->delayed_refs.flushing)
+                        min_bytes = max(bytes, (bytes + empty_size) >> 1);
+                else
+                        min_bytes = max(bytes, (bytes + empty_size) >> 4);
+        } else
+                min_bytes = max(bytes, (bytes + empty_size) >> 2);
+        spin_lock(&block_group->tree_lock);
+        spin_lock(&cluster->lock);
+        /* someone already found a cluster, hooray */
+        if (cluster->block_group) {
+                ret = 0;
+                goto out;
+        }
+again:
+        min_bytes = min(min_bytes, bytes + empty_size);
+        entry = tree_search_bytes(&block_group->free_space_bytes,
+                                  offset, min_bytes);
+        if (!entry) {
+                ret = -ENOSPC;
+                goto out;
+        }
+        window_start = entry->offset;
+        window_free = entry->bytes;
+        last = entry;
+        max_extent = entry->bytes;
+        while(1) {
+                /* out window is just right, lets fill it */
+                if (window_free >= bytes + empty_size)
+                        break;
-        ret = tree_search_offset(&block_group->free_space_offset, offset,
+                node = rb_next(&last->offset_index);
-                                 bytes, 0);
+                if (!node) {
-        if (!ret)
+                        ret = -ENOSPC;
-                ret = tree_search_bytes(&block_group->free_space_bytes,
+                        goto out;
-                                        offset, bytes);
+                }
+                next = rb_entry(node, struct btrfs_free_space, offset_index);
+                /*
+                 * we haven't filled the empty size and the window is
+                 * very large.  reset and try again
+                 */
+                if (next->offset - window_start > (bytes + empty_size) * 2) {
+                        entry = next;
+                        window_start = entry->offset;
+                        window_free = entry->bytes;
+                        last = entry;
+                        max_extent = 0;
+                        total_retries++;
+                        if (total_retries % 256 == 0) {
+                                if (min_bytes >= (bytes + empty_size)) {
+                                        ret = -ENOSPC;
+                                        goto out;
+                                }
+                                /*
+                                 * grow our allocation a bit, we're not having
+                                 * much luck
+                                 */
+                                min_bytes *= 2;
+                                goto again;
+                        }
+                } else {
+                        last = next;
+                        window_free += next->bytes;
+                        if (entry->bytes > max_extent)
+                                max_extent = entry->bytes;
+                }
+        }
+        cluster->window_start = entry->offset;
+        /*
+         * now we've found our entries, pull them out of the free space
+         * cache and put them into the cluster rbtree
+         *
+         * The cluster includes an rbtree, but only uses the offset index
+         * of each free space cache entry.
+         */
+        while(1) {
+                node = rb_next(&entry->offset_index);
+                unlink_free_space(block_group, entry);
+                ret = tree_insert_offset(&cluster->root, entry->offset,
+                                         &entry->offset_index);
+                BUG_ON(ret);
+                if (!node || entry == last)
+                        break;
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        }
+        ret = 0;
+        cluster->max_size = max_extent;
+        atomic_inc(&block_group->count);
+        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
+        cluster->block_group = block_group;
+out:
+        spin_unlock(&cluster->lock);
+        spin_unlock(&block_group->tree_lock);
        return ret;
 }
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+        spin_lock_init(&cluster->lock);
+        spin_lock_init(&cluster->refill_lock);
+        cluster->root.rb_node = NULL;
+        cluster->max_size = 0;
+        INIT_LIST_HEAD(&cluster->block_group_list);
+        cluster->block_group = NULL;
+}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_FREE_SPACE_CACHE
+#define __BTRFS_FREE_SPACE_CACHE
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                         u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                            u64 bytenr, u64 size);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+                                   *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+                               u64 offset, u64 bytes, u64 empty_size);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+                           u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster,
+                             u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster, u64 bytes,
+                             u64 min_start);
+int btrfs_return_cluster_to_free_space(
+                               struct btrfs_block_group_cache *block_group,
+                               struct btrfs_free_cluster *cluster);
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06d8db5afb08..a0d1dd492a58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3481,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (dir) {
                ret = btrfs_set_inode_index(dir, index);
-                if (ret)
+                if (ret) {
+                        iput(inode);
                        return ERR_PTR(ret);
+                }
        }
        /*
         * index_cnt is ignored for everything but a dir,
@@ -3565,6 +3567,7 @@ fail:
        if (dir)
                BTRFS_I(dir)->index_cnt--;
        btrfs_free_path(path);
+        iput(inode);
        return ERR_PTR(ret);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..7594bec1be10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
                goto out_dput;
        if (!IS_POSIXACL(parent->dentry->d_inode))
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
        error = mnt_want_write(parent->mnt);
        if (error)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index a5310c0f41e2..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 /*
 * unfortunately, many of the places that currently set a lock to blocking
- * don't end up blocking for every long, and often they don't block
+ * don't end up blocking for very long, and often they don't block
- * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * at all.  For a dbench 50 run, if we don't spin on the blocking bit
 * at all, the context switch rate can jump up to 400,000/sec or more.
 *
 * So, we're still stuck with this crummy spin on the blocking bit,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 19a4daf03ccb..9744af9d71e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/init.h>
+#include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
+        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
+        Opt_flushoncommit, Opt_err,
 };
 static match_table_t tokens = {
@@ -83,6 +85,8 @@ static match_table_t tokens = {
        {Opt_compress, "compress"},
        {Opt_ssd, "ssd"},
        {Opt_noacl, "noacl"},
+        {Opt_notreelog, "notreelog"},
+        {Opt_flushoncommit, "flushoncommit"},
        {Opt_err, NULL},
 };
@@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_noacl:
                        root->fs_info->sb->s_flags &= ~MS_POSIXACL;
                        break;
+                case Opt_notreelog:
+                        printk(KERN_INFO "btrfs: disabling tree log\n");
+                        btrfs_set_opt(info->mount_opt, NOTREELOG);
+                        break;
+                case Opt_flushoncommit:
+                        printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
+                        btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+                        break;
                default:
                        break;
                }
@@ -363,9 +375,8 @@ fail_close:
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
        struct btrfs_trans_handle *trans;
-        struct btrfs_root *root;
+        struct btrfs_root *root = btrfs_sb(sb);
        int ret;
-        root = btrfs_sb(sb);
        if (sb->s_flags & MS_RDONLY)
                return 0;
@@ -385,6 +396,41 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        return ret;
 }
+static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+        struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
+        struct btrfs_fs_info *info = root->fs_info;
+        if (btrfs_test_opt(root, DEGRADED))
+                seq_puts(seq, ",degraded");
+        if (btrfs_test_opt(root, NODATASUM))
+                seq_puts(seq, ",nodatasum");
+        if (btrfs_test_opt(root, NODATACOW))
+                seq_puts(seq, ",nodatacow");
+        if (btrfs_test_opt(root, NOBARRIER))
+                seq_puts(seq, ",nobarrier");
+        if (info->max_extent != (u64)-1)
+                seq_printf(seq, ",max_extent=%llu", info->max_extent);
+        if (info->max_inline != 8192 * 1024)
+                seq_printf(seq, ",max_inline=%llu", info->max_inline);
+        if (info->alloc_start != 0)
+                seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
+        if (info->thread_pool_size !=  min_t(unsigned long,
+                                             num_online_cpus() + 2, 8))
+                seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+        if (btrfs_test_opt(root, COMPRESS))
+                seq_puts(seq, ",compress");
+        if (btrfs_test_opt(root, SSD))
+                seq_puts(seq, ",ssd");
+        if (btrfs_test_opt(root, NOTREELOG))
+                seq_puts(seq, ",no-treelog");
+        if (btrfs_test_opt(root, FLUSHONCOMMIT))
+                seq_puts(seq, ",flush-on-commit");
+        if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+                seq_puts(seq, ",noacl");
+        return 0;
+}
 static void btrfs_write_super(struct super_block *sb)
 {
        sb->s_dirt = 0;
@@ -630,7 +676,7 @@ static struct super_operations btrfs_super_ops = {
        .put_super      = btrfs_put_super,
        .write_super    = btrfs_write_super,
        .sync_fs        = btrfs_sync_fs,
-        .show_options   = generic_show_options,
+        .show_options   = btrfs_show_options,
        .write_inode    = btrfs_write_inode,
        .dirty_inode    = btrfs_dirty_inode,
        .alloc_inode    = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 664782c6a2df..2869b3361eb6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
                                             GFP_NOFS);
                BUG_ON(!cur_trans);
                root->fs_info->generation++;
-                root->fs_info->last_alloc = 0;
-                root->fs_info->last_data_alloc = 0;
                cur_trans->num_writers = 1;
                cur_trans->num_joined = 0;
                cur_trans->transid = root->fs_info->generation;
@@ -974,6 +972,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        int ret;
        int should_grow = 0;
        unsigned long now = get_seconds();
+        int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
        btrfs_run_ordered_operations(root, 0);
@@ -1053,7 +1052,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->fs_info->trans_mutex);
-                if (snap_pending) {
+                if (flush_on_commit || snap_pending) {
+                        if (flush_on_commit)
+                                btrfs_start_delalloc_inodes(root);
                        ret = btrfs_wait_ordered_extents(root, 1);
                        BUG_ON(ret);
                }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fc9b87a7975b..25f20ea11f27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -262,11 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
                              struct extent_buffer *eb,
                              struct walk_control *wc, u64 gen)
 {
-        if (wc->pin) {
+        if (wc->pin)
-                mutex_lock(&log->fs_info->pinned_mutex);
                btrfs_update_pinned_extents(log->fs_info->extent_root,
                                            eb->start, eb->len, 1);
-        }
        if (btrfs_buffer_uptodate(eb, gen)) {
                if (wc->write)
@@ -1224,8 +1222,7 @@ insert:
        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
                              name, name_len, log_type, &log_key);
-        if (ret && ret != -ENOENT)
+        BUG_ON(ret && ret != -ENOENT);
-                BUG();
        goto out;
 }
@@ -2900,6 +2897,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        sb = inode->i_sb;
+        if (btrfs_test_opt(root, NOTREELOG)) {
+                ret = 1;
+                goto end_no_trans;
+        }
        if (root->fs_info->last_trans_log_full_commit >
            root->fs_info->last_trans_committed) {
                ret = 1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd06e18e5aac..e0913e469728 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
+#include <linux/iocontext.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        int again = 0;
        unsigned long num_run = 0;
        unsigned long limit;
+        unsigned long last_waited = 0;
-        bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+        bdi = blk_get_backing_dev_info(device->bdev);
        fs_info = device->dev_root->fs_info;
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
@@ -207,7 +209,32 @@ loop_lock:
                if (pending && bdi_write_congested(bdi) && num_run > 16 &&
                    fs_info->fs_devices->open_devices > 1) {
                        struct bio *old_head;
+                        struct io_context *ioc;
+                        ioc = current->io_context;
+                        /*
+                         * the main goal here is that we don't want to
+                         * block if we're going to be able to submit
+                         * more requests without blocking.
+                         *
+                         * This code does two great things, it pokes into
+                         * the elevator code from a filesystem _and_
+                         * it makes assumptions about how batching works.
+                         */
+                        if (ioc && ioc->nr_batch_requests > 0 &&
+                            time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+                            (last_waited == 0 ||
+                             ioc->last_waited == last_waited)) {
+                                /*
+                                 * we want to go through our batch of
+                                 * requests and stop.  So, we copy out
+                                 * the ioc->last_waited time and test
+                                 * against it before looping
+                                 */
+                                last_waited = ioc->last_waited;
+                                continue;
+                        }
                        spin_lock(&device->io_lock);
                        old_head = device->pending_bios;
@@ -231,6 +258,18 @@ loop_lock:
        if (device->pending_bios)
                goto loop_lock;
        spin_unlock(&device->io_lock);
+        /*
+         * IO has already been through a long path to get here.  Checksumming,
+         * async helper threads, perhaps compression.  We've done a pretty
+         * good job of collecting a batch of IO and should just unplug
+         * the device right away.
+         *
+         * This will help anyone who is waiting on the IO, they might have
+         * already unplugged, but managed to do so before the bio they
+         * cared about found its way down here.
+         */
+        blk_run_backing_dev(bdi, NULL);
 done:
        return 0;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..2185de72ff7d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -76,7 +76,7 @@ struct btrfs_device {
 struct btrfs_fs_devices {
        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
-        /* the device with this id has the most recent coyp of the super */
+        /* the device with this id has the most recent copy of the super */
        u64 latest_devid;
        u64 latest_trans;
        u64 num_devices;
diff --git a/fs/buffer.c b/fs/buffer.c
index f5f8b15a6e40..6e35762b6169 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
        head = page_buffers(page);
        bh = head;
        do {
-                if (bh->b_blocknr == block) {
+                if (!buffer_mapped(bh))
+                        all_mapped = 0;
+                else if (bh->b_blocknr == block) {
                        ret = bh;
                        get_bh(bh);
                        goto out_unlock;
                }
-                if (!buffer_mapped(bh))
-                        all_mapped = 0;
                bh = bh->b_this_page;
        } while (bh != head);
@@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
        struct buffer_head *bh;
        struct list_head tmp;
-        struct address_space *mapping;
+        struct address_space *mapping, *prev_mapping = NULL;
        int err = 0, err2;
        INIT_LIST_HEAD(&tmp);
@@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * contents - it is a noop if I/O is still in
                                 * flight on potentially older contents.
                                 */
-                                ll_rw_block(SWRITE_SYNC, 1, &bh);
+                                ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+                                /*
+                                 * Kick off IO for the previous mapping. Note
+                                 * that we will not run the very last mapping,
+                                 * wait_on_buffer() will do that for us
+                                 * through sync_buffer().
+                                 */
+                                if (prev_mapping && prev_mapping != mapping)
+                                        blk_run_address_space(prev_mapping);
+                                prev_mapping = mapping;
                                brelse(bh);
                                spin_lock(lock);
                        }
@@ -1595,6 +1606,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        struct buffer_head *bh, *head;
        const unsigned blocksize = 1 << inode->i_blkbits;
        int nr_underway = 0;
+        int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
        BUG_ON(!PageLocked(page));
@@ -1686,7 +1698,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
-                        submit_bh(WRITE, bh);
+                        submit_bh(write_op, bh);
                        nr_underway++;
                }
                bh = next;
@@ -1740,7 +1752,7 @@ recover:
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
-                        submit_bh(WRITE, bh);
+                        submit_bh(write_op, bh);
                        nr_underway++;
                }
                bh = next;
@@ -2956,12 +2968,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];
-                if (rw == SWRITE || rw == SWRITE_SYNC)
+                if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
                        lock_buffer(bh);
                else if (!trylock_buffer(bh))
                        continue;
-                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
+                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
+                    rw == SWRITE_SYNC_PLUG) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
@@ -2997,7 +3010,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE, bh);
+                ret = submit_bh(WRITE_SYNC, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
@@ -3315,7 +3328,6 @@ EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
-EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644
index 000000000000..80e9c6167f0b
--- /dev/null
+++ b/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
+config CACHEFILES
+        tristate "Filesystem caching on files"
+        depends on FSCACHE && BLOCK
+        help
+          This permits use of a mounted filesystem as a cache for other
+          filesystems - primarily networking filesystems - thus allowing fast
+          local disk to enhance the speed of slower devices.
+          See Documentation/filesystems/caching/cachefiles.txt for more
+          information.
+config CACHEFILES_DEBUG
+        bool "Debug CacheFiles"
+        depends on CACHEFILES
+        help
+          This permits debugging to be dynamically enabled in the filesystem
+          caching on files module.  If this is set, the debugging output may be
+          enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
+          by including a debugging specifier in /etc/cachefilesd.conf.
+config CACHEFILES_HISTOGRAM
+        bool "Gather latency information on CacheFiles"
+        depends on CACHEFILES && PROC_FS
+        help
+          This option causes latency information to be gathered on CacheFiles
+          operation and exported through file:
+                /proc/fs/cachefiles/histogram
+          The generation of this histogram adds a certain amount of overhead to
+          execution as there are a number of points at which data is gathered,
+          and on a multi-CPU system these may be on cachelines that keep
+          bouncing between CPUs.  On the other hand, the histogram may be
+          useful for debugging purposes.  Saying 'N' here is recommended.
+          See Documentation/filesystems/caching/cachefiles.txt for more
+          information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644
index 000000000000..32cbab0ffce3
--- /dev/null
+++ b/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for caching in a mounted filesystem
+#
+cachefiles-y := \
+        bind.o \
+        daemon.o \
+        interface.o \
+        key.o \
+        main.o \
+        namei.o \
+        rdwr.o \
+        security.o \
+        xattr.o
+cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
+obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644
index 000000000000..3797e0077b35
--- /dev/null
+++ b/fs/cachefiles/bind.c
@@ -0,0 +1,286 @@
+/* Bind and unbind a cache from the filesystem backing it
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include "internal.h"
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
+/*
+ * bind a directory as a cache
+ */
+int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
+{
+        _enter("{%u,%u,%u,%u,%u,%u},%s",
+               cache->frun_percent,
+               cache->fcull_percent,
+               cache->fstop_percent,
+               cache->brun_percent,
+               cache->bcull_percent,
+               cache->bstop_percent,
+               args);
+        /* start by checking things over */
+        ASSERT(cache->fstop_percent >= 0 &&
+               cache->fstop_percent < cache->fcull_percent &&
+               cache->fcull_percent < cache->frun_percent &&
+               cache->frun_percent  < 100);
+        ASSERT(cache->bstop_percent >= 0 &&
+               cache->bstop_percent < cache->bcull_percent &&
+               cache->bcull_percent < cache->brun_percent &&
+               cache->brun_percent  < 100);
+        if (*args) {
+                kerror("'bind' command doesn't take an argument");
+                return -EINVAL;
+        }
+        if (!cache->rootdirname) {
+                kerror("No cache directory specified");
+                return -EINVAL;
+        }
+        /* don't permit already bound caches to be re-bound */
+        if (test_bit(CACHEFILES_READY, &cache->flags)) {
+                kerror("Cache already bound");
+                return -EBUSY;
+        }
+        /* make sure we have copies of the tag and dirname strings */
+        if (!cache->tag) {
+                /* the tag string is released by the fops->release()
+                 * function, so we don't release it on error here */
+                cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
+                if (!cache->tag)
+                        return -ENOMEM;
+        }
+        /* add the cache */
+        return cachefiles_daemon_add_cache(cache);
+}
+/*
+ * add a cache
+ */
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
+{
+        struct cachefiles_object *fsdef;
+        struct nameidata nd;
+        struct kstatfs stats;
+        struct dentry *graveyard, *cachedir, *root;
+        const struct cred *saved_cred;
+        int ret;
+        _enter("");
+        /* we want to work under the module's security ID */
+        ret = cachefiles_get_security_ID(cache);
+        if (ret < 0)
+                return ret;
+        cachefiles_begin_secure(cache, &saved_cred);
+        /* allocate the root index object */
+        ret = -ENOMEM;
+        fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+        if (!fsdef)
+                goto error_root_object;
+        ASSERTCMP(fsdef->backer, ==, NULL);
+        atomic_set(&fsdef->usage, 1);
+        fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
+        _debug("- fsdef %p", fsdef);
+        /* look up the directory at the root of the cache */
+        memset(&nd, 0, sizeof(nd));
+        ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
+        if (ret < 0)
+                goto error_open_root;
+        cache->mnt = mntget(nd.path.mnt);
+        root = dget(nd.path.dentry);
+        path_put(&nd.path);
+        /* check parameters */
+        ret = -EOPNOTSUPP;
+        if (!root->d_inode ||
+            !root->d_inode->i_op ||
+            !root->d_inode->i_op->lookup ||
+            !root->d_inode->i_op->mkdir ||
+            !root->d_inode->i_op->setxattr ||
+            !root->d_inode->i_op->getxattr ||
+            !root->d_sb ||
+            !root->d_sb->s_op ||
+            !root->d_sb->s_op->statfs ||
+            !root->d_sb->s_op->sync_fs)
+                goto error_unsupported;
+        ret = -EROFS;
+        if (root->d_sb->s_flags & MS_RDONLY)
+                goto error_unsupported;
+        /* determine the security of the on-disk cache as this governs
+         * security ID of files we create */
+        ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
+        if (ret < 0)
+                goto error_unsupported;
+        /* get the cache size and blocksize */
+        ret = vfs_statfs(root, &stats);
+        if (ret < 0)
+                goto error_unsupported;
+        ret = -ERANGE;
+        if (stats.f_bsize <= 0)
+                goto error_unsupported;
+        ret = -EOPNOTSUPP;
+        if (stats.f_bsize > PAGE_SIZE)
+                goto error_unsupported;
+        cache->bsize = stats.f_bsize;
+        cache->bshift = 0;
+        if (stats.f_bsize < PAGE_SIZE)
+                cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
+        _debug("blksize %u (shift %u)",
+               cache->bsize, cache->bshift);
+        _debug("size %llu, avail %llu",
+               (unsigned long long) stats.f_blocks,
+               (unsigned long long) stats.f_bavail);
+        /* set up caching limits */
+        do_div(stats.f_files, 100);
+        cache->fstop = stats.f_files * cache->fstop_percent;
+        cache->fcull = stats.f_files * cache->fcull_percent;
+        cache->frun  = stats.f_files * cache->frun_percent;
+        _debug("limits {%llu,%llu,%llu} files",
+               (unsigned long long) cache->frun,
+               (unsigned long long) cache->fcull,
+               (unsigned long long) cache->fstop);
+        stats.f_blocks >>= cache->bshift;
+        do_div(stats.f_blocks, 100);
+        cache->bstop = stats.f_blocks * cache->bstop_percent;
+        cache->bcull = stats.f_blocks * cache->bcull_percent;
+        cache->brun  = stats.f_blocks * cache->brun_percent;
+        _debug("limits {%llu,%llu,%llu} blocks",
+               (unsigned long long) cache->brun,
+               (unsigned long long) cache->bcull,
+               (unsigned long long) cache->bstop);
+        /* get the cache directory and check its type */
+        cachedir = cachefiles_get_directory(cache, root, "cache");
+        if (IS_ERR(cachedir)) {
+                ret = PTR_ERR(cachedir);
+                goto error_unsupported;
+        }
+        fsdef->dentry = cachedir;
+        fsdef->fscache.cookie = NULL;
+        ret = cachefiles_check_object_type(fsdef);
+        if (ret < 0)
+                goto error_unsupported;
+        /* get the graveyard directory */
+        graveyard = cachefiles_get_directory(cache, root, "graveyard");
+        if (IS_ERR(graveyard)) {
+                ret = PTR_ERR(graveyard);
+                goto error_unsupported;
+        }
+        cache->graveyard = graveyard;
+        /* publish the cache */
+        fscache_init_cache(&cache->cache,
+                           &cachefiles_cache_ops,
+                           "%s",
+                           fsdef->dentry->d_sb->s_id);
+        fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
+        ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
+        if (ret < 0)
+                goto error_add_cache;
+        /* done */
+        set_bit(CACHEFILES_READY, &cache->flags);
+        dput(root);
+        printk(KERN_INFO "CacheFiles:"
+               " File cache on %s registered\n",
+               cache->cache.identifier);
+        /* check how much space the cache has */
+        cachefiles_has_space(cache, 0, 0);
+        cachefiles_end_secure(cache, saved_cred);
+        return 0;
+error_add_cache:
+        dput(cache->graveyard);
+        cache->graveyard = NULL;
+error_unsupported:
+        mntput(cache->mnt);
+        cache->mnt = NULL;
+        dput(fsdef->dentry);
+        fsdef->dentry = NULL;
+        dput(root);
+error_open_root:
+        kmem_cache_free(cachefiles_object_jar, fsdef);
+error_root_object:
+        cachefiles_end_secure(cache, saved_cred);
+        kerror("Failed to register: %d", ret);
+        return ret;
+}
+/*
+ * unbind a cache on fd release
+ */
+void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
+{
+        _enter("");
+        if (test_bit(CACHEFILES_READY, &cache->flags)) {
+                printk(KERN_INFO "CacheFiles:"
+                       " File cache on %s unregistering\n",
+                       cache->cache.identifier);
+                fscache_withdraw_cache(&cache->cache);
+        }
+        dput(cache->graveyard);
+        mntput(cache->mnt);
+        kfree(cache->rootdirname);
+        kfree(cache->secctx);
+        kfree(cache->tag);
+        _leave("");
+}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644
index 000000000000..4618516dd994
--- /dev/null
+++ b/fs/cachefiles/daemon.c
@@ -0,0 +1,755 @@
+/* Daemon interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include <linux/fs_struct.h>
+#include "internal.h"
+static int cachefiles_daemon_open(struct inode *, struct file *);
+static int cachefiles_daemon_release(struct inode *, struct file *);
+static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
+                                      loff_t *);
+static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
+                                       size_t, loff_t *);
+static unsigned int cachefiles_daemon_poll(struct file *,
+                                           struct poll_table_struct *);
+static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
+static unsigned long cachefiles_open;
+const struct file_operations cachefiles_daemon_fops = {
+        .owner          = THIS_MODULE,
+        .open           = cachefiles_daemon_open,
+        .release        = cachefiles_daemon_release,
+        .read           = cachefiles_daemon_read,
+        .write          = cachefiles_daemon_write,
+        .poll           = cachefiles_daemon_poll,
+};
+struct cachefiles_daemon_cmd {
+        char name[8];
+        int (*handler)(struct cachefiles_cache *cache, char *args);
+};
+static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
+        { "bind",       cachefiles_daemon_bind          },
+        { "brun",       cachefiles_daemon_brun          },
+        { "bcull",      cachefiles_daemon_bcull         },
+        { "bstop",      cachefiles_daemon_bstop         },
+        { "cull",       cachefiles_daemon_cull          },
+        { "debug",      cachefiles_daemon_debug         },
+        { "dir",        cachefiles_daemon_dir           },
+        { "frun",       cachefiles_daemon_frun          },
+        { "fcull",      cachefiles_daemon_fcull         },
+        { "fstop",      cachefiles_daemon_fstop         },
+        { "inuse",      cachefiles_daemon_inuse         },
+        { "secctx",     cachefiles_daemon_secctx        },
+        { "tag",        cachefiles_daemon_tag           },
+        { "",           NULL                            }
+};
+/*
+ * do various checks
+ */
+static int cachefiles_daemon_open(struct inode *inode, struct file *file)
+{
+        struct cachefiles_cache *cache;
+        _enter("");
+        /* only the superuser may do this */
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /* the cachefiles device may only be open once at a time */
+        if (xchg(&cachefiles_open, 1) == 1)
+                return -EBUSY;
+        /* allocate a cache record */
+        cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
+        if (!cache) {
+                cachefiles_open = 0;
+                return -ENOMEM;
+        }
+        mutex_init(&cache->daemon_mutex);
+        cache->active_nodes = RB_ROOT;
+        rwlock_init(&cache->active_lock);
+        init_waitqueue_head(&cache->daemon_pollwq);
+        /* set default caching limits
+         * - limit at 1% free space and/or free files
+         * - cull below 5% free space and/or free files
+         * - cease culling above 7% free space and/or free files
+         */
+        cache->frun_percent = 7;
+        cache->fcull_percent = 5;
+        cache->fstop_percent = 1;
+        cache->brun_percent = 7;
+        cache->bcull_percent = 5;
+        cache->bstop_percent = 1;
+        file->private_data = cache;
+        cache->cachefilesd = file;
+        return 0;
+}
+/*
+ * release a cache
+ */
+static int cachefiles_daemon_release(struct inode *inode, struct file *file)
+{
+        struct cachefiles_cache *cache = file->private_data;
+        _enter("");
+        ASSERT(cache);
+        set_bit(CACHEFILES_DEAD, &cache->flags);
+        cachefiles_daemon_unbind(cache);
+        ASSERT(!cache->active_nodes.rb_node);
+        /* clean up the control file interface */
+        cache->cachefilesd = NULL;
+        file->private_data = NULL;
+        cachefiles_open = 0;
+        kfree(cache);
+        _leave("");
+        return 0;
+}
+/*
+ * read the cache state
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+                                      size_t buflen, loff_t *pos)
+{
+        struct cachefiles_cache *cache = file->private_data;
+        char buffer[256];
+        int n;
+        //_enter(",,%zu,", buflen);
+        if (!test_bit(CACHEFILES_READY, &cache->flags))
+                return 0;
+        /* check how much space the cache has */
+        cachefiles_has_space(cache, 0, 0);
+        /* summarise */
+        clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+        n = snprintf(buffer, sizeof(buffer),
+                     "cull=%c"
+                     " frun=%llx"
+                     " fcull=%llx"
+                     " fstop=%llx"
+                     " brun=%llx"
+                     " bcull=%llx"
+                     " bstop=%llx",
+                     test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
+                     (unsigned long long) cache->frun,
+                     (unsigned long long) cache->fcull,
+                     (unsigned long long) cache->fstop,
+                     (unsigned long long) cache->brun,
+                     (unsigned long long) cache->bcull,
+                     (unsigned long long) cache->bstop
+                     );
+        if (n > buflen)
+                return -EMSGSIZE;
+        if (copy_to_user(_buffer, buffer, n) != 0)
+                return -EFAULT;
+        return n;
+}
+/*
+ * command the cache
+ */
+static ssize_t cachefiles_daemon_write(struct file *file,
+                                       const char __user *_data,
+                                       size_t datalen,
+                                       loff_t *pos)
+{
+        const struct cachefiles_daemon_cmd *cmd;
+        struct cachefiles_cache *cache = file->private_data;
+        ssize_t ret;
+        char *data, *args, *cp;
+        //_enter(",,%zu,", datalen);
+        ASSERT(cache);
+        if (test_bit(CACHEFILES_DEAD, &cache->flags))
+                return -EIO;
+        if (datalen < 0 || datalen > PAGE_SIZE - 1)
+                return -EOPNOTSUPP;
+        /* drag the command string into the kernel so we can parse it */
+        data = kmalloc(datalen + 1, GFP_KERNEL);
+        if (!data)
+                return -ENOMEM;
+        ret = -EFAULT;
+        if (copy_from_user(data, _data, datalen) != 0)
+                goto error;
+        data[datalen] = '\0';
+        ret = -EINVAL;
+        if (memchr(data, '\0', datalen))
+                goto error;
+        /* strip any newline */
+        cp = memchr(data, '\n', datalen);
+        if (cp) {
+                if (cp == data)
+                        goto error;
+                *cp = '\0';
+        }
+        /* parse the command */
+        ret = -EOPNOTSUPP;
+        for (args = data; *args; args++)
+                if (isspace(*args))
+                        break;
+        if (*args) {
+                if (args == data)
+                        goto error;
+                *args = '\0';
+                for (args++; isspace(*args); args++)
+                        continue;
+        }
+        /* run the appropriate command handler */
+        for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
+                if (strcmp(cmd->name, data) == 0)
+                        goto found_command;
+error:
+        kfree(data);
+        //_leave(" = %zd", ret);
+        return ret;
+found_command:
+        mutex_lock(&cache->daemon_mutex);
+        ret = -EIO;
+        if (!test_bit(CACHEFILES_DEAD, &cache->flags))
+                ret = cmd->handler(cache, args);
+        mutex_unlock(&cache->daemon_mutex);
+        if (ret == 0)
+                ret = datalen;
+        goto error;
+}
+/*
+ * poll for culling state
+ * - use POLLOUT to indicate culling state
+ */
+static unsigned int cachefiles_daemon_poll(struct file *file,
+                                           struct poll_table_struct *poll)
+{
+        struct cachefiles_cache *cache = file->private_data;
+        unsigned int mask;
+        poll_wait(file, &cache->daemon_pollwq, poll);
+        mask = 0;
+        if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+                mask |= POLLIN;
+        if (test_bit(CACHEFILES_CULLING, &cache->flags))
+                mask |= POLLOUT;
+        return mask;
+}
+/*
+ * give a range error for cache space constraints
+ * - can be tail-called
+ */
+static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
+                                         char *args)
+{
+        kerror("Free space limits must be in range"
+               " 0%%<=stop<cull<run<100%%");
+        return -EINVAL;
+}
+/*
+ * set the percentage of files at which to stop culling
+ * - command: "frun <N>%"
+ */
+static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long frun;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        frun = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (frun <= cache->fcull_percent || frun >= 100)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->frun_percent = frun;
+        return 0;
+}
+/*
+ * set the percentage of files at which to start culling
+ * - command: "fcull <N>%"
+ */
+static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long fcull;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        fcull = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->fcull_percent = fcull;
+        return 0;
+}
+/*
+ * set the percentage of files at which to stop allocating
+ * - command: "fstop <N>%"
+ */
+static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long fstop;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        fstop = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (fstop < 0 || fstop >= cache->fcull_percent)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->fstop_percent = fstop;
+        return 0;
+}
+/*
+ * set the percentage of blocks at which to stop culling
+ * - command: "brun <N>%"
+ */
+static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long brun;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        brun = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (brun <= cache->bcull_percent || brun >= 100)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->brun_percent = brun;
+        return 0;
+}
+/*
+ * set the percentage of blocks at which to start culling
+ * - command: "bcull <N>%"
+ */
+static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long bcull;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        bcull = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->bcull_percent = bcull;
+        return 0;
+}
+/*
+ * set the percentage of blocks at which to stop allocating
+ * - command: "bstop <N>%"
+ */
+static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long bstop;
+        _enter(",%s", args);
+        if (!*args)
+                return -EINVAL;
+        bstop = simple_strtoul(args, &args, 10);
+        if (args[0] != '%' || args[1] != '\0')
+                return -EINVAL;
+        if (bstop < 0 || bstop >= cache->bcull_percent)
+                return cachefiles_daemon_range_error(cache, args);
+        cache->bstop_percent = bstop;
+        return 0;
+}
+/*
+ * set the cache directory
+ * - command: "dir <name>"
+ */
+static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
+{
+        char *dir;
+        _enter(",%s", args);
+        if (!*args) {
+                kerror("Empty directory specified");
+                return -EINVAL;
+        }
+        if (cache->rootdirname) {
+                kerror("Second cache directory specified");
+                return -EEXIST;
+        }
+        dir = kstrdup(args, GFP_KERNEL);
+        if (!dir)
+                return -ENOMEM;
+        cache->rootdirname = dir;
+        return 0;
+}
+/*
+ * set the cache security context
+ * - command: "secctx <ctx>"
+ */
+static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
+{
+        char *secctx;
+        _enter(",%s", args);
+        if (!*args) {
+                kerror("Empty security context specified");
+                return -EINVAL;
+        }
+        if (cache->secctx) {
+                kerror("Second security context specified");
+                return -EINVAL;
+        }
+        secctx = kstrdup(args, GFP_KERNEL);
+        if (!secctx)
+                return -ENOMEM;
+        cache->secctx = secctx;
+        return 0;
+}
+/*
+ * set the cache tag
+ * - command: "tag <name>"
+ */
+static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
+{
+        char *tag;
+        _enter(",%s", args);
+        if (!*args) {
+                kerror("Empty tag specified");
+                return -EINVAL;
+        }
+        if (cache->tag)
+                return -EEXIST;
+        tag = kstrdup(args, GFP_KERNEL);
+        if (!tag)
+                return -ENOMEM;
+        cache->tag = tag;
+        return 0;
+}
+/*
+ * request a node in the cache be culled from the current working directory
+ * - command: "cull <name>"
+ */
+static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
+{
+        struct fs_struct *fs;
+        struct dentry *dir;
+        const struct cred *saved_cred;
+        int ret;
+        _enter(",%s", args);
+        if (strchr(args, '/'))
+                goto inval;
+        if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+                kerror("cull applied to unready cache");
+                return -EIO;
+        }
+        if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+                kerror("cull applied to dead cache");
+                return -EIO;
+        }
+        /* extract the directory dentry from the cwd */
+        fs = current->fs;
+        read_lock(&fs->lock);
+        dir = dget(fs->pwd.dentry);
+        read_unlock(&fs->lock);
+        if (!S_ISDIR(dir->d_inode->i_mode))
+                goto notdir;
+        cachefiles_begin_secure(cache, &saved_cred);
+        ret = cachefiles_cull(cache, dir, args);
+        cachefiles_end_secure(cache, saved_cred);
+        dput(dir);
+        _leave(" = %d", ret);
+        return ret;
+notdir:
+        dput(dir);
+        kerror("cull command requires dirfd to be a directory");
+        return -ENOTDIR;
+inval:
+        kerror("cull command requires dirfd and filename");
+        return -EINVAL;
+}
+/*
+ * set debugging mode
+ * - command: "debug <mask>"
+ */
+static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
+{
+        unsigned long mask;
+        _enter(",%s", args);
+        mask = simple_strtoul(args, &args, 0);
+        if (args[0] != '\0')
+                goto inval;
+        cachefiles_debug = mask;
+        _leave(" = 0");
+        return 0;
+inval:
+        kerror("debug command requires mask");
+        return -EINVAL;
+}
+/*
+ * find out whether an object in the current working directory is in use or not
+ * - command: "inuse <name>"
+ */
+static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
+{
+        struct fs_struct *fs;
+        struct dentry *dir;
+        const struct cred *saved_cred;
+        int ret;
+        //_enter(",%s", args);
+        if (strchr(args, '/'))
+                goto inval;
+        if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+                kerror("inuse applied to unready cache");
+                return -EIO;
+        }
+        if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+                kerror("inuse applied to dead cache");
+                return -EIO;
+        }
+        /* extract the directory dentry from the cwd */
+        fs = current->fs;
+        read_lock(&fs->lock);
+        dir = dget(fs->pwd.dentry);
+        read_unlock(&fs->lock);
+        if (!S_ISDIR(dir->d_inode->i_mode))
+                goto notdir;
+        cachefiles_begin_secure(cache, &saved_cred);
+        ret = cachefiles_check_in_use(cache, dir, args);
+        cachefiles_end_secure(cache, saved_cred);
+        dput(dir);
+        //_leave(" = %d", ret);
+        return ret;
+notdir:
+        dput(dir);
+        kerror("inuse command requires dirfd to be a directory");
+        return -ENOTDIR;
+inval:
+        kerror("inuse command requires dirfd and filename");
+        return -EINVAL;
+}
+/*
+ * see if we have space for a number of pages and/or a number of files in the
+ * cache
+ */
+int cachefiles_has_space(struct cachefiles_cache *cache,
+                         unsigned fnr, unsigned bnr)
+{
+        struct kstatfs stats;
+        int ret;
+        //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
+        //       (unsigned long long) cache->frun,
+        //       (unsigned long long) cache->fcull,
+        //       (unsigned long long) cache->fstop,
+        //       (unsigned long long) cache->brun,
+        //       (unsigned long long) cache->bcull,
+        //       (unsigned long long) cache->bstop,
+        //       fnr, bnr);
+        /* find out how many pages of blockdev are available */
+        memset(&stats, 0, sizeof(stats));
+        ret = vfs_statfs(cache->mnt->mnt_root, &stats);
+        if (ret < 0) {
+                if (ret == -EIO)
+                        cachefiles_io_error(cache, "statfs failed");
+                _leave(" = %d", ret);
+                return ret;
+        }
+        stats.f_bavail >>= cache->bshift;
+        //_debug("avail %llu,%llu",
+        //       (unsigned long long) stats.f_ffree,
+        //       (unsigned long long) stats.f_bavail);
+        /* see if there is sufficient space */
+        if (stats.f_ffree > fnr)
+                stats.f_ffree -= fnr;
+        else
+                stats.f_ffree = 0;
+        if (stats.f_bavail > bnr)
+                stats.f_bavail -= bnr;
+        else
+                stats.f_bavail = 0;
+        ret = -ENOBUFS;
+        if (stats.f_ffree < cache->fstop ||
+            stats.f_bavail < cache->bstop)
+                goto begin_cull;
+        ret = 0;
+        if (stats.f_ffree < cache->fcull ||
+            stats.f_bavail < cache->bcull)
+                goto begin_cull;
+        if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
+            stats.f_ffree >= cache->frun &&
+            stats.f_bavail >= cache->brun &&
+            test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
+            ) {
+                _debug("cease culling");
+                cachefiles_state_changed(cache);
+        }
+        //_leave(" = 0");
+        return 0;
+begin_cull:
+        if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
+                _debug("### CULL CACHE ###");
+                cachefiles_state_changed(cache);
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644
index 000000000000..1e962348d111
--- /dev/null
+++ b/fs/cachefiles/interface.c
@@ -0,0 +1,449 @@
+/* FS-Cache interface to CacheFiles
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+struct cachefiles_lookup_data {
+        struct cachefiles_xattr *auxdata;       /* auxiliary data */
+        char                    *key;           /* key path */
+};
+static int cachefiles_attr_changed(struct fscache_object *_object);
+/*
+ * allocate an object record for a cookie lookup and prepare the lookup data
+ */
+static struct fscache_object *cachefiles_alloc_object(
+        struct fscache_cache *_cache,
+        struct fscache_cookie *cookie)
+{
+        struct cachefiles_lookup_data *lookup_data;
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct cachefiles_xattr *auxdata;
+        unsigned keylen, auxlen;
+        void *buffer;
+        char *key;
+        cache = container_of(_cache, struct cachefiles_cache, cache);
+        _enter("{%s},%p,", cache->cache.identifier, cookie);
+        lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+        if (!lookup_data)
+                goto nomem_lookup_data;
+        /* create a new object record and a temporary leaf image */
+        object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+        if (!object)
+                goto nomem_object;
+        ASSERTCMP(object->backer, ==, NULL);
+        BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+        atomic_set(&object->usage, 1);
+        fscache_object_init(&object->fscache, cookie, &cache->cache);
+        object->type = cookie->def->type;
+        /* get hold of the raw key
+         * - stick the length on the front and leave space on the back for the
+         *   encoder
+         */
+        buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+        if (!buffer)
+                goto nomem_buffer;
+        keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
+        ASSERTCMP(keylen, <, 512);
+        *(uint16_t *)buffer = keylen;
+        ((char *)buffer)[keylen + 2] = 0;
+        ((char *)buffer)[keylen + 3] = 0;
+        ((char *)buffer)[keylen + 4] = 0;
+        /* turn the raw key into something that can work with as a filename */
+        key = cachefiles_cook_key(buffer, keylen + 2, object->type);
+        if (!key)
+                goto nomem_key;
+        /* get hold of the auxiliary data and prepend the object type */
+        auxdata = buffer;
+        auxlen = 0;
+        if (cookie->def->get_aux) {
+                auxlen = cookie->def->get_aux(cookie->netfs_data,
+                                              auxdata->data, 511);
+                ASSERTCMP(auxlen, <, 511);
+        }
+        auxdata->len = auxlen + 1;
+        auxdata->type = cookie->def->type;
+        lookup_data->auxdata = auxdata;
+        lookup_data->key = key;
+        object->lookup_data = lookup_data;
+        _leave(" = %p [%p]", &object->fscache, lookup_data);
+        return &object->fscache;
+nomem_key:
+        kfree(buffer);
+nomem_buffer:
+        BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+        kmem_cache_free(cachefiles_object_jar, object);
+        fscache_object_destroyed(&cache->cache);
+nomem_object:
+        kfree(lookup_data);
+nomem_lookup_data:
+        _leave(" = -ENOMEM");
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * attempt to look up the nominated node in this cache
+ */
+static void cachefiles_lookup_object(struct fscache_object *_object)
+{
+        struct cachefiles_lookup_data *lookup_data;
+        struct cachefiles_object *parent, *object;
+        struct cachefiles_cache *cache;
+        const struct cred *saved_cred;
+        int ret;
+        _enter("{OBJ%x}", _object->debug_id);
+        cache = container_of(_object->cache, struct cachefiles_cache, cache);
+        parent = container_of(_object->parent,
+                              struct cachefiles_object, fscache);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        lookup_data = object->lookup_data;
+        ASSERTCMP(lookup_data, !=, NULL);
+        /* look up the key, creating any missing bits */
+        cachefiles_begin_secure(cache, &saved_cred);
+        ret = cachefiles_walk_to_object(parent, object,
+                                        lookup_data->key,
+                                        lookup_data->auxdata);
+        cachefiles_end_secure(cache, saved_cred);
+        /* polish off by setting the attributes of non-index files */
+        if (ret == 0 &&
+            object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+                cachefiles_attr_changed(&object->fscache);
+        if (ret < 0) {
+                printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
+                       ret);
+                fscache_object_lookup_error(&object->fscache);
+        }
+        _leave(" [%d]", ret);
+}
+/*
+ * indication of lookup completion
+ */
+static void cachefiles_lookup_complete(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        object = container_of(_object, struct cachefiles_object, fscache);
+        _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
+        if (object->lookup_data) {
+                kfree(object->lookup_data->key);
+                kfree(object->lookup_data->auxdata);
+                kfree(object->lookup_data);
+                object->lookup_data = NULL;
+        }
+}
+/*
+ * increment the usage count on an inode object (may fail if unmounting)
+ */
+static
+struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+{
+        struct cachefiles_object *object =
+                container_of(_object, struct cachefiles_object, fscache);
+        _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
+#ifdef CACHEFILES_DEBUG_SLAB
+        ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+        atomic_inc(&object->usage);
+        return &object->fscache;
+}
+/*
+ * update the auxilliary data for an object object on disk
+ */
+static void cachefiles_update_object(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_xattr *auxdata;
+        struct cachefiles_cache *cache;
+        struct fscache_cookie *cookie;
+        const struct cred *saved_cred;
+        unsigned auxlen;
+        _enter("{OBJ%x}", _object->debug_id);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache, struct cachefiles_cache,
+                             cache);
+        cookie = object->fscache.cookie;
+        if (!cookie->def->get_aux) {
+                _leave(" [no aux]");
+                return;
+        }
+        auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+        if (!auxdata) {
+                _leave(" [nomem]");
+                return;
+        }
+        auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+        ASSERTCMP(auxlen, <, 511);
+        auxdata->len = auxlen + 1;
+        auxdata->type = cookie->def->type;
+        cachefiles_begin_secure(cache, &saved_cred);
+        cachefiles_update_object_xattr(object, auxdata);
+        cachefiles_end_secure(cache, saved_cred);
+        kfree(auxdata);
+        _leave("");
+}
+/*
+ * discard the resources pinned by an object and effect retirement if
+ * requested
+ */
+static void cachefiles_drop_object(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        const struct cred *saved_cred;
+        ASSERT(_object);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        _enter("{OBJ%x,%d}",
+               object->fscache.debug_id, atomic_read(&object->usage));
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+#ifdef CACHEFILES_DEBUG_SLAB
+        ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+        /* delete retired objects */
+        if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+            _object != cache->cache.fsdef
+            ) {
+                _debug("- retire object OBJ%x", object->fscache.debug_id);
+                cachefiles_begin_secure(cache, &saved_cred);
+                cachefiles_delete_object(cache, object);
+                cachefiles_end_secure(cache, saved_cred);
+        }
+        /* close the filesystem stuff attached to the object */
+        if (object->backer != object->dentry)
+                dput(object->backer);
+        object->backer = NULL;
+        /* note that the object is now inactive */
+        if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+                write_lock(&cache->active_lock);
+                if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
+                                        &object->flags))
+                        BUG();
+                rb_erase(&object->active_node, &cache->active_nodes);
+                wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+                write_unlock(&cache->active_lock);
+        }
+        dput(object->dentry);
+        object->dentry = NULL;
+        _leave("");
+}
+/*
+ * dispose of a reference to an object
+ */
+static void cachefiles_put_object(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        struct fscache_cache *cache;
+        ASSERT(_object);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        _enter("{OBJ%x,%d}",
+               object->fscache.debug_id, atomic_read(&object->usage));
+#ifdef CACHEFILES_DEBUG_SLAB
+        ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+        ASSERTIFCMP(object->fscache.parent,
+                    object->fscache.parent->n_children, >, 0);
+        if (atomic_dec_and_test(&object->usage)) {
+                _debug("- kill object OBJ%x", object->fscache.debug_id);
+                ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+                ASSERTCMP(object->fscache.parent, ==, NULL);
+                ASSERTCMP(object->backer, ==, NULL);
+                ASSERTCMP(object->dentry, ==, NULL);
+                ASSERTCMP(object->fscache.n_ops, ==, 0);
+                ASSERTCMP(object->fscache.n_children, ==, 0);
+                if (object->lookup_data) {
+                        kfree(object->lookup_data->key);
+                        kfree(object->lookup_data->auxdata);
+                        kfree(object->lookup_data);
+                        object->lookup_data = NULL;
+                }
+                cache = object->fscache.cache;
+                kmem_cache_free(cachefiles_object_jar, object);
+                fscache_object_destroyed(cache);
+        }
+        _leave("");
+}
+/*
+ * sync a cache
+ */
+static void cachefiles_sync_cache(struct fscache_cache *_cache)
+{
+        struct cachefiles_cache *cache;
+        const struct cred *saved_cred;
+        int ret;
+        _enter("%p", _cache);
+        cache = container_of(_cache, struct cachefiles_cache, cache);
+        /* make sure all pages pinned by operations on behalf of the netfs are
+         * written to disc */
+        cachefiles_begin_secure(cache, &saved_cred);
+        ret = fsync_super(cache->mnt->mnt_sb);
+        cachefiles_end_secure(cache, saved_cred);
+        if (ret == -EIO)
+                cachefiles_io_error(cache,
+                                    "Attempt to sync backing fs superblock"
+                                    " returned error %d",
+                                    ret);
+}
+/*
+ * notification the attributes on an object have changed
+ * - called with reads/writes excluded by FS-Cache
+ */
+static int cachefiles_attr_changed(struct fscache_object *_object)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        const struct cred *saved_cred;
+        struct iattr newattrs;
+        uint64_t ni_size;
+        loff_t oi_size;
+        int ret;
+        _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+        _enter("{OBJ%x},[%llu]",
+               _object->debug_id, (unsigned long long) ni_size);
+        object = container_of(_object, struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        if (ni_size == object->i_size)
+                return 0;
+        if (!object->backer)
+                return -ENOBUFS;
+        ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+        fscache_set_store_limit(&object->fscache, ni_size);
+        oi_size = i_size_read(object->backer->d_inode);
+        if (oi_size == ni_size)
+                return 0;
+        newattrs.ia_size = ni_size;
+        newattrs.ia_valid = ATTR_SIZE;
+        cachefiles_begin_secure(cache, &saved_cred);
+        mutex_lock(&object->backer->d_inode->i_mutex);
+        ret = notify_change(object->backer, &newattrs);
+        mutex_unlock(&object->backer->d_inode->i_mutex);
+        cachefiles_end_secure(cache, saved_cred);
+        if (ret == -EIO) {
+                fscache_set_store_limit(&object->fscache, 0);
+                cachefiles_io_error_obj(object, "Size set failed");
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * dissociate a cache from all the pages it was backing
+ */
+static void cachefiles_dissociate_pages(struct fscache_cache *cache)
+{
+        _enter("");
+}
+const struct fscache_cache_ops cachefiles_cache_ops = {
+        .name                   = "cachefiles",
+        .alloc_object           = cachefiles_alloc_object,
+        .lookup_object          = cachefiles_lookup_object,
+        .lookup_complete        = cachefiles_lookup_complete,
+        .grab_object            = cachefiles_grab_object,
+        .update_object          = cachefiles_update_object,
+        .drop_object            = cachefiles_drop_object,
+        .put_object             = cachefiles_put_object,
+        .sync_cache             = cachefiles_sync_cache,
+        .attr_changed           = cachefiles_attr_changed,
+        .read_or_alloc_page     = cachefiles_read_or_alloc_page,
+        .read_or_alloc_pages    = cachefiles_read_or_alloc_pages,
+        .allocate_page          = cachefiles_allocate_page,
+        .allocate_pages         = cachefiles_allocate_pages,
+        .write_page             = cachefiles_write_page,
+        .uncache_page           = cachefiles_uncache_page,
+        .dissociate_pages       = cachefiles_dissociate_pages,
+};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644
index 000000000000..19218e1463d6
--- /dev/null
+++ b/fs/cachefiles/internal.h
@@ -0,0 +1,360 @@
+/* General netfs cache on cache files internal defs
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/fscache-cache.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+struct cachefiles_cache;
+struct cachefiles_object;
+extern unsigned cachefiles_debug;
+#define CACHEFILES_DEBUG_KENTER 1
+#define CACHEFILES_DEBUG_KLEAVE 2
+#define CACHEFILES_DEBUG_KDEBUG 4
+/*
+ * node records
+ */
+struct cachefiles_object {
+        struct fscache_object           fscache;        /* fscache handle */
+        struct cachefiles_lookup_data   *lookup_data;   /* cached lookup data */
+        struct dentry                   *dentry;        /* the file/dir representing this object */
+        struct dentry                   *backer;        /* backing file */
+        loff_t                          i_size;         /* object size */
+        unsigned long                   flags;
+#define CACHEFILES_OBJECT_ACTIVE        0               /* T if marked active */
+        atomic_t                        usage;          /* object usage count */
+        uint8_t                         type;           /* object type */
+        uint8_t                         new;            /* T if object new */
+        spinlock_t                      work_lock;
+        struct rb_node                  active_node;    /* link in active tree (dentry is key) */
+};
+extern struct kmem_cache *cachefiles_object_jar;
+/*
+ * Cache files cache definition
+ */
+struct cachefiles_cache {
+        struct fscache_cache            cache;          /* FS-Cache record */
+        struct vfsmount                 *mnt;           /* mountpoint holding the cache */
+        struct dentry                   *graveyard;     /* directory into which dead objects go */
+        struct file                     *cachefilesd;   /* manager daemon handle */
+        const struct cred               *cache_cred;    /* security override for accessing cache */
+        struct mutex                    daemon_mutex;   /* command serialisation mutex */
+        wait_queue_head_t               daemon_pollwq;  /* poll waitqueue for daemon */
+        struct rb_root                  active_nodes;   /* active nodes (can't be culled) */
+        rwlock_t                        active_lock;    /* lock for active_nodes */
+        atomic_t                        gravecounter;   /* graveyard uniquifier */
+        unsigned                        frun_percent;   /* when to stop culling (% files) */
+        unsigned                        fcull_percent;  /* when to start culling (% files) */
+        unsigned                        fstop_percent;  /* when to stop allocating (% files) */
+        unsigned                        brun_percent;   /* when to stop culling (% blocks) */
+        unsigned                        bcull_percent;  /* when to start culling (% blocks) */
+        unsigned                        bstop_percent;  /* when to stop allocating (% blocks) */
+        unsigned                        bsize;          /* cache's block size */
+        unsigned                        bshift;         /* min(ilog2(PAGE_SIZE / bsize), 0) */
+        uint64_t                        frun;           /* when to stop culling */
+        uint64_t                        fcull;          /* when to start culling */
+        uint64_t                        fstop;          /* when to stop allocating */
+        sector_t                        brun;           /* when to stop culling */
+        sector_t                        bcull;          /* when to start culling */
+        sector_t                        bstop;          /* when to stop allocating */
+        unsigned long                   flags;
+#define CACHEFILES_READY                0       /* T if cache prepared */
+#define CACHEFILES_DEAD                 1       /* T if cache dead */
+#define CACHEFILES_CULLING              2       /* T if cull engaged */
+#define CACHEFILES_STATE_CHANGED        3       /* T if state changed (poll trigger) */
+        char                            *rootdirname;   /* name of cache root directory */
+        char                            *secctx;        /* LSM security context */
+        char                            *tag;           /* cache binding tag */
+};
+/*
+ * backing file read tracking
+ */
+struct cachefiles_one_read {
+        wait_queue_t                    monitor;        /* link into monitored waitqueue */
+        struct page                     *back_page;     /* backing file page we're waiting for */
+        struct page                     *netfs_page;    /* netfs page we're going to fill */
+        struct fscache_retrieval        *op;            /* retrieval op covering this */
+        struct list_head                op_link;        /* link in op's todo list */
+};
+/*
+ * backing file write tracking
+ */
+struct cachefiles_one_write {
+        struct page                     *netfs_page;    /* netfs page to copy */
+        struct cachefiles_object        *object;
+        struct list_head                obj_link;       /* link in object's lists */
+        fscache_rw_complete_t           end_io_func;
+        void                            *context;
+};
+/*
+ * auxiliary data xattr buffer
+ */
+struct cachefiles_xattr {
+        uint16_t                        len;
+        uint8_t                         type;
+        uint8_t                         data[];
+};
+/*
+ * note change of state for daemon
+ */
+static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
+{
+        set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+        wake_up_all(&cache->daemon_pollwq);
+}
+/*
+ * cf-bind.c
+ */
+extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
+extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
+/*
+ * cf-daemon.c
+ */
+extern const struct file_operations cachefiles_daemon_fops;
+extern int cachefiles_has_space(struct cachefiles_cache *cache,
+                                unsigned fnr, unsigned bnr);
+/*
+ * cf-interface.c
+ */
+extern const struct fscache_cache_ops cachefiles_cache_ops;
+/*
+ * cf-key.c
+ */
+extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+/*
+ * cf-namei.c
+ */
+extern int cachefiles_delete_object(struct cachefiles_cache *cache,
+                                    struct cachefiles_object *object);
+extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
+                                     struct cachefiles_object *object,
+                                     const char *key,
+                                     struct cachefiles_xattr *auxdata);
+extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+                                               struct dentry *dir,
+                                               const char *name);
+extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+                           char *filename);
+extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
+                                   struct dentry *dir, char *filename);
+/*
+ * cf-proc.c
+ */
+#ifdef CONFIG_CACHEFILES_HISTOGRAM
+extern atomic_t cachefiles_lookup_histogram[HZ];
+extern atomic_t cachefiles_mkdir_histogram[HZ];
+extern atomic_t cachefiles_create_histogram[HZ];
+extern int __init cachefiles_proc_init(void);
+extern void cachefiles_proc_cleanup(void);
+static inline
+void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
+{
+        unsigned long jif = jiffies - start_jif;
+        if (jif >= HZ)
+                jif = HZ - 1;
+        atomic_inc(&histogram[jif]);
+}
+#else
+#define cachefiles_proc_init()          (0)
+#define cachefiles_proc_cleanup()       do {} while (0)
+#define cachefiles_hist(hist, start_jif) do {} while (0)
+#endif
+/*
+ * cf-rdwr.c
+ */
+extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
+                                         struct page *, gfp_t);
+extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
+                                          struct list_head *, unsigned *,
+                                          gfp_t);
+extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
+                                    gfp_t);
+extern int cachefiles_allocate_pages(struct fscache_retrieval *,
+                                     struct list_head *, unsigned *, gfp_t);
+extern int cachefiles_write_page(struct fscache_storage *, struct page *);
+extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
+/*
+ * cf-security.c
+ */
+extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
+extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+                                               struct dentry *root,
+                                               const struct cred **_saved_cred);
+static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
+                                           const struct cred **_saved_cred)
+{
+        *_saved_cred = override_creds(cache->cache_cred);
+}
+static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
+                                         const struct cred *saved_cred)
+{
+        revert_creds(saved_cred);
+}
+/*
+ * cf-xattr.c
+ */
+extern int cachefiles_check_object_type(struct cachefiles_object *object);
+extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
+                                       struct cachefiles_xattr *auxdata);
+extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
+                                          struct cachefiles_xattr *auxdata);
+extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
+                                         struct cachefiles_xattr *auxdata);
+extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+                                          struct dentry *dentry);
+/*
+ * error handling
+ */
+#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
+#define cachefiles_io_error(___cache, FMT, ...)         \
+do {                                                    \
+        kerror("I/O Error: " FMT, ##__VA_ARGS__);       \
+        fscache_io_error(&(___cache)->cache);           \
+        set_bit(CACHEFILES_DEAD, &(___cache)->flags);   \
+} while (0)
+#define cachefiles_io_error_obj(object, FMT, ...)                       \
+do {                                                                    \
+        struct cachefiles_cache *___cache;                              \
+                                                                        \
+        ___cache = container_of((object)->fscache.cache,                \
+                                struct cachefiles_cache, cache);        \
+        cachefiles_io_error(___cache, FMT, ##__VA_ARGS__);              \
+} while (0)
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+        printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline void _dbprintk(const char *fmt, ...)
+        __attribute__((format(printf, 1, 2)));
+static inline void _dbprintk(const char *fmt, ...)
+{
+}
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+#if defined(__KDEBUG)
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+#elif defined(CONFIG_CACHEFILES_DEBUG)
+#define _enter(FMT, ...)                                \
+do {                                                    \
+        if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \
+                kenter(FMT, ##__VA_ARGS__);             \
+} while (0)
+#define _leave(FMT, ...)                                \
+do {                                                    \
+        if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \
+                kleave(FMT, ##__VA_ARGS__);             \
+} while (0)
+#define _debug(FMT, ...)                                \
+do {                                                    \
+        if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \
+                kdebug(FMT, ##__VA_ARGS__);             \
+} while (0)
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+#if 1 /* defined(__KDEBUGALL) */
+#define ASSERT(X)                                                       \
+do {                                                                    \
+        if (unlikely(!(X))) {                                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTCMP(X, OP, Y)                                             \
+do {                                                                    \
+        if (unlikely(!((X) OP (Y)))) {                                  \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+                printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTIF(C, X)                                                  \
+do {                                                                    \
+        if (unlikely((C) && !(X))) {                                    \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)                                        \
+do {                                                                    \
+        if (unlikely((C) && !((X) OP (Y)))) {                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+                printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#else
+#define ASSERT(X)                       do {} while (0)
+#define ASSERTCMP(X, OP, Y)             do {} while (0)
+#define ASSERTIF(C, X)                  do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)        do {} while (0)
+#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644
index 000000000000..81b8b2b3a674
--- /dev/null
+++ b/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
+/* Key to pathname encoder
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/slab.h>
+#include "internal.h"
+static const char cachefiles_charmap[64] =
+        "0123456789"                    /* 0 - 9 */
+        "abcdefghijklmnopqrstuvwxyz"    /* 10 - 35 */
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"    /* 36 - 61 */
+        "_-"                            /* 62 - 63 */
+        ;
+static const char cachefiles_filecharmap[256] = {
+        /* we skip space and tab and control chars */
+        [33 ... 46] = 1,                /* '!' -> '.' */
+        /* we skip '/' as it's significant to pathwalk */
+        [48 ... 127] = 1,               /* '0' -> '~' */
+};
+/*
+ * turn the raw key into something cooked
+ * - the raw key should include the length in the two bytes at the front
+ * - the key may be up to 514 bytes in length (including the length word)
+ *   - "base64" encode the strange keys, mapping 3 bytes of raw to four of
+ *     cooked
+ *   - need to cut the cooked key into 252 char lengths (189 raw bytes)
+ */
+char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+{
+        unsigned char csum, ch;
+        unsigned int acc;
+        char *key;
+        int loop, len, max, seg, mark, print;
+        _enter(",%d", keylen);
+        BUG_ON(keylen < 2 || keylen > 514);
+        csum = raw[0] + raw[1];
+        print = 1;
+        for (loop = 2; loop < keylen; loop++) {
+                ch = raw[loop];
+                csum += ch;
+                print &= cachefiles_filecharmap[ch];
+        }
+        if (print) {
+                /* if the path is usable ASCII, then we render it directly */
+                max = keylen - 2;
+                max += 2;       /* two base64'd length chars on the front */
+                max += 5;       /* @checksum/M */
+                max += 3 * 2;   /* maximum number of segment dividers (".../M")
+                                 * is ((514 + 251) / 252) = 3
+                                 */
+                max += 1;       /* NUL on end */
+        } else {
+                /* calculate the maximum length of the cooked key */
+                keylen = (keylen + 2) / 3;
+                max = keylen * 4;
+                max += 5;       /* @checksum/M */
+                max += 3 * 2;   /* maximum number of segment dividers (".../M")
+                                 * is ((514 + 188) / 189) = 3
+                                 */
+                max += 1;       /* NUL on end */
+        }
+        max += 1;       /* 2nd NUL on end */
+        _debug("max: %d", max);
+        key = kmalloc(max, GFP_KERNEL);
+        if (!key)
+                return NULL;
+        len = 0;
+        /* build the cooked key */
+        sprintf(key, "@%02x%c+", (unsigned) csum, 0);
+        len = 5;
+        mark = len - 1;
+        if (print) {
+                acc = *(uint16_t *) raw;
+                raw += 2;
+                key[len + 1] = cachefiles_charmap[acc & 63];
+                acc >>= 6;
+                key[len] = cachefiles_charmap[acc & 63];
+                len += 2;
+                seg = 250;
+                for (loop = keylen; loop > 0; loop--) {
+                        if (seg <= 0) {
+                                key[len++] = '\0';
+                                mark = len;
+                                key[len++] = '+';
+                                seg = 252;
+                        }
+                        key[len++] = *raw++;
+                        ASSERT(len < max);
+                }
+                switch (type) {
+                case FSCACHE_COOKIE_TYPE_INDEX:         type = 'I';     break;
+                case FSCACHE_COOKIE_TYPE_DATAFILE:      type = 'D';     break;
+                default:                                type = 'S';     break;
+                }
+        } else {
+                seg = 252;
+                for (loop = keylen; loop > 0; loop--) {
+                        if (seg <= 0) {
+                                key[len++] = '\0';
+                                mark = len;
+                                key[len++] = '+';
+                                seg = 252;
+                        }
+                        acc = *raw++;
+                        acc |= *raw++ << 8;
+                        acc |= *raw++ << 16;
+                        _debug("acc: %06x", acc);
+                        key[len++] = cachefiles_charmap[acc & 63];
+                        acc >>= 6;
+                        key[len++] = cachefiles_charmap[acc & 63];
+                        acc >>= 6;
+                        key[len++] = cachefiles_charmap[acc & 63];
+                        acc >>= 6;
+                        key[len++] = cachefiles_charmap[acc & 63];
+                        ASSERT(len < max);
+                }
+                switch (type) {
+                case FSCACHE_COOKIE_TYPE_INDEX:         type = 'J';     break;
+                case FSCACHE_COOKIE_TYPE_DATAFILE:      type = 'E';     break;
+                default:                                type = 'T';     break;
+                }
+        }
+        key[mark] = type;
+        key[len++] = 0;
+        key[len] = 0;
+        _leave(" = %p %d", key, len);
+        return key;
+}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644
index 000000000000..4bfa8cf43bf5
--- /dev/null
+++ b/fs/cachefiles/main.c
@@ -0,0 +1,106 @@
+/* Network filesystem caching backend to use cache files on a premounted
+ * filesystem
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/sysctl.h>
+#include <linux/miscdevice.h>
+#include "internal.h"
+unsigned cachefiles_debug;
+module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
+MODULE_DESCRIPTION("Mounted-filesystem based cache");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+struct kmem_cache *cachefiles_object_jar;
+static struct miscdevice cachefiles_dev = {
+        .minor  = MISC_DYNAMIC_MINOR,
+        .name   = "cachefiles",
+        .fops   = &cachefiles_daemon_fops,
+};
+static void cachefiles_object_init_once(void *_object)
+{
+        struct cachefiles_object *object = _object;
+        memset(object, 0, sizeof(*object));
+        spin_lock_init(&object->work_lock);
+}
+/*
+ * initialise the fs caching module
+ */
+static int __init cachefiles_init(void)
+{
+        int ret;
+        ret = misc_register(&cachefiles_dev);
+        if (ret < 0)
+                goto error_dev;
+        /* create an object jar */
+        ret = -ENOMEM;
+        cachefiles_object_jar =
+                kmem_cache_create("cachefiles_object_jar",
+                                  sizeof(struct cachefiles_object),
+                                  0,
+                                  SLAB_HWCACHE_ALIGN,
+                                  cachefiles_object_init_once);
+        if (!cachefiles_object_jar) {
+                printk(KERN_NOTICE
+                       "CacheFiles: Failed to allocate an object jar\n");
+                goto error_object_jar;
+        }
+        ret = cachefiles_proc_init();
+        if (ret < 0)
+                goto error_proc;
+        printk(KERN_INFO "CacheFiles: Loaded\n");
+        return 0;
+error_proc:
+        kmem_cache_destroy(cachefiles_object_jar);
+error_object_jar:
+        misc_deregister(&cachefiles_dev);
+error_dev:
+        kerror("failed to register: %d", ret);
+        return ret;
+}
+fs_initcall(cachefiles_init);
+/*
+ * clean up on module removal
+ */
+static void __exit cachefiles_exit(void)
+{
+        printk(KERN_INFO "CacheFiles: Unloading\n");
+        cachefiles_proc_cleanup();
+        kmem_cache_destroy(cachefiles_object_jar);
+        misc_deregister(&cachefiles_dev);
+}
+module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644
index 000000000000..4ce818ae39ea
--- /dev/null
+++ b/fs/cachefiles/namei.c
@@ -0,0 +1,771 @@
+/* CacheFiles path walking and related routines
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include "internal.h"
+static int cachefiles_wait_bit(void *flags)
+{
+        schedule();
+        return 0;
+}
+/*
+ * record the fact that an object is now active
+ */
+static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
+                                          struct cachefiles_object *object)
+{
+        struct cachefiles_object *xobject;
+        struct rb_node **_p, *_parent = NULL;
+        struct dentry *dentry;
+        _enter(",%p", object);
+try_again:
+        write_lock(&cache->active_lock);
+        if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+                BUG();
+        dentry = object->dentry;
+        _p = &cache->active_nodes.rb_node;
+        while (*_p) {
+                _parent = *_p;
+                xobject = rb_entry(_parent,
+                                   struct cachefiles_object, active_node);
+                ASSERT(xobject != object);
+                if (xobject->dentry > dentry)
+                        _p = &(*_p)->rb_left;
+                else if (xobject->dentry < dentry)
+                        _p = &(*_p)->rb_right;
+                else
+                        goto wait_for_old_object;
+        }
+        rb_link_node(&object->active_node, _parent, _p);
+        rb_insert_color(&object->active_node, &cache->active_nodes);
+        write_unlock(&cache->active_lock);
+        _leave("");
+        return;
+        /* an old object from a previous incarnation is hogging the slot - we
+         * need to wait for it to be destroyed */
+wait_for_old_object:
+        if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+                printk(KERN_ERR "\n");
+                printk(KERN_ERR "CacheFiles: Error:"
+                       " Unexpected object collision\n");
+                printk(KERN_ERR "xobject: OBJ%x\n",
+                       xobject->fscache.debug_id);
+                printk(KERN_ERR "xobjstate=%s\n",
+                       fscache_object_states[xobject->fscache.state]);
+                printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
+                printk(KERN_ERR "xobjevent=%lx [%lx]\n",
+                       xobject->fscache.events, xobject->fscache.event_mask);
+                printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
+                       xobject->fscache.n_ops, xobject->fscache.n_in_progress,
+                       xobject->fscache.n_exclusive);
+                printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
+                       xobject->fscache.cookie,
+                       xobject->fscache.cookie->parent,
+                       xobject->fscache.cookie->netfs_data,
+                       xobject->fscache.cookie->flags);
+                printk(KERN_ERR "xparent=%p\n",
+                       xobject->fscache.parent);
+                printk(KERN_ERR "object: OBJ%x\n",
+                       object->fscache.debug_id);
+                printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
+                       object->fscache.cookie,
+                       object->fscache.cookie->parent,
+                       object->fscache.cookie->netfs_data,
+                       object->fscache.cookie->flags);
+                printk(KERN_ERR "parent=%p\n",
+                       object->fscache.parent);
+                BUG();
+        }
+        atomic_inc(&xobject->usage);
+        write_unlock(&cache->active_lock);
+        _debug(">>> wait");
+        wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
+                    cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
+        _debug("<<< waited");
+        cache->cache.ops->put_object(&xobject->fscache);
+        goto try_again;
+}
+/*
+ * delete an object representation from the cache
+ * - file backed objects are unlinked
+ * - directory backed objects are stuffed into the graveyard for userspace to
+ *   delete
+ * - unlocks the directory mutex
+ */
+static int cachefiles_bury_object(struct cachefiles_cache *cache,
+                                  struct dentry *dir,
+                                  struct dentry *rep)
+{
+        struct dentry *grave, *trap;
+        char nbuffer[8 + 8 + 1];
+        int ret;
+        _enter(",'%*.*s','%*.*s'",
+               dir->d_name.len, dir->d_name.len, dir->d_name.name,
+               rep->d_name.len, rep->d_name.len, rep->d_name.name);
+        /* non-directories can just be unlinked */
+        if (!S_ISDIR(rep->d_inode->i_mode)) {
+                _debug("unlink stale object");
+                ret = vfs_unlink(dir->d_inode, rep);
+                mutex_unlock(&dir->d_inode->i_mutex);
+                if (ret == -EIO)
+                        cachefiles_io_error(cache, "Unlink failed");
+                _leave(" = %d", ret);
+                return ret;
+        }
+        /* directories have to be moved to the graveyard */
+        _debug("move stale object to graveyard");
+        mutex_unlock(&dir->d_inode->i_mutex);
+try_again:
+        /* first step is to make up a grave dentry in the graveyard */
+        sprintf(nbuffer, "%08x%08x",
+                (uint32_t) get_seconds(),
+                (uint32_t) atomic_inc_return(&cache->gravecounter));
+        /* do the multiway lock magic */
+        trap = lock_rename(cache->graveyard, dir);
+        /* do some checks before getting the grave dentry */
+        if (rep->d_parent != dir) {
+                /* the entry was probably culled when we dropped the parent dir
+                 * lock */
+                unlock_rename(cache->graveyard, dir);
+                _leave(" = 0 [culled?]");
+                return 0;
+        }
+        if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
+                unlock_rename(cache->graveyard, dir);
+                cachefiles_io_error(cache, "Graveyard no longer a directory");
+                return -EIO;
+        }
+        if (trap == rep) {
+                unlock_rename(cache->graveyard, dir);
+                cachefiles_io_error(cache, "May not make directory loop");
+                return -EIO;
+        }
+        if (d_mountpoint(rep)) {
+                unlock_rename(cache->graveyard, dir);
+                cachefiles_io_error(cache, "Mountpoint in cache");
+                return -EIO;
+        }
+        grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+        if (IS_ERR(grave)) {
+                unlock_rename(cache->graveyard, dir);
+                if (PTR_ERR(grave) == -ENOMEM) {
+                        _leave(" = -ENOMEM");
+                        return -ENOMEM;
+                }
+                cachefiles_io_error(cache, "Lookup error %ld",
+                                    PTR_ERR(grave));
+                return -EIO;
+        }
+        if (grave->d_inode) {
+                unlock_rename(cache->graveyard, dir);
+                dput(grave);
+                grave = NULL;
+                cond_resched();
+                goto try_again;
+        }
+        if (d_mountpoint(grave)) {
+                unlock_rename(cache->graveyard, dir);
+                dput(grave);
+                cachefiles_io_error(cache, "Mountpoint in graveyard");
+                return -EIO;
+        }
+        /* target should not be an ancestor of source */
+        if (trap == grave) {
+                unlock_rename(cache->graveyard, dir);
+                dput(grave);
+                cachefiles_io_error(cache, "May not make directory loop");
+                return -EIO;
+        }
+        /* attempt the rename */
+        ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
+        if (ret != 0 && ret != -ENOMEM)
+                cachefiles_io_error(cache, "Rename failed with error %d", ret);
+        unlock_rename(cache->graveyard, dir);
+        dput(grave);
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * delete an object representation from the cache
+ */
+int cachefiles_delete_object(struct cachefiles_cache *cache,
+                             struct cachefiles_object *object)
+{
+        struct dentry *dir;
+        int ret;
+        _enter(",{%p}", object->dentry);
+        ASSERT(object->dentry);
+        ASSERT(object->dentry->d_inode);
+        ASSERT(object->dentry->d_parent);
+        dir = dget_parent(object->dentry);
+        mutex_lock(&dir->d_inode->i_mutex);
+        ret = cachefiles_bury_object(cache, dir, object->dentry);
+        dput(dir);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * walk from the parent object to the child object through the backing
+ * filesystem, creating directories as we go
+ */
+int cachefiles_walk_to_object(struct cachefiles_object *parent,
+                              struct cachefiles_object *object,
+                              const char *key,
+                              struct cachefiles_xattr *auxdata)
+{
+        struct cachefiles_cache *cache;
+        struct dentry *dir, *next = NULL;
+        unsigned long start;
+        const char *name;
+        int ret, nlen;
+        _enter("{%p},,%s,", parent->dentry, key);
+        cache = container_of(parent->fscache.cache,
+                             struct cachefiles_cache, cache);
+        ASSERT(parent->dentry);
+        ASSERT(parent->dentry->d_inode);
+        if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
+                // TODO: convert file to dir
+                _leave("looking up in none directory");
+                return -ENOBUFS;
+        }
+        dir = dget(parent->dentry);
+advance:
+        /* attempt to transit the first directory component */
+        name = key;
+        nlen = strlen(key);
+        /* key ends in a double NUL */
+        key = key + nlen + 1;
+        if (!*key)
+                key = NULL;
+lookup_again:
+        /* search the current directory for the element name */
+        _debug("lookup '%s'", name);
+        mutex_lock(&dir->d_inode->i_mutex);
+        start = jiffies;
+        next = lookup_one_len(name, dir, nlen);
+        cachefiles_hist(cachefiles_lookup_histogram, start);
+        if (IS_ERR(next))
+                goto lookup_error;
+        _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
+        if (!key)
+                object->new = !next->d_inode;
+        /* if this element of the path doesn't exist, then the lookup phase
+         * failed, and we can release any readers in the certain knowledge that
+         * there's nothing for them to actually read */
+        if (!next->d_inode)
+                fscache_object_lookup_negative(&object->fscache);
+        /* we need to create the object if it's negative */
+        if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
+                /* index objects and intervening tree levels must be subdirs */
+                if (!next->d_inode) {
+                        ret = cachefiles_has_space(cache, 1, 0);
+                        if (ret < 0)
+                                goto create_error;
+                        start = jiffies;
+                        ret = vfs_mkdir(dir->d_inode, next, 0);
+                        cachefiles_hist(cachefiles_mkdir_histogram, start);
+                        if (ret < 0)
+                                goto create_error;
+                        ASSERT(next->d_inode);
+                        _debug("mkdir -> %p{%p{ino=%lu}}",
+                               next, next->d_inode, next->d_inode->i_ino);
+                } else if (!S_ISDIR(next->d_inode->i_mode)) {
+                        kerror("inode %lu is not a directory",
+                               next->d_inode->i_ino);
+                        ret = -ENOBUFS;
+                        goto error;
+                }
+        } else {
+                /* non-index objects start out life as files */
+                if (!next->d_inode) {
+                        ret = cachefiles_has_space(cache, 1, 0);
+                        if (ret < 0)
+                                goto create_error;
+                        start = jiffies;
+                        ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
+                        cachefiles_hist(cachefiles_create_histogram, start);
+                        if (ret < 0)
+                                goto create_error;
+                        ASSERT(next->d_inode);
+                        _debug("create -> %p{%p{ino=%lu}}",
+                               next, next->d_inode, next->d_inode->i_ino);
+                } else if (!S_ISDIR(next->d_inode->i_mode) &&
+                           !S_ISREG(next->d_inode->i_mode)
+                           ) {
+                        kerror("inode %lu is not a file or directory",
+                               next->d_inode->i_ino);
+                        ret = -ENOBUFS;
+                        goto error;
+                }
+        }
+        /* process the next component */
+        if (key) {
+                _debug("advance");
+                mutex_unlock(&dir->d_inode->i_mutex);
+                dput(dir);
+                dir = next;
+                next = NULL;
+                goto advance;
+        }
+        /* we've found the object we were looking for */
+        object->dentry = next;
+        /* if we've found that the terminal object exists, then we need to
+         * check its attributes and delete it if it's out of date */
+        if (!object->new) {
+                _debug("validate '%*.*s'",
+                       next->d_name.len, next->d_name.len, next->d_name.name);
+                ret = cachefiles_check_object_xattr(object, auxdata);
+                if (ret == -ESTALE) {
+                        /* delete the object (the deleter drops the directory
+                         * mutex) */
+                        object->dentry = NULL;
+                        ret = cachefiles_bury_object(cache, dir, next);
+                        dput(next);
+                        next = NULL;
+                        if (ret < 0)
+                                goto delete_error;
+                        _debug("redo lookup");
+                        goto lookup_again;
+                }
+        }
+        /* note that we're now using this object */
+        cachefiles_mark_object_active(cache, object);
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(dir);
+        dir = NULL;
+        _debug("=== OBTAINED_OBJECT ===");
+        if (object->new) {
+                /* attach data to a newly constructed terminal object */
+                ret = cachefiles_set_object_xattr(object, auxdata);
+                if (ret < 0)
+                        goto check_error;
+        } else {
+                /* always update the atime on an object we've just looked up
+                 * (this is used to keep track of culling, and atimes are only
+                 * updated by read, write and readdir but not lookup or
+                 * open) */
+                touch_atime(cache->mnt, next);
+        }
+        /* open a file interface onto a data file */
+        if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
+                if (S_ISREG(object->dentry->d_inode->i_mode)) {
+                        const struct address_space_operations *aops;
+                        ret = -EPERM;
+                        aops = object->dentry->d_inode->i_mapping->a_ops;
+                        if (!aops->bmap)
+                                goto check_error;
+                        object->backer = object->dentry;
+                } else {
+                        BUG(); // TODO: open file in data-class subdir
+                }
+        }
+        object->new = 0;
+        fscache_obtained_object(&object->fscache);
+        _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
+        return 0;
+create_error:
+        _debug("create error %d", ret);
+        if (ret == -EIO)
+                cachefiles_io_error(cache, "Create/mkdir failed");
+        goto error;
+check_error:
+        _debug("check error %d", ret);
+        write_lock(&cache->active_lock);
+        rb_erase(&object->active_node, &cache->active_nodes);
+        clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+        wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+        write_unlock(&cache->active_lock);
+        dput(object->dentry);
+        object->dentry = NULL;
+        goto error_out;
+delete_error:
+        _debug("delete error %d", ret);
+        goto error_out2;
+lookup_error:
+        _debug("lookup error %ld", PTR_ERR(next));
+        ret = PTR_ERR(next);
+        if (ret == -EIO)
+                cachefiles_io_error(cache, "Lookup failed");
+        next = NULL;
+error:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(next);
+error_out2:
+        dput(dir);
+error_out:
+        if (ret == -ENOSPC)
+                ret = -ENOBUFS;
+        _leave(" = error %d", -ret);
+        return ret;
+}
+/*
+ * get a subdirectory
+ */
+struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+                                        struct dentry *dir,
+                                        const char *dirname)
+{
+        struct dentry *subdir;
+        unsigned long start;
+        int ret;
+        _enter(",,%s", dirname);
+        /* search the current directory for the element name */
+        mutex_lock(&dir->d_inode->i_mutex);
+        start = jiffies;
+        subdir = lookup_one_len(dirname, dir, strlen(dirname));
+        cachefiles_hist(cachefiles_lookup_histogram, start);
+        if (IS_ERR(subdir)) {
+                if (PTR_ERR(subdir) == -ENOMEM)
+                        goto nomem_d_alloc;
+                goto lookup_error;
+        }
+        _debug("subdir -> %p %s",
+               subdir, subdir->d_inode ? "positive" : "negative");
+        /* we need to create the subdir if it doesn't exist yet */
+        if (!subdir->d_inode) {
+                ret = cachefiles_has_space(cache, 1, 0);
+                if (ret < 0)
+                        goto mkdir_error;
+                _debug("attempt mkdir");
+                ret = vfs_mkdir(dir->d_inode, subdir, 0700);
+                if (ret < 0)
+                        goto mkdir_error;
+                ASSERT(subdir->d_inode);
+                _debug("mkdir -> %p{%p{ino=%lu}}",
+                       subdir,
+                       subdir->d_inode,
+                       subdir->d_inode->i_ino);
+        }
+        mutex_unlock(&dir->d_inode->i_mutex);
+        /* we need to make sure the subdir is a directory */
+        ASSERT(subdir->d_inode);
+        if (!S_ISDIR(subdir->d_inode->i_mode)) {
+                kerror("%s is not a directory", dirname);
+                ret = -EIO;
+                goto check_error;
+        }
+        ret = -EPERM;
+        if (!subdir->d_inode->i_op ||
+            !subdir->d_inode->i_op->setxattr ||
+            !subdir->d_inode->i_op->getxattr ||
+            !subdir->d_inode->i_op->lookup ||
+            !subdir->d_inode->i_op->mkdir ||
+            !subdir->d_inode->i_op->create ||
+            !subdir->d_inode->i_op->rename ||
+            !subdir->d_inode->i_op->rmdir ||
+            !subdir->d_inode->i_op->unlink)
+                goto check_error;
+        _leave(" = [%lu]", subdir->d_inode->i_ino);
+        return subdir;
+check_error:
+        dput(subdir);
+        _leave(" = %d [check]", ret);
+        return ERR_PTR(ret);
+mkdir_error:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(subdir);
+        kerror("mkdir %s failed with error %d", dirname, ret);
+        return ERR_PTR(ret);
+lookup_error:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        ret = PTR_ERR(subdir);
+        kerror("Lookup %s failed with error %d", dirname, ret);
+        return ERR_PTR(ret);
+nomem_d_alloc:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        _leave(" = -ENOMEM");
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * find out if an object is in use or not
+ * - if finds object and it's not in use:
+ *   - returns a pointer to the object and a reference on it
+ *   - returns with the directory locked
+ */
+static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
+                                              struct dentry *dir,
+                                              char *filename)
+{
+        struct cachefiles_object *object;
+        struct rb_node *_n;
+        struct dentry *victim;
+        unsigned long start;
+        int ret;
+        //_enter(",%*.*s/,%s",
+        //       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+        /* look up the victim */
+        mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+        start = jiffies;
+        victim = lookup_one_len(filename, dir, strlen(filename));
+        cachefiles_hist(cachefiles_lookup_histogram, start);
+        if (IS_ERR(victim))
+                goto lookup_error;
+        //_debug("victim -> %p %s",
+        //       victim, victim->d_inode ? "positive" : "negative");
+        /* if the object is no longer there then we probably retired the object
+         * at the netfs's request whilst the cull was in progress
+         */
+        if (!victim->d_inode) {
+                mutex_unlock(&dir->d_inode->i_mutex);
+                dput(victim);
+                _leave(" = -ENOENT [absent]");
+                return ERR_PTR(-ENOENT);
+        }
+        /* check to see if we're using this object */
+        read_lock(&cache->active_lock);
+        _n = cache->active_nodes.rb_node;
+        while (_n) {
+                object = rb_entry(_n, struct cachefiles_object, active_node);
+                if (object->dentry > victim)
+                        _n = _n->rb_left;
+                else if (object->dentry < victim)
+                        _n = _n->rb_right;
+                else
+                        goto object_in_use;
+        }
+        read_unlock(&cache->active_lock);
+        //_leave(" = %p", victim);
+        return victim;
+object_in_use:
+        read_unlock(&cache->active_lock);
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(victim);
+        //_leave(" = -EBUSY [in use]");
+        return ERR_PTR(-EBUSY);
+lookup_error:
+        mutex_unlock(&dir->d_inode->i_mutex);
+        ret = PTR_ERR(victim);
+        if (ret == -ENOENT) {
+                /* file or dir now absent - probably retired by netfs */
+                _leave(" = -ESTALE [absent]");
+                return ERR_PTR(-ESTALE);
+        }
+        if (ret == -EIO) {
+                cachefiles_io_error(cache, "Lookup failed");
+        } else if (ret != -ENOMEM) {
+                kerror("Internal error: %d", ret);
+                ret = -EIO;
+        }
+        _leave(" = %d", ret);
+        return ERR_PTR(ret);
+}
+/*
+ * cull an object if it's not in use
+ * - called only by cache manager daemon
+ */
+int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+                    char *filename)
+{
+        struct dentry *victim;
+        int ret;
+        _enter(",%*.*s/,%s",
+               dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+        victim = cachefiles_check_active(cache, dir, filename);
+        if (IS_ERR(victim))
+                return PTR_ERR(victim);
+        _debug("victim -> %p %s",
+               victim, victim->d_inode ? "positive" : "negative");
+        /* okay... the victim is not being used so we can cull it
+         * - start by marking it as stale
+         */
+        _debug("victim is cullable");
+        ret = cachefiles_remove_object_xattr(cache, victim);
+        if (ret < 0)
+                goto error_unlock;
+        /*  actually remove the victim (drops the dir mutex) */
+        _debug("bury");
+        ret = cachefiles_bury_object(cache, dir, victim);
+        if (ret < 0)
+                goto error;
+        dput(victim);
+        _leave(" = 0");
+        return 0;
+error_unlock:
+        mutex_unlock(&dir->d_inode->i_mutex);
+error:
+        dput(victim);
+        if (ret == -ENOENT) {
+                /* file or dir now absent - probably retired by netfs */
+                _leave(" = -ESTALE [absent]");
+                return -ESTALE;
+        }
+        if (ret != -ENOMEM) {
+                kerror("Internal error: %d", ret);
+                ret = -EIO;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * find out if an object is in use or not
+ * - called only by cache manager daemon
+ * - returns -EBUSY or 0 to indicate whether an object is in use or not
+ */
+int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
+                            char *filename)
+{
+        struct dentry *victim;
+        //_enter(",%*.*s/,%s",
+        //       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+        victim = cachefiles_check_active(cache, dir, filename);
+        if (IS_ERR(victim))
+                return PTR_ERR(victim);
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(victim);
+        //_leave(" = 0");
+        return 0;
+}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644
index 000000000000..eccd33941199
--- /dev/null
+++ b/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
+/* CacheFiles statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+atomic_t cachefiles_lookup_histogram[HZ];
+atomic_t cachefiles_mkdir_histogram[HZ];
+atomic_t cachefiles_create_histogram[HZ];
+/*
+ * display the latency histogram
+ */
+static int cachefiles_histogram_show(struct seq_file *m, void *v)
+{
+        unsigned long index;
+        unsigned x, y, z, t;
+        switch ((unsigned long) v) {
+        case 1:
+                seq_puts(m, "JIFS  SECS  LOOKUPS   MKDIRS    CREATES\n");
+                return 0;
+        case 2:
+                seq_puts(m, "===== ===== ========= ========= =========\n");
+                return 0;
+        default:
+                index = (unsigned long) v - 3;
+                x = atomic_read(&cachefiles_lookup_histogram[index]);
+                y = atomic_read(&cachefiles_mkdir_histogram[index]);
+                z = atomic_read(&cachefiles_create_histogram[index]);
+                if (x == 0 && y == 0 && z == 0)
+                        return 0;
+                t = (index * 1000) / HZ;
+                seq_printf(m, "%4lu  0.%03u %9u %9u %9u\n", index, t, x, y, z);
+                return 0;
+        }
+}
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+        if ((unsigned long long)*_pos >= HZ + 2)
+                return NULL;
+        if (*_pos == 0)
+                *_pos = 1;
+        return (void *)(unsigned long) *_pos;
+}
+/*
+ * move to the next line
+ */
+static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return (unsigned long long)*pos > HZ + 2 ?
+                NULL : (void *)(unsigned long) *pos;
+}
+/*
+ * clean up after reading
+ */
+static void cachefiles_histogram_stop(struct seq_file *m, void *v)
+{
+}
+static const struct seq_operations cachefiles_histogram_ops = {
+        .start          = cachefiles_histogram_start,
+        .stop           = cachefiles_histogram_stop,
+        .next           = cachefiles_histogram_next,
+        .show           = cachefiles_histogram_show,
+};
+/*
+ * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
+ */
+static int cachefiles_histogram_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &cachefiles_histogram_ops);
+}
+static const struct file_operations cachefiles_histogram_fops = {
+        .owner          = THIS_MODULE,
+        .open           = cachefiles_histogram_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+/*
+ * initialise the /proc/fs/cachefiles/ directory
+ */
+int __init cachefiles_proc_init(void)
+{
+        _enter("");
+        if (!proc_mkdir("fs/cachefiles", NULL))
+                goto error_dir;
+        if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+                         &cachefiles_histogram_fops))
+                goto error_histogram;
+        _leave(" = 0");
+        return 0;
+error_histogram:
+        remove_proc_entry("fs/cachefiles", NULL);
+error_dir:
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+/*
+ * clean up the /proc/fs/cachefiles/ directory
+ */
+void cachefiles_proc_cleanup(void)
+{
+        remove_proc_entry("fs/cachefiles/histogram", NULL);
+        remove_proc_entry("fs/cachefiles", NULL);
+}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644
index 000000000000..a69787e7dd96
--- /dev/null
+++ b/fs/cachefiles/rdwr.c
@@ -0,0 +1,879 @@
+/* Storage object read/write
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/mount.h>
+#include <linux/file.h>
+#include "internal.h"
+/*
+ * detect wake up events generated by the unlocking of pages in which we're
+ * interested
+ * - we use this to detect read completion of backing pages
+ * - the caller holds the waitqueue lock
+ */
+static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+                                  int sync, void *_key)
+{
+        struct cachefiles_one_read *monitor =
+                container_of(wait, struct cachefiles_one_read, monitor);
+        struct cachefiles_object *object;
+        struct wait_bit_key *key = _key;
+        struct page *page = wait->private;
+        ASSERT(key);
+        _enter("{%lu},%u,%d,{%p,%u}",
+               monitor->netfs_page->index, mode, sync,
+               key->flags, key->bit_nr);
+        if (key->flags != &page->flags ||
+            key->bit_nr != PG_locked)
+                return 0;
+        _debug("--- monitor %p %lx ---", page, page->flags);
+        if (!PageUptodate(page) && !PageError(page))
+                dump_stack();
+        /* remove from the waitqueue */
+        list_del(&wait->task_list);
+        /* move onto the action list and queue for FS-Cache thread pool */
+        ASSERT(monitor->op);
+        object = container_of(monitor->op->op.object,
+                              struct cachefiles_object, fscache);
+        spin_lock(&object->work_lock);
+        list_add_tail(&monitor->op_link, &monitor->op->to_do);
+        spin_unlock(&object->work_lock);
+        fscache_enqueue_retrieval(monitor->op);
+        return 0;
+}
+/*
+ * copy data from backing pages to netfs pages to complete a read operation
+ * - driven by FS-Cache's thread pool
+ */
+static void cachefiles_read_copier(struct fscache_operation *_op)
+{
+        struct cachefiles_one_read *monitor;
+        struct cachefiles_object *object;
+        struct fscache_retrieval *op;
+        struct pagevec pagevec;
+        int error, max;
+        op = container_of(_op, struct fscache_retrieval, op);
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        _enter("{ino=%lu}", object->backer->d_inode->i_ino);
+        pagevec_init(&pagevec, 0);
+        max = 8;
+        spin_lock_irq(&object->work_lock);
+        while (!list_empty(&op->to_do)) {
+                monitor = list_entry(op->to_do.next,
+                                     struct cachefiles_one_read, op_link);
+                list_del(&monitor->op_link);
+                spin_unlock_irq(&object->work_lock);
+                _debug("- copy {%lu}", monitor->back_page->index);
+                error = -EIO;
+                if (PageUptodate(monitor->back_page)) {
+                        copy_highpage(monitor->netfs_page, monitor->back_page);
+                        pagevec_add(&pagevec, monitor->netfs_page);
+                        fscache_mark_pages_cached(monitor->op, &pagevec);
+                        error = 0;
+                }
+                if (error)
+                        cachefiles_io_error_obj(
+                                object,
+                                "Readpage failed on backing file %lx",
+                                (unsigned long) monitor->back_page->flags);
+                page_cache_release(monitor->back_page);
+                fscache_end_io(op, monitor->netfs_page, error);
+                page_cache_release(monitor->netfs_page);
+                fscache_put_retrieval(op);
+                kfree(monitor);
+                /* let the thread pool have some air occasionally */
+                max--;
+                if (max < 0 || need_resched()) {
+                        if (!list_empty(&op->to_do))
+                                fscache_enqueue_retrieval(op);
+                        _leave(" [maxed out]");
+                        return;
+                }
+                spin_lock_irq(&object->work_lock);
+        }
+        spin_unlock_irq(&object->work_lock);
+        _leave("");
+}
+/*
+ * read the corresponding page to the given set from the backing file
+ * - an uncertain page is simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
+                                            struct fscache_retrieval *op,
+                                            struct page *netpage,
+                                            struct pagevec *pagevec)
+{
+        struct cachefiles_one_read *monitor;
+        struct address_space *bmapping;
+        struct page *newpage, *backpage;
+        int ret;
+        _enter("");
+        pagevec_reinit(pagevec);
+        _debug("read back %p{%lu,%d}",
+               netpage, netpage->index, page_count(netpage));
+        monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+        if (!monitor)
+                goto nomem;
+        monitor->netfs_page = netpage;
+        monitor->op = fscache_get_retrieval(op);
+        init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
+        /* attempt to get hold of the backing page */
+        bmapping = object->backer->d_inode->i_mapping;
+        newpage = NULL;
+        for (;;) {
+                backpage = find_get_page(bmapping, netpage->index);
+                if (backpage)
+                        goto backing_page_already_present;
+                if (!newpage) {
+                        newpage = page_cache_alloc_cold(bmapping);
+                        if (!newpage)
+                                goto nomem_monitor;
+                }
+                ret = add_to_page_cache(newpage, bmapping,
+                                        netpage->index, GFP_KERNEL);
+                if (ret == 0)
+                        goto installed_new_backing_page;
+                if (ret != -EEXIST)
+                        goto nomem_page;
+        }
+        /* we've installed a new backing page, so now we need to add it
+         * to the LRU list and start it reading */
+installed_new_backing_page:
+        _debug("- new %p", newpage);
+        backpage = newpage;
+        newpage = NULL;
+        page_cache_get(backpage);
+        pagevec_add(pagevec, backpage);
+        __pagevec_lru_add_file(pagevec);
+read_backing_page:
+        ret = bmapping->a_ops->readpage(NULL, backpage);
+        if (ret < 0)
+                goto read_error;
+        /* set the monitor to transfer the data across */
+monitor_backing_page:
+        _debug("- monitor add");
+        /* install the monitor */
+        page_cache_get(monitor->netfs_page);
+        page_cache_get(backpage);
+        monitor->back_page = backpage;
+        monitor->monitor.private = backpage;
+        add_page_wait_queue(backpage, &monitor->monitor);
+        monitor = NULL;
+        /* but the page may have been read before the monitor was installed, so
+         * the monitor may miss the event - so we have to ensure that we do get
+         * one in such a case */
+        if (trylock_page(backpage)) {
+                _debug("jumpstart %p {%lx}", backpage, backpage->flags);
+                unlock_page(backpage);
+        }
+        goto success;
+        /* if the backing page is already present, it can be in one of
+         * three states: read in progress, read failed or read okay */
+backing_page_already_present:
+        _debug("- present");
+        if (newpage) {
+                page_cache_release(newpage);
+                newpage = NULL;
+        }
+        if (PageError(backpage))
+                goto io_error;
+        if (PageUptodate(backpage))
+                goto backing_page_already_uptodate;
+        if (!trylock_page(backpage))
+                goto monitor_backing_page;
+        _debug("read %p {%lx}", backpage, backpage->flags);
+        goto read_backing_page;
+        /* the backing page is already up to date, attach the netfs
+         * page to the pagecache and LRU and copy the data across */
+backing_page_already_uptodate:
+        _debug("- uptodate");
+        pagevec_add(pagevec, netpage);
+        fscache_mark_pages_cached(op, pagevec);
+        copy_highpage(netpage, backpage);
+        fscache_end_io(op, netpage, 0);
+success:
+        _debug("success");
+        ret = 0;
+out:
+        if (backpage)
+                page_cache_release(backpage);
+        if (monitor) {
+                fscache_put_retrieval(monitor->op);
+                kfree(monitor);
+        }
+        _leave(" = %d", ret);
+        return ret;
+read_error:
+        _debug("read error %d", ret);
+        if (ret == -ENOMEM)
+                goto out;
+io_error:
+        cachefiles_io_error_obj(object, "Page read error on backing file");
+        ret = -ENOBUFS;
+        goto out;
+nomem_page:
+        page_cache_release(newpage);
+nomem_monitor:
+        fscache_put_retrieval(monitor->op);
+        kfree(monitor);
+nomem:
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - if the page is backed by a block in the cache:
+ *   - a read will be started which will call the callback on completion
+ *   - 0 will be returned
+ * - else if the page is unbacked:
+ *   - the metadata will be retained
+ *   - -ENODATA will be returned
+ */
+int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
+                                  struct page *page,
+                                  gfp_t gfp)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct pagevec pagevec;
+        struct inode *inode;
+        sector_t block0, block;
+        unsigned shift;
+        int ret;
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("{%p},{%lx},,,", object, page->index);
+        if (!object->backer)
+                return -ENOBUFS;
+        inode = object->backer->d_inode;
+        ASSERT(S_ISREG(inode->i_mode));
+        ASSERT(inode->i_mapping->a_ops->bmap);
+        ASSERT(inode->i_mapping->a_ops->readpages);
+        /* calculate the shift required to use bmap */
+        if (inode->i_sb->s_blocksize > PAGE_SIZE)
+                return -ENOBUFS;
+        shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+        op->op.flags = FSCACHE_OP_FAST;
+        op->op.processor = cachefiles_read_copier;
+        pagevec_init(&pagevec, 0);
+        /* we assume the absence or presence of the first block is a good
+         * enough indication for the page as a whole
+         * - TODO: don't use bmap() for this as it is _not_ actually good
+         *   enough for this as it doesn't indicate errors, but it's all we've
+         *   got for the moment
+         */
+        block0 = page->index;
+        block0 <<= shift;
+        block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
+        _debug("%llx -> %llx",
+               (unsigned long long) block0,
+               (unsigned long long) block);
+        if (block) {
+                /* submit the apparently valid page to the backing fs to be
+                 * read from disk */
+                ret = cachefiles_read_backing_file_one(object, op, page,
+                                                       &pagevec);
+        } else if (cachefiles_has_space(cache, 0, 1) == 0) {
+                /* there's space in the cache we can use */
+                pagevec_add(&pagevec, page);
+                fscache_mark_pages_cached(op, &pagevec);
+                ret = -ENODATA;
+        } else {
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * read the corresponding pages to the given set from the backing file
+ * - any uncertain pages are simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file(struct cachefiles_object *object,
+                                        struct fscache_retrieval *op,
+                                        struct list_head *list,
+                                        struct pagevec *mark_pvec)
+{
+        struct cachefiles_one_read *monitor = NULL;
+        struct address_space *bmapping = object->backer->d_inode->i_mapping;
+        struct pagevec lru_pvec;
+        struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
+        int ret = 0;
+        _enter("");
+        pagevec_init(&lru_pvec, 0);
+        list_for_each_entry_safe(netpage, _n, list, lru) {
+                list_del(&netpage->lru);
+                _debug("read back %p{%lu,%d}",
+                       netpage, netpage->index, page_count(netpage));
+                if (!monitor) {
+                        monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+                        if (!monitor)
+                                goto nomem;
+                        monitor->op = fscache_get_retrieval(op);
+                        init_waitqueue_func_entry(&monitor->monitor,
+                                                  cachefiles_read_waiter);
+                }
+                for (;;) {
+                        backpage = find_get_page(bmapping, netpage->index);
+                        if (backpage)
+                                goto backing_page_already_present;
+                        if (!newpage) {
+                                newpage = page_cache_alloc_cold(bmapping);
+                                if (!newpage)
+                                        goto nomem;
+                        }
+                        ret = add_to_page_cache(newpage, bmapping,
+                                                netpage->index, GFP_KERNEL);
+                        if (ret == 0)
+                                goto installed_new_backing_page;
+                        if (ret != -EEXIST)
+                                goto nomem;
+                }
+                /* we've installed a new backing page, so now we need to add it
+                 * to the LRU list and start it reading */
+        installed_new_backing_page:
+                _debug("- new %p", newpage);
+                backpage = newpage;
+                newpage = NULL;
+                page_cache_get(backpage);
+                if (!pagevec_add(&lru_pvec, backpage))
+                        __pagevec_lru_add_file(&lru_pvec);
+        reread_backing_page:
+                ret = bmapping->a_ops->readpage(NULL, backpage);
+                if (ret < 0)
+                        goto read_error;
+                /* add the netfs page to the pagecache and LRU, and set the
+                 * monitor to transfer the data across */
+        monitor_backing_page:
+                _debug("- monitor add");
+                ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+                                        GFP_KERNEL);
+                if (ret < 0) {
+                        if (ret == -EEXIST) {
+                                page_cache_release(netpage);
+                                continue;
+                        }
+                        goto nomem;
+                }
+                page_cache_get(netpage);
+                if (!pagevec_add(&lru_pvec, netpage))
+                        __pagevec_lru_add_file(&lru_pvec);
+                /* install a monitor */
+                page_cache_get(netpage);
+                monitor->netfs_page = netpage;
+                page_cache_get(backpage);
+                monitor->back_page = backpage;
+                monitor->monitor.private = backpage;
+                add_page_wait_queue(backpage, &monitor->monitor);
+                monitor = NULL;
+                /* but the page may have been read before the monitor was
+                 * installed, so the monitor may miss the event - so we have to
+                 * ensure that we do get one in such a case */
+                if (trylock_page(backpage)) {
+                        _debug("2unlock %p {%lx}", backpage, backpage->flags);
+                        unlock_page(backpage);
+                }
+                page_cache_release(backpage);
+                backpage = NULL;
+                page_cache_release(netpage);
+                netpage = NULL;
+                continue;
+                /* if the backing page is already present, it can be in one of
+                 * three states: read in progress, read failed or read okay */
+        backing_page_already_present:
+                _debug("- present %p", backpage);
+                if (PageError(backpage))
+                        goto io_error;
+                if (PageUptodate(backpage))
+                        goto backing_page_already_uptodate;
+                _debug("- not ready %p{%lx}", backpage, backpage->flags);
+                if (!trylock_page(backpage))
+                        goto monitor_backing_page;
+                if (PageError(backpage)) {
+                        _debug("error %lx", backpage->flags);
+                        unlock_page(backpage);
+                        goto io_error;
+                }
+                if (PageUptodate(backpage))
+                        goto backing_page_already_uptodate_unlock;
+                /* we've locked a page that's neither up to date nor erroneous,
+                 * so we need to attempt to read it again */
+                goto reread_backing_page;
+                /* the backing page is already up to date, attach the netfs
+                 * page to the pagecache and LRU and copy the data across */
+        backing_page_already_uptodate_unlock:
+                _debug("uptodate %lx", backpage->flags);
+                unlock_page(backpage);
+        backing_page_already_uptodate:
+                _debug("- uptodate");
+                ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+                                        GFP_KERNEL);
+                if (ret < 0) {
+                        if (ret == -EEXIST) {
+                                page_cache_release(netpage);
+                                continue;
+                        }
+                        goto nomem;
+                }
+                copy_highpage(netpage, backpage);
+                page_cache_release(backpage);
+                backpage = NULL;
+                if (!pagevec_add(mark_pvec, netpage))
+                        fscache_mark_pages_cached(op, mark_pvec);
+                page_cache_get(netpage);
+                if (!pagevec_add(&lru_pvec, netpage))
+                        __pagevec_lru_add_file(&lru_pvec);
+                fscache_end_io(op, netpage, 0);
+                page_cache_release(netpage);
+                netpage = NULL;
+                continue;
+        }
+        netpage = NULL;
+        _debug("out");
+out:
+        /* tidy up */
+        pagevec_lru_add_file(&lru_pvec);
+        if (newpage)
+                page_cache_release(newpage);
+        if (netpage)
+                page_cache_release(netpage);
+        if (backpage)
+                page_cache_release(backpage);
+        if (monitor) {
+                fscache_put_retrieval(op);
+                kfree(monitor);
+        }
+        list_for_each_entry_safe(netpage, _n, list, lru) {
+                list_del(&netpage->lru);
+                page_cache_release(netpage);
+        }
+        _leave(" = %d", ret);
+        return ret;
+nomem:
+        _debug("nomem");
+        ret = -ENOMEM;
+        goto out;
+read_error:
+        _debug("read error %d", ret);
+        if (ret == -ENOMEM)
+                goto out;
+io_error:
+        cachefiles_io_error_obj(object, "Page read error on backing file");
+        ret = -ENOBUFS;
+        goto out;
+}
+/*
+ * read a list of pages from the cache or allocate blocks in which to store
+ * them
+ */
+int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
+                                   struct list_head *pages,
+                                   unsigned *nr_pages,
+                                   gfp_t gfp)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct list_head backpages;
+        struct pagevec pagevec;
+        struct inode *inode;
+        struct page *page, *_n;
+        unsigned shift, nrbackpages;
+        int ret, ret2, space;
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("{OBJ%x,%d},,%d,,",
+               object->fscache.debug_id, atomic_read(&op->op.usage),
+               *nr_pages);
+        if (!object->backer)
+                return -ENOBUFS;
+        space = 1;
+        if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
+                space = 0;
+        inode = object->backer->d_inode;
+        ASSERT(S_ISREG(inode->i_mode));
+        ASSERT(inode->i_mapping->a_ops->bmap);
+        ASSERT(inode->i_mapping->a_ops->readpages);
+        /* calculate the shift required to use bmap */
+        if (inode->i_sb->s_blocksize > PAGE_SIZE)
+                return -ENOBUFS;
+        shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+        pagevec_init(&pagevec, 0);
+        op->op.flags = FSCACHE_OP_FAST;
+        op->op.processor = cachefiles_read_copier;
+        INIT_LIST_HEAD(&backpages);
+        nrbackpages = 0;
+        ret = space ? -ENODATA : -ENOBUFS;
+        list_for_each_entry_safe(page, _n, pages, lru) {
+                sector_t block0, block;
+                /* we assume the absence or presence of the first block is a
+                 * good enough indication for the page as a whole
+                 * - TODO: don't use bmap() for this as it is _not_ actually
+                 *   good enough for this as it doesn't indicate errors, but
+                 *   it's all we've got for the moment
+                 */
+                block0 = page->index;
+                block0 <<= shift;
+                block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
+                                                      block0);
+                _debug("%llx -> %llx",
+                       (unsigned long long) block0,
+                       (unsigned long long) block);
+                if (block) {
+                        /* we have data - add it to the list to give to the
+                         * backing fs */
+                        list_move(&page->lru, &backpages);
+                        (*nr_pages)--;
+                        nrbackpages++;
+                } else if (space && pagevec_add(&pagevec, page) == 0) {
+                        fscache_mark_pages_cached(op, &pagevec);
+                        ret = -ENODATA;
+                }
+        }
+        if (pagevec_count(&pagevec) > 0)
+                fscache_mark_pages_cached(op, &pagevec);
+        if (list_empty(pages))
+                ret = 0;
+        /* submit the apparently valid pages to the backing fs to be read from
+         * disk */
+        if (nrbackpages > 0) {
+                ret2 = cachefiles_read_backing_file(object, op, &backpages,
+                                                    &pagevec);
+                if (ret2 == -ENOMEM || ret2 == -EINTR)
+                        ret = ret2;
+        }
+        if (pagevec_count(&pagevec) > 0)
+                fscache_mark_pages_cached(op, &pagevec);
+        _leave(" = %d [nr=%u%s]",
+               ret, *nr_pages, list_empty(pages) ? " empty" : "");
+        return ret;
+}
+/*
+ * allocate a block in the cache in which to store a page
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - otherwise:
+ *   - the metadata will be retained
+ *   - 0 will be returned
+ */
+int cachefiles_allocate_page(struct fscache_retrieval *op,
+                             struct page *page,
+                             gfp_t gfp)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct pagevec pagevec;
+        int ret;
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("%p,{%lx},", object, page->index);
+        ret = cachefiles_has_space(cache, 0, 1);
+        if (ret == 0) {
+                pagevec_init(&pagevec, 0);
+                pagevec_add(&pagevec, page);
+                fscache_mark_pages_cached(op, &pagevec);
+        } else {
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * allocate blocks in the cache in which to store a set of pages
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if some buffers couldn't be made available
+ * - returns -ENOBUFS if some pages are beyond EOF
+ * - otherwise:
+ *   - -ENODATA will be returned
+ * - metadata will be retained for any page marked
+ */
+int cachefiles_allocate_pages(struct fscache_retrieval *op,
+                              struct list_head *pages,
+                              unsigned *nr_pages,
+                              gfp_t gfp)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        struct pagevec pagevec;
+        struct page *page;
+        int ret;
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("%p,,,%d,", object, *nr_pages);
+        ret = cachefiles_has_space(cache, 0, *nr_pages);
+        if (ret == 0) {
+                pagevec_init(&pagevec, 0);
+                list_for_each_entry(page, pages, lru) {
+                        if (pagevec_add(&pagevec, page) == 0)
+                                fscache_mark_pages_cached(op, &pagevec);
+                }
+                if (pagevec_count(&pagevec) > 0)
+                        fscache_mark_pages_cached(op, &pagevec);
+                ret = -ENODATA;
+        } else {
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * request a page be stored in the cache
+ * - cache withdrawal is prevented by the caller
+ * - this request may be ignored if there's no cache block available, in which
+ *   case -ENOBUFS will be returned
+ * - if the op is in progress, 0 will be returned
+ */
+int cachefiles_write_page(struct fscache_storage *op, struct page *page)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        mm_segment_t old_fs;
+        struct file *file;
+        loff_t pos;
+        void *data;
+        int ret;
+        ASSERT(op != NULL);
+        ASSERT(page != NULL);
+        object = container_of(op->op.object,
+                              struct cachefiles_object, fscache);
+        _enter("%p,%p{%lx},,,", object, page, page->index);
+        if (!object->backer) {
+                _leave(" = -ENOBUFS");
+                return -ENOBUFS;
+        }
+        ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        /* write the page to the backing filesystem and let it store it in its
+         * own time */
+        dget(object->backer);
+        mntget(cache->mnt);
+        file = dentry_open(object->backer, cache->mnt, O_RDWR,
+                           cache->cache_cred);
+        if (IS_ERR(file)) {
+                ret = PTR_ERR(file);
+        } else {
+                ret = -EIO;
+                if (file->f_op->write) {
+                        pos = (loff_t) page->index << PAGE_SHIFT;
+                        data = kmap(page);
+                        old_fs = get_fs();
+                        set_fs(KERNEL_DS);
+                        ret = file->f_op->write(
+                                file, (const void __user *) data, PAGE_SIZE,
+                                &pos);
+                        set_fs(old_fs);
+                        kunmap(page);
+                        if (ret != PAGE_SIZE)
+                                ret = -EIO;
+                }
+                fput(file);
+        }
+        if (ret < 0) {
+                if (ret == -EIO)
+                        cachefiles_io_error_obj(
+                                object, "Write page to backing file failed");
+                ret = -ENOBUFS;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * detach a backing block from a page
+ * - cache withdrawal is prevented by the caller
+ */
+void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
+{
+        struct cachefiles_object *object;
+        struct cachefiles_cache *cache;
+        object = container_of(_object, struct cachefiles_object, fscache);
+        cache = container_of(object->fscache.cache,
+                             struct cachefiles_cache, cache);
+        _enter("%p,{%lu}", object, page->index);
+        spin_unlock(&object->fscache.cookie->lock);
+}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644
index 000000000000..b5808cdb2232
--- /dev/null
+++ b/fs/cachefiles/security.c
@@ -0,0 +1,116 @@
+/* CacheFiles security management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/fs.h>
+#include <linux/cred.h>
+#include "internal.h"
+/*
+ * determine the security context within which we access the cache from within
+ * the kernel
+ */
+int cachefiles_get_security_ID(struct cachefiles_cache *cache)
+{
+        struct cred *new;
+        int ret;
+        _enter("{%s}", cache->secctx);
+        new = prepare_kernel_cred(current);
+        if (!new) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        if (cache->secctx) {
+                ret = set_security_override_from_ctx(new, cache->secctx);
+                if (ret < 0) {
+                        put_cred(new);
+                        printk(KERN_ERR "CacheFiles:"
+                               " Security denies permission to nominate"
+                               " security context: error %d\n",
+                               ret);
+                        goto error;
+                }
+        }
+        cache->cache_cred = new;
+        ret = 0;
+error:
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * see if mkdir and create can be performed in the root directory
+ */
+static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
+                                      struct dentry *root)
+{
+        int ret;
+        ret = security_inode_mkdir(root->d_inode, root, 0);
+        if (ret < 0) {
+                printk(KERN_ERR "CacheFiles:"
+                       " Security denies permission to make dirs: error %d",
+                       ret);
+                return ret;
+        }
+        ret = security_inode_create(root->d_inode, root, 0);
+        if (ret < 0)
+                printk(KERN_ERR "CacheFiles:"
+                       " Security denies permission to create files: error %d",
+                       ret);
+        return ret;
+}
+/*
+ * check the security details of the on-disk cache
+ * - must be called with security override in force
+ */
+int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+                                        struct dentry *root,
+                                        const struct cred **_saved_cred)
+{
+        struct cred *new;
+        int ret;
+        _enter("");
+        /* duplicate the cache creds for COW (the override is currently in
+         * force, so we can use prepare_creds() to do this) */
+        new = prepare_creds();
+        if (!new)
+                return -ENOMEM;
+        cachefiles_end_secure(cache, *_saved_cred);
+        /* use the cache root dir's security context as the basis with
+         * which create files */
+        ret = set_create_files_as(new, root->d_inode);
+        if (ret < 0) {
+                _leave(" = %d [cfa]", ret);
+                return ret;
+        }
+        put_cred(cache->cache_cred);
+        cache->cache_cred = new;
+        cachefiles_begin_secure(cache, _saved_cred);
+        ret = cachefiles_check_cache_dir(cache, root);
+        if (ret == -EOPNOTSUPP)
+                ret = 0;
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644
index 000000000000..f3e7a0bf068b
--- /dev/null
+++ b/fs/cachefiles/xattr.c
@@ -0,0 +1,291 @@
+/* CacheFiles extended attribute management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include "internal.h"
+static const char cachefiles_xattr_cache[] =
+        XATTR_USER_PREFIX "CacheFiles.cache";
+/*
+ * check the type label on an object
+ * - done using xattrs
+ */
+int cachefiles_check_object_type(struct cachefiles_object *object)
+{
+        struct dentry *dentry = object->dentry;
+        char type[3], xtype[3];
+        int ret;
+        ASSERT(dentry);
+        ASSERT(dentry->d_inode);
+        if (!object->fscache.cookie)
+                strcpy(type, "C3");
+        else
+                snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
+        _enter("%p{%s}", object, type);
+        /* attempt to install a type label directly */
+        ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
+                           XATTR_CREATE);
+        if (ret == 0) {
+                _debug("SET"); /* we succeeded */
+                goto error;
+        }
+        if (ret != -EEXIST) {
+                kerror("Can't set xattr on %*.*s [%lu] (err %d)",
+                       dentry->d_name.len, dentry->d_name.len,
+                       dentry->d_name.name, dentry->d_inode->i_ino,
+                       -ret);
+                goto error;
+        }
+        /* read the current type label */
+        ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+        if (ret < 0) {
+                if (ret == -ERANGE)
+                        goto bad_type_length;
+                kerror("Can't read xattr on %*.*s [%lu] (err %d)",
+                       dentry->d_name.len, dentry->d_name.len,
+                       dentry->d_name.name, dentry->d_inode->i_ino,
+                       -ret);
+                goto error;
+        }
+        /* check the type is what we're expecting */
+        if (ret != 2)
+                goto bad_type_length;
+        if (xtype[0] != type[0] || xtype[1] != type[1])
+                goto bad_type;
+        ret = 0;
+error:
+        _leave(" = %d", ret);
+        return ret;
+bad_type_length:
+        kerror("Cache object %lu type xattr length incorrect",
+               dentry->d_inode->i_ino);
+        ret = -EIO;
+        goto error;
+bad_type:
+        xtype[2] = 0;
+        kerror("Cache object %*.*s [%lu] type %s not %s",
+               dentry->d_name.len, dentry->d_name.len,
+               dentry->d_name.name, dentry->d_inode->i_ino,
+               xtype, type);
+        ret = -EIO;
+        goto error;
+}
+/*
+ * set the state xattr on a cache file
+ */
+int cachefiles_set_object_xattr(struct cachefiles_object *object,
+                                struct cachefiles_xattr *auxdata)
+{
+        struct dentry *dentry = object->dentry;
+        int ret;
+        ASSERT(object->fscache.cookie);
+        ASSERT(dentry);
+        _enter("%p,#%d", object, auxdata->len);
+        /* attempt to install the cache metadata directly */
+        _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+        ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+                           &auxdata->type, auxdata->len,
+                           XATTR_CREATE);
+        if (ret < 0 && ret != -ENOMEM)
+                cachefiles_io_error_obj(
+                        object,
+                        "Failed to set xattr with error %d", ret);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * update the state xattr on a cache file
+ */
+int cachefiles_update_object_xattr(struct cachefiles_object *object,
+                                   struct cachefiles_xattr *auxdata)
+{
+        struct dentry *dentry = object->dentry;
+        int ret;
+        ASSERT(object->fscache.cookie);
+        ASSERT(dentry);
+        _enter("%p,#%d", object, auxdata->len);
+        /* attempt to install the cache metadata directly */
+        _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+        ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+                           &auxdata->type, auxdata->len,
+                           XATTR_REPLACE);
+        if (ret < 0 && ret != -ENOMEM)
+                cachefiles_io_error_obj(
+                        object,
+                        "Failed to update xattr with error %d", ret);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * check the state xattr on a cache file
+ * - return -ESTALE if the object should be deleted
+ */
+int cachefiles_check_object_xattr(struct cachefiles_object *object,
+                                  struct cachefiles_xattr *auxdata)
+{
+        struct cachefiles_xattr *auxbuf;
+        struct dentry *dentry = object->dentry;
+        int ret;
+        _enter("%p,#%d", object, auxdata->len);
+        ASSERT(dentry);
+        ASSERT(dentry->d_inode);
+        auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+        if (!auxbuf) {
+                _leave(" = -ENOMEM");
+                return -ENOMEM;
+        }
+        /* read the current type label */
+        ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+                           &auxbuf->type, 512 + 1);
+        if (ret < 0) {
+                if (ret == -ENODATA)
+                        goto stale; /* no attribute - power went off
+                                     * mid-cull? */
+                if (ret == -ERANGE)
+                        goto bad_type_length;
+                cachefiles_io_error_obj(object,
+                                        "Can't read xattr on %lu (err %d)",
+                                        dentry->d_inode->i_ino, -ret);
+                goto error;
+        }
+        /* check the on-disk object */
+        if (ret < 1)
+                goto bad_type_length;
+        if (auxbuf->type != auxdata->type)
+                goto stale;
+        auxbuf->len = ret;
+        /* consult the netfs */
+        if (object->fscache.cookie->def->check_aux) {
+                enum fscache_checkaux result;
+                unsigned int dlen;
+                dlen = auxbuf->len - 1;
+                _debug("checkaux %s #%u",
+                       object->fscache.cookie->def->name, dlen);
+                result = fscache_check_aux(&object->fscache,
+                                           &auxbuf->data, dlen);
+                switch (result) {
+                        /* entry okay as is */
+                case FSCACHE_CHECKAUX_OKAY:
+                        goto okay;
+                        /* entry requires update */
+                case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+                        break;
+                        /* entry requires deletion */
+                case FSCACHE_CHECKAUX_OBSOLETE:
+                        goto stale;
+                default:
+                        BUG();
+                }
+                /* update the current label */
+                ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+                                   &auxdata->type, auxdata->len,
+                                   XATTR_REPLACE);
+                if (ret < 0) {
+                        cachefiles_io_error_obj(object,
+                                                "Can't update xattr on %lu"
+                                                " (error %d)",
+                                                dentry->d_inode->i_ino, -ret);
+                        goto error;
+                }
+        }
+okay:
+        ret = 0;
+error:
+        kfree(auxbuf);
+        _leave(" = %d", ret);
+        return ret;
+bad_type_length:
+        kerror("Cache object %lu xattr length incorrect",
+               dentry->d_inode->i_ino);
+        ret = -EIO;
+        goto error;
+stale:
+        ret = -ESTALE;
+        goto error;
+}
+/*
+ * remove the object's xattr to mark it stale
+ */
+int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+                                   struct dentry *dentry)
+{
+        int ret;
+        ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
+        if (ret < 0) {
+                if (ret == -ENOENT || ret == -ENODATA)
+                        ret = 0;
+                else if (ret != -ENOMEM)
+                        cachefiles_io_error(cache,
+                                            "Can't remove xattr from %lu"
+                                            " (error %d)",
+                                            dentry->d_inode->i_ino, -ret);
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2f35cccfcd8d..54dce78fbb73 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                return -ENOMEM;
        }
-        mode &= ~current->fs->umask;
+        mode &= ~current_umask();
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                rc = -ENOMEM;
        else if (pTcon->unix_ext) {
                struct cifs_unix_set_info_args args = {
-                        .mode   = mode & ~current->fs->umask,
+                        .mode   = mode & ~current_umask(),
                        .ctime  = NO_CHANGE_64,
                        .atime  = NO_CHANGE_64,
                        .mtime  = NO_CHANGE_64,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8797cc60805..f121a80fdd6f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        goto mkdir_out;
                }
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
                rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
                                mode, NULL /* netfid */, pInfo, &oplock,
                                full_path, cifs_sb->local_nls,
@@ -1204,7 +1204,7 @@ mkdir_get_info:
                if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
                                direntry->d_inode->i_nlink = 2;
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
                /* must turn on setgid bit if parent dir has it */
                if (inode->i_mode & S_ISGID)
                        mode |= S_ISGID;
diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5a..3f84d5f15889 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1195,16 +1196,12 @@ out:
        return ret;
 }
-asmlinkage ssize_t
+static size_t compat_readv(struct file *file,
-compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen)
+                           const struct compat_iovec __user *vec,
+                           unsigned long vlen, loff_t *pos)
 {
-        struct file *file;
        ssize_t ret = -EBADF;
-        file = fget(fd);
-        if (!file)
-                return -EBADF;
        if (!(file->f_mode & FMODE_READ))
                goto out;
@@ -1212,25 +1209,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
                goto out;
-        ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
+        ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
 out:
        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
-        fput(file);
        return ret;
 }
 asmlinkage ssize_t
-compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen)
+compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
+                 unsigned long vlen)
 {
        struct file *file;
-        ssize_t ret = -EBADF;
+        int fput_needed;
+        ssize_t ret;
-        file = fget(fd);
+        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
+        ret = compat_readv(file, vec, vlen, &file->f_pos);
+        fput_light(file, fput_needed);
+        return ret;
+}
+asmlinkage ssize_t
+compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
+                  unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+        struct file *file;
+        int fput_needed;
+        ssize_t ret;
+        if (pos < 0)
+                return -EINVAL;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        ret = compat_readv(file, vec, vlen, &pos);
+        fput_light(file, fput_needed);
+        return ret;
+}
+static size_t compat_writev(struct file *file,
+                            const struct compat_iovec __user *vec,
+                            unsigned long vlen, loff_t *pos)
+{
+        ssize_t ret = -EBADF;
        if (!(file->f_mode & FMODE_WRITE))
                goto out;
@@ -1238,13 +1266,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
                goto out;
-        ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
+        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
 out:
        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
-        fput(file);
+        return ret;
+}
+asmlinkage ssize_t
+compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
+                  unsigned long vlen)
+{
+        struct file *file;
+        int fput_needed;
+        ssize_t ret;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        ret = compat_writev(file, vec, vlen, &file->f_pos);
+        fput_light(file, fput_needed);
+        return ret;
+}
+asmlinkage ssize_t
+compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
+                   unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+        struct file *file;
+        int fput_needed;
+        ssize_t ret;
+        if (pos < 0)
+                return -EINVAL;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        ret = compat_writev(file, vec, vlen, &pos);
+        fput_light(file, fput_needed);
        return ret;
 }
@@ -1441,12 +1503,15 @@ int compat_do_execve(char * filename,
        bprm->cred = prepare_exec_creds();
        if (!bprm->cred)
                goto out_unlock;
-        check_unsafe_exec(bprm);
+        retval = check_unsafe_exec(bprm);
+        if (retval)
+                goto out_unlock;
        file = open_exec(filename);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
-                goto out_unlock;
+                goto out_unmark;
        sched_exec();
@@ -1488,6 +1553,9 @@ int compat_do_execve(char * filename,
                goto out;
        /* execve succeeded */
+        write_lock(&current->fs->lock);
+        current->fs->in_exec = 0;
+        write_unlock(&current->fs->lock);
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
        acct_update_integrals(current);
@@ -1506,6 +1574,11 @@ out_file:
                fput(bprm->file);
        }
+out_unmark:
+        write_lock(&current->fs->lock);
+        current->fs->in_exec = 0;
+        write_unlock(&current->fs->lock);
 out_unlock:
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ff786687e93b..3e87ce443ea2 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
 #include <linux/if.h>
 #include <linux/if_bridge.h>
 #include <linux/slab.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
 #include <linux/in6.h>
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a07338d2d140..dd3634e4c967 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -318,6 +318,7 @@ out:
 static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = CRAMFS_MAGIC;
        buf->f_bsize = PAGE_CACHE_SIZE;
@@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = 0;
        buf->f_files = CRAMFS_SB(sb)->files;
        buf->f_ffree = 0;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = CRAMFS_MAXPATHLEN;
        return 0;
 }
@@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
 static int cramfs_readpage(struct file *file, struct page * page)
 {
        struct inode *inode = page->mapping->host;
-        u32 maxblock, bytes_filled;
+        u32 maxblock;
+        int bytes_filled;
        void *pgdata;
        maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        bytes_filled = 0;
+        pgdata = kmap(page);
        if (page->index < maxblock) {
                struct super_block *sb = inode->i_sb;
                u32 blkptr_offset = OFFSET(inode) + page->index*4;
@@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page)
                start_offset = OFFSET(inode) + maxblock*4;
                mutex_lock(&read_mutex);
                if (page->index)
-                        start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4);
+                        start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
-                compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset);
+                                4);
+                compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
+                        start_offset);
                mutex_unlock(&read_mutex);
-                pgdata = kmap(page);
                if (compr_len == 0)
                        ; /* hole */
-                else if (compr_len > (PAGE_CACHE_SIZE << 1))
+                else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
-                        printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len);
+                        pr_err("cramfs: bad compressed blocksize %u\n",
-                else {
+                                compr_len);
+                        goto err;
+                } else {
                        mutex_lock(&read_mutex);
                        bytes_filled = cramfs_uncompress_block(pgdata,
                                 PAGE_CACHE_SIZE,
                                 cramfs_read(sb, start_offset, compr_len),
                                 compr_len);
                        mutex_unlock(&read_mutex);
+                        if (unlikely(bytes_filled < 0))
+                                goto err;
                }
-        } else
+        }
-                pgdata = kmap(page);
        memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
-        kunmap(page);
        flush_dcache_page(page);
+        kunmap(page);
        SetPageUptodate(page);
        unlock_page(page);
        return 0;
+err:
+        kunmap(page);
+        ClearPageUptodate(page);
+        SetPageError(page);
+        unlock_page(page);
+        return 0;
 }
 static const struct address_space_operations cramfs_aops = {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index fc3ccb74626f..023329800d2e 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
 err:
        printk("Error %d while decompressing!\n", err);
        printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
-        return 0;
+        return -EIO;
 }
 int cramfs_uncompress_init(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b116..761d30be2683 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,7 +17,6 @@
 #include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/fsnotify.h>
 #include <linux/slab.h>
@@ -32,6 +31,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
+static bool debugfs_registered;
 static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 {
@@ -496,6 +497,16 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
+/**
+ * debugfs_initialized - Tells whether debugfs has been registered
+ */
+bool debugfs_initialized(void)
+{
+        return debugfs_registered;
+}
+EXPORT_SYMBOL_GPL(debugfs_initialized);
 static struct kobject *debug_kobj;
 static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
        retval = register_filesystem(&debug_fs_type);
        if (retval)
                kobject_put(debug_kobj);
+        else
+                debugfs_registered = true;
        return retval;
 }
 static void __exit debugfs_exit(void)
 {
+        debugfs_registered = false;
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        unregister_filesystem(&debug_fs_type);
        kobject_put(debug_kobj);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..da258e7249cc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        int acquire_i_mutex = 0;
        if (rw & WRITE)
-                rw = WRITE_SYNC;
+                rw = WRITE_ODIRECT;
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 44d725f612cf..b6a719a909f8 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb)
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                        continue;
                if (inode->i_mapping->nrpages == 0)
                        continue;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 73b19cfc91fc..f04942810818 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -329,18 +329,22 @@ out_no_fs:
 }
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
-        struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
+        struct super_block *sb = dentry->d_sb;
+        struct efs_sb_info *sbi = SUPER_INFO(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type    = EFS_SUPER_MAGIC;       /* efs magic number */
        buf->f_bsize   = EFS_BLOCKSIZE;         /* blocksize */
-        buf->f_blocks  = sb->total_groups *     /* total data blocks */
+        buf->f_blocks  = sbi->total_groups *    /* total data blocks */
-                        (sb->group_size - sb->inode_blocks);
+                        (sbi->group_size - sbi->inode_blocks);
-        buf->f_bfree   = sb->data_free;         /* free data blocks */
+        buf->f_bfree   = sbi->data_free;        /* free data blocks */
-        buf->f_bavail  = sb->data_free;         /* free blocks for non-root */
+        buf->f_bavail  = sbi->data_free;        /* free blocks for non-root */
-        buf->f_files   = sb->total_groups *     /* total inodes */
+        buf->f_files   = sbi->total_groups *    /* total inodes */
-                        sb->inode_blocks *
+                        sbi->inode_blocks *
                        (EFS_BLOCKSIZE / sizeof(struct efs_dinode));
-        buf->f_ffree   = sb->inode_free;        /* free inodes */
+        buf->f_ffree   = sbi->inode_free;       /* free inodes */
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = EFS_MAXNAMELEN;        /* max filename length */
        return 0;
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc9165..052a961e41aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1056,28 +1057,35 @@ EXPORT_SYMBOL(install_exec_creds);
 * - the caller must hold current->cred_exec_mutex to protect against
 *   PTRACE_ATTACH
 */
-void check_unsafe_exec(struct linux_binprm *bprm)
+int check_unsafe_exec(struct linux_binprm *bprm)
 {
        struct task_struct *p = current, *t;
        unsigned long flags;
-        unsigned n_fs, n_sighand;
+        unsigned n_fs;
+        int res = 0;
        bprm->unsafe = tracehook_unsafe_exec(p);
        n_fs = 1;
-        n_sighand = 1;
+        write_lock(&p->fs->lock);
        lock_task_sighand(p, &flags);
        for (t = next_thread(p); t != p; t = next_thread(t)) {
                if (t->fs == p->fs)
                        n_fs++;
-                n_sighand++;
        }
-        if (atomic_read(&p->fs->count) > n_fs ||
+        if (p->fs->users > n_fs) {
-            atomic_read(&p->sighand->count) > n_sighand)
                bprm->unsafe |= LSM_UNSAFE_SHARE;
+        } else {
+                if (p->fs->in_exec)
+                        res = -EAGAIN;
+                p->fs->in_exec = 1;
+        }
        unlock_task_sighand(p, &flags);
+        write_unlock(&p->fs->lock);
+        return res;
 }
 /* 
@@ -1296,12 +1304,15 @@ int do_execve(char * filename,
        bprm->cred = prepare_exec_creds();
        if (!bprm->cred)
                goto out_unlock;
-        check_unsafe_exec(bprm);
+        retval = check_unsafe_exec(bprm);
+        if (retval)
+                goto out_unlock;
        file = open_exec(filename);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
-                goto out_unlock;
+                goto out_unmark;
        sched_exec();
@@ -1344,6 +1355,9 @@ int do_execve(char * filename,
                goto out;
        /* execve succeeded */
+        write_lock(&current->fs->lock);
+        current->fs->in_exec = 0;
+        write_unlock(&current->fs->lock);
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
        acct_update_integrals(current);
@@ -1362,6 +1376,11 @@ out_file:
                fput(bprm->file);
        }
+out_unmark:
+        write_lock(&current->fs->lock);
+        current->fs->in_exec = 0;
+        write_unlock(&current->fs->lock);
 out_unlock:
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 000000000000..1b2d4c63a579
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
+- Out-of-space may cause a severe problem if the object (and directory entry)
+  were written, but the inode attributes failed. Then if the filesystem was
+  unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 000000000000..cc2d22db119c
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc.  All rights reserved.
+#
+# Authors:
+#   Boaz Harrosh <bharrosh@panasas.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 000000000000..86194b2f799d
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
+config EXOFS_FS
+        tristate "exofs: OSD based file system support"
+        depends on SCSI_OSD_ULD
+        help
+          EXOFS is a file system that uses an OSD storage device,
+          as its backing storage.
+# Debugging-related stuff
+config EXOFS_DEBUG
+        bool "Enable debugging"
+        depends on EXOFS_FS
+        help
+          This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 000000000000..b1512c4bb8c7
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
+/*
+ * common.h - Common definitions for both Kernel and user-mode utilities
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+#include <linux/types.h>
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_MIN_PID   0x10000 /* Smallest partition ID */
+#define EXOFS_OBJ_OFF   0x10000 /* offset for objects */
+#define EXOFS_SUPER_ID  0x10000 /* object ID for on-disk superblock */
+#define EXOFS_ROOT_ID   0x10002 /* object ID for root directory */
+/* exofs Application specific page/attribute */
+# define EXOFS_APAGE_FS_DATA    (OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define EXOFS_ATTR_INODE_DATA  1
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number.  This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+        EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+                                        (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
+        EXOFS_MAX_ID     = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT  12
+#define EXOFS_BLKSIZE   (1UL << EXOFS_BLKSHIFT)
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC       0x5DF5
+/*
+ * The file system control block - stored in an object's data (mainly, the one
+ * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
+ * on disk.  Right now it just has a magic value, which is basically a sanity
+ * check on our ability to communicate with the object store.
+ */
+struct exofs_fscb {
+        __le64  s_nextid;       /* Highest object ID used */
+        __le32  s_numfiles;     /* Number of files on fs */
+        __le16  s_magic;        /* Magic signature */
+        __le16  s_newfs;        /* Non-zero if this is a new fs */
+};
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA             5
+/*
+ * The file control block - stored in an object's attributes.  This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+        __le64  i_size;                 /* Size of the file */
+        __le16  i_mode;                 /* File mode */
+        __le16  i_links_count;          /* Links count */
+        __le32  i_uid;                  /* Owner Uid */
+        __le32  i_gid;                  /* Group Id */
+        __le32  i_atime;                /* Access time */
+        __le32  i_ctime;                /* Creation time */
+        __le32  i_mtime;                /* Modification time */
+        __le32  i_flags;                /* File flags (unused for now)*/
+        __le32  i_generation;           /* File version (for NFS) */
+        __le32  i_data[EXOFS_IDATA];    /* Short symlink names and device #s */
+};
+#define EXOFS_INO_ATTR_SIZE     sizeof(struct exofs_fcb)
+/* This is the Attribute the fcb is stored in */
+static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
+        EXOFS_APAGE_FS_DATA,
+        EXOFS_ATTR_INODE_DATA,
+        EXOFS_INO_ATTR_SIZE);
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN  255
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+        __le64          inode_no;               /* inode number           */
+        __le16          rec_len;                /* directory entry length */
+        u8              name_len;               /* name length            */
+        u8              file_type;              /* umm...file type        */
+        char            name[EXOFS_NAME_LEN];   /* file name              */
+};
+enum {
+        EXOFS_FT_UNKNOWN,
+        EXOFS_FT_REG_FILE,
+        EXOFS_FT_DIR,
+        EXOFS_FT_CHRDEV,
+        EXOFS_FT_BLKDEV,
+        EXOFS_FT_FIFO,
+        EXOFS_FT_SOCK,
+        EXOFS_FT_SYMLINK,
+        EXOFS_FT_MAX
+};
+#define EXOFS_DIR_PAD                   4
+#define EXOFS_DIR_ROUND                 (EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+        (((name_len) + offsetof(struct exofs_dir_entry, name)  + \
+          EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+/*************************
+ * function declarations *
+ *************************/
+/* osd.c                 */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+                           const struct osd_obj_id *obj);
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
+static inline int exofs_check_ok(struct osd_request *or)
+{
+        return exofs_check_ok_resid(or, NULL, NULL);
+}
+int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
+int exofs_async_op(struct osd_request *or,
+        osd_req_done_fn *async_done, void *caller_context, u8 *cred);
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
+int osd_req_read_kern(struct osd_request *or,
+        const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+int osd_req_write_kern(struct osd_request *or,
+        const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 000000000000..65b0c8c776a1
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,672 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "exofs.h"
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+        return inode->i_sb->s_blocksize;
+}
+static inline void exofs_put_page(struct page *page)
+{
+        kunmap(page);
+        page_cache_release(page);
+}
+/* Accesses dir's inode->i_size must be called under inode lock */
+static inline unsigned long dir_pages(struct inode *inode)
+{
+        return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+        loff_t last_byte = inode->i_size;
+        last_byte -= page_nr << PAGE_CACHE_SHIFT;
+        if (last_byte > PAGE_CACHE_SIZE)
+                last_byte = PAGE_CACHE_SIZE;
+        return last_byte;
+}
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *dir = mapping->host;
+        int err = 0;
+        dir->i_version++;
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
+        if (pos+len > dir->i_size) {
+                i_size_write(dir, pos+len);
+                mark_inode_dirty(dir);
+        }
+        set_page_dirty(page);
+        if (IS_DIRSYNC(dir))
+                err = write_one_page(page, 1);
+        else
+                unlock_page(page);
+        return err;
+}
+static void exofs_check_page(struct page *page)
+{
+        struct inode *dir = page->mapping->host;
+        unsigned chunk_size = exofs_chunk_size(dir);
+        char *kaddr = page_address(page);
+        unsigned offs, rec_len;
+        unsigned limit = PAGE_CACHE_SIZE;
+        struct exofs_dir_entry *p;
+        char *error;
+        /* if the page is the last one in the directory */
+        if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+                limit = dir->i_size & ~PAGE_CACHE_MASK;
+                if (limit & (chunk_size - 1))
+                        goto Ebadsize;
+                if (!limit)
+                        goto out;
+        }
+        for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+                p = (struct exofs_dir_entry *)(kaddr + offs);
+                rec_len = le16_to_cpu(p->rec_len);
+                if (rec_len < EXOFS_DIR_REC_LEN(1))
+                        goto Eshort;
+                if (rec_len & 3)
+                        goto Ealign;
+                if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+                        goto Enamelen;
+                if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                        goto Espan;
+        }
+        if (offs != limit)
+                goto Eend;
+out:
+        SetPageChecked(page);
+        return;
+Ebadsize:
+        EXOFS_ERR("ERROR [exofs_check_page]: "
+                "size of directory #%lu is not a multiple of chunk size",
+                dir->i_ino
+        );
+        goto fail;
+Eshort:
+        error = "rec_len is smaller than minimal";
+        goto bad_entry;
+Ealign:
+        error = "unaligned directory entry";
+        goto bad_entry;
+Enamelen:
+        error = "rec_len is too small for name_len";
+        goto bad_entry;
+Espan:
+        error = "directory entry across blocks";
+        goto bad_entry;
+bad_entry:
+        EXOFS_ERR(
+                "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+                "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+                dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                _LLU(le64_to_cpu(p->inode_no)),
+                rec_len, p->name_len);
+        goto fail;
+Eend:
+        p = (struct exofs_dir_entry *)(kaddr + offs);
+        EXOFS_ERR("ERROR [exofs_check_page]: "
+                "entry in directory #%lu spans the page boundary"
+                "offset=%lu, inode=%llu",
+                dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                _LLU(le64_to_cpu(p->inode_no)));
+fail:
+        SetPageChecked(page);
+        SetPageError(page);
+}
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+        struct address_space *mapping = dir->i_mapping;
+        struct page *page = read_mapping_page(mapping, n, NULL);
+        if (!IS_ERR(page)) {
+                kmap(page);
+                if (!PageChecked(page))
+                        exofs_check_page(page);
+                if (PageError(page))
+                        goto fail;
+        }
+        return page;
+fail:
+        exofs_put_page(page);
+        return ERR_PTR(-EIO);
+}
+static inline int exofs_match(int len, const unsigned char *name,
+                                        struct exofs_dir_entry *de)
+{
+        if (len != de->name_len)
+                return 0;
+        if (!de->inode_no)
+                return 0;
+        return !memcmp(name, de->name, len);
+}
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+        return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+        struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+        struct exofs_dir_entry *p =
+                        (struct exofs_dir_entry *)(base + (offset&mask));
+        while ((char *)p < (char *)de) {
+                if (p->rec_len == 0)
+                        break;
+                p = exofs_next_entry(p);
+        }
+        return (char *)p - base;
+}
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+        [EXOFS_FT_UNKNOWN]      = DT_UNKNOWN,
+        [EXOFS_FT_REG_FILE]     = DT_REG,
+        [EXOFS_FT_DIR]          = DT_DIR,
+        [EXOFS_FT_CHRDEV]       = DT_CHR,
+        [EXOFS_FT_BLKDEV]       = DT_BLK,
+        [EXOFS_FT_FIFO]         = DT_FIFO,
+        [EXOFS_FT_SOCK]         = DT_SOCK,
+        [EXOFS_FT_SYMLINK]      = DT_LNK,
+};
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+        [S_IFREG >> S_SHIFT]    = EXOFS_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = EXOFS_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = EXOFS_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = EXOFS_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = EXOFS_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = EXOFS_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = EXOFS_FT_SYMLINK,
+};
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+        mode_t mode = inode->i_mode;
+        de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+static int
+exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        loff_t pos = filp->f_pos;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        unsigned int offset = pos & ~PAGE_CACHE_MASK;
+        unsigned long n = pos >> PAGE_CACHE_SHIFT;
+        unsigned long npages = dir_pages(inode);
+        unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+        unsigned char *types = NULL;
+        int need_revalidate = (filp->f_version != inode->i_version);
+        if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+                return 0;
+        types = exofs_filetype_table;
+        for ( ; n < npages; n++, offset = 0) {
+                char *kaddr, *limit;
+                struct exofs_dir_entry *de;
+                struct page *page = exofs_get_page(inode, n);
+                if (IS_ERR(page)) {
+                        EXOFS_ERR("ERROR: "
+                                   "bad page in #%lu",
+                                   inode->i_ino);
+                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        return PTR_ERR(page);
+                }
+                kaddr = page_address(page);
+                if (unlikely(need_revalidate)) {
+                        if (offset) {
+                                offset = exofs_validate_entry(kaddr, offset,
+                                                                chunk_mask);
+                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                        }
+                        filp->f_version = inode->i_version;
+                        need_revalidate = 0;
+                }
+                de = (struct exofs_dir_entry *)(kaddr + offset);
+                limit = kaddr + exofs_last_byte(inode, n) -
+                                                        EXOFS_DIR_REC_LEN(1);
+                for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+                        if (de->rec_len == 0) {
+                                EXOFS_ERR("ERROR: "
+                                        "zero-length directory entry");
+                                exofs_put_page(page);
+                                return -EIO;
+                        }
+                        if (de->inode_no) {
+                                int over;
+                                unsigned char d_type = DT_UNKNOWN;
+                                if (types && de->file_type < EXOFS_FT_MAX)
+                                        d_type = types[de->file_type];
+                                offset = (char *)de - kaddr;
+                                over = filldir(dirent, de->name, de->name_len,
+                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                                le64_to_cpu(de->inode_no),
+                                                d_type);
+                                if (over) {
+                                        exofs_put_page(page);
+                                        return 0;
+                                }
+                        }
+                        filp->f_pos += le16_to_cpu(de->rec_len);
+                }
+                exofs_put_page(page);
+        }
+        return 0;
+}
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+                        struct dentry *dentry, struct page **res_page)
+{
+        const unsigned char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+        unsigned long start, n;
+        unsigned long npages = dir_pages(dir);
+        struct page *page = NULL;
+        struct exofs_i_info *oi = exofs_i(dir);
+        struct exofs_dir_entry *de;
+        if (npages == 0)
+                goto out;
+        *res_page = NULL;
+        start = oi->i_dir_start_lookup;
+        if (start >= npages)
+                start = 0;
+        n = start;
+        do {
+                char *kaddr;
+                page = exofs_get_page(dir, n);
+                if (!IS_ERR(page)) {
+                        kaddr = page_address(page);
+                        de = (struct exofs_dir_entry *) kaddr;
+                        kaddr += exofs_last_byte(dir, n) - reclen;
+                        while ((char *) de <= kaddr) {
+                                if (de->rec_len == 0) {
+                                        EXOFS_ERR(
+                                                "ERROR: exofs_find_entry: "
+                                                "zero-length directory entry");
+                                        exofs_put_page(page);
+                                        goto out;
+                                }
+                                if (exofs_match(namelen, name, de))
+                                        goto found;
+                                de = exofs_next_entry(de);
+                        }
+                        exofs_put_page(page);
+                }
+                if (++n >= npages)
+                        n = 0;
+        } while (n != start);
+out:
+        return NULL;
+found:
+        *res_page = page;
+        oi->i_dir_start_lookup = n;
+        return de;
+}
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+        struct page *page = exofs_get_page(dir, 0);
+        struct exofs_dir_entry *de = NULL;
+        if (!IS_ERR(page)) {
+                de = exofs_next_entry(
+                                (struct exofs_dir_entry *)page_address(page));
+                *p = page;
+        }
+        return de;
+}
+ino_t exofs_parent_ino(struct dentry *child)
+{
+        struct page *page;
+        struct exofs_dir_entry *de;
+        ino_t ino;
+        de = exofs_dotdot(child->d_inode, &page);
+        if (!de)
+                return 0;
+        ino = le64_to_cpu(de->inode_no);
+        exofs_put_page(page);
+        return ino;
+}
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+        ino_t res = 0;
+        struct exofs_dir_entry *de;
+        struct page *page;
+        de = exofs_find_entry(dir, dentry, &page);
+        if (de) {
+                res = le64_to_cpu(de->inode_no);
+                exofs_put_page(page);
+        }
+        return res;
+}
+int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+                        struct page *page, struct inode *inode)
+{
+        loff_t pos = page_offset(page) +
+                        (char *) de - (char *) page_address(page);
+        unsigned len = le16_to_cpu(de->rec_len);
+        int err;
+        lock_page(page);
+        err = exofs_write_begin(NULL, page->mapping, pos, len,
+                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+        if (err)
+                EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+                          err);
+        de->inode_no = cpu_to_le64(inode->i_ino);
+        exofs_set_de_type(de, inode);
+        if (likely(!err))
+                err = exofs_commit_chunk(page, pos, len);
+        exofs_put_page(page);
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(dir);
+        return err;
+}
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        const unsigned char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        unsigned chunk_size = exofs_chunk_size(dir);
+        unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+        unsigned short rec_len, name_len;
+        struct page *page = NULL;
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        struct exofs_dir_entry *de;
+        unsigned long npages = dir_pages(dir);
+        unsigned long n;
+        char *kaddr;
+        loff_t pos;
+        int err;
+        for (n = 0; n <= npages; n++) {
+                char *dir_end;
+                page = exofs_get_page(dir, n);
+                err = PTR_ERR(page);
+                if (IS_ERR(page))
+                        goto out;
+                lock_page(page);
+                kaddr = page_address(page);
+                dir_end = kaddr + exofs_last_byte(dir, n);
+                de = (struct exofs_dir_entry *)kaddr;
+                kaddr += PAGE_CACHE_SIZE - reclen;
+                while ((char *)de <= kaddr) {
+                        if ((char *)de == dir_end) {
+                                name_len = 0;
+                                rec_len = chunk_size;
+                                de->rec_len = cpu_to_le16(chunk_size);
+                                de->inode_no = 0;
+                                goto got_it;
+                        }
+                        if (de->rec_len == 0) {
+                                EXOFS_ERR("ERROR: exofs_add_link: "
+                                        "zero-length directory entry");
+                                err = -EIO;
+                                goto out_unlock;
+                        }
+                        err = -EEXIST;
+                        if (exofs_match(namelen, name, de))
+                                goto out_unlock;
+                        name_len = EXOFS_DIR_REC_LEN(de->name_len);
+                        rec_len = le16_to_cpu(de->rec_len);
+                        if (!de->inode_no && rec_len >= reclen)
+                                goto got_it;
+                        if (rec_len >= name_len + reclen)
+                                goto got_it;
+                        de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+                }
+                unlock_page(page);
+                exofs_put_page(page);
+        }
+        EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+        return -EINVAL;
+got_it:
+        pos = page_offset(page) +
+                (char *)de - (char *)page_address(page);
+        err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+                                                        &page, NULL);
+        if (err)
+                goto out_unlock;
+        if (de->inode_no) {
+                struct exofs_dir_entry *de1 =
+                        (struct exofs_dir_entry *)((char *)de + name_len);
+                de1->rec_len = cpu_to_le16(rec_len - name_len);
+                de->rec_len = cpu_to_le16(name_len);
+                de = de1;
+        }
+        de->name_len = namelen;
+        memcpy(de->name, name, namelen);
+        de->inode_no = cpu_to_le64(inode->i_ino);
+        exofs_set_de_type(de, inode);
+        err = exofs_commit_chunk(page, pos, rec_len);
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        mark_inode_dirty(dir);
+        sbi->s_numfiles++;
+out_put:
+        exofs_put_page(page);
+out:
+        return err;
+out_unlock:
+        unlock_page(page);
+        goto out_put;
+}
+int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        char *kaddr = page_address(page);
+        unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
+        unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+        loff_t pos;
+        struct exofs_dir_entry *pde = NULL;
+        struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
+        int err;
+        while (de < dir) {
+                if (de->rec_len == 0) {
+                        EXOFS_ERR("ERROR: exofs_delete_entry:"
+                                "zero-length directory entry");
+                        err = -EIO;
+                        goto out;
+                }
+                pde = de;
+                de = exofs_next_entry(de);
+        }
+        if (pde)
+                from = (char *)pde - (char *)page_address(page);
+        pos = page_offset(page) + from;
+        lock_page(page);
+        err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
+                                                        &page, NULL);
+        if (err)
+                EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+                          err);
+        if (pde)
+                pde->rec_len = cpu_to_le16(to - from);
+        dir->inode_no = 0;
+        if (likely(!err))
+                err = exofs_commit_chunk(page, pos, to - from);
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        mark_inode_dirty(inode);
+        sbi->s_numfiles--;
+out:
+        exofs_put_page(page);
+        return err;
+}
+/* kept aligned on 4 bytes */
+#define THIS_DIR ".\0\0"
+#define PARENT_DIR "..\0"
+int exofs_make_empty(struct inode *inode, struct inode *parent)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct page *page = grab_cache_page(mapping, 0);
+        unsigned chunk_size = exofs_chunk_size(inode);
+        struct exofs_dir_entry *de;
+        int err;
+        void *kaddr;
+        if (!page)
+                return -ENOMEM;
+        err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+                                                        &page, NULL);
+        if (err) {
+                unlock_page(page);
+                goto fail;
+        }
+        kaddr = kmap_atomic(page, KM_USER0);
+        de = (struct exofs_dir_entry *)kaddr;
+        de->name_len = 1;
+        de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+        memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
+        de->inode_no = cpu_to_le64(inode->i_ino);
+        exofs_set_de_type(de, inode);
+        de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
+        de->name_len = 2;
+        de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
+        de->inode_no = cpu_to_le64(parent->i_ino);
+        memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
+        exofs_set_de_type(de, inode);
+        kunmap_atomic(page, KM_USER0);
+        err = exofs_commit_chunk(page, 0, chunk_size);
+fail:
+        page_cache_release(page);
+        return err;
+}
+int exofs_empty_dir(struct inode *inode)
+{
+        struct page *page = NULL;
+        unsigned long i, npages = dir_pages(inode);
+        for (i = 0; i < npages; i++) {
+                char *kaddr;
+                struct exofs_dir_entry *de;
+                page = exofs_get_page(inode, i);
+                if (IS_ERR(page))
+                        continue;
+                kaddr = page_address(page);
+                de = (struct exofs_dir_entry *)kaddr;
+                kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
+                while ((char *)de <= kaddr) {
+                        if (de->rec_len == 0) {
+                                EXOFS_ERR("ERROR: exofs_empty_dir: "
+                                          "zero-length directory entry"
+                                          "kaddr=%p, de=%p\n", kaddr, de);
+                                goto not_empty;
+                        }
+                        if (de->inode_no != 0) {
+                                /* check for . and .. */
+                                if (de->name[0] != '.')
+                                        goto not_empty;
+                                if (de->name_len > 2)
+                                        goto not_empty;
+                                if (de->name_len < 2) {
+                                        if (le64_to_cpu(de->inode_no) !=
+                                            inode->i_ino)
+                                                goto not_empty;
+                                } else if (de->name[1] != '.')
+                                        goto not_empty;
+                        }
+                        de = exofs_next_entry(de);
+                }
+                exofs_put_page(page);
+        }
+        return 1;
+not_empty:
+        exofs_put_page(page);
+        return 0;
+}
+const struct file_operations exofs_dir_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = exofs_readdir,
+};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 000000000000..0fd4c7859679
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/fs.h>
+#include <linux/time.h>
+#include "common.h"
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
+#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
+#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG(fmt, a...) \
+        printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define EXOFS_DBGMSG(fmt, a...) \
+        do { if (0) printk(fmt, ##a); } while (0)
+#endif
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+/*
+ * our extension to the in-memory superblock
+ */
+struct exofs_sb_info {
+        struct osd_dev  *s_dev;                 /* returned by get_osd_dev    */
+        osd_id          s_pid;                  /* partition ID of file system*/
+        int             s_timeout;              /* timeout for OSD operations */
+        uint64_t        s_nextid;               /* highest object ID used     */
+        uint32_t        s_numfiles;             /* number of files on fs      */
+        spinlock_t      s_next_gen_lock;        /* spinlock for gen # update  */
+        u32             s_next_generation;      /* next gen # to use          */
+        atomic_t        s_curr_pending;         /* number of pending commands */
+        uint8_t         s_cred[OSD_CAP_LEN];    /* all-powerful credential    */
+};
+/*
+ * our extension to the in-memory inode
+ */
+struct exofs_i_info {
+        unsigned long  i_flags;            /* various atomic flags            */
+        uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
+        uint32_t       i_dir_start_lookup; /* which page to start lookup      */
+        wait_queue_head_t i_wq;            /* wait queue for inode            */
+        uint64_t       i_commit_size;      /* the object's written length     */
+        uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
+        struct inode   vfs_inode;          /* normal in-memory inode          */
+};
+/*
+ * our inode flags
+ */
+#define OBJ_2BCREATED   0       /* object will be created soon*/
+#define OBJ_CREATED     1       /* object has been created on the osd*/
+static inline int obj_2bcreated(struct exofs_i_info *oi)
+{
+        return test_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+static inline void set_obj_2bcreated(struct exofs_i_info *oi)
+{
+        set_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+static inline int obj_created(struct exofs_i_info *oi)
+{
+        return test_bit(OBJ_CREATED, &oi->i_flags);
+}
+static inline void set_obj_created(struct exofs_i_info *oi)
+{
+        set_bit(OBJ_CREATED, &oi->i_flags);
+}
+int __exofs_wait_obj_created(struct exofs_i_info *oi);
+static inline int wait_obj_created(struct exofs_i_info *oi)
+{
+        if (likely(obj_created(oi)))
+                return 0;
+        return __exofs_wait_obj_created(oi);
+}
+/*
+ * get to our inode from the vfs inode
+ */
+static inline struct exofs_i_info *exofs_i(struct inode *inode)
+{
+        return container_of(inode, struct exofs_i_info, vfs_inode);
+}
+/*
+ * Maximum count of links to a file
+ */
+#define EXOFS_LINK_MAX           32000
+/*************************
+ * function declarations *
+ *************************/
+/* inode.c               */
+void exofs_truncate(struct inode *inode);
+int exofs_setattr(struct dentry *, struct iattr *);
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata);
+extern struct inode *exofs_iget(struct super_block *, unsigned long);
+struct inode *exofs_new_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, int);
+extern void exofs_delete_inode(struct inode *);
+/* dir.c:                */
+int exofs_add_link(struct dentry *, struct inode *);
+ino_t exofs_inode_by_name(struct inode *, struct dentry *);
+int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
+int exofs_make_empty(struct inode *, struct inode *);
+struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
+                                         struct page **);
+int exofs_empty_dir(struct inode *);
+struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+ino_t exofs_parent_ino(struct dentry *child);
+int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
+                    struct inode *);
+/*********************
+ * operation vectors *
+ *********************/
+/* dir.c:            */
+extern const struct file_operations exofs_dir_operations;
+/* file.c            */
+extern const struct inode_operations exofs_file_inode_operations;
+extern const struct file_operations exofs_file_operations;
+/* inode.c           */
+extern const struct address_space_operations exofs_aops;
+/* namei.c           */
+extern const struct inode_operations exofs_dir_inode_operations;
+extern const struct inode_operations exofs_special_inode_operations;
+/* symlink.c         */
+extern const struct inode_operations exofs_symlink_inode_operations;
+extern const struct inode_operations exofs_fast_symlink_inode_operations;
+#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 000000000000..6ed7fe484752
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/buffer_head.h>
+#include "exofs.h"
+static int exofs_release_file(struct inode *inode, struct file *filp)
+{
+        return 0;
+}
+static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+                            int datasync)
+{
+        int ret;
+        struct address_space *mapping = filp->f_mapping;
+        ret = filemap_write_and_wait(mapping);
+        if (ret)
+                return ret;
+        /*Note: file_fsync below also calles sync_blockdev, which is a no-op
+         *      for exofs, but other then that it does sync_inode and
+         *      sync_superblock which is what we need here.
+         */
+        return file_fsync(filp, dentry, datasync);
+}
+static int exofs_flush(struct file *file, fl_owner_t id)
+{
+        exofs_file_fsync(file, file->f_path.dentry, 1);
+        /* TODO: Flush the OSD target */
+        return 0;
+}
+const struct file_operations exofs_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = generic_file_aio_read,
+        .aio_write      = generic_file_aio_write,
+        .mmap           = generic_file_mmap,
+        .open           = generic_file_open,
+        .release        = exofs_release_file,
+        .fsync          = exofs_file_fsync,
+        .flush          = exofs_flush,
+        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
+};
+const struct inode_operations exofs_file_inode_operations = {
+        .truncate       = exofs_truncate,
+        .setattr        = exofs_setattr,
+};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 000000000000..ba8d9fab4693
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,1303 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <scsi/scsi_device.h>
+#include "exofs.h"
+#ifdef CONFIG_EXOFS_DEBUG
+#  define EXOFS_DEBUG_OBJ_ISIZE 1
+#endif
+struct page_collect {
+        struct exofs_sb_info *sbi;
+        struct request_queue *req_q;
+        struct inode *inode;
+        unsigned expected_pages;
+        struct bio *bio;
+        unsigned nr_pages;
+        unsigned long length;
+        loff_t pg_first; /* keep 64bit also in 32-arches */
+};
+static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
+                struct inode *inode)
+{
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
+        pcol->sbi = sbi;
+        pcol->req_q = req_q;
+        pcol->inode = inode;
+        pcol->expected_pages = expected_pages;
+        pcol->bio = NULL;
+        pcol->nr_pages = 0;
+        pcol->length = 0;
+        pcol->pg_first = -1;
+        EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
+                     expected_pages);
+}
+static void _pcol_reset(struct page_collect *pcol)
+{
+        pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
+        pcol->bio = NULL;
+        pcol->nr_pages = 0;
+        pcol->length = 0;
+        pcol->pg_first = -1;
+        EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+                     pcol->inode->i_ino, pcol->expected_pages);
+        /* this is probably the end of the loop but in writes
+         * it might not end here. don't be left with nothing
+         */
+        if (!pcol->expected_pages)
+                pcol->expected_pages = 128;
+}
+static int pcol_try_alloc(struct page_collect *pcol)
+{
+        int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+        for (; pages; pages >>= 1) {
+                pcol->bio = bio_alloc(GFP_KERNEL, pages);
+                if (likely(pcol->bio))
+                        return 0;
+        }
+        EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+                  pcol->expected_pages);
+        return -ENOMEM;
+}
+static void pcol_free(struct page_collect *pcol)
+{
+        bio_put(pcol->bio);
+        pcol->bio = NULL;
+}
+static int pcol_add_page(struct page_collect *pcol, struct page *page,
+                         unsigned len)
+{
+        int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
+        if (unlikely(len != added_len))
+                return -ENOMEM;
+        ++pcol->nr_pages;
+        pcol->length += len;
+        return 0;
+}
+static int update_read_page(struct page *page, int ret)
+{
+        if (ret == 0) {
+                /* Everything is OK */
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+        } else if (ret == -EFAULT) {
+                /* In this case we were trying to read something that wasn't on
+                 * disk yet - return a page full of zeroes.  This should be OK,
+                 * because the object should be empty (if there was a write
+                 * before this read, the read would be waiting with the page
+                 * locked */
+                clear_highpage(page);
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+                ret = 0; /* recovered error */
+                EXOFS_DBGMSG("recovered read error\n");
+        } else /* Error */
+                SetPageError(page);
+        return ret;
+}
+static void update_write_page(struct page *page, int ret)
+{
+        if (ret) {
+                mapping_set_error(page->mapping, ret);
+                SetPageError(page);
+        }
+        end_page_writeback(page);
+}
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
+static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+                            bool do_unlock)
+{
+        struct bio_vec *bvec;
+        int i;
+        u64 resid;
+        u64 good_bytes;
+        u64 length = 0;
+        int ret = exofs_check_ok_resid(or, &resid, NULL);
+        osd_end_request(or);
+        if (likely(!ret))
+                good_bytes = pcol->length;
+        else if (!resid)
+                good_bytes = 0;
+        else
+                good_bytes = pcol->length - resid;
+        EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+                     " length=0x%lx nr_pages=%u\n",
+                     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+                     pcol->nr_pages);
+        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+                struct page *page = bvec->bv_page;
+                struct inode *inode = page->mapping->host;
+                int page_stat;
+                if (inode != pcol->inode)
+                        continue; /* osd might add more pages at end */
+                if (likely(length < good_bytes))
+                        page_stat = 0;
+                else
+                        page_stat = ret;
+                EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+                          inode->i_ino, page->index,
+                          page_stat ? "bad_bytes" : "good_bytes");
+                ret = update_read_page(page, page_stat);
+                if (do_unlock)
+                        unlock_page(page);
+                length += bvec->bv_len;
+        }
+        pcol_free(pcol);
+        EXOFS_DBGMSG("readpages_done END\n");
+        return ret;
+}
+/* callback of async reads */
+static void readpages_done(struct osd_request *or, void *p)
+{
+        struct page_collect *pcol = p;
+        __readpages_done(or, pcol, true);
+        atomic_dec(&pcol->sbi->s_curr_pending);
+        kfree(p);
+}
+static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
+{
+        struct bio_vec *bvec;
+        int i;
+        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+                struct page *page = bvec->bv_page;
+                if (rw == READ)
+                        update_read_page(page, ret);
+                else
+                        update_write_page(page, ret);
+                unlock_page(page);
+        }
+        pcol_free(pcol);
+}
+static int read_exec(struct page_collect *pcol, bool is_sync)
+{
+        struct exofs_i_info *oi = exofs_i(pcol->inode);
+        struct osd_obj_id obj = {pcol->sbi->s_pid,
+                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or = NULL;
+        struct page_collect *pcol_copy = NULL;
+        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+        int ret;
+        if (!pcol->bio)
+                return 0;
+        /* see comment in _readpage() about sync reads */
+        WARN_ON(is_sync && (pcol->nr_pages != 1));
+        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        osd_req_read(or, &obj, pcol->bio, i_start);
+        if (is_sync) {
+                exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+                return __readpages_done(or, pcol, false);
+        }
+        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+        if (!pcol_copy) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        *pcol_copy = *pcol;
+        ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+        if (unlikely(ret))
+                goto err;
+        atomic_inc(&pcol->sbi->s_curr_pending);
+        EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+                  obj.id, _LLU(i_start), pcol->length);
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        return 0;
+err:
+        if (!is_sync)
+                _unlock_pcol_pages(pcol, ret, READ);
+        kfree(pcol_copy);
+        if (or)
+                osd_end_request(or);
+        return ret;
+}
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+        struct page_collect *pcol = data;
+        struct inode *inode = pcol->inode;
+        struct exofs_i_info *oi = exofs_i(inode);
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        size_t len;
+        int ret;
+        /* FIXME: Just for debugging, will be removed */
+        if (PageUptodate(page))
+                EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
+                          page->index);
+        if (page->index < end_index)
+                len = PAGE_CACHE_SIZE;
+        else if (page->index == end_index)
+                len = i_size & ~PAGE_CACHE_MASK;
+        else
+                len = 0;
+        if (!len || !obj_created(oi)) {
+                /* this will be out of bounds, or doesn't exist yet.
+                 * Current page is cleared and the request is split
+                 */
+                clear_highpage(page);
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+                unlock_page(page);
+                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
+                             " splitting\n", inode->i_ino, page->index);
+                return read_exec(pcol, false);
+        }
+try_again:
+        if (unlikely(pcol->pg_first == -1)) {
+                pcol->pg_first = page->index;
+        } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+                   page->index)) {
+                /* Discontinuity detected, split the request */
+                ret = read_exec(pcol, false);
+                if (unlikely(ret))
+                        goto fail;
+                goto try_again;
+        }
+        if (!pcol->bio) {
+                ret = pcol_try_alloc(pcol);
+                if (unlikely(ret))
+                        goto fail;
+        }
+        if (len != PAGE_CACHE_SIZE)
+                zero_user(page, len, PAGE_CACHE_SIZE - len);
+        EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+                     inode->i_ino, page->index, len);
+        ret = pcol_add_page(pcol, page, len);
+        if (ret) {
+                EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+                          "this_len=0x%zx nr_pages=%u length=0x%lx\n",
+                          page, len, pcol->nr_pages, pcol->length);
+                /* split the request, and start again with current page */
+                ret = read_exec(pcol, false);
+                if (unlikely(ret))
+                        goto fail;
+                goto try_again;
+        }
+        return 0;
+fail:
+        /* SetPageError(page); ??? */
+        unlock_page(page);
+        return ret;
+}
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+                           struct list_head *pages, unsigned nr_pages)
+{
+        struct page_collect pcol;
+        int ret;
+        _pcol_init(&pcol, nr_pages, mapping->host);
+        ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
+        if (ret) {
+                EXOFS_ERR("read_cache_pages => %d\n", ret);
+                return ret;
+        }
+        return read_exec(&pcol, false);
+}
+static int _readpage(struct page *page, bool is_sync)
+{
+        struct page_collect pcol;
+        int ret;
+        _pcol_init(&pcol, 1, page->mapping->host);
+        /* readpage_strip might call read_exec(,async) inside at several places
+         * but this is safe for is_async=0 since read_exec will not do anything
+         * when we have a single page.
+         */
+        ret = readpage_strip(&pcol, page);
+        if (ret) {
+                EXOFS_ERR("_readpage => %d\n", ret);
+                return ret;
+        }
+        return read_exec(&pcol, is_sync);
+}
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+        return _readpage(page, false);
+}
+/* Callback for osd_write. All writes are asynchronouse */
+static void writepages_done(struct osd_request *or, void *p)
+{
+        struct page_collect *pcol = p;
+        struct bio_vec *bvec;
+        int i;
+        u64 resid;
+        u64  good_bytes;
+        u64  length = 0;
+        int ret = exofs_check_ok_resid(or, NULL, &resid);
+        osd_end_request(or);
+        atomic_dec(&pcol->sbi->s_curr_pending);
+        if (likely(!ret))
+                good_bytes = pcol->length;
+        else if (!resid)
+                good_bytes = 0;
+        else
+                good_bytes = pcol->length - resid;
+        EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+                     " length=0x%lx nr_pages=%u\n",
+                     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+                     pcol->nr_pages);
+        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+                struct page *page = bvec->bv_page;
+                struct inode *inode = page->mapping->host;
+                int page_stat;
+                if (inode != pcol->inode)
+                        continue; /* osd might add more pages to a bio */
+                if (likely(length < good_bytes))
+                        page_stat = 0;
+                else
+                        page_stat = ret;
+                update_write_page(page, page_stat);
+                unlock_page(page);
+                EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+                             inode->i_ino, page->index, page_stat);
+                length += bvec->bv_len;
+        }
+        pcol_free(pcol);
+        kfree(pcol);
+        EXOFS_DBGMSG("writepages_done END\n");
+}
+static int write_exec(struct page_collect *pcol)
+{
+        struct exofs_i_info *oi = exofs_i(pcol->inode);
+        struct osd_obj_id obj = {pcol->sbi->s_pid,
+                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or = NULL;
+        struct page_collect *pcol_copy = NULL;
+        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+        int ret;
+        if (!pcol->bio)
+                return 0;
+        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
+                ret = -ENOMEM;
+                goto err;
+        }
+        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+        if (!pcol_copy) {
+                EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+                ret = -ENOMEM;
+                goto err;
+        }
+        *pcol_copy = *pcol;
+        osd_req_write(or, &obj, pcol_copy->bio, i_start);
+        ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+        if (unlikely(ret)) {
+                EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+                goto err;
+        }
+        atomic_inc(&pcol->sbi->s_curr_pending);
+        EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+                  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+                  pcol->length);
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        return 0;
+err:
+        _unlock_pcol_pages(pcol, ret, WRITE);
+        kfree(pcol_copy);
+        if (or)
+                osd_end_request(or);
+        return ret;
+}
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
+static int writepage_strip(struct page *page,
+                           struct writeback_control *wbc_unused, void *data)
+{
+        struct page_collect *pcol = data;
+        struct inode *inode = pcol->inode;
+        struct exofs_i_info *oi = exofs_i(inode);
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        size_t len;
+        int ret;
+        BUG_ON(!PageLocked(page));
+        ret = wait_obj_created(oi);
+        if (unlikely(ret))
+                goto fail;
+        if (page->index < end_index)
+                /* in this case, the page is within the limits of the file */
+                len = PAGE_CACHE_SIZE;
+        else {
+                len = i_size & ~PAGE_CACHE_MASK;
+                if (page->index > end_index || !len) {
+                        /* in this case, the page is outside the limits
+                         * (truncate in progress)
+                         */
+                        ret = write_exec(pcol);
+                        if (unlikely(ret))
+                                goto fail;
+                        if (PageError(page))
+                                ClearPageError(page);
+                        unlock_page(page);
+                        return 0;
+                }
+        }
+try_again:
+        if (unlikely(pcol->pg_first == -1)) {
+                pcol->pg_first = page->index;
+        } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+                   page->index)) {
+                /* Discontinuity detected, split the request */
+                ret = write_exec(pcol);
+                if (unlikely(ret))
+                        goto fail;
+                goto try_again;
+        }
+        if (!pcol->bio) {
+                ret = pcol_try_alloc(pcol);
+                if (unlikely(ret))
+                        goto fail;
+        }
+        EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+                     inode->i_ino, page->index, len);
+        ret = pcol_add_page(pcol, page, len);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("Failed pcol_add_page "
+                             "nr_pages=%u total_length=0x%lx\n",
+                             pcol->nr_pages, pcol->length);
+                /* split the request, next loop will start again */
+                ret = write_exec(pcol);
+                if (unlikely(ret)) {
+                        EXOFS_DBGMSG("write_exec faild => %d", ret);
+                        goto fail;
+                }
+                goto try_again;
+        }
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        return 0;
+fail:
+        set_bit(AS_EIO, &page->mapping->flags);
+        unlock_page(page);
+        return ret;
+}
+static int exofs_writepages(struct address_space *mapping,
+                       struct writeback_control *wbc)
+{
+        struct page_collect pcol;
+        long start, end, expected_pages;
+        int ret;
+        start = wbc->range_start >> PAGE_CACHE_SHIFT;
+        end = (wbc->range_end == LLONG_MAX) ?
+                        start + mapping->nrpages :
+                        wbc->range_end >> PAGE_CACHE_SHIFT;
+        if (start || end)
+                expected_pages = min(end - start + 1, 32L);
+        else
+                expected_pages = mapping->nrpages;
+        EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+                     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+                     mapping->host->i_ino, wbc->range_start, wbc->range_end,
+                     mapping->nrpages, start, end);
+        _pcol_init(&pcol, expected_pages, mapping->host);
+        ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
+        if (ret) {
+                EXOFS_ERR("write_cache_pages => %d\n", ret);
+                return ret;
+        }
+        return write_exec(&pcol);
+}
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct page_collect pcol;
+        int ret;
+        _pcol_init(&pcol, 1, page->mapping->host);
+        ret = writepage_strip(page, NULL, &pcol);
+        if (ret) {
+                EXOFS_ERR("exofs_writepage => %d\n", ret);
+                return ret;
+        }
+        return write_exec(&pcol);
+}
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+        int ret = 0;
+        struct page *page;
+        page = *pagep;
+        if (page == NULL) {
+                ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+                                         fsdata);
+                if (ret) {
+                        EXOFS_DBGMSG("simple_write_begin faild\n");
+                        return ret;
+                }
+                page = *pagep;
+        }
+         /* read modify write */
+        if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+                ret = _readpage(page, true);
+                if (ret) {
+                        /*SetPageError was done by _readpage. Is it ok?*/
+                        unlock_page(page);
+                        EXOFS_DBGMSG("__readpage_filler faild\n");
+                }
+        }
+        return ret;
+}
+static int exofs_write_begin_export(struct file *file,
+                struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+        *pagep = NULL;
+        return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+                                        fsdata);
+}
+const struct address_space_operations exofs_aops = {
+        .readpage       = exofs_readpage,
+        .readpages      = exofs_readpages,
+        .writepage      = exofs_writepage,
+        .writepages     = exofs_writepages,
+        .write_begin    = exofs_write_begin_export,
+        .write_end      = simple_write_end,
+};
+/******************************************************************************
+ * INODE OPERATIONS
+ *****************************************************************************/
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+{
+        struct exofs_i_info *oi = exofs_i(inode);
+        return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
+}
+/*
+ * get_block_t - Fill in a buffer_head
+ * An OSD takes care of block allocation so we just fake an allocation by
+ * putting in the inode's sector_t in the buffer_head.
+ * TODO: What about the case of create==0 and @iblock does not exist in the
+ * object?
+ */
+static int exofs_get_block(struct inode *inode, sector_t iblock,
+                    struct buffer_head *bh_result, int create)
+{
+        map_bh(bh_result, inode->i_sb, iblock);
+        return 0;
+}
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+        OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+/*
+ * Truncate a file to the specified size - all we have to do is set the size
+ * attribute.  We make sure the object exists first.
+ */
+void exofs_truncate(struct inode *inode)
+{
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        struct exofs_i_info *oi = exofs_i(inode);
+        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or;
+        struct osd_attr attr;
+        loff_t isize = i_size_read(inode);
+        __be64 newsize;
+        int ret;
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+             || S_ISLNK(inode->i_mode)))
+                return;
+        if (exofs_inode_is_fast_symlink(inode))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
+                goto fail;
+        }
+        osd_req_set_attributes(or, &obj);
+        newsize = cpu_to_be64((u64)isize);
+        attr = g_attr_logical_length;
+        attr.val_ptr = &newsize;
+        osd_req_add_set_attr_list(or, &attr, 1);
+        /* if we are about to truncate an object, and it hasn't been
+         * created yet, wait
+         */
+        if (unlikely(wait_obj_created(oi)))
+                goto fail;
+        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        osd_end_request(or);
+        if (ret)
+                goto fail;
+out:
+        mark_inode_dirty(inode);
+        return;
+fail:
+        make_bad_inode(inode);
+        goto out;
+}
+/*
+ * Set inode attributes - just call generic functions.
+ */
+int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                return error;
+        error = inode_setattr(inode, iattr);
+        return error;
+}
+/*
+ * Read an inode from the OSD, and return it as is.  We also return the size
+ * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * on.
+ */
+static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
+                    struct exofs_fcb *inode, uint64_t *sanity)
+{
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        struct osd_request *or;
+        struct osd_attr attr;
+        struct osd_obj_id obj = {sbi->s_pid,
+                                 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
+        int ret;
+        exofs_make_credential(oi->i_cred, &obj);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
+                return -ENOMEM;
+        }
+        osd_req_get_attributes(or, &obj);
+        /* we need the inode attribute */
+        osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        /* we get the size attributes to do a sanity check */
+        osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+#endif
+        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        if (ret)
+                goto out;
+        attr = g_attr_inode_data;
+        ret = extract_attr_from_req(or, &attr);
+        if (ret) {
+                EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+                goto out;
+        }
+        WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
+        memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        attr = g_attr_logical_length;
+        ret = extract_attr_from_req(or, &attr);
+        if (ret) {
+                EXOFS_ERR("ERROR: extract attr from or failed\n");
+                goto out;
+        }
+        *sanity = get_unaligned_be64(attr.val_ptr);
+#endif
+out:
+        osd_end_request(or);
+        return ret;
+}
+/*
+ * Fill in an inode read from the OSD and set it up for use
+ */
+struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
+{
+        struct exofs_i_info *oi;
+        struct exofs_fcb fcb;
+        struct inode *inode;
+        uint64_t uninitialized_var(sanity);
+        int ret;
+        inode = iget_locked(sb, ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        oi = exofs_i(inode);
+        /* read the inode from the osd */
+        ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+        if (ret)
+                goto bad_inode;
+        init_waitqueue_head(&oi->i_wq);
+        set_obj_created(oi);
+        /* copy stuff from on-disk struct to in-memory struct */
+        inode->i_mode = le16_to_cpu(fcb.i_mode);
+        inode->i_uid = le32_to_cpu(fcb.i_uid);
+        inode->i_gid = le32_to_cpu(fcb.i_gid);
+        inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+        inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
+        inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
+        inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
+        inode->i_ctime.tv_nsec =
+                inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+        oi->i_commit_size = le64_to_cpu(fcb.i_size);
+        i_size_write(inode, oi->i_commit_size);
+        inode->i_blkbits = EXOFS_BLKSHIFT;
+        inode->i_generation = le32_to_cpu(fcb.i_generation);
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        if ((inode->i_size != sanity) &&
+                (!exofs_inode_is_fast_symlink(inode))) {
+                EXOFS_ERR("WARNING: Size of object from inode and "
+                          "attributes differ (%lld != %llu)\n",
+                          inode->i_size, _LLU(sanity));
+        }
+#endif
+        oi->i_dir_start_lookup = 0;
+        if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
+                ret = -ESTALE;
+                goto bad_inode;
+        }
+        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+                if (fcb.i_data[0])
+                        inode->i_rdev =
+                                old_decode_dev(le32_to_cpu(fcb.i_data[0]));
+                else
+                        inode->i_rdev =
+                                new_decode_dev(le32_to_cpu(fcb.i_data[1]));
+        } else {
+                memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
+        }
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &exofs_file_inode_operations;
+                inode->i_fop = &exofs_file_operations;
+                inode->i_mapping->a_ops = &exofs_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &exofs_dir_inode_operations;
+                inode->i_fop = &exofs_dir_operations;
+                inode->i_mapping->a_ops = &exofs_aops;
+        } else if (S_ISLNK(inode->i_mode)) {
+                if (exofs_inode_is_fast_symlink(inode))
+                        inode->i_op = &exofs_fast_symlink_inode_operations;
+                else {
+                        inode->i_op = &exofs_symlink_inode_operations;
+                        inode->i_mapping->a_ops = &exofs_aops;
+                }
+        } else {
+                inode->i_op = &exofs_special_inode_operations;
+                if (fcb.i_data[0])
+                        init_special_inode(inode, inode->i_mode,
+                           old_decode_dev(le32_to_cpu(fcb.i_data[0])));
+                else
+                        init_special_inode(inode, inode->i_mode,
+                           new_decode_dev(le32_to_cpu(fcb.i_data[1])));
+        }
+        unlock_new_inode(inode);
+        return inode;
+bad_inode:
+        iget_failed(inode);
+        return ERR_PTR(ret);
+}
+int __exofs_wait_obj_created(struct exofs_i_info *oi)
+{
+        if (!obj_created(oi)) {
+                BUG_ON(!obj_2bcreated(oi));
+                wait_event(oi->i_wq, obj_created(oi));
+        }
+        return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
+}
+/*
+ * Callback function from exofs_new_inode().  The important thing is that we
+ * set the obj_created flag so that other methods know that the object exists on
+ * the OSD.
+ */
+static void create_done(struct osd_request *or, void *p)
+{
+        struct inode *inode = p;
+        struct exofs_i_info *oi = exofs_i(inode);
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+        int ret;
+        ret = exofs_check_ok(or);
+        osd_end_request(or);
+        atomic_dec(&sbi->s_curr_pending);
+        if (unlikely(ret)) {
+                EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+                          _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
+                make_bad_inode(inode);
+        } else
+                set_obj_created(oi);
+        atomic_dec(&inode->i_count);
+        wake_up(&oi->i_wq);
+}
+/*
+ * Set up a new inode and create an object for it on the OSD
+ */
+struct inode *exofs_new_inode(struct inode *dir, int mode)
+{
+        struct super_block *sb;
+        struct inode *inode;
+        struct exofs_i_info *oi;
+        struct exofs_sb_info *sbi;
+        struct osd_request *or;
+        struct osd_obj_id obj;
+        int ret;
+        sb = dir->i_sb;
+        inode = new_inode(sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        oi = exofs_i(inode);
+        init_waitqueue_head(&oi->i_wq);
+        set_obj_2bcreated(oi);
+        sbi = sb->s_fs_info;
+        sb->s_dirt = 1;
+        inode->i_uid = current->cred->fsuid;
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else {
+                inode->i_gid = current->cred->fsgid;
+        }
+        inode->i_mode = mode;
+        inode->i_ino = sbi->s_nextid++;
+        inode->i_blkbits = EXOFS_BLKSHIFT;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        oi->i_commit_size = inode->i_size = 0;
+        spin_lock(&sbi->s_next_gen_lock);
+        inode->i_generation = sbi->s_next_generation++;
+        spin_unlock(&sbi->s_next_gen_lock);
+        insert_inode_hash(inode);
+        mark_inode_dirty(inode);
+        obj.partition = sbi->s_pid;
+        obj.id = inode->i_ino + EXOFS_OBJ_OFF;
+        exofs_make_credential(oi->i_cred, &obj);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        osd_req_create_object(or, &obj);
+        /* increment the refcount so that the inode will still be around when we
+         * reach the callback
+         */
+        atomic_inc(&inode->i_count);
+        ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+        if (ret) {
+                atomic_dec(&inode->i_count);
+                osd_end_request(or);
+                return ERR_PTR(-EIO);
+        }
+        atomic_inc(&sbi->s_curr_pending);
+        return inode;
+}
+/*
+ * struct to pass two arguments to update_inode's callback
+ */
+struct updatei_args {
+        struct exofs_sb_info    *sbi;
+        struct exofs_fcb        fcb;
+};
+/*
+ * Callback function from exofs_update_inode().
+ */
+static void updatei_done(struct osd_request *or, void *p)
+{
+        struct updatei_args *args = p;
+        osd_end_request(or);
+        atomic_dec(&args->sbi->s_curr_pending);
+        kfree(args);
+}
+/*
+ * Write the inode to the OSD.  Just fill up the struct, and set the attribute
+ * synchronously or asynchronously depending on the do_sync flag.
+ */
+static int exofs_update_inode(struct inode *inode, int do_sync)
+{
+        struct exofs_i_info *oi = exofs_i(inode);
+        struct super_block *sb = inode->i_sb;
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or;
+        struct osd_attr attr;
+        struct exofs_fcb *fcb;
+        struct updatei_args *args;
+        int ret;
+        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        if (!args)
+                return -ENOMEM;
+        fcb = &args->fcb;
+        fcb->i_mode = cpu_to_le16(inode->i_mode);
+        fcb->i_uid = cpu_to_le32(inode->i_uid);
+        fcb->i_gid = cpu_to_le32(inode->i_gid);
+        fcb->i_links_count = cpu_to_le16(inode->i_nlink);
+        fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+        fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+        fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+        oi->i_commit_size = i_size_read(inode);
+        fcb->i_size = cpu_to_le64(oi->i_commit_size);
+        fcb->i_generation = cpu_to_le32(inode->i_generation);
+        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+                if (old_valid_dev(inode->i_rdev)) {
+                        fcb->i_data[0] =
+                                cpu_to_le32(old_encode_dev(inode->i_rdev));
+                        fcb->i_data[1] = 0;
+                } else {
+                        fcb->i_data[0] = 0;
+                        fcb->i_data[1] =
+                                cpu_to_le32(new_encode_dev(inode->i_rdev));
+                        fcb->i_data[2] = 0;
+                }
+        } else
+                memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
+                ret = -ENOMEM;
+                goto free_args;
+        }
+        osd_req_set_attributes(or, &obj);
+        attr = g_attr_inode_data;
+        attr.val_ptr = fcb;
+        osd_req_add_set_attr_list(or, &attr, 1);
+        if (!obj_created(oi)) {
+                EXOFS_DBGMSG("!obj_created\n");
+                BUG_ON(!obj_2bcreated(oi));
+                wait_event(oi->i_wq, obj_created(oi));
+                EXOFS_DBGMSG("wait_event done\n");
+        }
+        if (do_sync) {
+                ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+                osd_end_request(or);
+                goto free_args;
+        } else {
+                args->sbi = sbi;
+                ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
+                if (ret) {
+                        osd_end_request(or);
+                        goto free_args;
+                }
+                atomic_inc(&sbi->s_curr_pending);
+                goto out; /* deallocation in updatei_done */
+        }
+free_args:
+        kfree(args);
+out:
+        EXOFS_DBGMSG("ret=>%d\n", ret);
+        return ret;
+}
+int exofs_write_inode(struct inode *inode, int wait)
+{
+        return exofs_update_inode(inode, wait);
+}
+/*
+ * Callback function from exofs_delete_inode() - don't have much cleaning up to
+ * do.
+ */
+static void delete_done(struct osd_request *or, void *p)
+{
+        struct exofs_sb_info *sbi;
+        osd_end_request(or);
+        sbi = p;
+        atomic_dec(&sbi->s_curr_pending);
+}
+/*
+ * Called when the refcount of an inode reaches zero.  We remove the object
+ * from the OSD here.  We make sure the object was created before we try and
+ * delete it.
+ */
+void exofs_delete_inode(struct inode *inode)
+{
+        struct exofs_i_info *oi = exofs_i(inode);
+        struct super_block *sb = inode->i_sb;
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct osd_request *or;
+        int ret;
+        truncate_inode_pages(&inode->i_data, 0);
+        if (is_bad_inode(inode))
+                goto no_delete;
+        mark_inode_dirty(inode);
+        exofs_update_inode(inode, inode_needs_sync(inode));
+        inode->i_size = 0;
+        if (inode->i_blocks)
+                exofs_truncate(inode);
+        clear_inode(inode);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+                return;
+        }
+        osd_req_remove_object(or, &obj);
+        /* if we are deleting an obj that hasn't been created yet, wait */
+        if (!obj_created(oi)) {
+                BUG_ON(!obj_2bcreated(oi));
+                wait_event(oi->i_wq, obj_created(oi));
+        }
+        ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+        if (ret) {
+                EXOFS_ERR(
+                       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+                osd_end_request(or);
+                return;
+        }
+        atomic_inc(&sbi->s_curr_pending);
+        return;
+no_delete:
+        clear_inode(inode);
+}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 000000000000..77fdd765e76d
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "exofs.h"
+static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+        int err = exofs_add_link(dentry, inode);
+        if (!err) {
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        inode_dec_link_count(inode);
+        iput(inode);
+        return err;
+}
+static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
+                                   struct nameidata *nd)
+{
+        struct inode *inode;
+        ino_t ino;
+        if (dentry->d_name.len > EXOFS_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        ino = exofs_inode_by_name(dir, dentry);
+        inode = NULL;
+        if (ino) {
+                inode = exofs_iget(dir->i_sb, ino);
+                if (IS_ERR(inode))
+                        return ERR_CAST(inode);
+        }
+        return d_splice_alias(inode, dentry);
+}
+static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
+                         struct nameidata *nd)
+{
+        struct inode *inode = exofs_new_inode(dir, mode);
+        int err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                inode->i_op = &exofs_file_inode_operations;
+                inode->i_fop = &exofs_file_operations;
+                inode->i_mapping->a_ops = &exofs_aops;
+                mark_inode_dirty(inode);
+                err = exofs_add_nondir(dentry, inode);
+        }
+        return err;
+}
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+                       dev_t rdev)
+{
+        struct inode *inode;
+        int err;
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        inode = exofs_new_inode(dir, mode);
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                init_special_inode(inode, inode->i_mode, rdev);
+                mark_inode_dirty(inode);
+                err = exofs_add_nondir(dentry, inode);
+        }
+        return err;
+}
+static int exofs_symlink(struct inode *dir, struct dentry *dentry,
+                          const char *symname)
+{
+        struct super_block *sb = dir->i_sb;
+        int err = -ENAMETOOLONG;
+        unsigned l = strlen(symname)+1;
+        struct inode *inode;
+        struct exofs_i_info *oi;
+        if (l > sb->s_blocksize)
+                goto out;
+        inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out;
+        oi = exofs_i(inode);
+        if (l > sizeof(oi->i_data)) {
+                /* slow symlink */
+                inode->i_op = &exofs_symlink_inode_operations;
+                inode->i_mapping->a_ops = &exofs_aops;
+                memset(oi->i_data, 0, sizeof(oi->i_data));
+                err = page_symlink(inode, symname, l);
+                if (err)
+                        goto out_fail;
+        } else {
+                /* fast symlink */
+                inode->i_op = &exofs_fast_symlink_inode_operations;
+                memcpy(oi->i_data, symname, l);
+                inode->i_size = l-1;
+        }
+        mark_inode_dirty(inode);
+        err = exofs_add_nondir(dentry, inode);
+out:
+        return err;
+out_fail:
+        inode_dec_link_count(inode);
+        iput(inode);
+        goto out;
+}
+static int exofs_link(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        struct inode *inode = old_dentry->d_inode;
+        if (inode->i_nlink >= EXOFS_LINK_MAX)
+                return -EMLINK;
+        inode->i_ctime = CURRENT_TIME;
+        inode_inc_link_count(inode);
+        atomic_inc(&inode->i_count);
+        return exofs_add_nondir(dentry, inode);
+}
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct inode *inode;
+        int err = -EMLINK;
+        if (dir->i_nlink >= EXOFS_LINK_MAX)
+                goto out;
+        inode_inc_link_count(dir);
+        inode = exofs_new_inode(dir, S_IFDIR | mode);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_dir;
+        inode->i_op = &exofs_dir_inode_operations;
+        inode->i_fop = &exofs_dir_operations;
+        inode->i_mapping->a_ops = &exofs_aops;
+        inode_inc_link_count(inode);
+        err = exofs_make_empty(inode, dir);
+        if (err)
+                goto out_fail;
+        err = exofs_add_link(dentry, inode);
+        if (err)
+                goto out_fail;
+        d_instantiate(dentry, inode);
+out:
+        return err;
+out_fail:
+        inode_dec_link_count(inode);
+        inode_dec_link_count(inode);
+        iput(inode);
+out_dir:
+        inode_dec_link_count(dir);
+        goto out;
+}
+static int exofs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        struct exofs_dir_entry *de;
+        struct page *page;
+        int err = -ENOENT;
+        de = exofs_find_entry(dir, dentry, &page);
+        if (!de)
+                goto out;
+        err = exofs_delete_entry(de, page);
+        if (err)
+                goto out;
+        inode->i_ctime = dir->i_ctime;
+        inode_dec_link_count(inode);
+        err = 0;
+out:
+        return err;
+}
+static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        int err = -ENOTEMPTY;
+        if (exofs_empty_dir(inode)) {
+                err = exofs_unlink(dir, dentry);
+                if (!err) {
+                        inode->i_size = 0;
+                        inode_dec_link_count(inode);
+                        inode_dec_link_count(dir);
+                }
+        }
+        return err;
+}
+static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct inode *old_inode = old_dentry->d_inode;
+        struct inode *new_inode = new_dentry->d_inode;
+        struct page *dir_page = NULL;
+        struct exofs_dir_entry *dir_de = NULL;
+        struct page *old_page;
+        struct exofs_dir_entry *old_de;
+        int err = -ENOENT;
+        old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
+        if (!old_de)
+                goto out;
+        if (S_ISDIR(old_inode->i_mode)) {
+                err = -EIO;
+                dir_de = exofs_dotdot(old_inode, &dir_page);
+                if (!dir_de)
+                        goto out_old;
+        }
+        if (new_inode) {
+                struct page *new_page;
+                struct exofs_dir_entry *new_de;
+                err = -ENOTEMPTY;
+                if (dir_de && !exofs_empty_dir(new_inode))
+                        goto out_dir;
+                err = -ENOENT;
+                new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
+                if (!new_de)
+                        goto out_dir;
+                inode_inc_link_count(old_inode);
+                err = exofs_set_link(new_dir, new_de, new_page, old_inode);
+                new_inode->i_ctime = CURRENT_TIME;
+                if (dir_de)
+                        drop_nlink(new_inode);
+                inode_dec_link_count(new_inode);
+                if (err)
+                        goto out_dir;
+        } else {
+                if (dir_de) {
+                        err = -EMLINK;
+                        if (new_dir->i_nlink >= EXOFS_LINK_MAX)
+                                goto out_dir;
+                }
+                inode_inc_link_count(old_inode);
+                err = exofs_add_link(new_dentry, old_inode);
+                if (err) {
+                        inode_dec_link_count(old_inode);
+                        goto out_dir;
+                }
+                if (dir_de)
+                        inode_inc_link_count(new_dir);
+        }
+        old_inode->i_ctime = CURRENT_TIME;
+        exofs_delete_entry(old_de, old_page);
+        inode_dec_link_count(old_inode);
+        if (dir_de) {
+                err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
+                inode_dec_link_count(old_dir);
+                if (err)
+                        goto out_dir;
+        }
+        return 0;
+out_dir:
+        if (dir_de) {
+                kunmap(dir_page);
+                page_cache_release(dir_page);
+        }
+out_old:
+        kunmap(old_page);
+        page_cache_release(old_page);
+out:
+        return err;
+}
+const struct inode_operations exofs_dir_inode_operations = {
+        .create         = exofs_create,
+        .lookup         = exofs_lookup,
+        .link           = exofs_link,
+        .unlink         = exofs_unlink,
+        .symlink        = exofs_symlink,
+        .mkdir          = exofs_mkdir,
+        .rmdir          = exofs_rmdir,
+        .mknod          = exofs_mknod,
+        .rename         = exofs_rename,
+        .setattr        = exofs_setattr,
+};
+const struct inode_operations exofs_special_inode_operations = {
+        .setattr        = exofs_setattr,
+};
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 000000000000..b249ae97fb15
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <scsi/scsi_device.h>
+#include <scsi/osd_sense.h>
+#include "exofs.h"
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
+{
+        struct osd_sense_info osi;
+        int ret = osd_req_decode_sense(or, &osi);
+        if (ret) { /* translate to Linux codes */
+                if (osi.additional_code == scsi_invalid_field_in_cdb) {
+                        if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
+                                ret = -EFAULT;
+                        if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
+                                ret = -ENOENT;
+                        else
+                                ret = -EINVAL;
+                } else if (osi.additional_code == osd_quota_error)
+                        ret = -ENOSPC;
+                else
+                        ret = -EIO;
+        }
+        /* FIXME: should be include in osd_sense_info */
+        if (in_resid)
+                *in_resid = or->in.req ? or->in.req->data_len : 0;
+        if (out_resid)
+                *out_resid = or->out.req ? or->out.req->data_len : 0;
+        return ret;
+}
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+/*
+ * Perform a synchronous OSD operation.
+ */
+int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+        int ret;
+        or->timeout = timeout;
+        ret = osd_finalize_request(or, 0, credential, NULL);
+        if (ret) {
+                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+                return ret;
+        }
+        ret = osd_execute_request(or);
+        if (ret)
+                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+        /* osd_req_decode_sense(or, ret); */
+        return ret;
+}
+/*
+ * Perform an asynchronous OSD operation.
+ */
+int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+                   void *caller_context, u8 *cred)
+{
+        int ret;
+        ret = osd_finalize_request(or, 0, cred, NULL);
+        if (ret) {
+                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+                return ret;
+        }
+        ret = osd_execute_request_async(or, async_done, caller_context);
+        if (ret)
+                EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
+        return ret;
+}
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+        struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+        void *iter = NULL;
+        int nelem;
+        do {
+                nelem = 1;
+                osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+                if ((cur_attr.attr_page == attr->attr_page) &&
+                    (cur_attr.attr_id == attr->attr_id)) {
+                        attr->len = cur_attr.len;
+                        attr->val_ptr = cur_attr.val_ptr;
+                        return 0;
+                }
+        } while (iter);
+        return -EIO;
+}
+int osd_req_read_kern(struct osd_request *or,
+        const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+        struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+        struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+        if (!bio)
+                return -ENOMEM;
+        osd_req_read(or, obj, bio, offset);
+        return 0;
+}
+int osd_req_write_kern(struct osd_request *or,
+        const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+        struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+        struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+        if (!bio)
+                return -ENOMEM;
+        osd_req_write(or, obj, bio, offset);
+        return 0;
+}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 000000000000..9f1985e857e2
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+#include "exofs.h"
+/******************************************************************************
+ * MOUNT OPTIONS
+ *****************************************************************************/
+/*
+ * struct to hold what we get from mount options
+ */
+struct exofs_mountopt {
+        const char *dev_name;
+        uint64_t pid;
+        int timeout;
+};
+/*
+ * exofs-specific mount-time options.
+ */
+enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+/*
+ * Our mount-time options.  These should ideally be 64-bit unsigned, but the
+ * kernel's parsing functions do not currently support that.  32-bit should be
+ * sufficient for most applications now.
+ */
+static match_table_t tokens = {
+        {Opt_pid, "pid=%u"},
+        {Opt_to, "to=%u"},
+        {Opt_err, NULL}
+};
+/*
+ * The main option parsing method.  Also makes sure that all of the mandatory
+ * mount options were set.
+ */
+static int parse_options(char *options, struct exofs_mountopt *opts)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        bool s_pid = false;
+        EXOFS_DBGMSG("parse_options %s\n", options);
+        /* defaults */
+        memset(opts, 0, sizeof(*opts));
+        opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                char str[32];
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_pid:
+                        if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+                                return -EINVAL;
+                        opts->pid = simple_strtoull(str, NULL, 0);
+                        if (opts->pid < EXOFS_MIN_PID) {
+                                EXOFS_ERR("Partition ID must be >= %u",
+                                          EXOFS_MIN_PID);
+                                return -EINVAL;
+                        }
+                        s_pid = 1;
+                        break;
+                case Opt_to:
+                        if (match_int(&args[0], &option))
+                                return -EINVAL;
+                        if (option <= 0) {
+                                EXOFS_ERR("Timout must be > 0");
+                                return -EINVAL;
+                        }
+                        opts->timeout = option * HZ;
+                        break;
+                }
+        }
+        if (!s_pid) {
+                EXOFS_ERR("Need to specify the following options:\n");
+                EXOFS_ERR("    -o pid=pid_no_to_use\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+/******************************************************************************
+ * INODE CACHE
+ *****************************************************************************/
+/*
+ * Our inode cache.  Isn't it pretty?
+ */
+static struct kmem_cache *exofs_inode_cachep;
+/*
+ * Allocate an inode in the cache
+ */
+static struct inode *exofs_alloc_inode(struct super_block *sb)
+{
+        struct exofs_i_info *oi;
+        oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
+        if (!oi)
+                return NULL;
+        oi->vfs_inode.i_version = 1;
+        return &oi->vfs_inode;
+}
+/*
+ * Remove an inode from the cache
+ */
+static void exofs_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+/*
+ * Initialize the inode
+ */
+static void exofs_init_once(void *foo)
+{
+        struct exofs_i_info *oi = foo;
+        inode_init_once(&oi->vfs_inode);
+}
+/*
+ * Create and initialize the inode cache
+ */
+static int init_inodecache(void)
+{
+        exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
+                                sizeof(struct exofs_i_info), 0,
+                                SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                exofs_init_once);
+        if (exofs_inode_cachep == NULL)
+                return -ENOMEM;
+        return 0;
+}
+/*
+ * Destroy the inode cache
+ */
+static void destroy_inodecache(void)
+{
+        kmem_cache_destroy(exofs_inode_cachep);
+}
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
+/*
+ * Write the superblock to the OSD
+ */
+static void exofs_write_super(struct super_block *sb)
+{
+        struct exofs_sb_info *sbi;
+        struct exofs_fscb *fscb;
+        struct osd_request *or;
+        struct osd_obj_id obj;
+        int ret;
+        fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
+        if (!fscb) {
+                EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
+                return;
+        }
+        lock_kernel();
+        sbi = sb->s_fs_info;
+        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
+        fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+        fscb->s_magic = cpu_to_le16(sb->s_magic);
+        fscb->s_newfs = 0;
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
+                goto out;
+        }
+        obj.partition = sbi->s_pid;
+        obj.id = EXOFS_SUPER_ID;
+        ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
+        if (unlikely(ret)) {
+                EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
+                goto out;
+        }
+        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+        if (unlikely(ret)) {
+                EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
+                goto out;
+        }
+        sb->s_dirt = 0;
+out:
+        if (or)
+                osd_end_request(or);
+        unlock_kernel();
+        kfree(fscb);
+}
+/*
+ * This function is called when the vfs is freeing the superblock.  We just
+ * need to free our own part.
+ */
+static void exofs_put_super(struct super_block *sb)
+{
+        int num_pend;
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        /* make sure there are no pending commands */
+        for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
+             num_pend = atomic_read(&sbi->s_curr_pending)) {
+                wait_queue_head_t wq;
+                init_waitqueue_head(&wq);
+                wait_event_timeout(wq,
+                                  (atomic_read(&sbi->s_curr_pending) == 0),
+                                  msecs_to_jiffies(100));
+        }
+        osduld_put_device(sbi->s_dev);
+        kfree(sb->s_fs_info);
+        sb->s_fs_info = NULL;
+}
+/*
+ * Read the superblock from the OSD and fill in the fields
+ */
+static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct inode *root;
+        struct exofs_mountopt *opts = data;
+        struct exofs_sb_info *sbi;      /*extended info                  */
+        struct exofs_fscb fscb;         /*on-disk superblock info        */
+        struct osd_request *or = NULL;
+        struct osd_obj_id obj;
+        int ret;
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+        if (!sbi)
+                return -ENOMEM;
+        sb->s_fs_info = sbi;
+        /* use mount options to fill superblock */
+        sbi->s_dev = osduld_path_lookup(opts->dev_name);
+        if (IS_ERR(sbi->s_dev)) {
+                ret = PTR_ERR(sbi->s_dev);
+                sbi->s_dev = NULL;
+                goto free_sbi;
+        }
+        sbi->s_pid = opts->pid;
+        sbi->s_timeout = opts->timeout;
+        /* fill in some other data by hand */
+        memset(sb->s_id, 0, sizeof(sb->s_id));
+        strcpy(sb->s_id, "exofs");
+        sb->s_blocksize = EXOFS_BLKSIZE;
+        sb->s_blocksize_bits = EXOFS_BLKSHIFT;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        atomic_set(&sbi->s_curr_pending, 0);
+        sb->s_bdev = NULL;
+        sb->s_dev = 0;
+        /* read data from on-disk superblock object */
+        obj.partition = sbi->s_pid;
+        obj.id = EXOFS_SUPER_ID;
+        exofs_make_credential(sbi->s_cred, &obj);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                if (!silent)
+                        EXOFS_ERR(
+                               "exofs_fill_super: osd_start_request failed.\n");
+                ret = -ENOMEM;
+                goto free_sbi;
+        }
+        ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
+        if (unlikely(ret)) {
+                if (!silent)
+                        EXOFS_ERR(
+                               "exofs_fill_super: osd_req_read_kern failed.\n");
+                ret = -ENOMEM;
+                goto free_sbi;
+        }
+        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+        if (unlikely(ret)) {
+                if (!silent)
+                        EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
+                ret = -EIO;
+                goto free_sbi;
+        }
+        sb->s_magic = le16_to_cpu(fscb.s_magic);
+        sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
+        sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
+        /* make sure what we read from the object store is correct */
+        if (sb->s_magic != EXOFS_SUPER_MAGIC) {
+                if (!silent)
+                        EXOFS_ERR("ERROR: Bad magic value\n");
+                ret = -EINVAL;
+                goto free_sbi;
+        }
+        /* start generation numbers from a random point */
+        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+        spin_lock_init(&sbi->s_next_gen_lock);
+        /* set up operation vectors */
+        sb->s_op = &exofs_sops;
+        sb->s_export_op = &exofs_export_ops;
+        root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+        if (IS_ERR(root)) {
+                EXOFS_ERR("ERROR: exofs_iget failed\n");
+                ret = PTR_ERR(root);
+                goto free_sbi;
+        }
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                iput(root);
+                EXOFS_ERR("ERROR: get root inode failed\n");
+                ret = -ENOMEM;
+                goto free_sbi;
+        }
+        if (!S_ISDIR(root->i_mode)) {
+                dput(sb->s_root);
+                sb->s_root = NULL;
+                EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
+                       root->i_mode);
+                ret = -EINVAL;
+                goto free_sbi;
+        }
+        ret = 0;
+out:
+        if (or)
+                osd_end_request(or);
+        return ret;
+free_sbi:
+        osduld_put_device(sbi->s_dev); /* NULL safe */
+        kfree(sbi);
+        goto out;
+}
+/*
+ * Set up the superblock (calls exofs_fill_super eventually)
+ */
+static int exofs_get_sb(struct file_system_type *type,
+                          int flags, const char *dev_name,
+                          void *data, struct vfsmount *mnt)
+{
+        struct exofs_mountopt opts;
+        int ret;
+        ret = parse_options(data, &opts);
+        if (ret)
+                return ret;
+        opts.dev_name = dev_name;
+        return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+}
+/*
+ * Return information about the file system state in the buffer.  This is used
+ * by the 'df' command, for example.
+ */
+static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct exofs_sb_info *sbi = sb->s_fs_info;
+        struct osd_obj_id obj = {sbi->s_pid, 0};
+        struct osd_attr attrs[] = {
+                ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
+                        OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
+                ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
+                        OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
+        };
+        uint64_t capacity = ULLONG_MAX;
+        uint64_t used = ULLONG_MAX;
+        struct osd_request *or;
+        uint8_t cred_a[OSD_CAP_LEN];
+        int ret;
+        /* get used/capacity attributes */
+        exofs_make_credential(cred_a, &obj);
+        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
+                return -ENOMEM;
+        }
+        osd_req_get_attributes(or, &obj);
+        osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
+        ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+        if (unlikely(ret))
+                goto out;
+        ret = extract_attr_from_req(or, &attrs[0]);
+        if (likely(!ret))
+                capacity = get_unaligned_be64(attrs[0].val_ptr);
+        else
+                EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
+        ret = extract_attr_from_req(or, &attrs[1]);
+        if (likely(!ret))
+                used = get_unaligned_be64(attrs[1].val_ptr);
+        else
+                EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
+        /* fill in the stats buffer */
+        buf->f_type = EXOFS_SUPER_MAGIC;
+        buf->f_bsize = EXOFS_BLKSIZE;
+        buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+        buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+        buf->f_bavail = buf->f_bfree;
+        buf->f_files = sbi->s_numfiles;
+        buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
+        buf->f_namelen = EXOFS_NAME_LEN;
+out:
+        osd_end_request(or);
+        return ret;
+}
+static const struct super_operations exofs_sops = {
+        .alloc_inode    = exofs_alloc_inode,
+        .destroy_inode  = exofs_destroy_inode,
+        .write_inode    = exofs_write_inode,
+        .delete_inode   = exofs_delete_inode,
+        .put_super      = exofs_put_super,
+        .write_super    = exofs_write_super,
+        .statfs         = exofs_statfs,
+};
+/******************************************************************************
+ * EXPORT OPERATIONS
+ *****************************************************************************/
+struct dentry *exofs_get_parent(struct dentry *child)
+{
+        unsigned long ino = exofs_parent_ino(child);
+        if (!ino)
+                return NULL;
+        return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
+}
+static struct inode *exofs_nfs_get_inode(struct super_block *sb,
+                u64 ino, u32 generation)
+{
+        struct inode *inode;
+        inode = exofs_iget(sb, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (generation && inode->i_generation != generation) {
+                /* we didn't find the right inode.. */
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return inode;
+}
+static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
+                                struct fid *fid, int fh_len, int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                    exofs_nfs_get_inode);
+}
+static struct dentry *exofs_fh_to_parent(struct super_block *sb,
+                                struct fid *fid, int fh_len, int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                    exofs_nfs_get_inode);
+}
+static const struct export_operations exofs_export_ops = {
+        .fh_to_dentry = exofs_fh_to_dentry,
+        .fh_to_parent = exofs_fh_to_parent,
+        .get_parent = exofs_get_parent,
+};
+/******************************************************************************
+ * INSMOD/RMMOD
+ *****************************************************************************/
+/*
+ * struct that describes this file system
+ */
+static struct file_system_type exofs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "exofs",
+        .get_sb         = exofs_get_sb,
+        .kill_sb        = generic_shutdown_super,
+};
+static int __init init_exofs(void)
+{
+        int err;
+        err = init_inodecache();
+        if (err)
+                goto out;
+        err = register_filesystem(&exofs_type);
+        if (err)
+                goto out_d;
+        return 0;
+out_d:
+        destroy_inodecache();
+out:
+        return err;
+}
+static void __exit exit_exofs(void)
+{
+        unregister_filesystem(&exofs_type);
+        destroy_inodecache();
+}
+MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
+MODULE_DESCRIPTION("exofs");
+MODULE_LICENSE("GPL");
+module_init(init_exofs)
+module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 000000000000..36e2d7bc7f7b
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/namei.h>
+#include "exofs.h"
+static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct exofs_i_info *oi = exofs_i(dentry->d_inode);
+        nd_set_link(nd, (char *)oi->i_data);
+        return NULL;
+}
+const struct inode_operations exofs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+};
+const struct inode_operations exofs_fast_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = exofs_follow_link,
+};
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b27..d46e38cb85c5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
                                return PTR_ERR(acl);
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
               struct posix_acl *clone;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
          To compile this file system support as a module, choose M here: the
          module will be called ext3.
+config EXT3_DEFAULTS_TO_ORDERED
+        bool "Default to 'data=ordered' in ext3 (legacy option)"
+        depends on EXT3_FS
+        help
+          If a filesystem does not explicitly specify a data ordering
+          mode, and the journal capability allowed it, ext3 used to
+          historically default to 'data=ordered'.
+          That was a rather unfortunate choice, because it leads to all
+          kinds of latency problems, and the 'data=writeback' mode is more
+          appropriate these days.
+          You should probably always answer 'n' here, and if you really
+          want to use 'data=ordered' mode, set it in the filesystem itself
+          with 'tune2fs -o journal_data_ordered'.
+          But if you really want to enable the legacy default, you can do
+          so by answering 'y' to this question.
 config EXT3_FS_XATTR
        bool "Ext3 extended attributes"
        depends on EXT3_FS
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880c..d81ef2fdb08e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
                                return PTR_ERR(acl);
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af4..3d724a95882f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = ext3_readdir,         /* we take BKL. needed?*/
-        .ioctl          = ext3_ioctl,           /* BKL held */
+        .unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..5b49704b231b 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -33,6 +33,10 @@
 */
 static int ext3_release_file (struct inode * inode, struct file * filp)
 {
+        if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+                filemap_flush(inode->i_mapping);
+                EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
                        (atomic_read(&inode->i_writecount) == 1))
@@ -112,7 +116,7 @@ const struct file_operations ext3_file_operations = {
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
        .aio_write      = ext3_file_write,
-        .ioctl          = ext3_ioctl,
+        .unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 4a09ff169870..466a332e0bd1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
                                struct page **pagep, void **fsdata)
 {
        struct inode *inode = mapping->host;
-        int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+        int ret;
        handle_t *handle;
        int retries = 0;
        struct page *page;
        pgoff_t index;
        unsigned from, to;
+        /* Reserve one block more for addition to orphan list in case
+         * we allocate blocks but write fails for some reason */
+        int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1184,15 +1187,20 @@ retry:
        }
 write_begin_failed:
        if (ret) {
-                ext3_journal_stop(handle);
-                unlock_page(page);
-                page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
+                 *
+                 * Add inode to orphan list in case we crash before truncate
+                 * finishes.
                 */
                if (pos + len > inode->i_size)
+                        ext3_orphan_add(handle, inode);
+                ext3_journal_stop(handle);
+                unlock_page(page);
+                page_cache_release(page);
+                if (pos + len > inode->i_size)
                        vmtruncate(inode, inode->i_size);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
@@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
        return err;
 }
+/* For ordered writepage and write_end functions */
+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+        /*
+         * Write could have mapped the buffer but it didn't copy the data in
+         * yet. So avoid filing such buffer into a transaction.
+         */
+        if (buffer_mapped(bh) && buffer_uptodate(bh))
+                return ext3_journal_dirty_data(handle, bh);
+        return 0;
+}
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * Generic write_end handler for ordered and writeback ext3 journal modes.
+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
- * We can't use generic_write_end, because that unlocks the page and we need to
+ * for the whole page but later we failed to copy the data in. Update inode
- * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
+ * size according to what we managed to copy. The rest is going to be
- * after block_write_end.
+ * truncated in write_end function.
 */
-static int ext3_generic_write_end(struct file *file,
+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
-                                struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
 {
-        struct inode *inode = file->f_mapping->host;
+        /* What matters to us is i_disksize. We don't write i_size anywhere */
+        if (pos + copied > inode->i_size)
-        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+                i_size_write(inode, pos + copied);
+        if (pos + copied > EXT3_I(inode)->i_disksize) {
-        if (pos+copied > inode->i_size) {
+                EXT3_I(inode)->i_disksize = pos + copied;
-                i_size_write(inode, pos+copied);
                mark_inode_dirty(inode);
        }
-        return copied;
 }
 /*
@@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file,
        unsigned from, to;
        int ret = 0, ret2;
-        from = pos & (PAGE_CACHE_SIZE - 1);
+        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-        to = from + len;
+        from = pos & (PAGE_CACHE_SIZE - 1);
+        to = from + copied;
        ret = walk_page_buffers(handle, page_buffers(page),
-                from, to, NULL, ext3_journal_dirty_data);
+                from, to, NULL, journal_dirty_data_fn);
-        if (ret == 0) {
+        if (ret == 0)
-                /*
+                update_file_sizes(inode, pos, copied);
-                 * generic_write_end() will run mark_inode_dirty() if i_size
+        /*
-                 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+         * There may be allocated blocks outside of i_size because
-                 * into that.
+         * we failed to copy some data. Prepare for truncate.
-                 */
+         */
-                loff_t new_i_size;
+        if (pos + len > inode->i_size)
+                ext3_orphan_add(handle, inode);
-                new_i_size = pos + copied;
-                if (new_i_size > EXT3_I(inode)->i_disksize)
-                        EXT3_I(inode)->i_disksize = new_i_size;
-                ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
-                                                        page, fsdata);
-                copied = ret2;
-                if (ret2 < 0)
-                        ret = ret2;
-        }
        ret2 = ext3_journal_stop(handle);
        if (!ret)
                ret = ret2;
        unlock_page(page);
        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                vmtruncate(inode, inode->i_size);
        return ret ? ret : copied;
 }
@@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file,
 {
        handle_t *handle = ext3_journal_current_handle();
        struct inode *inode = file->f_mapping->host;
-        int ret = 0, ret2;
+        int ret;
-        loff_t new_i_size;
-        new_i_size = pos + copied;
-        if (new_i_size > EXT3_I(inode)->i_disksize)
-                EXT3_I(inode)->i_disksize = new_i_size;
-        ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
-                                                        page, fsdata);
-        copied = ret2;
-        if (ret2 < 0)
-                ret = ret2;
-        ret2 = ext3_journal_stop(handle);
+        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (!ret)
+        update_file_sizes(inode, pos, copied);
-                ret = ret2;
+        /*
+         * There may be allocated blocks outside of i_size because
+         * we failed to copy some data. Prepare for truncate.
+         */
+        if (pos + len > inode->i_size)
+                ext3_orphan_add(handle, inode);
+        ret = ext3_journal_stop(handle);
        unlock_page(page);
        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                vmtruncate(inode, inode->i_size);
        return ret ? ret : copied;
 }
@@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file,
        if (copied < len) {
                if (!PageUptodate(page))
                        copied = 0;
-                page_zero_new_buffers(page, from+copied, to);
+                page_zero_new_buffers(page, from + copied, to);
+                to = from + copied;
        }
        ret = walk_page_buffers(handle, page_buffers(page), from,
                                to, &partial, write_end_fn);
        if (!partial)
                SetPageUptodate(page);
-        if (pos+copied > inode->i_size)
-                i_size_write(inode, pos+copied);
+        if (pos + copied > inode->i_size)
+                i_size_write(inode, pos + copied);
+        /*
+         * There may be allocated blocks outside of i_size because
+         * we failed to copy some data. Prepare for truncate.
+         */
+        if (pos + len > inode->i_size)
+                ext3_orphan_add(handle, inode);
        EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
        if (inode->i_size > EXT3_I(inode)->i_disksize) {
                EXT3_I(inode)->i_disksize = inode->i_size;
@@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file,
        unlock_page(page);
        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                vmtruncate(inode, inode->i_size);
        return ret ? ret : copied;
 }
@@ -1428,17 +1443,11 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-        if (buffer_mapped(bh))
-                return ext3_journal_dirty_data(handle, bh);
-        return 0;
-}
 static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
 {
        return !buffer_mapped(bh);
 }
 /*
 * Note that we always start a transaction even if we're not journalling
 * data.  This is to preserve ordering: any hole instantiation within
@@ -2354,6 +2363,9 @@ void ext3_truncate(struct inode *inode)
        if (!ext3_can_truncate(inode))
                return;
+        if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+                ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
        /*
         * We have to lock the EOF page here, because lock_page() nests
         * outside journal_start().
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e0..88974814783a 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
-int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
        struct ext3_inode_info *ei = EXT3_I(inode);
        unsigned int flags;
        unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                unsigned int oldflags;
                unsigned int jflag;
+                if (!is_owner_or_cap(inode))
+                        return -EACCES;
+                if (get_user(flags, (int __user *) arg))
+                        return -EFAULT;
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        return err;
-                if (!is_owner_or_cap(inode)) {
-                        err = -EACCES;
-                        goto flags_out;
-                }
-                if (get_user(flags, (int __user *) arg)) {
-                        err = -EFAULT;
-                        goto flags_out;
-                }
                flags = ext3_mask_flags(inode->i_mode, flags);
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
-                if (IS_NOQUOTA(inode)) {
+                err = -EPERM;
-                        mutex_unlock(&inode->i_mutex);
+                if (IS_NOQUOTA(inode))
-                        err = -EPERM;
                        goto flags_out;
-                }
                oldflags = ei->i_flags;
                /* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                 * This test looks nicer. Thanks to Pauline Middelink
                 */
                if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
-                        if (!capable(CAP_LINUX_IMMUTABLE)) {
+                        if (!capable(CAP_LINUX_IMMUTABLE))
-                                mutex_unlock(&inode->i_mutex);
-                                err = -EPERM;
                                goto flags_out;
-                        }
                }
                /*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                 * the relevant capability.
                 */
                if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-                        if (!capable(CAP_SYS_RESOURCE)) {
+                        if (!capable(CAP_SYS_RESOURCE))
-                                mutex_unlock(&inode->i_mutex);
-                                err = -EPERM;
                                goto flags_out;
-                        }
                }
                handle = ext3_journal_start(inode, 1);
                if (IS_ERR(handle)) {
-                        mutex_unlock(&inode->i_mutex);
                        err = PTR_ERR(handle);
                        goto flags_out;
                }
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
                ext3_journal_stop(handle);
-                if (err) {
+                if (err)
-                        mutex_unlock(&inode->i_mutex);
+                        goto flags_out;
-                        return err;
-                }
                if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
                        err = ext3_change_inode_journal_flag(inode, jflag);
-                mutex_unlock(&inode->i_mutex);
 flags_out:
+                mutex_unlock(&inode->i_mutex);
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
@@ -140,6 +125,7 @@ flags_out:
                if (!is_owner_or_cap(inode))
                        return -EPERM;
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        return err;
@@ -147,6 +133,7 @@ flags_out:
                        err = -EFAULT;
                        goto setversion_out;
                }
                handle = ext3_journal_start(inode, 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
 #ifdef CONFIG_COMPAT
 long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
-        int ret;
        /* These are just misnamed, they actually get/put from/to user an int */
        switch (cmd) {
        case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        default:
                return -ENOIOCTLCMD;
        }
-        lock_kernel();
+        return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-        ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-        unlock_kernel();
-        return ret;
 }
 #endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index e2fc63cbba8b..6ff7b9730234 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry,
                                 struct dx_frame *frame,
                                 int *err);
 static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
                        struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
                struct dx_map_entry *offsets, int count);
-static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
@@ -708,14 +708,14 @@ errout:
 * Create map of hash values, offsets, and sizes, stored at end of block.
 * Returns number of entries mapped.
 */
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
-                        struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+                struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
 {
        int count = 0;
        char *base = (char *) de;
        struct dx_hash_info h = *hinfo;
-        while ((char *) de < base + size)
+        while ((char *) de < base + blocksize)
        {
                if (de->name_len && de->inode) {
                        ext3fs_dirhash(de->name, de->name_len, &h);
@@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
                        return ERR_PTR(-EIO);
                }
                inode = ext3_iget(dir->i_sb, ino);
-                if (IS_ERR(inode))
+                if (unlikely(IS_ERR(inode))) {
-                        return ERR_CAST(inode);
+                        if (PTR_ERR(inode) == -ESTALE) {
+                                ext3_error(dir->i_sb, __func__,
+                                                "deleted inode referenced: %lu",
+                                                ino);
+                                return ERR_PTR(-EIO);
+                        } else {
+                                return ERR_CAST(inode);
+                        }
+                }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 * Compact each dir entry in the range to the minimal rec_len.
 * Returns pointer to last entry in range.
 */
-static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
 {
-        struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
+        struct ext3_dir_entry_2 *next, *to, *prev;
+        struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
        unsigned rec_len = 0;
        prev = to = de;
-        while ((char*)de < base + size) {
+        while ((char *)de < base + blocksize) {
                next = ext3_next_entry(de);
                if (de->inode && de->name_len) {
                        rec_len = EXT3_DIR_REC_LEN(de->name_len);
@@ -2265,7 +2274,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
        struct inode * old_inode, * new_inode;
        struct buffer_head * old_bh, * new_bh, * dir_bh;
        struct ext3_dir_entry_2 * old_de, * new_de;
-        int retval;
+        int retval, flush_file = 0;
        old_bh = new_bh = dir_bh = NULL;
@@ -2401,6 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                ext3_mark_inode_dirty(handle, new_inode);
                if (!new_inode->i_nlink)
                        ext3_orphan_add(handle, new_inode);
+                if (ext3_should_writeback_data(new_inode))
+                        flush_file = 1;
        }
        retval = 0;
@@ -2409,6 +2420,8 @@ end_rename:
        brelse (old_bh);
        brelse (new_bh);
        ext3_journal_stop(handle);
+        if (retval == 0 && flush_file)
+                filemap_flush(old_inode->i_mapping);
        return retval;
 }
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
 #include "acl.h"
 #include "namei.h"
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
                             unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                   cope, else JOURNAL_DATA */
                if (journal_check_available_features
                    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
-                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                        set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
                else
                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
                break;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7505482a08fa..418b6f3b0ae8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -18,7 +18,7 @@ config EXT4_FS
          filesystem; while there will be some performance gains from
          the delayed allocation and inode table readahead, the best
          performance gains will require enabling ext4 features in the
-          filesystem, or formating a new filesystem as an ext4
+          filesystem, or formatting a new filesystem as an ext4
          filesystem initially.
          To compile this file system support as a module, choose M here. The
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc8..647e0d65a284 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
                                return PTR_ERR(acl);
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0004fe6e00..296785a0dec8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
+        struct super_block *sb = dentry->d_sb;
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        /* If the count of free cluster is still unknown, counts it here. */
        if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
@@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
        buf->f_bfree = sbi->free_clusters;
        buf->f_bavail = sbi->free_clusters;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = sbi->options.isvfat ? 260 : 12;
        return 0;
@@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->fs_uid = current_uid();
        opts->fs_gid = current_gid();
-        opts->fs_fmask = opts->fs_dmask = current->fs->umask;
+        opts->fs_fmask = current_umask();
        opts->allow_utime = -1;
        opts->codepage = fat_default_codepage;
        opts->iocharset = fat_default_iocharset;
diff --git a/fs/file_table.c b/fs/file_table.c
index b74a8e1da913..54018fe48840 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -169,7 +169,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
                fmode_t mode, const struct file_operations *fop)
 {
        struct file *file;
-        struct path;
        file = get_empty_filp();
        if (!file)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e3fe9918faaf..91013ff7dd53 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode)
                struct inode *tail_inode;
                tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
-                if (!time_after_eq(inode->dirtied_when,
+                if (time_before(inode->dirtied_when,
                                tail_inode->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
@@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode)
        wake_up_bit(&inode->i_state, __I_SYNC);
 }
+static bool inode_dirtied_after(struct inode *inode, unsigned long t)
+{
+        bool ret = time_after(inode->dirtied_when, t);
+#ifndef CONFIG_64BIT
+        /*
+         * For inodes being constantly redirtied, dirtied_when can get stuck.
+         * It _appears_ to be in the future, but is actually in distant past.
+         * This test is necessary to prevent such wrapped-around relative times
+         * from permanently stopping the whole pdflush writeback.
+         */
+        ret = ret && time_before_eq(inode->dirtied_when, jiffies);
+#endif
+        return ret;
+}
 /*
 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
 */
@@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
                struct inode *inode = list_entry(delaying_queue->prev,
                                                struct inode, i_list);
                if (older_than_this &&
-                        time_after(inode->dirtied_when, *older_than_this))
+                    inode_dirtied_after(inode, *older_than_this))
                        break;
                list_move(&inode->i_list, dispatch_queue);
        }
@@ -420,7 +435,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * If older_than_this is non-NULL, then only write out inodes which
 * had their first dirtying at a time earlier than *older_than_this.
 *
- * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * If we're a pdflush thread, then implement pdflush collision avoidance
 * against the entire list.
 *
 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
@@ -492,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
                        continue;               /* blockdev has wrong queue */
                }
-                /* Was this inode dirtied after sync_sb_inodes was called? */
+                /*
-                if (time_after(inode->dirtied_when, start))
+                 * Was this inode dirtied after sync_sb_inodes was called?
+                 * This keeps sync from extra jobs and livelock.
+                 */
+                if (inode_dirtied_after(inode, start))
                        break;
                /* Is another pdflush already flushing this queue? */
@@ -538,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
                list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                        struct address_space *mapping;
-                        if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                        if (inode->i_state &
+                                        (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                                continue;
                        mapping = inode->i_mapping;
                        if (mapping->nrpages == 0)
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..eee059052db5
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,177 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/fs_struct.h>
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, struct path *path)
+{
+        struct path old_root;
+        write_lock(&fs->lock);
+        old_root = fs->root;
+        fs->root = *path;
+        path_get(path);
+        write_unlock(&fs->lock);
+        if (old_root.dentry)
+                path_put(&old_root);
+}
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, struct path *path)
+{
+        struct path old_pwd;
+        write_lock(&fs->lock);
+        old_pwd = fs->pwd;
+        fs->pwd = *path;
+        path_get(path);
+        write_unlock(&fs->lock);
+        if (old_pwd.dentry)
+                path_put(&old_pwd);
+}
+void chroot_fs_refs(struct path *old_root, struct path *new_root)
+{
+        struct task_struct *g, *p;
+        struct fs_struct *fs;
+        int count = 0;
+        read_lock(&tasklist_lock);
+        do_each_thread(g, p) {
+                task_lock(p);
+                fs = p->fs;
+                if (fs) {
+                        write_lock(&fs->lock);
+                        if (fs->root.dentry == old_root->dentry
+                            && fs->root.mnt == old_root->mnt) {
+                                path_get(new_root);
+                                fs->root = *new_root;
+                                count++;
+                        }
+                        if (fs->pwd.dentry == old_root->dentry
+                            && fs->pwd.mnt == old_root->mnt) {
+                                path_get(new_root);
+                                fs->pwd = *new_root;
+                                count++;
+                        }
+                        write_unlock(&fs->lock);
+                }
+                task_unlock(p);
+        } while_each_thread(g, p);
+        read_unlock(&tasklist_lock);
+        while (count--)
+                path_put(old_root);
+}
+void free_fs_struct(struct fs_struct *fs)
+{
+        path_put(&fs->root);
+        path_put(&fs->pwd);
+        kmem_cache_free(fs_cachep, fs);
+}
+void exit_fs(struct task_struct *tsk)
+{
+        struct fs_struct *fs = tsk->fs;
+        if (fs) {
+                int kill;
+                task_lock(tsk);
+                write_lock(&fs->lock);
+                tsk->fs = NULL;
+                kill = !--fs->users;
+                write_unlock(&fs->lock);
+                task_unlock(tsk);
+                if (kill)
+                        free_fs_struct(fs);
+        }
+}
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+        /* We don't need to lock fs - think why ;-) */
+        if (fs) {
+                fs->users = 1;
+                fs->in_exec = 0;
+                rwlock_init(&fs->lock);
+                fs->umask = old->umask;
+                read_lock(&old->lock);
+                fs->root = old->root;
+                path_get(&old->root);
+                fs->pwd = old->pwd;
+                path_get(&old->pwd);
+                read_unlock(&old->lock);
+        }
+        return fs;
+}
+int unshare_fs_struct(void)
+{
+        struct fs_struct *fs = current->fs;
+        struct fs_struct *new_fs = copy_fs_struct(fs);
+        int kill;
+        if (!new_fs)
+                return -ENOMEM;
+        task_lock(current);
+        write_lock(&fs->lock);
+        kill = !--fs->users;
+        current->fs = new_fs;
+        write_unlock(&fs->lock);
+        task_unlock(current);
+        if (kill)
+                free_fs_struct(fs);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+int current_umask(void)
+{
+        return current->fs->umask;
+}
+EXPORT_SYMBOL(current_umask);
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+        .users          = 1,
+        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
+        .umask          = 0022,
+};
+void daemonize_fs_struct(void)
+{
+        struct fs_struct *fs = current->fs;
+        if (fs) {
+                int kill;
+                task_lock(current);
+                write_lock(&init_fs.lock);
+                init_fs.users++;
+                write_unlock(&init_fs.lock);
+                write_lock(&fs->lock);
+                current->fs = &init_fs;
+                kill = !--fs->users;
+                write_unlock(&fs->lock);
+                task_unlock(current);
+                if (kill)
+                        free_fs_struct(fs);
+        }
+}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644
index 000000000000..9bbb8ce7bea0
--- /dev/null
+++ b/fs/fscache/Kconfig
@@ -0,0 +1,56 @@
+config FSCACHE
+        tristate "General filesystem local caching manager"
+        depends on EXPERIMENTAL
+        select SLOW_WORK
+        help
+          This option enables a generic filesystem caching manager that can be
+          used by various network and other filesystems to cache data locally.
+          Different sorts of caches can be plugged in, depending on the
+          resources available.
+          See Documentation/filesystems/caching/fscache.txt for more information.
+config FSCACHE_STATS
+        bool "Gather statistical information on local caching"
+        depends on FSCACHE && PROC_FS
+        help
+          This option causes statistical information to be gathered on local
+          caching and exported through file:
+                /proc/fs/fscache/stats
+          The gathering of statistics adds a certain amount of overhead to
+          execution as there are a quite a few stats gathered, and on a
+          multi-CPU system these may be on cachelines that keep bouncing
+          between CPUs.  On the other hand, the stats are very useful for
+          debugging purposes.  Saying 'Y' here is recommended.
+          See Documentation/filesystems/caching/fscache.txt for more information.
+config FSCACHE_HISTOGRAM
+        bool "Gather latency information on local caching"
+        depends on FSCACHE && PROC_FS
+        help
+          This option causes latency information to be gathered on local
+          caching and exported through file:
+                /proc/fs/fscache/histogram
+          The generation of this histogram adds a certain amount of overhead to
+          execution as there are a number of points at which data is gathered,
+          and on a multi-CPU system these may be on cachelines that keep
+          bouncing between CPUs.  On the other hand, the histogram may be
+          useful for debugging purposes.  Saying 'N' here is recommended.
+          See Documentation/filesystems/caching/fscache.txt for more information.
+config FSCACHE_DEBUG
+        bool "Debug FS-Cache"
+        depends on FSCACHE
+        help
+          This permits debugging to be dynamically enabled in the local caching
+          management module.  If this is set, the debugging output may be
+          enabled by setting bits in /sys/modules/fscache/parameter/debug.
+          See Documentation/filesystems/caching/fscache.txt for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644
index 000000000000..91571b95aacc
--- /dev/null
+++ b/fs/fscache/Makefile
@@ -0,0 +1,19 @@
+#
+# Makefile for general filesystem caching code
+#
+fscache-y := \
+        cache.o \
+        cookie.o \
+        fsdef.o \
+        main.o \
+        netfs.o \
+        object.o \
+        operation.o \
+        page.o
+fscache-$(CONFIG_PROC_FS) += proc.o
+fscache-$(CONFIG_FSCACHE_STATS) += stats.o
+fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 000000000000..e21985bbb1fb
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,415 @@
+/* FS-Cache cache handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+LIST_HEAD(fscache_cache_list);
+DECLARE_RWSEM(fscache_addremove_sem);
+DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
+EXPORT_SYMBOL(fscache_cache_cleared_wq);
+static LIST_HEAD(fscache_cache_tag_list);
+/*
+ * look up a cache tag
+ */
+struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+{
+        struct fscache_cache_tag *tag, *xtag;
+        /* firstly check for the existence of the tag under read lock */
+        down_read(&fscache_addremove_sem);
+        list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+                if (strcmp(tag->name, name) == 0) {
+                        atomic_inc(&tag->usage);
+                        up_read(&fscache_addremove_sem);
+                        return tag;
+                }
+        }
+        up_read(&fscache_addremove_sem);
+        /* the tag does not exist - create a candidate */
+        xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
+        if (!xtag)
+                /* return a dummy tag if out of memory */
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&xtag->usage, 1);
+        strcpy(xtag->name, name);
+        /* write lock, search again and add if still not present */
+        down_write(&fscache_addremove_sem);
+        list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+                if (strcmp(tag->name, name) == 0) {
+                        atomic_inc(&tag->usage);
+                        up_write(&fscache_addremove_sem);
+                        kfree(xtag);
+                        return tag;
+                }
+        }
+        list_add_tail(&xtag->link, &fscache_cache_tag_list);
+        up_write(&fscache_addremove_sem);
+        return xtag;
+}
+/*
+ * release a reference to a cache tag
+ */
+void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+        if (tag != ERR_PTR(-ENOMEM)) {
+                down_write(&fscache_addremove_sem);
+                if (atomic_dec_and_test(&tag->usage))
+                        list_del_init(&tag->link);
+                else
+                        tag = NULL;
+                up_write(&fscache_addremove_sem);
+                kfree(tag);
+        }
+}
+/*
+ * select a cache in which to store an object
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ * - the object will never be an index
+ */
+struct fscache_cache *fscache_select_cache_for_object(
+        struct fscache_cookie *cookie)
+{
+        struct fscache_cache_tag *tag;
+        struct fscache_object *object;
+        struct fscache_cache *cache;
+        _enter("");
+        if (list_empty(&fscache_cache_list)) {
+                _leave(" = NULL [no cache]");
+                return NULL;
+        }
+        /* we check the parent to determine the cache to use */
+        spin_lock(&cookie->lock);
+        /* the first in the parent's backing list should be the preferred
+         * cache */
+        if (!hlist_empty(&cookie->backing_objects)) {
+                object = hlist_entry(cookie->backing_objects.first,
+                                     struct fscache_object, cookie_link);
+                cache = object->cache;
+                if (object->state >= FSCACHE_OBJECT_DYING ||
+                    test_bit(FSCACHE_IOERROR, &cache->flags))
+                        cache = NULL;
+                spin_unlock(&cookie->lock);
+                _leave(" = %p [parent]", cache);
+                return cache;
+        }
+        /* the parent is unbacked */
+        if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+                /* cookie not an index and is unbacked */
+                spin_unlock(&cookie->lock);
+                _leave(" = NULL [cookie ub,ni]");
+                return NULL;
+        }
+        spin_unlock(&cookie->lock);
+        if (!cookie->def->select_cache)
+                goto no_preference;
+        /* ask the netfs for its preference */
+        tag = cookie->def->select_cache(cookie->parent->netfs_data,
+                                        cookie->netfs_data);
+        if (!tag)
+                goto no_preference;
+        if (tag == ERR_PTR(-ENOMEM)) {
+                _leave(" = NULL [nomem tag]");
+                return NULL;
+        }
+        if (!tag->cache) {
+                _leave(" = NULL [unbacked tag]");
+                return NULL;
+        }
+        if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
+                return NULL;
+        _leave(" = %p [specific]", tag->cache);
+        return tag->cache;
+no_preference:
+        /* netfs has no preference - just select first cache */
+        cache = list_entry(fscache_cache_list.next,
+                           struct fscache_cache, link);
+        _leave(" = %p [first]", cache);
+        return cache;
+}
+/**
+ * fscache_init_cache - Initialise a cache record
+ * @cache: The cache record to be initialised
+ * @ops: The cache operations to be installed in that record
+ * @idfmt: Format string to define identifier
+ * @...: sprintf-style arguments
+ *
+ * Initialise a record of a cache and fill in the name.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_init_cache(struct fscache_cache *cache,
+                        const struct fscache_cache_ops *ops,
+                        const char *idfmt,
+                        ...)
+{
+        va_list va;
+        memset(cache, 0, sizeof(*cache));
+        cache->ops = ops;
+        va_start(va, idfmt);
+        vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
+        va_end(va);
+        INIT_WORK(&cache->op_gc, fscache_operation_gc);
+        INIT_LIST_HEAD(&cache->link);
+        INIT_LIST_HEAD(&cache->object_list);
+        INIT_LIST_HEAD(&cache->op_gc_list);
+        spin_lock_init(&cache->object_list_lock);
+        spin_lock_init(&cache->op_gc_list_lock);
+}
+EXPORT_SYMBOL(fscache_init_cache);
+/**
+ * fscache_add_cache - Declare a cache as being open for business
+ * @cache: The record describing the cache
+ * @ifsdef: The record of the cache object describing the top-level index
+ * @tagname: The tag describing this cache
+ *
+ * Add a cache to the system, making it available for netfs's to use.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+int fscache_add_cache(struct fscache_cache *cache,
+                      struct fscache_object *ifsdef,
+                      const char *tagname)
+{
+        struct fscache_cache_tag *tag;
+        BUG_ON(!cache->ops);
+        BUG_ON(!ifsdef);
+        cache->flags = 0;
+        ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+        ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+        if (!tagname)
+                tagname = cache->identifier;
+        BUG_ON(!tagname[0]);
+        _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
+        /* we use the cache tag to uniquely identify caches */
+        tag = __fscache_lookup_cache_tag(tagname);
+        if (IS_ERR(tag))
+                goto nomem;
+        if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
+                goto tag_in_use;
+        cache->kobj = kobject_create_and_add(tagname, fscache_root);
+        if (!cache->kobj)
+                goto error;
+        ifsdef->cookie = &fscache_fsdef_index;
+        ifsdef->cache = cache;
+        cache->fsdef = ifsdef;
+        down_write(&fscache_addremove_sem);
+        tag->cache = cache;
+        cache->tag = tag;
+        /* add the cache to the list */
+        list_add(&cache->link, &fscache_cache_list);
+        /* add the cache's netfs definition index object to the cache's
+         * list */
+        spin_lock(&cache->object_list_lock);
+        list_add_tail(&ifsdef->cache_link, &cache->object_list);
+        spin_unlock(&cache->object_list_lock);
+        /* add the cache's netfs definition index object to the top level index
+         * cookie as a known backing object */
+        spin_lock(&fscache_fsdef_index.lock);
+        hlist_add_head(&ifsdef->cookie_link,
+                       &fscache_fsdef_index.backing_objects);
+        atomic_inc(&fscache_fsdef_index.usage);
+        /* done */
+        spin_unlock(&fscache_fsdef_index.lock);
+        up_write(&fscache_addremove_sem);
+        printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
+               cache->tag->name, cache->ops->name);
+        kobject_uevent(cache->kobj, KOBJ_ADD);
+        _leave(" = 0 [%s]", cache->identifier);
+        return 0;
+tag_in_use:
+        printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
+        __fscache_release_cache_tag(tag);
+        _leave(" = -EXIST");
+        return -EEXIST;
+error:
+        __fscache_release_cache_tag(tag);
+        _leave(" = -EINVAL");
+        return -EINVAL;
+nomem:
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+EXPORT_SYMBOL(fscache_add_cache);
+/**
+ * fscache_io_error - Note a cache I/O error
+ * @cache: The record describing the cache
+ *
+ * Note that an I/O error occurred in a cache and that it should no longer be
+ * used for anything.  This also reports the error into the kernel log.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_io_error(struct fscache_cache *cache)
+{
+        set_bit(FSCACHE_IOERROR, &cache->flags);
+        printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
+               cache->ops->name);
+}
+EXPORT_SYMBOL(fscache_io_error);
+/*
+ * request withdrawal of all the objects in a cache
+ * - all the objects being withdrawn are moved onto the supplied list
+ */
+static void fscache_withdraw_all_objects(struct fscache_cache *cache,
+                                         struct list_head *dying_objects)
+{
+        struct fscache_object *object;
+        spin_lock(&cache->object_list_lock);
+        while (!list_empty(&cache->object_list)) {
+                object = list_entry(cache->object_list.next,
+                                    struct fscache_object, cache_link);
+                list_move_tail(&object->cache_link, dying_objects);
+                _debug("withdraw %p", object->cookie);
+                spin_lock(&object->lock);
+                spin_unlock(&cache->object_list_lock);
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
+                spin_unlock(&object->lock);
+                cond_resched();
+                spin_lock(&cache->object_list_lock);
+        }
+        spin_unlock(&cache->object_list_lock);
+}
+/**
+ * fscache_withdraw_cache - Withdraw a cache from the active service
+ * @cache: The record describing the cache
+ *
+ * Withdraw a cache from service, unbinding all its cache objects from the
+ * netfs cookies they're currently representing.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_withdraw_cache(struct fscache_cache *cache)
+{
+        LIST_HEAD(dying_objects);
+        _enter("");
+        printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
+               cache->tag->name);
+        /* make the cache unavailable for cookie acquisition */
+        if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
+                BUG();
+        down_write(&fscache_addremove_sem);
+        list_del_init(&cache->link);
+        cache->tag->cache = NULL;
+        up_write(&fscache_addremove_sem);
+        /* make sure all pages pinned by operations on behalf of the netfs are
+         * written to disk */
+        cache->ops->sync_cache(cache);
+        /* dissociate all the netfs pages backed by this cache from the block
+         * mappings in the cache */
+        cache->ops->dissociate_pages(cache);
+        /* we now have to destroy all the active objects pertaining to this
+         * cache - which we do by passing them off to thread pool to be
+         * disposed of */
+        _debug("destroy");
+        fscache_withdraw_all_objects(cache, &dying_objects);
+        /* wait for all extant objects to finish their outstanding operations
+         * and go away */
+        _debug("wait for finish");
+        wait_event(fscache_cache_cleared_wq,
+                   atomic_read(&cache->object_count) == 0);
+        _debug("wait for clearance");
+        wait_event(fscache_cache_cleared_wq,
+                   list_empty(&cache->object_list));
+        _debug("cleared");
+        ASSERT(list_empty(&dying_objects));
+        kobject_put(cache->kobj);
+        clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
+        fscache_release_cache_tag(cache->tag);
+        cache->tag = NULL;
+        _leave("");
+}
+EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644
index 000000000000..72fd18f6c71f
--- /dev/null
+++ b/fs/fscache/cookie.c
@@ -0,0 +1,500 @@
+/* netfs cookie management
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for more information on
+ * the netfs API.
+ */
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+struct kmem_cache *fscache_cookie_jar;
+static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+static int fscache_alloc_object(struct fscache_cache *cache,
+                                struct fscache_cookie *cookie);
+static int fscache_attach_object(struct fscache_cookie *cookie,
+                                 struct fscache_object *object);
+/*
+ * initialise an cookie jar slab element prior to any use
+ */
+void fscache_cookie_init_once(void *_cookie)
+{
+        struct fscache_cookie *cookie = _cookie;
+        memset(cookie, 0, sizeof(*cookie));
+        spin_lock_init(&cookie->lock);
+        INIT_HLIST_HEAD(&cookie->backing_objects);
+}
+/*
+ * request a cookie to represent an object (index, datafile, xattr, etc)
+ * - parent specifies the parent object
+ *   - the top level index cookie for each netfs is stored in the fscache_netfs
+ *     struct upon registration
+ * - def points to the definition
+ * - the netfs_data will be passed to the functions pointed to in *def
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disk until there's a dependent file that
+ *   needs storing
+ * - other objects are stored in a selected cache immediately, and all the
+ *   indices forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ *   - we may set a negative cookie pointer, but that's okay
+ */
+struct fscache_cookie *__fscache_acquire_cookie(
+        struct fscache_cookie *parent,
+        const struct fscache_cookie_def *def,
+        void *netfs_data)
+{
+        struct fscache_cookie *cookie;
+        BUG_ON(!def);
+        _enter("{%s},{%s},%p",
+               parent ? (char *) parent->def->name : "<no-parent>",
+               def->name, netfs_data);
+        fscache_stat(&fscache_n_acquires);
+        /* if there's no parent cookie, then we don't create one here either */
+        if (!parent) {
+                fscache_stat(&fscache_n_acquires_null);
+                _leave(" [no parent]");
+                return NULL;
+        }
+        /* validate the definition */
+        BUG_ON(!def->get_key);
+        BUG_ON(!def->name[0]);
+        BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
+               parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+        /* allocate and initialise a cookie */
+        cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+        if (!cookie) {
+                fscache_stat(&fscache_n_acquires_oom);
+                _leave(" [ENOMEM]");
+                return NULL;
+        }
+        atomic_set(&cookie->usage, 1);
+        atomic_set(&cookie->n_children, 0);
+        atomic_inc(&parent->usage);
+        atomic_inc(&parent->n_children);
+        cookie->def             = def;
+        cookie->parent          = parent;
+        cookie->netfs_data      = netfs_data;
+        cookie->flags           = 0;
+        INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+        switch (cookie->def->type) {
+        case FSCACHE_COOKIE_TYPE_INDEX:
+                fscache_stat(&fscache_n_cookie_index);
+                break;
+        case FSCACHE_COOKIE_TYPE_DATAFILE:
+                fscache_stat(&fscache_n_cookie_data);
+                break;
+        default:
+                fscache_stat(&fscache_n_cookie_special);
+                break;
+        }
+        /* if the object is an index then we need do nothing more here - we
+         * create indices on disk when we need them as an index may exist in
+         * multiple caches */
+        if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+                if (fscache_acquire_non_index_cookie(cookie) < 0) {
+                        atomic_dec(&parent->n_children);
+                        __fscache_cookie_put(cookie);
+                        fscache_stat(&fscache_n_acquires_nobufs);
+                        _leave(" = NULL");
+                        return NULL;
+                }
+        }
+        fscache_stat(&fscache_n_acquires_ok);
+        _leave(" = %p", cookie);
+        return cookie;
+}
+EXPORT_SYMBOL(__fscache_acquire_cookie);
+/*
+ * acquire a non-index cookie
+ * - this must make sure the index chain is instantiated and instantiate the
+ *   object representation too
+ */
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+{
+        struct fscache_object *object;
+        struct fscache_cache *cache;
+        uint64_t i_size;
+        int ret;
+        _enter("");
+        cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+        /* now we need to see whether the backing objects for this cookie yet
+         * exist, if not there'll be nothing to search */
+        down_read(&fscache_addremove_sem);
+        if (list_empty(&fscache_cache_list)) {
+                up_read(&fscache_addremove_sem);
+                _leave(" = 0 [no caches]");
+                return 0;
+        }
+        /* select a cache in which to store the object */
+        cache = fscache_select_cache_for_object(cookie->parent);
+        if (!cache) {
+                up_read(&fscache_addremove_sem);
+                fscache_stat(&fscache_n_acquires_no_cache);
+                _leave(" = -ENOMEDIUM [no cache]");
+                return -ENOMEDIUM;
+        }
+        _debug("cache %s", cache->tag->name);
+        cookie->flags =
+                (1 << FSCACHE_COOKIE_LOOKING_UP) |
+                (1 << FSCACHE_COOKIE_CREATING) |
+                (1 << FSCACHE_COOKIE_NO_DATA_YET);
+        /* ask the cache to allocate objects for this cookie and its parent
+         * chain */
+        ret = fscache_alloc_object(cache, cookie);
+        if (ret < 0) {
+                up_read(&fscache_addremove_sem);
+                _leave(" = %d", ret);
+                return ret;
+        }
+        /* pass on how big the object we're caching is supposed to be */
+        cookie->def->get_attr(cookie->netfs_data, &i_size);
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects)) {
+                spin_unlock(&cookie->lock);
+                goto unavailable;
+        }
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        fscache_set_store_limit(object, i_size);
+        /* initiate the process of looking up all the objects in the chain
+         * (done by fscache_initialise_object()) */
+        fscache_enqueue_object(object);
+        spin_unlock(&cookie->lock);
+        /* we may be required to wait for lookup to complete at this point */
+        if (!fscache_defer_lookup) {
+                _debug("non-deferred lookup %p", &cookie->flags);
+                wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+                _debug("complete");
+                if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
+                        goto unavailable;
+        }
+        up_read(&fscache_addremove_sem);
+        _leave(" = 0 [deferred]");
+        return 0;
+unavailable:
+        up_read(&fscache_addremove_sem);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+}
+/*
+ * recursively allocate cache object records for a cookie/cache combination
+ * - caller must be holding the addremove sem
+ */
+static int fscache_alloc_object(struct fscache_cache *cache,
+                                struct fscache_cookie *cookie)
+{
+        struct fscache_object *object;
+        struct hlist_node *_n;
+        int ret;
+        _enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+        spin_lock(&cookie->lock);
+        hlist_for_each_entry(object, _n, &cookie->backing_objects,
+                             cookie_link) {
+                if (object->cache == cache)
+                        goto object_already_extant;
+        }
+        spin_unlock(&cookie->lock);
+        /* ask the cache to allocate an object (we may end up with duplicate
+         * objects at this stage, but we sort that out later) */
+        object = cache->ops->alloc_object(cache, cookie);
+        if (IS_ERR(object)) {
+                fscache_stat(&fscache_n_object_no_alloc);
+                ret = PTR_ERR(object);
+                goto error;
+        }
+        fscache_stat(&fscache_n_object_alloc);
+        object->debug_id = atomic_inc_return(&fscache_object_debug_id);
+        _debug("ALLOC OBJ%x: %s {%lx}",
+               object->debug_id, cookie->def->name, object->events);
+        ret = fscache_alloc_object(cache, cookie->parent);
+        if (ret < 0)
+                goto error_put;
+        /* only attach if we managed to allocate all we needed, otherwise
+         * discard the object we just allocated and instead use the one
+         * attached to the cookie */
+        if (fscache_attach_object(cookie, object) < 0)
+                cache->ops->put_object(object);
+        _leave(" = 0");
+        return 0;
+object_already_extant:
+        ret = -ENOBUFS;
+        if (object->state >= FSCACHE_OBJECT_DYING) {
+                spin_unlock(&cookie->lock);
+                goto error;
+        }
+        spin_unlock(&cookie->lock);
+        _leave(" = 0 [found]");
+        return 0;
+error_put:
+        cache->ops->put_object(object);
+error:
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * attach a cache object to a cookie
+ */
+static int fscache_attach_object(struct fscache_cookie *cookie,
+                                 struct fscache_object *object)
+{
+        struct fscache_object *p;
+        struct fscache_cache *cache = object->cache;
+        struct hlist_node *_n;
+        int ret;
+        _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+        spin_lock(&cookie->lock);
+        /* there may be multiple initial creations of this object, but we only
+         * want one */
+        ret = -EEXIST;
+        hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+                if (p->cache == object->cache) {
+                        if (p->state >= FSCACHE_OBJECT_DYING)
+                                ret = -ENOBUFS;
+                        goto cant_attach_object;
+                }
+        }
+        /* pin the parent object */
+        spin_lock_nested(&cookie->parent->lock, 1);
+        hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+                             cookie_link) {
+                if (p->cache == object->cache) {
+                        if (p->state >= FSCACHE_OBJECT_DYING) {
+                                ret = -ENOBUFS;
+                                spin_unlock(&cookie->parent->lock);
+                                goto cant_attach_object;
+                        }
+                        object->parent = p;
+                        spin_lock(&p->lock);
+                        p->n_children++;
+                        spin_unlock(&p->lock);
+                        break;
+                }
+        }
+        spin_unlock(&cookie->parent->lock);
+        /* attach to the cache's object list */
+        if (list_empty(&object->cache_link)) {
+                spin_lock(&cache->object_list_lock);
+                list_add(&object->cache_link, &cache->object_list);
+                spin_unlock(&cache->object_list_lock);
+        }
+        /* attach to the cookie */
+        object->cookie = cookie;
+        atomic_inc(&cookie->usage);
+        hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+        ret = 0;
+cant_attach_object:
+        spin_unlock(&cookie->lock);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * update the index entries backing a cookie
+ */
+void __fscache_update_cookie(struct fscache_cookie *cookie)
+{
+        struct fscache_object *object;
+        struct hlist_node *_p;
+        fscache_stat(&fscache_n_updates);
+        if (!cookie) {
+                fscache_stat(&fscache_n_updates_null);
+                _leave(" [no cookie]");
+                return;
+        }
+        _enter("{%s}", cookie->def->name);
+        BUG_ON(!cookie->def->get_aux);
+        spin_lock(&cookie->lock);
+        /* update the index entry on disk in each cache backing this cookie */
+        hlist_for_each_entry(object, _p,
+                             &cookie->backing_objects, cookie_link) {
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+        }
+        spin_unlock(&cookie->lock);
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_update_cookie);
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+        struct fscache_cache *cache;
+        struct fscache_object *object;
+        unsigned long event;
+        fscache_stat(&fscache_n_relinquishes);
+        if (!cookie) {
+                fscache_stat(&fscache_n_relinquishes_null);
+                _leave(" [no cookie]");
+                return;
+        }
+        _enter("%p{%s,%p},%d",
+               cookie, cookie->def->name, cookie->netfs_data, retire);
+        if (atomic_read(&cookie->n_children) != 0) {
+                printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
+                       cookie->def->name);
+                BUG();
+        }
+        /* wait for the cookie to finish being instantiated (or to fail) */
+        if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+                fscache_stat(&fscache_n_relinquishes_waitcrt);
+                wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+        }
+        event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+        /* detach pointers back to the netfs */
+        spin_lock(&cookie->lock);
+        cookie->netfs_data      = NULL;
+        cookie->def             = NULL;
+        /* break links with all the active objects */
+        while (!hlist_empty(&cookie->backing_objects)) {
+                object = hlist_entry(cookie->backing_objects.first,
+                                     struct fscache_object,
+                                     cookie_link);
+                _debug("RELEASE OBJ%x", object->debug_id);
+                /* detach each cache object from the object cookie */
+                spin_lock(&object->lock);
+                hlist_del_init(&object->cookie_link);
+                cache = object->cache;
+                object->cookie = NULL;
+                fscache_raise_event(object, event);
+                spin_unlock(&object->lock);
+                if (atomic_dec_and_test(&cookie->usage))
+                        /* the cookie refcount shouldn't be reduced to 0 yet */
+                        BUG();
+        }
+        spin_unlock(&cookie->lock);
+        if (cookie->parent) {
+                ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+                ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
+                atomic_dec(&cookie->parent->n_children);
+        }
+        /* finally dispose of the cookie */
+        ASSERTCMP(atomic_read(&cookie->usage), >, 0);
+        fscache_cookie_put(cookie);
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
+/*
+ * destroy a cookie
+ */
+void __fscache_cookie_put(struct fscache_cookie *cookie)
+{
+        struct fscache_cookie *parent;
+        _enter("%p", cookie);
+        for (;;) {
+                _debug("FREE COOKIE %p", cookie);
+                parent = cookie->parent;
+                BUG_ON(!hlist_empty(&cookie->backing_objects));
+                kmem_cache_free(fscache_cookie_jar, cookie);
+                if (!parent)
+                        break;
+                cookie = parent;
+                BUG_ON(atomic_read(&cookie->usage) <= 0);
+                if (!atomic_dec_and_test(&cookie->usage))
+                        break;
+        }
+        _leave("");
+}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644
index 000000000000..f5b4baee7352
--- /dev/null
+++ b/fs/fscache/fsdef.c
@@ -0,0 +1,144 @@
+/* Filesystem index definition
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include "internal.h"
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax);
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax);
+static
+enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
+                                                    const void *data,
+                                                    uint16_t datalen);
+/*
+ * The root index is owned by FS-Cache itself.
+ *
+ * When a netfs requests caching facilities, FS-Cache will, if one doesn't
+ * already exist, create an entry in the root index with the key being the name
+ * of the netfs ("AFS" for example), and the auxiliary data holding the index
+ * structure version supplied by the netfs:
+ *
+ *                                   FSDEF
+ *                                     |
+ *                               +-----------+
+ *                               |           |
+ *                              NFS         AFS
+ *                             [v=1]       [v=1]
+ *
+ * If an entry with the appropriate name does already exist, the version is
+ * compared.  If the version is different, the entire subtree from that entry
+ * will be discarded and a new entry created.
+ *
+ * The new entry will be an index, and a cookie referring to it will be passed
+ * to the netfs.  This is then the root handle by which the netfs accesses the
+ * cache.  It can create whatever objects it likes in that index, including
+ * further indices.
+ */
+static struct fscache_cookie_def fscache_fsdef_index_def = {
+        .name           = ".FS-Cache",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+};
+struct fscache_cookie fscache_fsdef_index = {
+        .usage          = ATOMIC_INIT(1),
+        .lock           = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
+        .backing_objects = HLIST_HEAD_INIT,
+        .def            = &fscache_fsdef_index_def,
+};
+EXPORT_SYMBOL(fscache_fsdef_index);
+/*
+ * Definition of an entry in the root index.  Each entry is an index, keyed to
+ * a specific netfs and only applicable to a particular version of the index
+ * structure used by that netfs.
+ */
+struct fscache_cookie_def fscache_fsdef_netfs_def = {
+        .name           = "FSDEF.netfs",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = fscache_fsdef_netfs_get_key,
+        .get_aux        = fscache_fsdef_netfs_get_aux,
+        .check_aux      = fscache_fsdef_netfs_check_aux,
+};
+/*
+ * get the key data for an FSDEF index record - this is the name of the netfs
+ * for which this entry is created
+ */
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax)
+{
+        const struct fscache_netfs *netfs = cookie_netfs_data;
+        unsigned klen;
+        _enter("{%s.%u},", netfs->name, netfs->version);
+        klen = strlen(netfs->name);
+        if (klen > bufmax)
+                return 0;
+        memcpy(buffer, netfs->name, klen);
+        return klen;
+}
+/*
+ * get the auxiliary data for an FSDEF index record - this is the index
+ * structure version number of the netfs for which this version is created
+ */
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+                                            void *buffer, uint16_t bufmax)
+{
+        const struct fscache_netfs *netfs = cookie_netfs_data;
+        unsigned dlen;
+        _enter("{%s.%u},", netfs->name, netfs->version);
+        dlen = sizeof(uint32_t);
+        if (dlen > bufmax)
+                return 0;
+        memcpy(buffer, &netfs->version, dlen);
+        return dlen;
+}
+/*
+ * check that the index structure version number stored in the auxiliary data
+ * matches the one the netfs gave us
+ */
+static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
+        void *cookie_netfs_data,
+        const void *data,
+        uint16_t datalen)
+{
+        struct fscache_netfs *netfs = cookie_netfs_data;
+        uint32_t version;
+        _enter("{%s},,%hu", netfs->name, datalen);
+        if (datalen != sizeof(version)) {
+                _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        }
+        memcpy(&version, data, sizeof(version));
+        if (version != netfs->version) {
+                _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        }
+        _leave(" = OKAY");
+        return FSCACHE_CHECKAUX_OKAY;
+}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 000000000000..bad496748a59
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
+/* FS-Cache latency histogram
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+atomic_t fscache_obj_instantiate_histogram[HZ];
+atomic_t fscache_objs_histogram[HZ];
+atomic_t fscache_ops_histogram[HZ];
+atomic_t fscache_retrieval_delay_histogram[HZ];
+atomic_t fscache_retrieval_histogram[HZ];
+/*
+ * display the time-taken histogram
+ */
+static int fscache_histogram_show(struct seq_file *m, void *v)
+{
+        unsigned long index;
+        unsigned n[5], t;
+        switch ((unsigned long) v) {
+        case 1:
+                seq_puts(m, "JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS "
+                         " RETRV DLY RETRIEVLS\n");
+                return 0;
+        case 2:
+                seq_puts(m, "===== ===== ========= ========= ========="
+                         " ========= =========\n");
+                return 0;
+        default:
+                index = (unsigned long) v - 3;
+                n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
+                n[1] = atomic_read(&fscache_ops_histogram[index]);
+                n[2] = atomic_read(&fscache_objs_histogram[index]);
+                n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
+                n[4] = atomic_read(&fscache_retrieval_histogram[index]);
+                if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
+                        return 0;
+                t = (index * 1000) / HZ;
+                seq_printf(m, "%4lu  0.%03u %9u %9u %9u %9u %9u\n",
+                           index, t, n[0], n[1], n[2], n[3], n[4]);
+                return 0;
+        }
+}
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+        if ((unsigned long long)*_pos >= HZ + 2)
+                return NULL;
+        if (*_pos == 0)
+                *_pos = 1;
+        return (void *)(unsigned long) *_pos;
+}
+/*
+ * move to the next line
+ */
+static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return (unsigned long long)*pos > HZ + 2 ?
+                NULL : (void *)(unsigned long) *pos;
+}
+/*
+ * clean up after reading
+ */
+static void fscache_histogram_stop(struct seq_file *m, void *v)
+{
+}
+static const struct seq_operations fscache_histogram_ops = {
+        .start          = fscache_histogram_start,
+        .stop           = fscache_histogram_stop,
+        .next           = fscache_histogram_next,
+        .show           = fscache_histogram_show,
+};
+/*
+ * open "/proc/fs/fscache/histogram" to provide latency data
+ */
+static int fscache_histogram_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &fscache_histogram_ops);
+}
+const struct file_operations fscache_histogram_fops = {
+        .owner          = THIS_MODULE,
+        .open           = fscache_histogram_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644
index 000000000000..e0cbd16f6dc9
--- /dev/null
+++ b/fs/fscache/internal.h
@@ -0,0 +1,380 @@
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+/*
+ * Lock order, in the order in which multiple locks should be obtained:
+ * - fscache_addremove_sem
+ * - cookie->lock
+ * - cookie->parent->lock
+ * - cache->object_list_lock
+ * - object->lock
+ * - object->parent->lock
+ * - fscache_thread_lock
+ *
+ */
+#include <linux/fscache-cache.h>
+#include <linux/sched.h>
+#define FSCACHE_MIN_THREADS     4
+#define FSCACHE_MAX_THREADS     32
+/*
+ * fsc-cache.c
+ */
+extern struct list_head fscache_cache_list;
+extern struct rw_semaphore fscache_addremove_sem;
+extern struct fscache_cache *fscache_select_cache_for_object(
+        struct fscache_cookie *);
+/*
+ * fsc-cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+extern void fscache_cookie_init_once(void *);
+extern void __fscache_cookie_put(struct fscache_cookie *);
+/*
+ * fsc-fsdef.c
+ */
+extern struct fscache_cookie fscache_fsdef_index;
+extern struct fscache_cookie_def fscache_fsdef_netfs_def;
+/*
+ * fsc-histogram.c
+ */
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+extern atomic_t fscache_obj_instantiate_histogram[HZ];
+extern atomic_t fscache_objs_histogram[HZ];
+extern atomic_t fscache_ops_histogram[HZ];
+extern atomic_t fscache_retrieval_delay_histogram[HZ];
+extern atomic_t fscache_retrieval_histogram[HZ];
+static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
+{
+        unsigned long jif = jiffies - start_jif;
+        if (jif >= HZ)
+                jif = HZ - 1;
+        atomic_inc(&histogram[jif]);
+}
+extern const struct file_operations fscache_histogram_fops;
+#else
+#define fscache_hist(hist, start_jif) do {} while (0)
+#endif
+/*
+ * fsc-main.c
+ */
+extern unsigned fscache_defer_lookup;
+extern unsigned fscache_defer_create;
+extern unsigned fscache_debug;
+extern struct kobject *fscache_root;
+extern int fscache_wait_bit(void *);
+extern int fscache_wait_bit_interruptible(void *);
+/*
+ * fsc-object.c
+ */
+extern void fscache_withdrawing_object(struct fscache_cache *,
+                                       struct fscache_object *);
+extern void fscache_enqueue_object(struct fscache_object *);
+/*
+ * fsc-operation.c
+ */
+extern int fscache_submit_exclusive_op(struct fscache_object *,
+                                       struct fscache_operation *);
+extern int fscache_submit_op(struct fscache_object *,
+                             struct fscache_operation *);
+extern void fscache_abort_object(struct fscache_object *);
+extern void fscache_start_operations(struct fscache_object *);
+extern void fscache_operation_gc(struct work_struct *);
+/*
+ * fsc-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init()     (0)
+#define fscache_proc_cleanup()  do {} while (0)
+#endif
+/*
+ * fsc-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_op_pend;
+extern atomic_t fscache_n_op_run;
+extern atomic_t fscache_n_op_enqueue;
+extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_release;
+extern atomic_t fscache_n_op_gc;
+extern atomic_t fscache_n_attr_changed;
+extern atomic_t fscache_n_attr_changed_ok;
+extern atomic_t fscache_n_attr_changed_nobufs;
+extern atomic_t fscache_n_attr_changed_nomem;
+extern atomic_t fscache_n_attr_changed_calls;
+extern atomic_t fscache_n_allocs;
+extern atomic_t fscache_n_allocs_ok;
+extern atomic_t fscache_n_allocs_wait;
+extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_alloc_ops;
+extern atomic_t fscache_n_alloc_op_waits;
+extern atomic_t fscache_n_retrievals;
+extern atomic_t fscache_n_retrievals_ok;
+extern atomic_t fscache_n_retrievals_wait;
+extern atomic_t fscache_n_retrievals_nodata;
+extern atomic_t fscache_n_retrievals_nobufs;
+extern atomic_t fscache_n_retrievals_intr;
+extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrieval_ops;
+extern atomic_t fscache_n_retrieval_op_waits;
+extern atomic_t fscache_n_stores;
+extern atomic_t fscache_n_stores_ok;
+extern atomic_t fscache_n_stores_again;
+extern atomic_t fscache_n_stores_nobufs;
+extern atomic_t fscache_n_stores_oom;
+extern atomic_t fscache_n_store_ops;
+extern atomic_t fscache_n_store_calls;
+extern atomic_t fscache_n_marks;
+extern atomic_t fscache_n_uncaches;
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_null;
+extern atomic_t fscache_n_acquires_no_cache;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_nobufs;
+extern atomic_t fscache_n_acquires_oom;
+extern atomic_t fscache_n_updates;
+extern atomic_t fscache_n_updates_null;
+extern atomic_t fscache_n_updates_run;
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_null;
+extern atomic_t fscache_n_relinquishes_waitcrt;
+extern atomic_t fscache_n_cookie_index;
+extern atomic_t fscache_n_cookie_data;
+extern atomic_t fscache_n_cookie_special;
+extern atomic_t fscache_n_object_alloc;
+extern atomic_t fscache_n_object_no_alloc;
+extern atomic_t fscache_n_object_lookups;
+extern atomic_t fscache_n_object_lookups_negative;
+extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_created;
+extern atomic_t fscache_n_object_avail;
+extern atomic_t fscache_n_object_dead;
+extern atomic_t fscache_n_checkaux_none;
+extern atomic_t fscache_n_checkaux_okay;
+extern atomic_t fscache_n_checkaux_update;
+extern atomic_t fscache_n_checkaux_obsolete;
+static inline void fscache_stat(atomic_t *stat)
+{
+        atomic_inc(stat);
+}
+extern const struct file_operations fscache_stats_fops;
+#else
+#define fscache_stat(stat) do {} while (0)
+#endif
+/*
+ * raise an event on an object
+ * - if the event is not masked for that object, then the object is
+ *   queued for attention by the thread pool.
+ */
+static inline void fscache_raise_event(struct fscache_object *object,
+                                       unsigned event)
+{
+        if (!test_and_set_bit(event, &object->events) &&
+            test_bit(event, &object->event_mask))
+                fscache_enqueue_object(object);
+}
+/*
+ * drop a reference to a cookie
+ */
+static inline void fscache_cookie_put(struct fscache_cookie *cookie)
+{
+        BUG_ON(atomic_read(&cookie->usage) <= 0);
+        if (atomic_dec_and_test(&cookie->usage))
+                __fscache_cookie_put(cookie);
+}
+/*
+ * get an extra reference to a netfs retrieval context
+ */
+static inline
+void *fscache_get_context(struct fscache_cookie *cookie, void *context)
+{
+        if (cookie->def->get_context)
+                cookie->def->get_context(cookie->netfs_data, context);
+        return context;
+}
+/*
+ * release a reference to a netfs retrieval context
+ */
+static inline
+void fscache_put_context(struct fscache_cookie *cookie, void *context)
+{
+        if (cookie->def->put_context)
+                cookie->def->put_context(cookie->netfs_data, context);
+}
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+        printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline __attribute__((format(printf, 1, 2)))
+void _dbprintk(const char *fmt, ...)
+{
+}
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#ifdef __KDEBUG
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+#elif defined(CONFIG_FSCACHE_DEBUG)
+#define _enter(FMT, ...)                        \
+do {                                            \
+        if (__do_kdebug(ENTER))                 \
+                kenter(FMT, ##__VA_ARGS__);     \
+} while (0)
+#define _leave(FMT, ...)                        \
+do {                                            \
+        if (__do_kdebug(LEAVE))                 \
+                kleave(FMT, ##__VA_ARGS__);     \
+} while (0)
+#define _debug(FMT, ...)                        \
+do {                                            \
+        if (__do_kdebug(DEBUG))                 \
+                kdebug(FMT, ##__VA_ARGS__);     \
+} while (0)
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+/*
+ * determine whether a particular optional debugging point should be logged
+ * - we need to go through three steps to persuade cpp to correctly join the
+ *   shorthand in FSCACHE_DEBUG_LEVEL with its prefix
+ */
+#define ____do_kdebug(LEVEL, POINT) \
+        unlikely((fscache_debug & \
+                  (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
+#define ___do_kdebug(LEVEL, POINT) \
+        ____do_kdebug(LEVEL, POINT)
+#define __do_kdebug(POINT) \
+        ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
+#define FSCACHE_DEBUG_CACHE     0
+#define FSCACHE_DEBUG_COOKIE    1
+#define FSCACHE_DEBUG_PAGE      2
+#define FSCACHE_DEBUG_OPERATION 3
+#define FSCACHE_POINT_ENTER     1
+#define FSCACHE_POINT_LEAVE     2
+#define FSCACHE_POINT_DEBUG     4
+#ifndef FSCACHE_DEBUG_LEVEL
+#define FSCACHE_DEBUG_LEVEL CACHE
+#endif
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+#define ASSERT(X)                                                       \
+do {                                                                    \
+        if (unlikely(!(X))) {                                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTCMP(X, OP, Y)                                             \
+do {                                                                    \
+        if (unlikely(!((X) OP (Y)))) {                                  \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+                printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTIF(C, X)                                                  \
+do {                                                                    \
+        if (unlikely((C) && !(X))) {                                    \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)                                        \
+do {                                                                    \
+        if (unlikely((C) && !((X) OP (Y)))) {                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+                printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
+#else
+#define ASSERT(X)                       do {} while (0)
+#define ASSERTCMP(X, OP, Y)             do {} while (0)
+#define ASSERTIF(C, X)                  do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)        do {} while (0)
+#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644
index 000000000000..4de41b597499
--- /dev/null
+++ b/fs/fscache/main.c
@@ -0,0 +1,124 @@
+/* General filesystem local caching manager
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include "internal.h"
+MODULE_DESCRIPTION("FS Cache Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+unsigned fscache_defer_lookup = 1;
+module_param_named(defer_lookup, fscache_defer_lookup, uint,
+                   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_lookup,
+                 "Defer cookie lookup to background thread");
+unsigned fscache_defer_create = 1;
+module_param_named(defer_create, fscache_defer_create, uint,
+                   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_create,
+                 "Defer cookie creation to background thread");
+unsigned fscache_debug;
+module_param_named(debug, fscache_debug, uint,
+                   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_debug,
+                 "FS-Cache debugging mask");
+struct kobject *fscache_root;
+/*
+ * initialise the fs caching module
+ */
+static int __init fscache_init(void)
+{
+        int ret;
+        ret = slow_work_register_user();
+        if (ret < 0)
+                goto error_slow_work;
+        ret = fscache_proc_init();
+        if (ret < 0)
+                goto error_proc;
+        fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
+                                               sizeof(struct fscache_cookie),
+                                               0,
+                                               0,
+                                               fscache_cookie_init_once);
+        if (!fscache_cookie_jar) {
+                printk(KERN_NOTICE
+                       "FS-Cache: Failed to allocate a cookie jar\n");
+                ret = -ENOMEM;
+                goto error_cookie_jar;
+        }
+        fscache_root = kobject_create_and_add("fscache", kernel_kobj);
+        if (!fscache_root)
+                goto error_kobj;
+        printk(KERN_NOTICE "FS-Cache: Loaded\n");
+        return 0;
+error_kobj:
+        kmem_cache_destroy(fscache_cookie_jar);
+error_cookie_jar:
+        fscache_proc_cleanup();
+error_proc:
+        slow_work_unregister_user();
+error_slow_work:
+        return ret;
+}
+fs_initcall(fscache_init);
+/*
+ * clean up on module removal
+ */
+static void __exit fscache_exit(void)
+{
+        _enter("");
+        kobject_put(fscache_root);
+        kmem_cache_destroy(fscache_cookie_jar);
+        fscache_proc_cleanup();
+        slow_work_unregister_user();
+        printk(KERN_NOTICE "FS-Cache: Unloaded\n");
+}
+module_exit(fscache_exit);
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+int fscache_wait_bit(void *flags)
+{
+        schedule();
+        return 0;
+}
+EXPORT_SYMBOL(fscache_wait_bit);
+/*
+ * wait_on_bit() sleep function for interruptible waiting
+ */
+int fscache_wait_bit_interruptible(void *flags)
+{
+        schedule();
+        return signal_pending(current);
+}
+EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 000000000000..e028b8eb1c40
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
+/* FS-Cache netfs (client) registration
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+static LIST_HEAD(fscache_netfs_list);
+/*
+ * register a network filesystem for caching
+ */
+int __fscache_register_netfs(struct fscache_netfs *netfs)
+{
+        struct fscache_netfs *ptr;
+        int ret;
+        _enter("{%s}", netfs->name);
+        INIT_LIST_HEAD(&netfs->link);
+        /* allocate a cookie for the primary index */
+        netfs->primary_index =
+                kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+        if (!netfs->primary_index) {
+                _leave(" = -ENOMEM");
+                return -ENOMEM;
+        }
+        /* initialise the primary index cookie */
+        atomic_set(&netfs->primary_index->usage, 1);
+        atomic_set(&netfs->primary_index->n_children, 0);
+        netfs->primary_index->def               = &fscache_fsdef_netfs_def;
+        netfs->primary_index->parent            = &fscache_fsdef_index;
+        netfs->primary_index->netfs_data        = netfs;
+        atomic_inc(&netfs->primary_index->parent->usage);
+        atomic_inc(&netfs->primary_index->parent->n_children);
+        spin_lock_init(&netfs->primary_index->lock);
+        INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+        /* check the netfs type is not already present */
+        down_write(&fscache_addremove_sem);
+        ret = -EEXIST;
+        list_for_each_entry(ptr, &fscache_netfs_list, link) {
+                if (strcmp(ptr->name, netfs->name) == 0)
+                        goto already_registered;
+        }
+        list_add(&netfs->link, &fscache_netfs_list);
+        ret = 0;
+        printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
+               netfs->name);
+already_registered:
+        up_write(&fscache_addremove_sem);
+        if (ret < 0) {
+                netfs->primary_index->parent = NULL;
+                __fscache_cookie_put(netfs->primary_index);
+                netfs->primary_index = NULL;
+        }
+        _leave(" = %d", ret);
+        return ret;
+}
+EXPORT_SYMBOL(__fscache_register_netfs);
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+        _enter("{%s.%u}", netfs->name, netfs->version);
+        down_write(&fscache_addremove_sem);
+        list_del(&netfs->link);
+        fscache_relinquish_cookie(netfs->primary_index, 0);
+        up_write(&fscache_addremove_sem);
+        printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
+               netfs->name);
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644
index 000000000000..392a41b1b79d
--- /dev/null
+++ b/fs/fscache/object.c
@@ -0,0 +1,810 @@
+/* FS-Cache object state machine handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/object.txt for a description of the
+ * object state machine and the in-kernel representations.
+ */
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include "internal.h"
+const char *fscache_object_states[] = {
+        [FSCACHE_OBJECT_INIT]           = "OBJECT_INIT",
+        [FSCACHE_OBJECT_LOOKING_UP]     = "OBJECT_LOOKING_UP",
+        [FSCACHE_OBJECT_CREATING]       = "OBJECT_CREATING",
+        [FSCACHE_OBJECT_AVAILABLE]      = "OBJECT_AVAILABLE",
+        [FSCACHE_OBJECT_ACTIVE]         = "OBJECT_ACTIVE",
+        [FSCACHE_OBJECT_UPDATING]       = "OBJECT_UPDATING",
+        [FSCACHE_OBJECT_DYING]          = "OBJECT_DYING",
+        [FSCACHE_OBJECT_LC_DYING]       = "OBJECT_LC_DYING",
+        [FSCACHE_OBJECT_ABORT_INIT]     = "OBJECT_ABORT_INIT",
+        [FSCACHE_OBJECT_RELEASING]      = "OBJECT_RELEASING",
+        [FSCACHE_OBJECT_RECYCLING]      = "OBJECT_RECYCLING",
+        [FSCACHE_OBJECT_WITHDRAWING]    = "OBJECT_WITHDRAWING",
+        [FSCACHE_OBJECT_DEAD]           = "OBJECT_DEAD",
+};
+EXPORT_SYMBOL(fscache_object_states);
+static void fscache_object_slow_work_put_ref(struct slow_work *);
+static int  fscache_object_slow_work_get_ref(struct slow_work *);
+static void fscache_object_slow_work_execute(struct slow_work *);
+static void fscache_initialise_object(struct fscache_object *);
+static void fscache_lookup_object(struct fscache_object *);
+static void fscache_object_available(struct fscache_object *);
+static void fscache_release_object(struct fscache_object *);
+static void fscache_withdraw_object(struct fscache_object *);
+static void fscache_enqueue_dependents(struct fscache_object *);
+static void fscache_dequeue_object(struct fscache_object *);
+const struct slow_work_ops fscache_object_slow_work_ops = {
+        .get_ref        = fscache_object_slow_work_get_ref,
+        .put_ref        = fscache_object_slow_work_put_ref,
+        .execute        = fscache_object_slow_work_execute,
+};
+EXPORT_SYMBOL(fscache_object_slow_work_ops);
+/*
+ * we need to notify the parent when an op completes that we had outstanding
+ * upon it
+ */
+static inline void fscache_done_parent_op(struct fscache_object *object)
+{
+        struct fscache_object *parent = object->parent;
+        _enter("OBJ%x {OBJ%x,%x}",
+               object->debug_id, parent->debug_id, parent->n_ops);
+        spin_lock_nested(&parent->lock, 1);
+        parent->n_ops--;
+        parent->n_obj_ops--;
+        if (parent->n_ops == 0)
+                fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+        spin_unlock(&parent->lock);
+}
+/*
+ * process events that have been sent to an object's state machine
+ * - initiates parent lookup
+ * - does object lookup
+ * - does object creation
+ * - does object recycling and retirement
+ * - does object withdrawal
+ */
+static void fscache_object_state_machine(struct fscache_object *object)
+{
+        enum fscache_object_state new_state;
+        ASSERT(object != NULL);
+        _enter("{OBJ%x,%s,%lx}",
+               object->debug_id, fscache_object_states[object->state],
+               object->events);
+        switch (object->state) {
+                /* wait for the parent object to become ready */
+        case FSCACHE_OBJECT_INIT:
+                object->event_mask =
+                        ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+                fscache_initialise_object(object);
+                goto done;
+                /* look up the object metadata on disk */
+        case FSCACHE_OBJECT_LOOKING_UP:
+                fscache_lookup_object(object);
+                goto lookup_transit;
+                /* create the object metadata on disk */
+        case FSCACHE_OBJECT_CREATING:
+                fscache_lookup_object(object);
+                goto lookup_transit;
+                /* handle an object becoming available; start pending
+                 * operations and queue dependent operations for processing */
+        case FSCACHE_OBJECT_AVAILABLE:
+                fscache_object_available(object);
+                goto active_transit;
+                /* normal running state */
+        case FSCACHE_OBJECT_ACTIVE:
+                goto active_transit;
+                /* update the object metadata on disk */
+        case FSCACHE_OBJECT_UPDATING:
+                clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
+                fscache_stat(&fscache_n_updates_run);
+                object->cache->ops->update_object(object);
+                goto active_transit;
+                /* handle an object dying during lookup or creation */
+        case FSCACHE_OBJECT_LC_DYING:
+                object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+                object->cache->ops->lookup_complete(object);
+                spin_lock(&object->lock);
+                object->state = FSCACHE_OBJECT_DYING;
+                if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+                                       &object->cookie->flags))
+                        wake_up_bit(&object->cookie->flags,
+                                    FSCACHE_COOKIE_CREATING);
+                spin_unlock(&object->lock);
+                fscache_done_parent_op(object);
+                /* wait for completion of all active operations on this object
+                 * and the death of all child objects of this object */
+        case FSCACHE_OBJECT_DYING:
+        dying:
+                clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
+                spin_lock(&object->lock);
+                _debug("dying OBJ%x {%d,%d}",
+                       object->debug_id, object->n_ops, object->n_children);
+                if (object->n_ops == 0 && object->n_children == 0) {
+                        object->event_mask &=
+                                ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+                        object->event_mask |=
+                                (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                                (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                                (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                                (1 << FSCACHE_OBJECT_EV_ERROR);
+                } else {
+                        object->event_mask &=
+                                ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                                  (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                                  (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                                  (1 << FSCACHE_OBJECT_EV_ERROR));
+                        object->event_mask |=
+                                1 << FSCACHE_OBJECT_EV_CLEARED;
+                }
+                spin_unlock(&object->lock);
+                fscache_enqueue_dependents(object);
+                goto terminal_transit;
+                /* handle an abort during initialisation */
+        case FSCACHE_OBJECT_ABORT_INIT:
+                _debug("handle abort init %lx", object->events);
+                object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+                spin_lock(&object->lock);
+                fscache_dequeue_object(object);
+                object->state = FSCACHE_OBJECT_DYING;
+                if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+                                       &object->cookie->flags))
+                        wake_up_bit(&object->cookie->flags,
+                                    FSCACHE_COOKIE_CREATING);
+                spin_unlock(&object->lock);
+                goto dying;
+                /* handle the netfs releasing an object and possibly marking it
+                 * obsolete too */
+        case FSCACHE_OBJECT_RELEASING:
+        case FSCACHE_OBJECT_RECYCLING:
+                object->event_mask &=
+                        ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                          (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                          (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                          (1 << FSCACHE_OBJECT_EV_ERROR));
+                fscache_release_object(object);
+                spin_lock(&object->lock);
+                object->state = FSCACHE_OBJECT_DEAD;
+                spin_unlock(&object->lock);
+                fscache_stat(&fscache_n_object_dead);
+                goto terminal_transit;
+                /* handle the parent cache of this object being withdrawn from
+                 * active service */
+        case FSCACHE_OBJECT_WITHDRAWING:
+                object->event_mask &=
+                        ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                          (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                          (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                          (1 << FSCACHE_OBJECT_EV_ERROR));
+                fscache_withdraw_object(object);
+                spin_lock(&object->lock);
+                object->state = FSCACHE_OBJECT_DEAD;
+                spin_unlock(&object->lock);
+                fscache_stat(&fscache_n_object_dead);
+                goto terminal_transit;
+                /* complain about the object being woken up once it is
+                 * deceased */
+        case FSCACHE_OBJECT_DEAD:
+                printk(KERN_ERR "FS-Cache:"
+                       " Unexpected event in dead state %lx\n",
+                       object->events & object->event_mask);
+                BUG();
+        default:
+                printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
+                       object->state);
+                BUG();
+        }
+        /* determine the transition from a lookup state */
+lookup_transit:
+        switch (fls(object->events & object->event_mask) - 1) {
+        case FSCACHE_OBJECT_EV_WITHDRAW:
+        case FSCACHE_OBJECT_EV_RETIRE:
+        case FSCACHE_OBJECT_EV_RELEASE:
+        case FSCACHE_OBJECT_EV_ERROR:
+                new_state = FSCACHE_OBJECT_LC_DYING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_REQUEUE:
+                goto done;
+        case -1:
+                goto done; /* sleep until event */
+        default:
+                goto unsupported_event;
+        }
+        /* determine the transition from an active state */
+active_transit:
+        switch (fls(object->events & object->event_mask) - 1) {
+        case FSCACHE_OBJECT_EV_WITHDRAW:
+        case FSCACHE_OBJECT_EV_RETIRE:
+        case FSCACHE_OBJECT_EV_RELEASE:
+        case FSCACHE_OBJECT_EV_ERROR:
+                new_state = FSCACHE_OBJECT_DYING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_UPDATE:
+                new_state = FSCACHE_OBJECT_UPDATING;
+                goto change_state;
+        case -1:
+                new_state = FSCACHE_OBJECT_ACTIVE;
+                goto change_state; /* sleep until event */
+        default:
+                goto unsupported_event;
+        }
+        /* determine the transition from a terminal state */
+terminal_transit:
+        switch (fls(object->events & object->event_mask) - 1) {
+        case FSCACHE_OBJECT_EV_WITHDRAW:
+                new_state = FSCACHE_OBJECT_WITHDRAWING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_RETIRE:
+                new_state = FSCACHE_OBJECT_RECYCLING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_RELEASE:
+                new_state = FSCACHE_OBJECT_RELEASING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_ERROR:
+                new_state = FSCACHE_OBJECT_WITHDRAWING;
+                goto change_state;
+        case FSCACHE_OBJECT_EV_CLEARED:
+                new_state = FSCACHE_OBJECT_DYING;
+                goto change_state;
+        case -1:
+                goto done; /* sleep until event */
+        default:
+                goto unsupported_event;
+        }
+change_state:
+        spin_lock(&object->lock);
+        object->state = new_state;
+        spin_unlock(&object->lock);
+done:
+        _leave(" [->%s]", fscache_object_states[object->state]);
+        return;
+unsupported_event:
+        printk(KERN_ERR "FS-Cache:"
+               " Unsupported event %lx [mask %lx] in state %s\n",
+               object->events, object->event_mask,
+               fscache_object_states[object->state]);
+        BUG();
+}
+/*
+ * execute an object
+ */
+static void fscache_object_slow_work_execute(struct slow_work *work)
+{
+        struct fscache_object *object =
+                container_of(work, struct fscache_object, work);
+        unsigned long start;
+        _enter("{OBJ%x}", object->debug_id);
+        clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+        start = jiffies;
+        fscache_object_state_machine(object);
+        fscache_hist(fscache_objs_histogram, start);
+        if (object->events & object->event_mask)
+                fscache_enqueue_object(object);
+}
+/*
+ * initialise an object
+ * - check the specified object's parent to see if we can make use of it
+ *   immediately to do a creation
+ * - we may need to start the process of creating a parent and we need to wait
+ *   for the parent's lookup and creation to complete if it's not there yet
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_initialise_object(struct fscache_object *object)
+{
+        struct fscache_object *parent;
+        _enter("");
+        ASSERT(object->cookie != NULL);
+        ASSERT(object->cookie->parent != NULL);
+        ASSERT(list_empty(&object->work.link));
+        if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
+                              (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                              (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                              (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
+                _debug("abort init %lx", object->events);
+                spin_lock(&object->lock);
+                object->state = FSCACHE_OBJECT_ABORT_INIT;
+                spin_unlock(&object->lock);
+                return;
+        }
+        spin_lock(&object->cookie->lock);
+        spin_lock_nested(&object->cookie->parent->lock, 1);
+        parent = object->parent;
+        if (!parent) {
+                _debug("no parent");
+                set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+        } else {
+                spin_lock(&object->lock);
+                spin_lock_nested(&parent->lock, 1);
+                _debug("parent %s", fscache_object_states[parent->state]);
+                if (parent->state >= FSCACHE_OBJECT_DYING) {
+                        _debug("bad parent");
+                        set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+                } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
+                        _debug("wait");
+                        /* we may get woken up in this state by child objects
+                         * binding on to us, so we need to make sure we don't
+                         * add ourself to the list multiple times */
+                        if (list_empty(&object->dep_link)) {
+                                object->cache->ops->grab_object(object);
+                                list_add(&object->dep_link,
+                                         &parent->dependents);
+                                /* fscache_acquire_non_index_cookie() uses this
+                                 * to wake the chain up */
+                                if (parent->state == FSCACHE_OBJECT_INIT)
+                                        fscache_enqueue_object(parent);
+                        }
+                } else {
+                        _debug("go");
+                        parent->n_ops++;
+                        parent->n_obj_ops++;
+                        object->lookup_jif = jiffies;
+                        object->state = FSCACHE_OBJECT_LOOKING_UP;
+                        set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+                }
+                spin_unlock(&parent->lock);
+                spin_unlock(&object->lock);
+        }
+        spin_unlock(&object->cookie->parent->lock);
+        spin_unlock(&object->cookie->lock);
+        _leave("");
+}
+/*
+ * look an object up in the cache from which it was allocated
+ * - we hold an "access lock" on the parent object, so the parent object cannot
+ *   be withdrawn by either party till we've finished
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_lookup_object(struct fscache_object *object)
+{
+        struct fscache_cookie *cookie = object->cookie;
+        struct fscache_object *parent;
+        _enter("");
+        parent = object->parent;
+        ASSERT(parent != NULL);
+        ASSERTCMP(parent->n_ops, >, 0);
+        ASSERTCMP(parent->n_obj_ops, >, 0);
+        /* make sure the parent is still available */
+        ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
+        if (parent->state >= FSCACHE_OBJECT_DYING ||
+            test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+                _debug("unavailable");
+                set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+                _leave("");
+                return;
+        }
+        _debug("LOOKUP \"%s/%s\" in \"%s\"",
+               parent->cookie->def->name, cookie->def->name,
+               object->cache->tag->name);
+        fscache_stat(&fscache_n_object_lookups);
+        object->cache->ops->lookup_object(object);
+        if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
+                set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+        _leave("");
+}
+/**
+ * fscache_object_lookup_negative - Note negative cookie lookup
+ * @object: Object pointing to cookie to mark
+ *
+ * Note negative lookup, permitting those waiting to read data from an already
+ * existing backing object to continue as there's no data for them to read.
+ */
+void fscache_object_lookup_negative(struct fscache_object *object)
+{
+        struct fscache_cookie *cookie = object->cookie;
+        _enter("{OBJ%x,%s}",
+               object->debug_id, fscache_object_states[object->state]);
+        spin_lock(&object->lock);
+        if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+                fscache_stat(&fscache_n_object_lookups_negative);
+                /* transit here to allow write requests to begin stacking up
+                 * and read requests to begin returning ENODATA */
+                object->state = FSCACHE_OBJECT_CREATING;
+                spin_unlock(&object->lock);
+                set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+                set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+                _debug("wake up lookup %p", &cookie->flags);
+                smp_mb__before_clear_bit();
+                clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+                smp_mb__after_clear_bit();
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+        } else {
+                ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+                spin_unlock(&object->lock);
+        }
+        _leave("");
+}
+EXPORT_SYMBOL(fscache_object_lookup_negative);
+/**
+ * fscache_obtained_object - Note successful object lookup or creation
+ * @object: Object pointing to cookie to mark
+ *
+ * Note successful lookup and/or creation, permitting those waiting to write
+ * data to a backing object to continue.
+ *
+ * Note that after calling this, an object's cookie may be relinquished by the
+ * netfs, and so must be accessed with object lock held.
+ */
+void fscache_obtained_object(struct fscache_object *object)
+{
+        struct fscache_cookie *cookie = object->cookie;
+        _enter("{OBJ%x,%s}",
+               object->debug_id, fscache_object_states[object->state]);
+        /* if we were still looking up, then we must have a positive lookup
+         * result, in which case there may be data available */
+        spin_lock(&object->lock);
+        if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+                fscache_stat(&fscache_n_object_lookups_positive);
+                clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+                object->state = FSCACHE_OBJECT_AVAILABLE;
+                spin_unlock(&object->lock);
+                smp_mb__before_clear_bit();
+                clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+                smp_mb__after_clear_bit();
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+        } else {
+                ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+                fscache_stat(&fscache_n_object_created);
+                object->state = FSCACHE_OBJECT_AVAILABLE;
+                spin_unlock(&object->lock);
+                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+                smp_wmb();
+        }
+        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
+        _leave("");
+}
+EXPORT_SYMBOL(fscache_obtained_object);
+/*
+ * handle an object that has just become available
+ */
+static void fscache_object_available(struct fscache_object *object)
+{
+        _enter("{OBJ%x}", object->debug_id);
+        spin_lock(&object->lock);
+        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
+                wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+        fscache_done_parent_op(object);
+        if (object->n_in_progress == 0) {
+                if (object->n_ops > 0) {
+                        ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
+                        ASSERTIF(object->n_ops > object->n_obj_ops,
+                                 !list_empty(&object->pending_ops));
+                        fscache_start_operations(object);
+                } else {
+                        ASSERT(list_empty(&object->pending_ops));
+                }
+        }
+        spin_unlock(&object->lock);
+        object->cache->ops->lookup_complete(object);
+        fscache_enqueue_dependents(object);
+        fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
+        fscache_stat(&fscache_n_object_avail);
+        _leave("");
+}
+/*
+ * drop an object's attachments
+ */
+static void fscache_drop_object(struct fscache_object *object)
+{
+        struct fscache_object *parent = object->parent;
+        struct fscache_cache *cache = object->cache;
+        _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+        spin_lock(&cache->object_list_lock);
+        list_del_init(&object->cache_link);
+        spin_unlock(&cache->object_list_lock);
+        cache->ops->drop_object(object);
+        if (parent) {
+                _debug("release parent OBJ%x {%d}",
+                       parent->debug_id, parent->n_children);
+                spin_lock(&parent->lock);
+                parent->n_children--;
+                if (parent->n_children == 0)
+                        fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+                spin_unlock(&parent->lock);
+                object->parent = NULL;
+        }
+        /* this just shifts the object release to the slow work processor */
+        object->cache->ops->put_object(object);
+        _leave("");
+}
+/*
+ * release or recycle an object that the netfs has discarded
+ */
+static void fscache_release_object(struct fscache_object *object)
+{
+        _enter("");
+        fscache_drop_object(object);
+}
+/*
+ * withdraw an object from active service
+ */
+static void fscache_withdraw_object(struct fscache_object *object)
+{
+        struct fscache_cookie *cookie;
+        bool detached;
+        _enter("");
+        spin_lock(&object->lock);
+        cookie = object->cookie;
+        if (cookie) {
+                /* need to get the cookie lock before the object lock, starting
+                 * from the object pointer */
+                atomic_inc(&cookie->usage);
+                spin_unlock(&object->lock);
+                detached = false;
+                spin_lock(&cookie->lock);
+                spin_lock(&object->lock);
+                if (object->cookie == cookie) {
+                        hlist_del_init(&object->cookie_link);
+                        object->cookie = NULL;
+                        detached = true;
+                }
+                spin_unlock(&cookie->lock);
+                fscache_cookie_put(cookie);
+                if (detached)
+                        fscache_cookie_put(cookie);
+        }
+        spin_unlock(&object->lock);
+        fscache_drop_object(object);
+}
+/*
+ * withdraw an object from active service at the behest of the cache
+ * - need break the links to a cached object cookie
+ * - called under two situations:
+ *   (1) recycler decides to reclaim an in-use object
+ *   (2) a cache is unmounted
+ * - have to take care as the cookie can be being relinquished by the netfs
+ *   simultaneously
+ * - the object is pinned by the caller holding a refcount on it
+ */
+void fscache_withdrawing_object(struct fscache_cache *cache,
+                                struct fscache_object *object)
+{
+        bool enqueue = false;
+        _enter(",OBJ%x", object->debug_id);
+        spin_lock(&object->lock);
+        if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
+                object->state = FSCACHE_OBJECT_WITHDRAWING;
+                enqueue = true;
+        }
+        spin_unlock(&object->lock);
+        if (enqueue)
+                fscache_enqueue_object(object);
+        _leave("");
+}
+/*
+ * allow the slow work item processor to get a ref on an object
+ */
+static int fscache_object_slow_work_get_ref(struct slow_work *work)
+{
+        struct fscache_object *object =
+                container_of(work, struct fscache_object, work);
+        return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+}
+/*
+ * allow the slow work item processor to discard a ref on a work item
+ */
+static void fscache_object_slow_work_put_ref(struct slow_work *work)
+{
+        struct fscache_object *object =
+                container_of(work, struct fscache_object, work);
+        return object->cache->ops->put_object(object);
+}
+/*
+ * enqueue an object for metadata-type processing
+ */
+void fscache_enqueue_object(struct fscache_object *object)
+{
+        _enter("{OBJ%x}", object->debug_id);
+        slow_work_enqueue(&object->work);
+}
+/*
+ * enqueue the dependents of an object for metadata-type processing
+ * - the caller must hold the object's lock
+ * - this may cause an already locked object to wind up being processed again
+ */
+static void fscache_enqueue_dependents(struct fscache_object *object)
+{
+        struct fscache_object *dep;
+        _enter("{OBJ%x}", object->debug_id);
+        if (list_empty(&object->dependents))
+                return;
+        spin_lock(&object->lock);
+        while (!list_empty(&object->dependents)) {
+                dep = list_entry(object->dependents.next,
+                                 struct fscache_object, dep_link);
+                list_del_init(&dep->dep_link);
+                /* sort onto appropriate lists */
+                fscache_enqueue_object(dep);
+                dep->cache->ops->put_object(dep);
+                if (!list_empty(&object->dependents))
+                        cond_resched_lock(&object->lock);
+        }
+        spin_unlock(&object->lock);
+}
+/*
+ * remove an object from whatever queue it's waiting on
+ * - the caller must hold object->lock
+ */
+void fscache_dequeue_object(struct fscache_object *object)
+{
+        _enter("{OBJ%x}", object->debug_id);
+        if (!list_empty(&object->dep_link)) {
+                spin_lock(&object->parent->lock);
+                list_del_init(&object->dep_link);
+                spin_unlock(&object->parent->lock);
+        }
+        _leave("");
+}
+/**
+ * fscache_check_aux - Ask the netfs whether an object on disk is still valid
+ * @object: The object to ask about
+ * @data: The auxiliary data for the object
+ * @datalen: The size of the auxiliary data
+ *
+ * This function consults the netfs about the coherency state of an object
+ */
+enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+                                        const void *data, uint16_t datalen)
+{
+        enum fscache_checkaux result;
+        if (!object->cookie->def->check_aux) {
+                fscache_stat(&fscache_n_checkaux_none);
+                return FSCACHE_CHECKAUX_OKAY;
+        }
+        result = object->cookie->def->check_aux(object->cookie->netfs_data,
+                                                data, datalen);
+        switch (result) {
+                /* entry okay as is */
+        case FSCACHE_CHECKAUX_OKAY:
+                fscache_stat(&fscache_n_checkaux_okay);
+                break;
+                /* entry requires update */
+        case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+                fscache_stat(&fscache_n_checkaux_update);
+                break;
+                /* entry requires deletion */
+        case FSCACHE_CHECKAUX_OBSOLETE:
+                fscache_stat(&fscache_n_checkaux_obsolete);
+                break;
+        default:
+                BUG();
+        }
+        return result;
+}
+EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644
index 000000000000..e7f8d53b8b6b
--- /dev/null
+++ b/fs/fscache/operation.c
@@ -0,0 +1,459 @@
+/* FS-Cache worker operation management routines
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/operations.txt
+ */
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include "internal.h"
+atomic_t fscache_op_debug_id;
+EXPORT_SYMBOL(fscache_op_debug_id);
+/**
+ * fscache_enqueue_operation - Enqueue an operation for processing
+ * @op: The operation to enqueue
+ *
+ * Enqueue an operation for processing by the FS-Cache thread pool.
+ *
+ * This will get its own ref on the object.
+ */
+void fscache_enqueue_operation(struct fscache_operation *op)
+{
+        _enter("{OBJ%x OP%x,%u}",
+               op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+        ASSERT(op->processor != NULL);
+        ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+        ASSERTCMP(atomic_read(&op->usage), >, 0);
+        if (list_empty(&op->pend_link)) {
+                switch (op->flags & FSCACHE_OP_TYPE) {
+                case FSCACHE_OP_FAST:
+                        _debug("queue fast");
+                        atomic_inc(&op->usage);
+                        if (!schedule_work(&op->fast_work))
+                                fscache_put_operation(op);
+                        break;
+                case FSCACHE_OP_SLOW:
+                        _debug("queue slow");
+                        slow_work_enqueue(&op->slow_work);
+                        break;
+                case FSCACHE_OP_MYTHREAD:
+                        _debug("queue for caller's attention");
+                        break;
+                default:
+                        printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
+                               op->flags);
+                        BUG();
+                        break;
+                }
+                fscache_stat(&fscache_n_op_enqueue);
+        }
+}
+EXPORT_SYMBOL(fscache_enqueue_operation);
+/*
+ * start an op running
+ */
+static void fscache_run_op(struct fscache_object *object,
+                           struct fscache_operation *op)
+{
+        object->n_in_progress++;
+        if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+                wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+        if (op->processor)
+                fscache_enqueue_operation(op);
+        fscache_stat(&fscache_n_op_run);
+}
+/*
+ * submit an exclusive operation for an object
+ * - other ops are excluded from running simultaneously with this one
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_exclusive_op(struct fscache_object *object,
+                                struct fscache_operation *op)
+{
+        int ret;
+        _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+        spin_lock(&object->lock);
+        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+        ret = -ENOBUFS;
+        if (fscache_object_is_active(object)) {
+                op->object = object;
+                object->n_ops++;
+                object->n_exclusive++;  /* reads and writes must wait */
+                if (object->n_ops > 0) {
+                        atomic_inc(&op->usage);
+                        list_add_tail(&op->pend_link, &object->pending_ops);
+                        fscache_stat(&fscache_n_op_pend);
+                } else if (!list_empty(&object->pending_ops)) {
+                        atomic_inc(&op->usage);
+                        list_add_tail(&op->pend_link, &object->pending_ops);
+                        fscache_stat(&fscache_n_op_pend);
+                        fscache_start_operations(object);
+                } else {
+                        ASSERTCMP(object->n_in_progress, ==, 0);
+                        fscache_run_op(object, op);
+                }
+                /* need to issue a new write op after this */
+                clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+                ret = 0;
+        } else if (object->state == FSCACHE_OBJECT_CREATING) {
+                op->object = object;
+                object->n_ops++;
+                object->n_exclusive++;  /* reads and writes must wait */
+                atomic_inc(&op->usage);
+                list_add_tail(&op->pend_link, &object->pending_ops);
+                fscache_stat(&fscache_n_op_pend);
+                ret = 0;
+        } else {
+                /* not allowed to submit ops in any other state */
+                BUG();
+        }
+        spin_unlock(&object->lock);
+        return ret;
+}
+/*
+ * report an unexpected submission
+ */
+static void fscache_report_unexpected_submission(struct fscache_object *object,
+                                                 struct fscache_operation *op,
+                                                 unsigned long ostate)
+{
+        static bool once_only;
+        struct fscache_operation *p;
+        unsigned n;
+        if (once_only)
+                return;
+        once_only = true;
+        kdebug("unexpected submission OP%x [OBJ%x %s]",
+               op->debug_id, object->debug_id,
+               fscache_object_states[object->state]);
+        kdebug("objstate=%s [%s]",
+               fscache_object_states[object->state],
+               fscache_object_states[ostate]);
+        kdebug("objflags=%lx", object->flags);
+        kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
+        kdebug("ops=%u inp=%u exc=%u",
+               object->n_ops, object->n_in_progress, object->n_exclusive);
+        if (!list_empty(&object->pending_ops)) {
+                n = 0;
+                list_for_each_entry(p, &object->pending_ops, pend_link) {
+                        ASSERTCMP(p->object, ==, object);
+                        kdebug("%p %p", op->processor, op->release);
+                        n++;
+                }
+                kdebug("n=%u", n);
+        }
+        dump_stack();
+}
+/*
+ * submit an operation for an object
+ * - objects may be submitted only in the following states:
+ *   - during object creation (write ops may be submitted)
+ *   - whilst the object is active
+ *   - after an I/O error incurred in one of the two above states (op rejected)
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_op(struct fscache_object *object,
+                      struct fscache_operation *op)
+{
+        unsigned long ostate;
+        int ret;
+        _enter("{OBJ%x OP%x},{%u}",
+               object->debug_id, op->debug_id, atomic_read(&op->usage));
+        ASSERTCMP(atomic_read(&op->usage), >, 0);
+        spin_lock(&object->lock);
+        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+        ostate = object->state;
+        smp_rmb();
+        if (fscache_object_is_active(object)) {
+                op->object = object;
+                object->n_ops++;
+                if (object->n_exclusive > 0) {
+                        atomic_inc(&op->usage);
+                        list_add_tail(&op->pend_link, &object->pending_ops);
+                        fscache_stat(&fscache_n_op_pend);
+                } else if (!list_empty(&object->pending_ops)) {
+                        atomic_inc(&op->usage);
+                        list_add_tail(&op->pend_link, &object->pending_ops);
+                        fscache_stat(&fscache_n_op_pend);
+                        fscache_start_operations(object);
+                } else {
+                        ASSERTCMP(object->n_exclusive, ==, 0);
+                        fscache_run_op(object, op);
+                }
+                ret = 0;
+        } else if (object->state == FSCACHE_OBJECT_CREATING) {
+                op->object = object;
+                object->n_ops++;
+                atomic_inc(&op->usage);
+                list_add_tail(&op->pend_link, &object->pending_ops);
+                fscache_stat(&fscache_n_op_pend);
+                ret = 0;
+        } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+                fscache_report_unexpected_submission(object, op, ostate);
+                ASSERT(!fscache_object_is_active(object));
+                ret = -ENOBUFS;
+        } else {
+                ret = -ENOBUFS;
+        }
+        spin_unlock(&object->lock);
+        return ret;
+}
+/*
+ * queue an object for withdrawal on error, aborting all following asynchronous
+ * operations
+ */
+void fscache_abort_object(struct fscache_object *object)
+{
+        _enter("{OBJ%x}", object->debug_id);
+        fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+}
+/*
+ * jump start the operation processing on an object
+ * - caller must hold object->lock
+ */
+void fscache_start_operations(struct fscache_object *object)
+{
+        struct fscache_operation *op;
+        bool stop = false;
+        while (!list_empty(&object->pending_ops) && !stop) {
+                op = list_entry(object->pending_ops.next,
+                                struct fscache_operation, pend_link);
+                if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+                        if (object->n_in_progress > 0)
+                                break;
+                        stop = true;
+                }
+                list_del_init(&op->pend_link);
+                object->n_in_progress++;
+                if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+                        wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+                if (op->processor)
+                        fscache_enqueue_operation(op);
+                /* the pending queue was holding a ref on the object */
+                fscache_put_operation(op);
+        }
+        ASSERTCMP(object->n_in_progress, <=, object->n_ops);
+        _debug("woke %d ops on OBJ%x",
+               object->n_in_progress, object->debug_id);
+}
+/*
+ * release an operation
+ * - queues pending ops if this is the last in-progress op
+ */
+void fscache_put_operation(struct fscache_operation *op)
+{
+        struct fscache_object *object;
+        struct fscache_cache *cache;
+        _enter("{OBJ%x OP%x,%d}",
+               op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+        ASSERTCMP(atomic_read(&op->usage), >, 0);
+        if (!atomic_dec_and_test(&op->usage))
+                return;
+        _debug("PUT OP");
+        if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
+                BUG();
+        fscache_stat(&fscache_n_op_release);
+        if (op->release) {
+                op->release(op);
+                op->release = NULL;
+        }
+        object = op->object;
+        /* now... we may get called with the object spinlock held, so we
+         * complete the cleanup here only if we can immediately acquire the
+         * lock, and defer it otherwise */
+        if (!spin_trylock(&object->lock)) {
+                _debug("defer put");
+                fscache_stat(&fscache_n_op_deferred_release);
+                cache = object->cache;
+                spin_lock(&cache->op_gc_list_lock);
+                list_add_tail(&op->pend_link, &cache->op_gc_list);
+                spin_unlock(&cache->op_gc_list_lock);
+                schedule_work(&cache->op_gc);
+                _leave(" [defer]");
+                return;
+        }
+        if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+                ASSERTCMP(object->n_exclusive, >, 0);
+                object->n_exclusive--;
+        }
+        ASSERTCMP(object->n_in_progress, >, 0);
+        object->n_in_progress--;
+        if (object->n_in_progress == 0)
+                fscache_start_operations(object);
+        ASSERTCMP(object->n_ops, >, 0);
+        object->n_ops--;
+        if (object->n_ops == 0)
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+        spin_unlock(&object->lock);
+        kfree(op);
+        _leave(" [done]");
+}
+EXPORT_SYMBOL(fscache_put_operation);
+/*
+ * garbage collect operations that have had their release deferred
+ */
+void fscache_operation_gc(struct work_struct *work)
+{
+        struct fscache_operation *op;
+        struct fscache_object *object;
+        struct fscache_cache *cache =
+                container_of(work, struct fscache_cache, op_gc);
+        int count = 0;
+        _enter("");
+        do {
+                spin_lock(&cache->op_gc_list_lock);
+                if (list_empty(&cache->op_gc_list)) {
+                        spin_unlock(&cache->op_gc_list_lock);
+                        break;
+                }
+                op = list_entry(cache->op_gc_list.next,
+                                struct fscache_operation, pend_link);
+                list_del(&op->pend_link);
+                spin_unlock(&cache->op_gc_list_lock);
+                object = op->object;
+                _debug("GC DEFERRED REL OBJ%x OP%x",
+                       object->debug_id, op->debug_id);
+                fscache_stat(&fscache_n_op_gc);
+                ASSERTCMP(atomic_read(&op->usage), ==, 0);
+                spin_lock(&object->lock);
+                if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+                        ASSERTCMP(object->n_exclusive, >, 0);
+                        object->n_exclusive--;
+                }
+                ASSERTCMP(object->n_in_progress, >, 0);
+                object->n_in_progress--;
+                if (object->n_in_progress == 0)
+                        fscache_start_operations(object);
+                ASSERTCMP(object->n_ops, >, 0);
+                object->n_ops--;
+                if (object->n_ops == 0)
+                        fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+                spin_unlock(&object->lock);
+        } while (count++ < 20);
+        if (!list_empty(&cache->op_gc_list))
+                schedule_work(&cache->op_gc);
+        _leave("");
+}
+/*
+ * allow the slow work item processor to get a ref on an operation
+ */
+static int fscache_op_get_ref(struct slow_work *work)
+{
+        struct fscache_operation *op =
+                container_of(work, struct fscache_operation, slow_work);
+        atomic_inc(&op->usage);
+        return 0;
+}
+/*
+ * allow the slow work item processor to discard a ref on an operation
+ */
+static void fscache_op_put_ref(struct slow_work *work)
+{
+        struct fscache_operation *op =
+                container_of(work, struct fscache_operation, slow_work);
+        fscache_put_operation(op);
+}
+/*
+ * execute an operation using the slow thread pool to provide processing context
+ * - the caller holds a ref to this object, so we don't need to hold one
+ */
+static void fscache_op_execute(struct slow_work *work)
+{
+        struct fscache_operation *op =
+                container_of(work, struct fscache_operation, slow_work);
+        unsigned long start;
+        _enter("{OBJ%x OP%x,%d}",
+               op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+        ASSERT(op->processor != NULL);
+        start = jiffies;
+        op->processor(op);
+        fscache_hist(fscache_ops_histogram, start);
+        _leave("");
+}
+const struct slow_work_ops fscache_op_slow_work_ops = {
+        .get_ref        = fscache_op_get_ref,
+        .put_ref        = fscache_op_put_ref,
+        .execute        = fscache_op_execute,
+};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 000000000000..2568e0eb644f
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
+/* Cache page management and data I/O routines
+ *
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#include <linux/fscache-cache.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+/*
+ * check to see if a page is being written to the cache
+ */
+bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+        void *val;
+        rcu_read_lock();
+        val = radix_tree_lookup(&cookie->stores, page->index);
+        rcu_read_unlock();
+        return val != NULL;
+}
+EXPORT_SYMBOL(__fscache_check_page_write);
+/*
+ * wait for a page to finish being written to the cache
+ */
+void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+        wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+        wait_event(*wq, !__fscache_check_page_write(cookie, page));
+}
+EXPORT_SYMBOL(__fscache_wait_on_page_write);
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+        struct page *xpage;
+        spin_lock(&cookie->lock);
+        xpage = radix_tree_delete(&cookie->stores, page->index);
+        spin_unlock(&cookie->lock);
+        ASSERT(xpage != NULL);
+        wake_up_bit(&cookie->flags, 0);
+}
+/*
+ * actually apply the changed attributes to a cache object
+ */
+static void fscache_attr_changed_op(struct fscache_operation *op)
+{
+        struct fscache_object *object = op->object;
+        _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
+        fscache_stat(&fscache_n_attr_changed_calls);
+        if (fscache_object_is_active(object) &&
+            object->cache->ops->attr_changed(object) < 0)
+                fscache_abort_object(object);
+        _leave("");
+}
+/*
+ * notification that the attributes on an object have changed
+ */
+int __fscache_attr_changed(struct fscache_cookie *cookie)
+{
+        struct fscache_operation *op;
+        struct fscache_object *object;
+        _enter("%p", cookie);
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        fscache_stat(&fscache_n_attr_changed);
+        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        if (!op) {
+                fscache_stat(&fscache_n_attr_changed_nomem);
+                _leave(" = -ENOMEM");
+                return -ENOMEM;
+        }
+        fscache_operation_init(op, NULL);
+        fscache_operation_init_slow(op, fscache_attr_changed_op);
+        op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        if (fscache_submit_exclusive_op(object, op) < 0)
+                goto nobufs;
+        spin_unlock(&cookie->lock);
+        fscache_stat(&fscache_n_attr_changed_ok);
+        fscache_put_operation(op);
+        _leave(" = 0");
+        return 0;
+nobufs:
+        spin_unlock(&cookie->lock);
+        kfree(op);
+        fscache_stat(&fscache_n_attr_changed_nobufs);
+        _leave(" = %d", -ENOBUFS);
+        return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_attr_changed);
+/*
+ * handle secondary execution given to a retrieval op on behalf of the
+ * cache
+ */
+static void fscache_retrieval_work(struct work_struct *work)
+{
+        struct fscache_retrieval *op =
+                container_of(work, struct fscache_retrieval, op.fast_work);
+        unsigned long start;
+        _enter("{OP%x}", op->op.debug_id);
+        start = jiffies;
+        op->op.processor(&op->op);
+        fscache_hist(fscache_ops_histogram, start);
+        fscache_put_operation(&op->op);
+}
+/*
+ * release a retrieval op reference
+ */
+static void fscache_release_retrieval_op(struct fscache_operation *_op)
+{
+        struct fscache_retrieval *op =
+                container_of(_op, struct fscache_retrieval, op);
+        _enter("{OP%x}", op->op.debug_id);
+        fscache_hist(fscache_retrieval_histogram, op->start_time);
+        if (op->context)
+                fscache_put_context(op->op.object->cookie, op->context);
+        _leave("");
+}
+/*
+ * allocate a retrieval op
+ */
+static struct fscache_retrieval *fscache_alloc_retrieval(
+        struct address_space *mapping,
+        fscache_rw_complete_t end_io_func,
+        void *context)
+{
+        struct fscache_retrieval *op;
+        /* allocate a retrieval operation and attempt to submit it */
+        op = kzalloc(sizeof(*op), GFP_NOIO);
+        if (!op) {
+                fscache_stat(&fscache_n_retrievals_nomem);
+                return NULL;
+        }
+        fscache_operation_init(&op->op, fscache_release_retrieval_op);
+        op->op.flags    = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+        op->mapping     = mapping;
+        op->end_io_func = end_io_func;
+        op->context     = context;
+        op->start_time  = jiffies;
+        INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
+        INIT_LIST_HEAD(&op->to_do);
+        return op;
+}
+/*
+ * wait for a deferred lookup to complete
+ */
+static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+{
+        unsigned long jif;
+        _enter("");
+        if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
+                _leave(" = 0 [imm]");
+                return 0;
+        }
+        fscache_stat(&fscache_n_retrievals_wait);
+        jif = jiffies;
+        if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+                        fscache_wait_bit_interruptible,
+                        TASK_INTERRUPTIBLE) != 0) {
+                fscache_stat(&fscache_n_retrievals_intr);
+                _leave(" = -ERESTARTSYS");
+                return -ERESTARTSYS;
+        }
+        ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
+        smp_rmb();
+        fscache_hist(fscache_retrieval_delay_histogram, jif);
+        _leave(" = 0 [dly]");
+        return 0;
+}
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - we return:
+ *   -ENOMEM    - out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS   - no backing object available in which to cache the block
+ *   -ENODATA   - no data available in the backing object for this block
+ *   0          - dispatched a read - it'll call end_io_func() when finished
+ */
+int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+                                 struct page *page,
+                                 fscache_rw_complete_t end_io_func,
+                                 void *context,
+                                 gfp_t gfp)
+{
+        struct fscache_retrieval *op;
+        struct fscache_object *object;
+        int ret;
+        _enter("%p,%p,,,", cookie, page);
+        fscache_stat(&fscache_n_retrievals);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERTCMP(page, !=, NULL);
+        if (fscache_wait_for_deferred_lookup(cookie) < 0)
+                return -ERESTARTSYS;
+        op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+        if (!op) {
+                _leave(" = -ENOMEM");
+                return -ENOMEM;
+        }
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs_unlock;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+        if (fscache_submit_op(object, &op->op) < 0)
+                goto nobufs_unlock;
+        spin_unlock(&cookie->lock);
+        fscache_stat(&fscache_n_retrieval_ops);
+        /* pin the netfs read context in case we need to do the actual netfs
+         * read because we've encountered a cache read failure */
+        fscache_get_context(object->cookie, op->context);
+        /* we wait for the operation to become active, and then process it
+         * *here*, in this thread, and not in the thread pool */
+        if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+                _debug(">>> WT");
+                fscache_stat(&fscache_n_retrieval_op_waits);
+                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+                _debug("<<< GO");
+        }
+        /* ask the cache to honour the operation */
+        if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+                ret = object->cache->ops->allocate_page(op, page, gfp);
+                if (ret == 0)
+                        ret = -ENODATA;
+        } else {
+                ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+        }
+        if (ret == -ENOMEM)
+                fscache_stat(&fscache_n_retrievals_nomem);
+        else if (ret == -ERESTARTSYS)
+                fscache_stat(&fscache_n_retrievals_intr);
+        else if (ret == -ENODATA)
+                fscache_stat(&fscache_n_retrievals_nodata);
+        else if (ret < 0)
+                fscache_stat(&fscache_n_retrievals_nobufs);
+        else
+                fscache_stat(&fscache_n_retrievals_ok);
+        fscache_put_retrieval(op);
+        _leave(" = %d", ret);
+        return ret;
+nobufs_unlock:
+        spin_unlock(&cookie->lock);
+        kfree(op);
+nobufs:
+        fscache_stat(&fscache_n_retrievals_nobufs);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_page);
+/*
+ * read a list of page from the cache or allocate a block in which to store
+ * them
+ * - we return:
+ *   -ENOMEM    - out of memory, some pages may be being read
+ *   -ERESTARTSYS - interrupted, some pages may be being read
+ *   -ENOBUFS   - no backing object or space available in which to cache any
+ *                pages not being read
+ *   -ENODATA   - no data available in the backing object for some or all of
+ *                the pages
+ *   0          - dispatched a read on all pages
+ *
+ * end_io_func() will be called for each page read from the cache as it is
+ * finishes being read
+ *
+ * any pages for which a read is dispatched will be removed from pages and
+ * nr_pages
+ */
+int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+                                  struct address_space *mapping,
+                                  struct list_head *pages,
+                                  unsigned *nr_pages,
+                                  fscache_rw_complete_t end_io_func,
+                                  void *context,
+                                  gfp_t gfp)
+{
+        fscache_pages_retrieval_func_t func;
+        struct fscache_retrieval *op;
+        struct fscache_object *object;
+        int ret;
+        _enter("%p,,%d,,,", cookie, *nr_pages);
+        fscache_stat(&fscache_n_retrievals);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERTCMP(*nr_pages, >, 0);
+        ASSERT(!list_empty(pages));
+        if (fscache_wait_for_deferred_lookup(cookie) < 0)
+                return -ERESTARTSYS;
+        op = fscache_alloc_retrieval(mapping, end_io_func, context);
+        if (!op)
+                return -ENOMEM;
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs_unlock;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        if (fscache_submit_op(object, &op->op) < 0)
+                goto nobufs_unlock;
+        spin_unlock(&cookie->lock);
+        fscache_stat(&fscache_n_retrieval_ops);
+        /* pin the netfs read context in case we need to do the actual netfs
+         * read because we've encountered a cache read failure */
+        fscache_get_context(object->cookie, op->context);
+        /* we wait for the operation to become active, and then process it
+         * *here*, in this thread, and not in the thread pool */
+        if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+                _debug(">>> WT");
+                fscache_stat(&fscache_n_retrieval_op_waits);
+                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+                _debug("<<< GO");
+        }
+        /* ask the cache to honour the operation */
+        if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
+                func = object->cache->ops->allocate_pages;
+        else
+                func = object->cache->ops->read_or_alloc_pages;
+        ret = func(op, pages, nr_pages, gfp);
+        if (ret == -ENOMEM)
+                fscache_stat(&fscache_n_retrievals_nomem);
+        else if (ret == -ERESTARTSYS)
+                fscache_stat(&fscache_n_retrievals_intr);
+        else if (ret == -ENODATA)
+                fscache_stat(&fscache_n_retrievals_nodata);
+        else if (ret < 0)
+                fscache_stat(&fscache_n_retrievals_nobufs);
+        else
+                fscache_stat(&fscache_n_retrievals_ok);
+        fscache_put_retrieval(op);
+        _leave(" = %d", ret);
+        return ret;
+nobufs_unlock:
+        spin_unlock(&cookie->lock);
+        kfree(op);
+nobufs:
+        fscache_stat(&fscache_n_retrievals_nobufs);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
+/*
+ * allocate a block in the cache on which to store a page
+ * - we return:
+ *   -ENOMEM    - out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS   - no backing object available in which to cache the block
+ *   0          - block allocated
+ */
+int __fscache_alloc_page(struct fscache_cookie *cookie,
+                         struct page *page,
+                         gfp_t gfp)
+{
+        struct fscache_retrieval *op;
+        struct fscache_object *object;
+        int ret;
+        _enter("%p,%p,,,", cookie, page);
+        fscache_stat(&fscache_n_allocs);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERTCMP(page, !=, NULL);
+        if (fscache_wait_for_deferred_lookup(cookie) < 0)
+                return -ERESTARTSYS;
+        op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+        if (!op)
+                return -ENOMEM;
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs_unlock;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        if (fscache_submit_op(object, &op->op) < 0)
+                goto nobufs_unlock;
+        spin_unlock(&cookie->lock);
+        fscache_stat(&fscache_n_alloc_ops);
+        if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+                _debug(">>> WT");
+                fscache_stat(&fscache_n_alloc_op_waits);
+                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+                _debug("<<< GO");
+        }
+        /* ask the cache to honour the operation */
+        ret = object->cache->ops->allocate_page(op, page, gfp);
+        if (ret < 0)
+                fscache_stat(&fscache_n_allocs_nobufs);
+        else
+                fscache_stat(&fscache_n_allocs_ok);
+        fscache_put_retrieval(op);
+        _leave(" = %d", ret);
+        return ret;
+nobufs_unlock:
+        spin_unlock(&cookie->lock);
+        kfree(op);
+nobufs:
+        fscache_stat(&fscache_n_allocs_nobufs);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_alloc_page);
+/*
+ * release a write op reference
+ */
+static void fscache_release_write_op(struct fscache_operation *_op)
+{
+        _enter("{OP%x}", _op->debug_id);
+}
+/*
+ * perform the background storage of a page into the cache
+ */
+static void fscache_write_op(struct fscache_operation *_op)
+{
+        struct fscache_storage *op =
+                container_of(_op, struct fscache_storage, op);
+        struct fscache_object *object = op->op.object;
+        struct fscache_cookie *cookie = object->cookie;
+        struct page *page;
+        unsigned n;
+        void *results[1];
+        int ret;
+        _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+        spin_lock(&cookie->lock);
+        spin_lock(&object->lock);
+        if (!fscache_object_is_active(object)) {
+                spin_unlock(&object->lock);
+                spin_unlock(&cookie->lock);
+                _leave("");
+                return;
+        }
+        fscache_stat(&fscache_n_store_calls);
+        /* find a page to store */
+        page = NULL;
+        n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
+                                       FSCACHE_COOKIE_PENDING_TAG);
+        if (n != 1)
+                goto superseded;
+        page = results[0];
+        _debug("gang %d [%lx]", n, page->index);
+        if (page->index > op->store_limit)
+                goto superseded;
+        radix_tree_tag_clear(&cookie->stores, page->index,
+                             FSCACHE_COOKIE_PENDING_TAG);
+        spin_unlock(&object->lock);
+        spin_unlock(&cookie->lock);
+        if (page) {
+                ret = object->cache->ops->write_page(op, page);
+                fscache_end_page_write(cookie, page);
+                page_cache_release(page);
+                if (ret < 0)
+                        fscache_abort_object(object);
+                else
+                        fscache_enqueue_operation(&op->op);
+        }
+        _leave("");
+        return;
+superseded:
+        /* this writer is going away and there aren't any more things to
+         * write */
+        _debug("cease");
+        clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+        spin_unlock(&object->lock);
+        spin_unlock(&cookie->lock);
+        _leave("");
+}
+/*
+ * request a page be stored in the cache
+ * - returns:
+ *   -ENOMEM    - out of memory, nothing done
+ *   -ENOBUFS   - no backing object available in which to cache the page
+ *   0          - dispatched a write - it'll call end_io_func() when finished
+ *
+ * if the cookie still has a backing object at this point, that object can be
+ * in one of a few states with respect to storage processing:
+ *
+ *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
+ *      set)
+ *
+ *      (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ *          fill op)
+ *
+ *      (b) writes deferred till post-creation (mark page for writing and
+ *          return immediately)
+ *
+ *  (2) negative lookup, object created, initial fill being made from netfs
+ *      (FSCACHE_COOKIE_INITIAL_FILL is set)
+ *
+ *      (a) fill point not yet reached this page (mark page for writing and
+ *          return)
+ *
+ *      (b) fill point passed this page (queue op to store this page)
+ *
+ *  (3) object extant (queue op to store this page)
+ *
+ * any other state is invalid
+ */
+int __fscache_write_page(struct fscache_cookie *cookie,
+                         struct page *page,
+                         gfp_t gfp)
+{
+        struct fscache_storage *op;
+        struct fscache_object *object;
+        int ret;
+        _enter("%p,%x,", cookie, (u32) page->flags);
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERT(PageFsCache(page));
+        fscache_stat(&fscache_n_stores);
+        op = kzalloc(sizeof(*op), GFP_NOIO);
+        if (!op)
+                goto nomem;
+        fscache_operation_init(&op->op, fscache_release_write_op);
+        fscache_operation_init_slow(&op->op, fscache_write_op);
+        op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+        ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+        if (ret < 0)
+                goto nomem_free;
+        ret = -ENOBUFS;
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects))
+                goto nobufs;
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+                goto nobufs;
+        /* add the page to the pending-storage radix tree on the backing
+         * object */
+        spin_lock(&object->lock);
+        _debug("store limit %llx", (unsigned long long) object->store_limit);
+        ret = radix_tree_insert(&cookie->stores, page->index, page);
+        if (ret < 0) {
+                if (ret == -EEXIST)
+                        goto already_queued;
+                _debug("insert failed %d", ret);
+                goto nobufs_unlock_obj;
+        }
+        radix_tree_tag_set(&cookie->stores, page->index,
+                           FSCACHE_COOKIE_PENDING_TAG);
+        page_cache_get(page);
+        /* we only want one writer at a time, but we do need to queue new
+         * writers after exclusive ops */
+        if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
+                goto already_pending;
+        spin_unlock(&object->lock);
+        op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
+        op->store_limit = object->store_limit;
+        if (fscache_submit_op(object, &op->op) < 0)
+                goto submit_failed;
+        spin_unlock(&cookie->lock);
+        radix_tree_preload_end();
+        fscache_stat(&fscache_n_store_ops);
+        fscache_stat(&fscache_n_stores_ok);
+        /* the slow work queue now carries its own ref on the object */
+        fscache_put_operation(&op->op);
+        _leave(" = 0");
+        return 0;
+already_queued:
+        fscache_stat(&fscache_n_stores_again);
+already_pending:
+        spin_unlock(&object->lock);
+        spin_unlock(&cookie->lock);
+        radix_tree_preload_end();
+        kfree(op);
+        fscache_stat(&fscache_n_stores_ok);
+        _leave(" = 0");
+        return 0;
+submit_failed:
+        radix_tree_delete(&cookie->stores, page->index);
+        page_cache_release(page);
+        ret = -ENOBUFS;
+        goto nobufs;
+nobufs_unlock_obj:
+        spin_unlock(&object->lock);
+nobufs:
+        spin_unlock(&cookie->lock);
+        radix_tree_preload_end();
+        kfree(op);
+        fscache_stat(&fscache_n_stores_nobufs);
+        _leave(" = -ENOBUFS");
+        return -ENOBUFS;
+nomem_free:
+        kfree(op);
+nomem:
+        fscache_stat(&fscache_n_stores_oom);
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+EXPORT_SYMBOL(__fscache_write_page);
+/*
+ * remove a page from the cache
+ */
+void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
+{
+        struct fscache_object *object;
+        _enter(",%p", page);
+        ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+        ASSERTCMP(page, !=, NULL);
+        fscache_stat(&fscache_n_uncaches);
+        /* cache withdrawal may beat us to it */
+        if (!PageFsCache(page))
+                goto done;
+        /* get the object */
+        spin_lock(&cookie->lock);
+        if (hlist_empty(&cookie->backing_objects)) {
+                ClearPageFsCache(page);
+                goto done_unlock;
+        }
+        object = hlist_entry(cookie->backing_objects.first,
+                             struct fscache_object, cookie_link);
+        /* there might now be stuff on disk we could read */
+        clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+        /* only invoke the cache backend if we managed to mark the page
+         * uncached here; this deals with synchronisation vs withdrawal */
+        if (TestClearPageFsCache(page) &&
+            object->cache->ops->uncache_page) {
+                /* the cache backend releases the cookie lock */
+                object->cache->ops->uncache_page(object, page);
+                goto done;
+        }
+done_unlock:
+        spin_unlock(&cookie->lock);
+done:
+        _leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_page);
+/**
+ * fscache_mark_pages_cached - Mark pages as being cached
+ * @op: The retrieval op pages are being marked for
+ * @pagevec: The pages to be marked
+ *
+ * Mark a bunch of netfs pages as being cached.  After this is called,
+ * the netfs must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_pages_cached(struct fscache_retrieval *op,
+                               struct pagevec *pagevec)
+{
+        struct fscache_cookie *cookie = op->op.object->cookie;
+        unsigned long loop;
+#ifdef CONFIG_FSCACHE_STATS
+        atomic_add(pagevec->nr, &fscache_n_marks);
+#endif
+        for (loop = 0; loop < pagevec->nr; loop++) {
+                struct page *page = pagevec->pages[loop];
+                _debug("- mark %p{%lx}", page, page->index);
+                if (TestSetPageFsCache(page)) {
+                        static bool once_only;
+                        if (!once_only) {
+                                once_only = true;
+                                printk(KERN_WARNING "FS-Cache:"
+                                       " Cookie type %s marked page %lx"
+                                       " multiple times\n",
+                                       cookie->def->name, page->index);
+                        }
+                }
+        }
+        if (cookie->def->mark_pages_cached)
+                cookie->def->mark_pages_cached(cookie->netfs_data,
+                                               op->mapping, pagevec);
+        pagevec_reinit(pagevec);
+}
+EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 000000000000..beeab44bc31a
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
+/* FS-Cache statistics viewing interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+/*
+ * initialise the /proc/fs/fscache/ directory
+ */
+int __init fscache_proc_init(void)
+{
+        _enter("");
+        if (!proc_mkdir("fs/fscache", NULL))
+                goto error_dir;
+#ifdef CONFIG_FSCACHE_STATS
+        if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
+                         &fscache_stats_fops))
+                goto error_stats;
+#endif
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+        if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
+                         &fscache_histogram_fops))
+                goto error_histogram;
+#endif
+        _leave(" = 0");
+        return 0;
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+error_histogram:
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+        remove_proc_entry("fs/fscache/stats", NULL);
+error_stats:
+#endif
+        remove_proc_entry("fs/fscache", NULL);
+error_dir:
+        _leave(" = -ENOMEM");
+        return -ENOMEM;
+}
+/*
+ * clean up the /proc/fs/fscache/ directory
+ */
+void fscache_proc_cleanup(void)
+{
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+        remove_proc_entry("fs/fscache/histogram", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+        remove_proc_entry("fs/fscache/stats", NULL);
+#endif
+        remove_proc_entry("fs/fscache", NULL);
+}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 000000000000..65deb99e756b
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
+/* FS-Cache statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+/*
+ * operation counters
+ */
+atomic_t fscache_n_op_pend;
+atomic_t fscache_n_op_run;
+atomic_t fscache_n_op_enqueue;
+atomic_t fscache_n_op_requeue;
+atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_release;
+atomic_t fscache_n_op_gc;
+atomic_t fscache_n_attr_changed;
+atomic_t fscache_n_attr_changed_ok;
+atomic_t fscache_n_attr_changed_nobufs;
+atomic_t fscache_n_attr_changed_nomem;
+atomic_t fscache_n_attr_changed_calls;
+atomic_t fscache_n_allocs;
+atomic_t fscache_n_allocs_ok;
+atomic_t fscache_n_allocs_wait;
+atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_alloc_ops;
+atomic_t fscache_n_alloc_op_waits;
+atomic_t fscache_n_retrievals;
+atomic_t fscache_n_retrievals_ok;
+atomic_t fscache_n_retrievals_wait;
+atomic_t fscache_n_retrievals_nodata;
+atomic_t fscache_n_retrievals_nobufs;
+atomic_t fscache_n_retrievals_intr;
+atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrieval_ops;
+atomic_t fscache_n_retrieval_op_waits;
+atomic_t fscache_n_stores;
+atomic_t fscache_n_stores_ok;
+atomic_t fscache_n_stores_again;
+atomic_t fscache_n_stores_nobufs;
+atomic_t fscache_n_stores_oom;
+atomic_t fscache_n_store_ops;
+atomic_t fscache_n_store_calls;
+atomic_t fscache_n_marks;
+atomic_t fscache_n_uncaches;
+atomic_t fscache_n_acquires;
+atomic_t fscache_n_acquires_null;
+atomic_t fscache_n_acquires_no_cache;
+atomic_t fscache_n_acquires_ok;
+atomic_t fscache_n_acquires_nobufs;
+atomic_t fscache_n_acquires_oom;
+atomic_t fscache_n_updates;
+atomic_t fscache_n_updates_null;
+atomic_t fscache_n_updates_run;
+atomic_t fscache_n_relinquishes;
+atomic_t fscache_n_relinquishes_null;
+atomic_t fscache_n_relinquishes_waitcrt;
+atomic_t fscache_n_cookie_index;
+atomic_t fscache_n_cookie_data;
+atomic_t fscache_n_cookie_special;
+atomic_t fscache_n_object_alloc;
+atomic_t fscache_n_object_no_alloc;
+atomic_t fscache_n_object_lookups;
+atomic_t fscache_n_object_lookups_negative;
+atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_created;
+atomic_t fscache_n_object_avail;
+atomic_t fscache_n_object_dead;
+atomic_t fscache_n_checkaux_none;
+atomic_t fscache_n_checkaux_okay;
+atomic_t fscache_n_checkaux_update;
+atomic_t fscache_n_checkaux_obsolete;
+/*
+ * display the general statistics
+ */
+static int fscache_stats_show(struct seq_file *m, void *v)
+{
+        seq_puts(m, "FS-Cache statistics\n");
+        seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
+                   atomic_read(&fscache_n_cookie_index),
+                   atomic_read(&fscache_n_cookie_data),
+                   atomic_read(&fscache_n_cookie_special));
+        seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
+                   atomic_read(&fscache_n_object_alloc),
+                   atomic_read(&fscache_n_object_no_alloc),
+                   atomic_read(&fscache_n_object_avail),
+                   atomic_read(&fscache_n_object_dead));
+        seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
+                   atomic_read(&fscache_n_checkaux_none),
+                   atomic_read(&fscache_n_checkaux_okay),
+                   atomic_read(&fscache_n_checkaux_update),
+                   atomic_read(&fscache_n_checkaux_obsolete));
+        seq_printf(m, "Pages  : mrk=%u unc=%u\n",
+                   atomic_read(&fscache_n_marks),
+                   atomic_read(&fscache_n_uncaches));
+        seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
+                   " oom=%u\n",
+                   atomic_read(&fscache_n_acquires),
+                   atomic_read(&fscache_n_acquires_null),
+                   atomic_read(&fscache_n_acquires_no_cache),
+                   atomic_read(&fscache_n_acquires_ok),
+                   atomic_read(&fscache_n_acquires_nobufs),
+                   atomic_read(&fscache_n_acquires_oom));
+        seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+                   atomic_read(&fscache_n_object_lookups),
+                   atomic_read(&fscache_n_object_lookups_negative),
+                   atomic_read(&fscache_n_object_lookups_positive),
+                   atomic_read(&fscache_n_object_created));
+        seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+                   atomic_read(&fscache_n_updates),
+                   atomic_read(&fscache_n_updates_null),
+                   atomic_read(&fscache_n_updates_run));
+        seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+                   atomic_read(&fscache_n_relinquishes),
+                   atomic_read(&fscache_n_relinquishes_null),
+                   atomic_read(&fscache_n_relinquishes_waitcrt));
+        seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
+                   atomic_read(&fscache_n_attr_changed),
+                   atomic_read(&fscache_n_attr_changed_ok),
+                   atomic_read(&fscache_n_attr_changed_nobufs),
+                   atomic_read(&fscache_n_attr_changed_nomem),
+                   atomic_read(&fscache_n_attr_changed_calls));
+        seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+                   atomic_read(&fscache_n_allocs),
+                   atomic_read(&fscache_n_allocs_ok),
+                   atomic_read(&fscache_n_allocs_wait),
+                   atomic_read(&fscache_n_allocs_nobufs));
+        seq_printf(m, "Allocs : ops=%u owt=%u\n",
+                   atomic_read(&fscache_n_alloc_ops),
+                   atomic_read(&fscache_n_alloc_op_waits));
+        seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
+                   " int=%u oom=%u\n",
+                   atomic_read(&fscache_n_retrievals),
+                   atomic_read(&fscache_n_retrievals_ok),
+                   atomic_read(&fscache_n_retrievals_wait),
+                   atomic_read(&fscache_n_retrievals_nodata),
+                   atomic_read(&fscache_n_retrievals_nobufs),
+                   atomic_read(&fscache_n_retrievals_intr),
+                   atomic_read(&fscache_n_retrievals_nomem));
+        seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+                   atomic_read(&fscache_n_retrieval_ops),
+                   atomic_read(&fscache_n_retrieval_op_waits));
+        seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
+                   atomic_read(&fscache_n_stores),
+                   atomic_read(&fscache_n_stores_ok),
+                   atomic_read(&fscache_n_stores_again),
+                   atomic_read(&fscache_n_stores_nobufs),
+                   atomic_read(&fscache_n_stores_oom));
+        seq_printf(m, "Stores : ops=%u run=%u\n",
+                   atomic_read(&fscache_n_store_ops),
+                   atomic_read(&fscache_n_store_calls));
+        seq_printf(m, "Ops    : pend=%u run=%u enq=%u\n",
+                   atomic_read(&fscache_n_op_pend),
+                   atomic_read(&fscache_n_op_run),
+                   atomic_read(&fscache_n_op_enqueue));
+        seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
+                   atomic_read(&fscache_n_op_deferred_release),
+                   atomic_read(&fscache_n_op_release),
+                   atomic_read(&fscache_n_op_gc));
+        return 0;
+}
+/*
+ * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
+ */
+static int fscache_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, fscache_stats_show, NULL);
+}
+const struct file_operations fscache_stats_fops = {
+        .owner          = THIS_MODULE,
+        .open           = fscache_stats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 06da05261e04..8b8eebc5614b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1032,6 +1032,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
                fuse_put_request(fc, req);
                return -ENOMEM;
        }
+        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
        fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4e340fedf768..2b25133524a3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -386,7 +386,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_read_in);
        req->in.args[0].value = inarg;
-        req->out.argpages = 1;
        req->out.argvar = 1;
        req->out.numargs = 1;
        req->out.args[0].size = count;
@@ -453,6 +452,7 @@ static int fuse_readpage(struct file *file, struct page *page)
        attr_ver = fuse_get_attr_version(fc);
        req->out.page_zeroing = 1;
+        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
        num_read = fuse_send_read(req, file, inode, pos, count, NULL);
@@ -510,6 +510,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
        struct fuse_conn *fc = get_fuse_conn(inode);
        loff_t pos = page_offset(req->pages[0]);
        size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+        req->out.argpages = 1;
        req->out.page_zeroing = 1;
        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
        req->misc.read.attr_ver = fuse_get_attr_version(fc);
@@ -621,7 +623,6 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
        inarg->flags = file ? file->f_flags : 0;
        req->in.h.opcode = FUSE_WRITE;
        req->in.h.nodeid = get_node_id(inode);
-        req->in.argpages = 1;
        req->in.numargs = 2;
        if (fc->minor < 9)
                req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
@@ -695,6 +696,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
        if (IS_ERR(req))
                return PTR_ERR(req);
+        req->in.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
        req->page_offset = offset;
@@ -771,6 +773,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
        size_t count = 0;
        int err;
+        req->in.argpages = 1;
        req->page_offset = offset;
        do {
@@ -935,21 +938,28 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 }
 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
-                               unsigned nbytes, int write)
+                               unsigned *nbytesp, int write)
 {
+        unsigned nbytes = *nbytesp;
        unsigned long user_addr = (unsigned long) buf;
        unsigned offset = user_addr & ~PAGE_MASK;
        int npages;
-        /* This doesn't work with nfsd */
+        /* Special case for kernel I/O: can copy directly into the buffer */
-        if (!current->mm)
+        if (segment_eq(get_fs(), KERNEL_DS)) {
-                return -EPERM;
+                if (write)
+                        req->in.args[1].value = (void *) user_addr;
+                else
+                        req->out.args[0].value = (void *) user_addr;
+                return 0;
+        }
        nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
        npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
        npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
        down_read(&current->mm->mmap_sem);
-        npages = get_user_pages(current, current->mm, user_addr, npages, write,
+        npages = get_user_pages(current, current->mm, user_addr, npages, !write,
                                0, req->pages, NULL);
        up_read(&current->mm->mmap_sem);
        if (npages < 0)
@@ -957,6 +967,15 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
        req->num_pages = npages;
        req->page_offset = offset;
+        if (write)
+                req->in.argpages = 1;
+        else
+                req->out.argpages = 1;
+        nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+        *nbytesp = min(*nbytesp, nbytes);
        return 0;
 }
@@ -979,15 +998,13 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
        while (count) {
                size_t nres;
-                size_t nbytes_limit = min(count, nmax);
+                size_t nbytes = min(count, nmax);
-                size_t nbytes;
+                int err = fuse_get_user_pages(req, buf, &nbytes, write);
-                int err = fuse_get_user_pages(req, buf, nbytes_limit, !write);
                if (err) {
                        res = err;
                        break;
                }
-                nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
-                nbytes = min(nbytes_limit, nbytes);
                if (write)
                        nres = fuse_send_write(req, file, inode, pos, nbytes,
                                               current->files);
@@ -1163,6 +1180,7 @@ static int fuse_writepage_locked(struct page *page)
        fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
        copy_highpage(tmp_page, page);
+        req->in.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = tmp_page;
        req->page_offset = 0;
@@ -1274,6 +1292,15 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
+static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        /* Can't provide the coherency needed for MAP_SHARED */
+        if (vma->vm_flags & VM_MAYSHARE)
+                return -ENODEV;
+        return generic_file_mmap(file, vma);
+}
 static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
                                  struct file_lock *fl)
 {
@@ -1908,6 +1935,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
        .llseek         = fuse_file_llseek,
        .read           = fuse_direct_read,
        .write          = fuse_direct_write,
+        .mmap           = fuse_direct_mmap,
        .open           = fuse_open,
        .flush          = fuse_flush,
        .release        = fuse_release,
@@ -1917,7 +1945,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
        .unlocked_ioctl = fuse_file_ioctl,
        .compat_ioctl   = fuse_file_compat_ioctl,
        .poll           = fuse_file_poll,
-        /* no mmap and splice_read */
+        /* no splice_read */
 };
 static const struct address_space_operations fuse_file_aops  = {
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e747..e0b53aa7bbec 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
        mode_t mode = inode->i_mode;
        int error;
-        inode->i_mode = mode & ~current->fs->umask;
+        inode->i_mode = mode & ~current_umask();
        if (!S_ISLNK(inode->i_mode))
                acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
        if (acl) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 43764f4fa763..fa881bdc3d85 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        if (error)
                return error;
        if (!acl) {
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
                if (mode != ip->i_inode.i_mode)
                        error = munge_mode(ip, mode);
                return error;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c8b5acf4b0b7..a36bb749926d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb)
 static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = HFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
@@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = buf->f_bfree;
        buf->f_files = HFS_SB(sb)->fs_ablocks;
        buf->f_ffree = HFS_SB(sb)->free_ablocks;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = HFS_NAMELEN;
        return 0;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdfa..3fcbb0e1f6fc 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
        opts->creator = HFSPLUS_DEF_CR_TYPE;
        opts->type = HFSPLUS_DEF_CR_TYPE;
-        opts->umask = current->fs->umask;
+        opts->umask = current_umask();
        opts->uid = current_uid();
        opts->gid = current_gid();
        opts->part = -1;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb74531a0a8e..f2a64020f42e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb)
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = HFSPLUS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
@@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = buf->f_bfree;
        buf->f_files = 0xFFFFFFFF;
        buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = HFSPLUS_MAX_STRLEN;
        return 0;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c4..fecf402d7b8a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *s = dentry->d_sb;
        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        u64 id = huge_encode_dev(s->s_bdev->bd_dev);
        lock_kernel();
        /*if (sbi->sb_n_free == -1) {*/
@@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = sbi->sb_n_free;
        buf->f_files = sbi->sb_dirband_size / 4;
        buf->f_ffree = sbi->sb_n_free_dnodes;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = 254;
        unlock_kernel();
@@ -477,7 +480,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        uid = current_uid();
        gid = current_gid();
-        umask = current->fs->umask;
+        umask = current_umask();
        lowercase = 0;
        conv = CONV_BINARY;
        eas = 2;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index b278f7f52024..a5089a6dd67a 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
                               "errno = %d\n", err);
                        return err;
                }
-                count = hppfs_read_file(hppfs->host_fd, buf, count);
+                err = hppfs_read_file(hppfs->host_fd, buf, count);
+                if (err < 0) {
+                        printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
+                        return err;
+                }
+                count = err;
                if (count > 0)
                        *ppos += count;
        }
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 struct super_block;
 struct linux_binprm;
+struct path;
 /*
 * block_dev.c
@@ -43,7 +44,7 @@ extern void __init chrdev_init(void);
 /*
 * exec.c
 */
-extern void check_unsafe_exec(struct linux_binprm *);
+extern int check_unsafe_exec(struct linux_binprm *);
 /*
 * namespace.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern void __init mnt_init(void);
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 13d2eddd0692..b4cbe9603c7d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -923,6 +923,7 @@ out_freesbi:
 static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = ISOFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
@@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = 0;
        buf->f_files = ISOFS_SB(sb)->s_ninodes;
        buf->f_ffree = 0;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = NAME_MAX;
        return 0;
 }
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3fbffb1ea714..a8e8513a78a9 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/bio.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
        return (ret == -EIO);
 }
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+                                   int write_op)
 {
        int i;
        for (i = 0; i < bufs; i++) {
                wbuf[i]->b_end_io = end_buffer_write_sync;
                /* We use-up our safety reference in submit_bh() */
-                submit_bh(WRITE, wbuf[i]);
+                submit_bh(write_op, wbuf[i]);
        }
 }
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 *  Submit all the data buffers to disk
 */
 static int journal_submit_data_buffers(journal_t *journal,
-                                transaction_t *commit_transaction)
+                                       transaction_t *commit_transaction,
+                                       int write_op)
 {
        struct journal_head *jh;
        struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
                                BUFFER_TRACE(bh, "needs blocking lock");
                                spin_unlock(&journal->j_list_lock);
                                /* Write out all data to prevent deadlocks */
-                                journal_do_submit_data(wbuf, bufs);
+                                journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
                                lock_buffer(bh);
                                spin_lock(&journal->j_list_lock);
@@ -256,7 +259,7 @@ write_out_data:
                        jbd_unlock_bh_state(bh);
                        if (bufs == journal->j_wbufsize) {
                                spin_unlock(&journal->j_list_lock);
-                                journal_do_submit_data(wbuf, bufs);
+                                journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
                                goto write_out_data;
                        }
@@ -286,7 +289,7 @@ write_out_data:
                }
        }
        spin_unlock(&journal->j_list_lock);
-        journal_do_submit_data(wbuf, bufs);
+        journal_do_submit_data(wbuf, bufs, write_op);
        return err;
 }
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
        int first_tag = 0;
        int tag_flag;
        int i;
+        int write_op = WRITE;
        /*
         * First job: lock down the current transaction and wait for
@@ -347,6 +351,13 @@ void journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
+        /*
+         * Use plugged writes here, since we want to submit several before
+         * we unplug the device. We don't do explicit unplugging in here,
+         * instead we rely on sync_buffer() doing the unplug for us.
+         */
+        if (commit_transaction->t_synchronous_commit)
+                write_op = WRITE_SYNC_PLUG;
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
@@ -431,7 +442,8 @@ void journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-        err = journal_submit_data_buffers(journal, commit_transaction);
+        err = journal_submit_data_buffers(journal, commit_transaction,
+                                          write_op);
        /*
         * Wait for all previously submitted IO to complete.
@@ -660,7 +672,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                                submit_bh(WRITE, bh);
+                                submit_bh(write_op, bh);
                        }
                        cond_resched();
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e79c07812afa..737f7246a4b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
                return NULL;
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        if (!bh)
+                return NULL;
        lock_buffer(bh);
        memset(bh->b_data, 0, journal->j_blocksize);
        set_buffer_uptodate(bh);
@@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-                kfree(journal);
+                goto out_err;
-                journal = NULL;
-                goto out;
        }
        journal->j_dev = bdev;
        journal->j_fs_dev = fs_dev;
@@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev,
        journal->j_maxlen = len;
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-        J_ASSERT(bh != NULL);
+        if (!bh) {
+                printk(KERN_ERR
+                       "%s: Cannot get buffer for journal superblock\n",
+                       __func__);
+                goto out_err;
+        }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
        return journal;
+out_err:
+        kfree(journal);
+        return NULL;
 }
 /**
@@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode)
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-                kfree(journal);
+                goto out_err;
-                return NULL;
        }
        err = journal_bmap(journal, 0, &blocknr);
@@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode)
        if (err) {
                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
                       __func__);
-                kfree(journal);
+                goto out_err;
-                return NULL;
        }
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-        J_ASSERT(bh != NULL);
+        if (!bh) {
+                printk(KERN_ERR
+                       "%s: Cannot get buffer for journal superblock\n",
+                       __func__);
+                goto out_err;
+        }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
        return journal;
+out_err:
+        kfree(journal);
+        return NULL;
 }
 /*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a117431277..ed886e6db399 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
                }
        }
+        if (handle->h_sync)
+                transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea72377c7a2..073c8c3df7cd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
                set_buffer_ordered(bh);
                barrier_done = 1;
        }
-        ret = submit_bh(WRITE_SYNC, bh);
+        ret = submit_bh(WRITE_SYNC_PLUG, bh);
        if (barrier_done)
                clear_buffer_ordered(bh);
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
                lock_buffer(bh);
                set_buffer_uptodate(bh);
                clear_buffer_dirty(bh);
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(WRITE_SYNC_PLUG, bh);
        }
        *cbh = bh;
        return ret;
@@ -190,7 +190,7 @@ retry:
                set_buffer_uptodate(bh);
                bh->b_end_io = journal_end_buffer_io_sync;
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(WRITE_SYNC_PLUG, bh);
                if (ret) {
                        unlock_buffer(bh);
                        return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
+        /*
+         * Use plugged writes here, since we want to submit several before
+         * we unplug the device. We don't do explicit unplugging in here,
+         * instead we rely on sync_buffer() doing the unplug for us.
+         */
        if (commit_transaction->t_synchronous_commit)
-                write_op = WRITE_SYNC;
+                write_op = WRITE_SYNC_PLUG;
        stats.u.run.rs_wait = commit_transaction->t_max_wait;
        stats.u.run.rs_locked = jiffies;
        stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1b..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
        size_t s;
        size -= sizeof(struct jffs2_acl_header);
-        s = size - 4 * sizeof(struct jffs2_acl_entry_short);
+        if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
-        if (s < 0) {
                if (size % sizeof(struct jffs2_acl_entry_short))
                        return -1;
                return size / sizeof(struct jffs2_acl_entry_short);
        } else {
+                s = size - 4 * sizeof(struct jffs2_acl_entry_short);
                if (s % sizeof(struct jffs2_acl_entry))
                        return -1;
                return s / sizeof(struct jffs2_acl_entry) + 4;
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
                return PTR_ERR(acl);
        if (!acl) {
-                *i_mode &= ~current->fs->umask;
+                *i_mode &= ~current_umask();
        } else {
                if (S_ISDIR(*i_mode))
                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
 struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
 {
        struct jffs2_xattr_datum *xd;
-        xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+        xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
        dbg_memalloc("%p\n", xd);
-        memset(xd, 0, sizeof(struct jffs2_xattr_datum));
        xd->class = RAWNODE_CLASS_XATTR_DATUM;
        xd->node = (void *)xd;
        INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
 struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 {
        struct jffs2_xattr_ref *ref;
-        ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+        ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
        dbg_memalloc("%p\n", ref);
-        memset(ref, 0, sizeof(struct jffs2_xattr_ref));
        ref->class = RAWNODE_CLASS_XATTR_REF;
        ref->node = (void *)ref;
        return ref;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a166c1669e82..06ca1b8d2054 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 cleanup:
                posix_acl_release(acl);
        } else
-                inode->i_mode &= ~current->fs->umask;
+                inode->i_mode &= ~current_umask();
        JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
                               inode->i_mode;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..cd223190c4e9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 * possibly a read which collects the result - which is stored in a
 * file-local buffer.
 */
+void simple_transaction_set(struct file *file, size_t n)
+{
+        struct simple_transaction_argresp *ar = file->private_data;
+        BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+        /*
+         * The barrier ensures that ar->size will really remain zero until
+         * ar->data is ready for reading.
+         */
+        smp_mb();
+        ar->size = n;
+}
 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
 {
        struct simple_transaction_argresp *ar;
@@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
+EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
 EXPORT_SYMBOL(simple_transaction_read);
 EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                        ret = nlm_granted;
                        goto out;
                case -EAGAIN:
+                        /*
+                         * If this is a blocking request for an
+                         * already pending lock request then we need
+                         * to put it back on lockd's block list
+                         */
+                        if (wait)
+                                break;
                        ret = nlm_lck_denied;
-                        break;
+                        goto out;
                case FILE_LOCK_DEFERRED:
                        if (wait)
                                break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                        goto out;
        }
-        ret = nlm_lck_denied;
-        if (!wait)
-                goto out;
        ret = nlm_lck_blocked;
        /* Append to list of blocked */
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 618865b3128b..daad3c2740db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -321,15 +321,20 @@ out:
 static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
+        struct super_block *sb = dentry->d_sb;
-        buf->f_type = dentry->d_sb->s_magic;
+        struct minix_sb_info *sbi = minix_sb(sb);
-        buf->f_bsize = dentry->d_sb->s_blocksize;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+        buf->f_type = sb->s_magic;
+        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
        buf->f_bfree = minix_count_free_blocks(sbi);
        buf->f_bavail = buf->f_bfree;
        buf->f_files = sbi->s_ninodes;
        buf->f_ffree = minix_count_free_inodes(sbi);
        buf->f_namelen = sbi->s_namelen;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae3..680ba60863ff 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
        bio_put(bio);
 }
-struct bio *mpage_bio_submit(int rw, struct bio *bio)
+static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
        bio->bi_end_io = mpage_end_io_read;
        if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
        submit_bio(rw, bio);
        return NULL;
 }
-EXPORT_SYMBOL(mpage_bio_submit);
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
 * just allocate full-size (16-page) BIOs.
 */
-int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+struct mpage_data {
+        struct bio *bio;
+        sector_t last_block_in_bio;
+        get_block_t *get_block;
+        unsigned use_writepage;
+};
+static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
                      void *data)
 {
        struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
        mpd->bio = bio;
        return ret;
 }
-EXPORT_SYMBOL(__mpage_writepage);
 /**
 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785d..b8433ebfae05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -1578,7 +1579,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
        struct dentry *dir = nd->path.dentry;
        if (!IS_POSIXACL(dir->d_inode))
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
        error = security_path_mknod(&nd->path, path->dentry, mode, 0);
        if (error)
                goto out_unlock;
@@ -1989,7 +1990,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
                goto out_unlock;
        }
        if (!IS_POSIXACL(nd.path.dentry->d_inode))
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
        error = may_mknod(mode);
        if (error)
                goto out_dput;
@@ -2067,7 +2068,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
                goto out_unlock;
        if (!IS_POSIXACL(nd.path.dentry->d_inode))
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
@@ -2897,10 +2898,3 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
-/* to be mentioned only in INIT_TASK */
-struct fs_struct init_fs = {
-        .count          = ATOMIC_INIT(1),
-        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
-        .umask          = 0022,
-};
diff --git a/fs/namespace.c b/fs/namespace.c
index 0a42e0e96027..c6f54e4c4290 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
 #include <linux/ramfs.h>
 #include <linux/log2.h>
 #include <linux/idr.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
@@ -2093,66 +2094,6 @@ out1:
 }
 /*
- * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_root(struct fs_struct *fs, struct path *path)
-{
-        struct path old_root;
-        write_lock(&fs->lock);
-        old_root = fs->root;
-        fs->root = *path;
-        path_get(path);
-        write_unlock(&fs->lock);
-        if (old_root.dentry)
-                path_put(&old_root);
-}
-/*
- * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
-{
-        struct path old_pwd;
-        write_lock(&fs->lock);
-        old_pwd = fs->pwd;
-        fs->pwd = *path;
-        path_get(path);
-        write_unlock(&fs->lock);
-        if (old_pwd.dentry)
-                path_put(&old_pwd);
-}
-static void chroot_fs_refs(struct path *old_root, struct path *new_root)
-{
-        struct task_struct *g, *p;
-        struct fs_struct *fs;
-        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
-                task_lock(p);
-                fs = p->fs;
-                if (fs) {
-                        atomic_inc(&fs->count);
-                        task_unlock(p);
-                        if (fs->root.dentry == old_root->dentry
-                            && fs->root.mnt == old_root->mnt)
-                                set_fs_root(fs, new_root);
-                        if (fs->pwd.dentry == old_root->dentry
-                            && fs->pwd.mnt == old_root->mnt)
-                                set_fs_pwd(fs, new_root);
-                        put_fs_struct(fs);
-                } else
-                        task_unlock(p);
-        } while_each_thread(g, p);
-        read_unlock(&tasklist_lock);
-}
-/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 36fe20d6eba2..e67f3ec07736 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -84,3 +84,11 @@ config ROOT_NFS
          <file:Documentation/filesystems/nfsroot.txt>.
          Most people say N here.
+config NFS_FSCACHE
+        bool "Provide NFS client caching support (EXPERIMENTAL)"
+        depends on EXPERIMENTAL
+        depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+        help
+          Say Y here if you want NFS data to be cached locally on disc through
+          the general filesystem cache manager
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ac6170c594a3..845159814de2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           callback.o callback_xdr.o callback_proc.o \
                           nfs4namespace.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index aba38017bdef..75c9cd2aa119 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_CLIENT
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        if (!IS_ERR(cred))
                clp->cl_machine_cred = cred;
+        nfs_fscache_get_client_cookie(clp);
        return clp;
 error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
        nfs4_shutdown_client(clp);
+        nfs_fscache_release_client_cookie(clp);
        /* -EIO all pending I/O */
        if (!IS_ERR(clp->cl_rpcclient))
                rpc_shutdown_client(clp->cl_rpcclient);
@@ -760,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
+        server->options = data->options;
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1148,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
        server->caps |= NFS_CAP_ATOMIC_OPEN;
+        server->options = data->options;
        /* Get a client record */
        error = nfs4_set_client(server,
@@ -1559,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
        /* display header on line 1 */
        if (v == &nfs_volume_list) {
-                seq_puts(m, "NV SERVER   PORT DEV     FSID\n");
+                seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
                return 0;
        }
        /* display one transport per line on subsequent lines */
@@ -1573,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
                 (unsigned long long) server->fsid.major,
                 (unsigned long long) server->fsid.minor);
-        seq_printf(m, "v%u %s %s %-7s %-17s\n",
+        seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
                   clp->rpc_ops->version,
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
                   dev,
-                   fsid);
+                   fsid,
+                   nfs_server_fscache_state(server));
        return 0;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 0abf3f331f56..5a97bcfe03e5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -35,6 +35,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_FILE
@@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
        return copied;
 }
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ *   page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
        dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
@@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
                return;
        /* Cancel any unstarted writes on this page */
        nfs_wb_page_cancel(page->mapping->host, page);
+        nfs_fscache_invalidate_page(page, page->mapping->host);
 }
+/*
+ * Attempt to release the private state associated with a page
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ * - Return true (may release page) or false (may not)
+ */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
        /* If PagePrivate() is set, then the page is not freeable */
-        return 0;
+        if (PagePrivate(page))
+                return 0;
+        return nfs_fscache_release_page(page, gfp);
 }
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
 static int nfs_launder_page(struct page *page)
 {
        struct inode *inode = page->mapping->host;
+        struct nfs_inode *nfsi = NFS_I(inode);
        dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
                inode->i_ino, (long long)page_offset(page));
+        nfs_fscache_wait_on_page_write(nfsi, page);
        return nfs_wb_page(inode, page);
 }
@@ -451,6 +479,11 @@ const struct address_space_operations nfs_file_aops = {
        .launder_page = nfs_launder_page,
 };
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
 static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct page *page = vmf->page;
@@ -465,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                filp->f_mapping->host->i_ino,
                (long long)page_offset(page));
+        /* make sure the cache has finished storing the page */
+        nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
        lock_page(page);
        mapping = page->mapping;
        if (mapping != dentry->d_inode->i_mapping)
@@ -480,8 +516,6 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto out_unlock;
        ret = nfs_updatepage(filp, page, 0, pagelen);
-        if (ret == 0)
-                ret = pagelen;
 out_unlock:
        unlock_page(page);
        if (ret)
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 000000000000..5b1006480bc2
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
+/* NFS FS-Cache index structure definition
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include "internal.h"
+#include "fscache.h"
+#define NFSDBG_FACILITY         NFSDBG_FSCACHE
+/*
+ * Define the NFS filesystem for FS-Cache.  Upon registration FS-Cache sticks
+ * the cookie for the top-level index object for NFS into here.  The top-level
+ * index can than have other cache objects inserted into it.
+ */
+struct fscache_netfs nfs_fscache_netfs = {
+        .name           = "nfs",
+        .version        = 0,
+};
+/*
+ * Register NFS for caching
+ */
+int nfs_fscache_register(void)
+{
+        return fscache_register_netfs(&nfs_fscache_netfs);
+}
+/*
+ * Unregister NFS for caching
+ */
+void nfs_fscache_unregister(void)
+{
+        fscache_unregister_netfs(&nfs_fscache_netfs);
+}
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+        uint16_t        nfsversion;             /* NFS protocol version */
+        uint16_t        family;                 /* address family */
+        uint16_t        port;                   /* IP port */
+        union {
+                struct in_addr  ipv4_addr;      /* IPv4 address */
+                struct in6_addr ipv6_addr;      /* IPv6 address */
+        } addr[0];
+};
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+                                   void *buffer, uint16_t bufmax)
+{
+        const struct nfs_client *clp = cookie_netfs_data;
+        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+        const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+        struct nfs_server_key *key = buffer;
+        uint16_t len = sizeof(struct nfs_server_key);
+        key->nfsversion = clp->rpc_ops->version;
+        key->family = clp->cl_addr.ss_family;
+        memset(key, 0, len);
+        switch (clp->cl_addr.ss_family) {
+        case AF_INET:
+                key->port = sin->sin_port;
+                key->addr[0].ipv4_addr = sin->sin_addr;
+                len += sizeof(key->addr[0].ipv4_addr);
+                break;
+        case AF_INET6:
+                key->port = sin6->sin6_port;
+                key->addr[0].ipv6_addr = sin6->sin6_addr;
+                len += sizeof(key->addr[0].ipv6_addr);
+                break;
+        default:
+                printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+                       clp->cl_addr.ss_family);
+                len = 0;
+                break;
+        }
+        return len;
+}
+/*
+ * Define the server object for FS-Cache.  This is used to describe a server
+ * object to fscache_acquire_cookie().  It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+        .name           = "NFS.server",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = nfs_server_get_key,
+};
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+                                  void *buffer, uint16_t bufmax)
+{
+        const struct nfs_fscache_key *key;
+        const struct nfs_server *nfss = cookie_netfs_data;
+        uint16_t len;
+        key = nfss->fscache_key;
+        len = sizeof(key->key) + key->key.uniq_len;
+        if (len > bufmax) {
+                len = 0;
+        } else {
+                memcpy(buffer, &key->key, sizeof(key->key));
+                memcpy(buffer + sizeof(key->key),
+                       key->key.uniquifier, key->key.uniq_len);
+        }
+        return len;
+}
+/*
+ * Define the superblock object for FS-Cache.  This is used to describe a
+ * superblock object to fscache_acquire_cookie().  It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+        .name           = "NFS.super",
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key        = nfs_super_get_key,
+};
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode.  This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+        struct timespec mtime;
+        struct timespec ctime;
+        loff_t          size;
+        u64             change_attr;
+};
+/*
+ * Generate a key to describe an NFS inode in an NFS server's index
+ */
+static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
+                                          void *buffer, uint16_t bufmax)
+{
+        const struct nfs_inode *nfsi = cookie_netfs_data;
+        uint16_t nsize;
+        /* use the inode's NFS filehandle as the key */
+        nsize = nfsi->fh.size;
+        memcpy(buffer, nfsi->fh.data, nsize);
+        return nsize;
+}
+/*
+ * Get certain file attributes from the netfs data
+ * - This function can be absent for an index
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
+                                       uint64_t *size)
+{
+        const struct nfs_inode *nfsi = cookie_netfs_data;
+        *size = nfsi->vfs_inode.i_size;
+}
+/*
+ * Get the auxiliary data from netfs data
+ * - This function can be absent if the index carries no state data
+ * - Should store the auxiliary data in the buffer
+ * - Should return the amount of amount stored
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
+                                          void *buffer, uint16_t bufmax)
+{
+        struct nfs_fscache_inode_auxdata auxdata;
+        const struct nfs_inode *nfsi = cookie_netfs_data;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.size = nfsi->vfs_inode.i_size;
+        auxdata.mtime = nfsi->vfs_inode.i_mtime;
+        auxdata.ctime = nfsi->vfs_inode.i_ctime;
+        if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+                auxdata.change_attr = nfsi->change_attr;
+        if (bufmax > sizeof(auxdata))
+                bufmax = sizeof(auxdata);
+        memcpy(buffer, &auxdata, bufmax);
+        return bufmax;
+}
+/*
+ * Consult the netfs about the state of an object
+ * - This function can be absent if the index carries no state data
+ * - The netfs data from the cookie being used as the target is
+ *   presented, as is the auxiliary data
+ */
+static
+enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
+                                                  const void *data,
+                                                  uint16_t datalen)
+{
+        struct nfs_fscache_inode_auxdata auxdata;
+        struct nfs_inode *nfsi = cookie_netfs_data;
+        if (datalen != sizeof(auxdata))
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.size = nfsi->vfs_inode.i_size;
+        auxdata.mtime = nfsi->vfs_inode.i_mtime;
+        auxdata.ctime = nfsi->vfs_inode.i_ctime;
+        if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+                auxdata.change_attr = nfsi->change_attr;
+        if (memcmp(data, &auxdata, datalen) != 0)
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        return FSCACHE_CHECKAUX_OKAY;
+}
+/*
+ * Indication from FS-Cache that the cookie is no longer cached
+ * - This function is called when the backing store currently caching a cookie
+ *   is removed
+ * - The netfs should use this to clean up any markers indicating cached pages
+ * - This is mandatory for any object that may have data
+ */
+static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+        struct nfs_inode *nfsi = cookie_netfs_data;
+        struct pagevec pvec;
+        pgoff_t first;
+        int loop, nr_pages;
+        pagevec_init(&pvec, 0);
+        first = 0;
+        dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
+        for (;;) {
+                /* grab a bunch of pages to unmark */
+                nr_pages = pagevec_lookup(&pvec,
+                                          nfsi->vfs_inode.i_mapping,
+                                          first,
+                                          PAGEVEC_SIZE - pagevec_count(&pvec));
+                if (!nr_pages)
+                        break;
+                for (loop = 0; loop < nr_pages; loop++)
+                        ClearPageFsCache(pvec.pages[loop]);
+                first = pvec.pages[nr_pages - 1]->index + 1;
+                pvec.nr = nr_pages;
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+/*
+ * Get an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ * - The read context is passed back to NFS in the event that a data read on the
+ *   cache fails with EIO - in which case the server must be contacted to
+ *   retrieve the data, which requires the read context for security.
+ */
+static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
+{
+        get_nfs_open_context(context);
+}
+/*
+ * Release an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ */
+static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
+{
+        if (context)
+                put_nfs_open_context(context);
+}
+/*
+ * Define the inode object for FS-Cache.  This is used to describe an inode
+ * object to fscache_acquire_cookie().  It is keyed by the NFS file handle for
+ * an inode.
+ *
+ * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
+ * held in the cache auxiliary data for the data storage object with those in
+ * the inode struct in memory.
+ */
+const struct fscache_cookie_def nfs_fscache_inode_object_def = {
+        .name           = "NFS.fh",
+        .type           = FSCACHE_COOKIE_TYPE_DATAFILE,
+        .get_key        = nfs_fscache_inode_get_key,
+        .get_attr       = nfs_fscache_inode_get_attr,
+        .get_aux        = nfs_fscache_inode_get_aux,
+        .check_aux      = nfs_fscache_inode_check_aux,
+        .now_uncached   = nfs_fscache_inode_now_uncached,
+        .get_context    = nfs_fh_get_context,
+        .put_context    = nfs_fh_put_context,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000000..379be678cb7e
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,523 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#define NFSDBG_FACILITY         NFSDBG_FSCACHE
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ *   cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+        /* create a cache index for looking up filehandles */
+        clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+                                              &nfs_fscache_server_index_def,
+                                              clp);
+        dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+                 clp, clp->fscache);
+}
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+        dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+                 clp, clp->fscache);
+        fscache_relinquish_cookie(clp->fscache, 0);
+        clp->fscache = NULL;
+}
+/*
+ * Get the cache cookie for an NFS superblock.  We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb,
+                                  struct nfs_parsed_mount_data *data)
+{
+        struct nfs_fscache_key *key, *xkey;
+        struct nfs_server *nfss = NFS_SB(sb);
+        struct rb_node **p, *parent;
+        const char *uniq = data->fscache_uniq ?: "";
+        int diff, ulen;
+        ulen = strlen(uniq);
+        key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+        if (!key)
+                return;
+        key->nfs_client = nfss->nfs_client;
+        key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+        key->key.nfs_server.flags = nfss->flags;
+        key->key.nfs_server.rsize = nfss->rsize;
+        key->key.nfs_server.wsize = nfss->wsize;
+        key->key.nfs_server.acregmin = nfss->acregmin;
+        key->key.nfs_server.acregmax = nfss->acregmax;
+        key->key.nfs_server.acdirmin = nfss->acdirmin;
+        key->key.nfs_server.acdirmax = nfss->acdirmax;
+        key->key.nfs_server.fsid = nfss->fsid;
+        key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+        key->key.uniq_len = ulen;
+        memcpy(key->key.uniquifier, uniq, ulen);
+        spin_lock(&nfs_fscache_keys_lock);
+        p = &nfs_fscache_keys.rb_node;
+        parent = NULL;
+        while (*p) {
+                parent = *p;
+                xkey = rb_entry(parent, struct nfs_fscache_key, node);
+                if (key->nfs_client < xkey->nfs_client)
+                        goto go_left;
+                if (key->nfs_client > xkey->nfs_client)
+                        goto go_right;
+                diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+                if (diff < 0)
+                        goto go_left;
+                if (diff > 0)
+                        goto go_right;
+                if (key->key.uniq_len == 0)
+                        goto non_unique;
+                diff = memcmp(key->key.uniquifier,
+                              xkey->key.uniquifier,
+                              key->key.uniq_len);
+                if (diff < 0)
+                        goto go_left;
+                if (diff > 0)
+                        goto go_right;
+                goto non_unique;
+        go_left:
+                p = &(*p)->rb_left;
+                continue;
+        go_right:
+                p = &(*p)->rb_right;
+        }
+        rb_link_node(&key->node, parent, p);
+        rb_insert_color(&key->node, &nfs_fscache_keys);
+        spin_unlock(&nfs_fscache_keys_lock);
+        nfss->fscache_key = key;
+        /* create a cache index for looking up filehandles */
+        nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+                                               &nfs_fscache_super_index_def,
+                                               nfss);
+        dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+                 nfss, nfss->fscache);
+        return;
+non_unique:
+        spin_unlock(&nfs_fscache_keys_lock);
+        kfree(key);
+        nfss->fscache_key = NULL;
+        nfss->fscache = NULL;
+        printk(KERN_WARNING "NFS:"
+               " Cache request denied due to non-unique superblock keys\n");
+}
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+        struct nfs_server *nfss = NFS_SB(sb);
+        dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+                 nfss, nfss->fscache);
+        fscache_relinquish_cookie(nfss->fscache, 0);
+        nfss->fscache = NULL;
+        if (nfss->fscache_key) {
+                spin_lock(&nfs_fscache_keys_lock);
+                rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+                spin_unlock(&nfs_fscache_keys_lock);
+                kfree(nfss->fscache_key);
+                nfss->fscache_key = NULL;
+        }
+}
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode_cookie(struct inode *inode)
+{
+        NFS_I(inode)->fscache = NULL;
+        if (S_ISREG(inode->i_mode))
+                set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+/*
+ * Get the per-inode cache cookie for an NFS inode.
+ */
+static void nfs_fscache_enable_inode_cookie(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct nfs_inode *nfsi = NFS_I(inode);
+        if (nfsi->fscache || !NFS_FSCACHE(inode))
+                return;
+        if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
+                nfsi->fscache = fscache_acquire_cookie(
+                        NFS_SB(sb)->fscache,
+                        &nfs_fscache_inode_object_def,
+                        nfsi);
+                dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
+                         sb, nfsi, nfsi->fscache);
+        }
+}
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_release_inode_cookie(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
+                 nfsi, nfsi->fscache);
+        fscache_relinquish_cookie(nfsi->fscache, 0);
+        nfsi->fscache = NULL;
+}
+/*
+ * Retire a per-inode cookie, destroying the data attached to it.
+ */
+void nfs_fscache_zap_inode_cookie(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
+                 nfsi, nfsi->fscache);
+        fscache_relinquish_cookie(nfsi->fscache, 1);
+        nfsi->fscache = NULL;
+}
+/*
+ * Turn off the cache with regard to a per-inode cookie if opened for writing,
+ * invalidating all the pages in the page cache relating to the associated
+ * inode to clear the per-page caching.
+ */
+static void nfs_fscache_disable_inode_cookie(struct inode *inode)
+{
+        clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+        if (NFS_I(inode)->fscache) {
+                dfprintk(FSCACHE,
+                         "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
+                /* Need to invalidate any mapped pages that were read in before
+                 * turning off the cache.
+                 */
+                if (inode->i_mapping && inode->i_mapping->nrpages)
+                        invalidate_inode_pages2(inode->i_mapping);
+                nfs_fscache_zap_inode_cookie(inode);
+        }
+}
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+static int nfs_fscache_wait_bit(void *flags)
+{
+        schedule();
+        return 0;
+}
+/*
+ * Lock against someone else trying to also acquire or relinquish a cookie
+ */
+static inline void nfs_fscache_inode_lock(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
+                wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
+                            nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+}
+/*
+ * Unlock cookie management lock
+ */
+static inline void nfs_fscache_inode_unlock(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        smp_mb__before_clear_bit();
+        clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
+}
+/*
+ * Decide if we should enable or disable local caching for this inode.
+ * - For now, with NFS, only regular files that are open read-only will be able
+ *   to use the cache.
+ * - May be invoked multiple times in parallel by parallel nfs_open() functions.
+ */
+void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+        if (NFS_FSCACHE(inode)) {
+                nfs_fscache_inode_lock(inode);
+                if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+                        nfs_fscache_disable_inode_cookie(inode);
+                else
+                        nfs_fscache_enable_inode_cookie(inode);
+                nfs_fscache_inode_unlock(inode);
+        }
+}
+/*
+ * Replace a per-inode cookie due to revalidation detecting a file having
+ * changed on the server.
+ */
+void nfs_fscache_reset_inode_cookie(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct nfs_server *nfss = NFS_SERVER(inode);
+        struct fscache_cookie *old = nfsi->fscache;
+        nfs_fscache_inode_lock(inode);
+        if (nfsi->fscache) {
+                /* retire the current fscache cache and get a new one */
+                fscache_relinquish_cookie(nfsi->fscache, 1);
+                nfsi->fscache = fscache_acquire_cookie(
+                        nfss->nfs_client->fscache,
+                        &nfs_fscache_inode_object_def,
+                        nfsi);
+                dfprintk(FSCACHE,
+                         "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
+                         nfss, nfsi, old, nfsi->fscache);
+        }
+        nfs_fscache_inode_unlock(inode);
+}
+/*
+ * Release the caching state associated with a page, if the page isn't busy
+ * interacting with the cache.
+ * - Returns true (can release page) or false (page busy).
+ */
+int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+        struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+        struct fscache_cookie *cookie = nfsi->fscache;
+        BUG_ON(!cookie);
+        if (fscache_check_page_write(cookie, page)) {
+                if (!(gfp & __GFP_WAIT))
+                        return 0;
+                fscache_wait_on_page_write(cookie, page);
+        }
+        if (PageFsCache(page)) {
+                dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
+                         cookie, page, nfsi);
+                fscache_uncache_page(cookie, page);
+                nfs_add_fscache_stats(page->mapping->host,
+                                      NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+        }
+        return 1;
+}
+/*
+ * Release the caching state associated with a page if undergoing complete page
+ * invalidation.
+ */
+void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct fscache_cookie *cookie = nfsi->fscache;
+        BUG_ON(!cookie);
+        dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
+                 cookie, page, nfsi);
+        fscache_wait_on_page_write(cookie, page);
+        BUG_ON(!PageLocked(page));
+        fscache_uncache_page(cookie, page);
+        nfs_add_fscache_stats(page->mapping->host,
+                              NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+}
+/*
+ * Handle completion of a page being read from the cache.
+ * - Called in process (keventd) context.
+ */
+static void nfs_readpage_from_fscache_complete(struct page *page,
+                                               void *context,
+                                               int error)
+{
+        dfprintk(FSCACHE,
+                 "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
+                 page, context, error);
+        /* if the read completes with an error, we just unlock the page and let
+         * the VM reissue the readpage */
+        if (!error) {
+                SetPageUptodate(page);
+                unlock_page(page);
+        } else {
+                error = nfs_readpage_async(context, page->mapping->host, page);
+                if (error)
+                        unlock_page(page);
+        }
+}
+/*
+ * Retrieve a page from fscache
+ */
+int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+                                struct inode *inode, struct page *page)
+{
+        int ret;
+        dfprintk(FSCACHE,
+                 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
+                 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+        ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+                                         page,
+                                         nfs_readpage_from_fscache_complete,
+                                         ctx,
+                                         GFP_KERNEL);
+        switch (ret) {
+        case 0: /* read BIO submitted (page in fscache) */
+                dfprintk(FSCACHE,
+                         "NFS:    readpage_from_fscache: BIO submitted\n");
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
+                return ret;
+        case -ENOBUFS: /* inode not in cache */
+        case -ENODATA: /* page not in cache */
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+                dfprintk(FSCACHE,
+                         "NFS:    readpage_from_fscache %d\n", ret);
+                return 1;
+        default:
+                dfprintk(FSCACHE, "NFS:    readpage_from_fscache %d\n", ret);
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+        }
+        return ret;
+}
+/*
+ * Retrieve a set of pages from fscache
+ */
+int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+                                 struct inode *inode,
+                                 struct address_space *mapping,
+                                 struct list_head *pages,
+                                 unsigned *nr_pages)
+{
+        int ret, npages = *nr_pages;
+        dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
+                 NFS_I(inode)->fscache, npages, inode);
+        ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+                                          mapping, pages, nr_pages,
+                                          nfs_readpage_from_fscache_complete,
+                                          ctx,
+                                          mapping_gfp_mask(mapping));
+        if (*nr_pages < npages)
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
+                                      npages);
+        if (*nr_pages > 0)
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
+                                      *nr_pages);
+        switch (ret) {
+        case 0: /* read submitted to the cache for all pages */
+                BUG_ON(!list_empty(pages));
+                BUG_ON(*nr_pages != 0);
+                dfprintk(FSCACHE,
+                         "NFS: nfs_getpages_from_fscache: submitted\n");
+                return ret;
+        case -ENOBUFS: /* some pages aren't cached and can't be */
+        case -ENODATA: /* some pages aren't cached */
+                dfprintk(FSCACHE,
+                         "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
+                return 1;
+        default:
+                dfprintk(FSCACHE,
+                         "NFS: nfs_getpages_from_fscache: ret  %d\n", ret);
+        }
+        return ret;
+}
+/*
+ * Store a newly fetched page in fscache
+ * - PG_fscache must be set on the page
+ */
+void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+{
+        int ret;
+        dfprintk(FSCACHE,
+                 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
+                 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+        ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+        dfprintk(FSCACHE,
+                 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+                 page, page->index, page->flags, ret);
+        if (ret != 0) {
+                fscache_uncache_page(NFS_I(inode)->fscache, page);
+                nfs_add_fscache_stats(inode,
+                                      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
+                nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+        } else {
+                nfs_add_fscache_stats(inode,
+                                      NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
+        }
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 000000000000..6e809bb0ff08
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,220 @@
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+#ifdef CONFIG_NFS_FSCACHE
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+        struct rb_node          node;
+        struct nfs_client       *nfs_client;    /* the server */
+        /* the elements of the unique key - as used by nfs_compare_super() and
+         * nfs_compare_mount_options() to distinguish superblocks */
+        struct {
+                struct {
+                        unsigned long   s_flags;        /* various flags
+                                                         * (& NFS_MS_MASK) */
+                } super;
+                struct {
+                        struct nfs_fsid fsid;
+                        int             flags;
+                        unsigned int    rsize;          /* read size */
+                        unsigned int    wsize;          /* write size */
+                        unsigned int    acregmin;       /* attr cache timeouts */
+                        unsigned int    acregmax;
+                        unsigned int    acdirmin;
+                        unsigned int    acdirmax;
+                } nfs_server;
+                struct {
+                        rpc_authflavor_t au_flavor;
+                } rpc_auth;
+                /* uniquifier - can be used if nfs_server.flags includes
+                 * NFS_MOUNT_UNSHARED  */
+                u8 uniq_len;
+                char uniquifier[0];
+        } key;
+};
+/*
+ * fscache-index.c
+ */
+extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
+extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
+extern int nfs_fscache_register(void);
+extern void nfs_fscache_unregister(void);
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+extern void nfs_fscache_get_super_cookie(struct super_block *,
+                                         struct nfs_parsed_mount_data *);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+extern void nfs_fscache_init_inode_cookie(struct inode *);
+extern void nfs_fscache_release_inode_cookie(struct inode *);
+extern void nfs_fscache_zap_inode_cookie(struct inode *);
+extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void nfs_fscache_reset_inode_cookie(struct inode *);
+extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
+extern int nfs_fscache_release_page(struct page *, gfp_t);
+extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
+                                       struct inode *, struct page *);
+extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
+                                        struct inode *, struct address_space *,
+                                        struct list_head *, unsigned *);
+extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
+/*
+ * wait for a page to complete writing to the cache
+ */
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+                                                  struct page *page)
+{
+        if (PageFsCache(page))
+                fscache_wait_on_page_write(nfsi->fscache, page);
+}
+/*
+ * release the caching state associated with a page if undergoing complete page
+ * invalidation
+ */
+static inline void nfs_fscache_invalidate_page(struct page *page,
+                                               struct inode *inode)
+{
+        if (PageFsCache(page))
+                __nfs_fscache_invalidate_page(page, inode);
+}
+/*
+ * Retrieve a page from an inode data storage object.
+ */
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+                                            struct inode *inode,
+                                            struct page *page)
+{
+        if (NFS_I(inode)->fscache)
+                return __nfs_readpage_from_fscache(ctx, inode, page);
+        return -ENOBUFS;
+}
+/*
+ * Retrieve a set of pages from an inode data storage object.
+ */
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+                                             struct inode *inode,
+                                             struct address_space *mapping,
+                                             struct list_head *pages,
+                                             unsigned *nr_pages)
+{
+        if (NFS_I(inode)->fscache)
+                return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
+                                                    nr_pages);
+        return -ENOBUFS;
+}
+/*
+ * Store a page newly fetched from the server in an inode data storage object
+ * in the cache.
+ */
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+                                           struct page *page,
+                                           int sync)
+{
+        if (PageFsCache(page))
+                __nfs_readpage_to_fscache(inode, page, sync);
+}
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+        if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+                return "yes";
+        return "no ";
+}
+#else /* CONFIG_NFS_FSCACHE */
+static inline int nfs_fscache_register(void) { return 0; }
+static inline void nfs_fscache_unregister(void) {}
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_get_super_cookie(
+        struct super_block *sb,
+        struct nfs_parsed_mount_data *data)
+{
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
+                                                struct file *filp) {}
+static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+        return 1; /* True: may release page */
+}
+static inline void nfs_fscache_invalidate_page(struct page *page,
+                                               struct inode *inode) {}
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+                                                  struct page *page) {}
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+                                            struct inode *inode,
+                                            struct page *page)
+{
+        return -ENOBUFS;
+}
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+                                             struct inode *inode,
+                                             struct address_space *mapping,
+                                             struct list_head *pages,
+                                             unsigned *nr_pages)
+{
+        return -ENOBUFS;
+}
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+                                           struct page *page, int sync) {}
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+        return "no ";
+}
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a834d1d850b7..64f87194d390 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -121,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
        BUG_ON(!list_empty(&NFS_I(inode)->open_files));
        nfs_zap_acl_cache(inode);
        nfs_access_zap_cache(inode);
+        nfs_fscache_release_inode_cookie(inode);
 }
 /**
@@ -355,6 +357,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                nfsi->attrtimeo_timestamp = now;
                nfsi->access_cache = RB_ROOT;
+                nfs_fscache_init_inode_cookie(inode);
                unlock_new_inode(inode);
        } else
                nfs_refresh_inode(inode, fattr);
@@ -686,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
        ctx->mode = filp->f_mode;
        nfs_file_set_open_context(filp, ctx);
        put_nfs_open_context(ctx);
+        nfs_fscache_set_inode_cookie(inode, filp);
        return 0;
 }
@@ -786,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
        spin_unlock(&inode->i_lock);
        nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+        nfs_fscache_reset_inode_cookie(inode);
        dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
                        inode->i_sb->s_id, (long long)NFS_FILEID(inode));
        return 0;
@@ -1030,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
        spin_lock(&inode->i_lock);
        status = nfs_refresh_inode_locked(inode, fattr);
        spin_unlock(&inode->i_lock);
        return status;
 }
@@ -1436,6 +1443,10 @@ static int __init init_nfs_fs(void)
 {
        int err;
+        err = nfs_fscache_register();
+        if (err < 0)
+                goto out7;
        err = nfsiod_start();
        if (err)
                goto out6;
@@ -1488,6 +1499,8 @@ out4:
 out5:
        nfsiod_stop();
 out6:
+        nfs_fscache_unregister();
+out7:
        return err;
 }
@@ -1498,6 +1511,7 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_readpagecache();
        nfs_destroy_inodecache();
        nfs_destroy_nfspagecache();
+        nfs_fscache_unregister();
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2041f68ff1cc..e4d6a8348adf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
 #include <linux/mount.h>
 #include <linux/security.h>
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
 struct nfs_string;
 /* Maximum number of readahead requests
@@ -37,10 +39,12 @@ struct nfs_parsed_mount_data {
        int                     acregmin, acregmax,
                                acdirmin, acdirmax;
        int                     namlen;
+        unsigned int            options;
        unsigned int            bsize;
        unsigned int            auth_flavor_len;
        rpc_authflavor_t        auth_flavors[1];
        char                    *client_address;
+        char                    *fscache_uniq;
        struct {
                struct sockaddr_storage address;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a36952810032..a2ab2529b5ca 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
 struct nfs_iostats {
        unsigned long long      bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+        unsigned long long      fscache[__NFSIOS_FSCACHEMAX];
+#endif
        unsigned long           events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
        nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+                                         enum nfs_stat_fscachecounters stat,
+                                         unsigned long addend)
+{
+        struct nfs_iostats *iostats;
+        int cpu;
+        cpu = get_cpu();
+        iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
+        iostats->fscache[stat] += addend;
+        put_cpu_no_resched();
+}
+#endif
 static inline struct nfs_iostats *nfs_alloc_iostats(void)
 {
        return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index b82fe6847f14..d0cc5ce0edfe 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                data->arg.create.verifier[1] = current->pid;
        }
-        sattr->ia_mode &= ~current->fs->umask;
+        sattr->ia_mode &= ~current_umask();
        for (;;) {
                status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
        dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
-        sattr->ia_mode &= ~current->fs->umask;
+        sattr->ia_mode &= ~current_umask();
        data = nfs3_alloc_createdata();
        if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
                        MAJOR(rdev), MINOR(rdev));
-        sattr->ia_mode &= ~current->fs->umask;
+        sattr->ia_mode &= ~current_umask();
        data = nfs3_alloc_createdata();
        if (data == NULL)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 97bacccff579..a4d242680299 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1501,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                attr.ia_mode = nd->intent.open.create_mode;
                attr.ia_valid = ATTR_MODE;
                if (!IS_POSIXACL(dir))
-                        attr.ia_mode &= ~current->fs->umask;
+                        attr.ia_mode &= ~current_umask();
        } else {
                attr.ia_valid = 0;
                BUG_ON(nd->intent.open.flags & O_CREAT);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7fa..4ace3c50a8eb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,6 +24,7 @@
 #include "internal.h"
 #include "iostat.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
        }
 }
-static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
-                struct page *page)
+                       struct page *page)
 {
        LIST_HEAD(one_request);
        struct nfs_page *new;
@@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 static void nfs_readpage_release(struct nfs_page *req)
 {
+        struct inode *d_inode = req->wb_context->path.dentry->d_inode;
+        if (PageUptodate(req->wb_page))
+                nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
        unlock_page(req->wb_page);
        dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
@@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page)
        } else
                ctx = get_nfs_open_context(nfs_file_open_context(file));
+        if (!IS_SYNC(inode)) {
+                error = nfs_readpage_from_fscache(ctx, inode, page);
+                if (error == 0)
+                        goto out;
+        }
        error = nfs_readpage_async(ctx, inode, page);
+out:
        put_nfs_open_context(ctx);
        return error;
 out_unlock:
@@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                        return -EBADF;
        } else
                desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
+        /* attempt to read as many of the pages as possible from the cache
+         * - this returns -ENOBUFS immediately if the cookie is negative
+         */
+        ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
+                                         pages, &nr_pages);
+        if (ret == 0)
+                goto read_complete; /* all pages were read */
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
@@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        nfs_pageio_complete(&pgio);
        npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+read_complete:
        put_nfs_open_context(desc.ctx);
 out:
        return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0942fcbbad3c..6717200923fe 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -76,6 +77,7 @@ enum {
        Opt_rdirplus, Opt_nordirplus,
        Opt_sharecache, Opt_nosharecache,
        Opt_resvport, Opt_noresvport,
+        Opt_fscache, Opt_nofscache,
        /* Mount options that take integer arguments */
        Opt_port,
@@ -93,6 +95,7 @@ enum {
        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
        Opt_lookupcache,
+        Opt_fscache_uniq,
        /* Special mount options */
        Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_nosharecache, "nosharecache" },
        { Opt_resvport, "resvport" },
        { Opt_noresvport, "noresvport" },
+        { Opt_fscache, "fsc" },
+        { Opt_fscache_uniq, "fsc=%s" },
+        { Opt_nofscache, "nofsc" },
        { Opt_port, "port=%u" },
        { Opt_rsize, "rsize=%u" },
@@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        if (clp->rpc_ops->version == 4)
                seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
 #endif
+        if (nfss->options & NFS_OPTION_FSCACHE)
+                seq_printf(m, ",fsc");
 }
 /*
@@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
                        totals.events[i] += stats->events[i];
                for (i = 0; i < __NFSIOS_BYTESMAX; i++)
                        totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+                for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+                        totals.fscache[i] += stats->fscache[i];
+#endif
                preempt_enable();
        }
@@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
        seq_printf(m, "\n\tbytes:\t");
        for (i = 0; i < __NFSIOS_BYTESMAX; i++)
                seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+        if (nfss->options & NFS_OPTION_FSCACHE) {
+                seq_printf(m, "\n\tfsc:\t");
+                for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+                        seq_printf(m, "%Lu ", totals.bytes[i]);
+        }
+#endif
        seq_printf(m, "\n");
        rpc_print_iostats(m, nfss->client);
@@ -1044,6 +1063,24 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_noresvport:
                        mnt->flags |= NFS_MOUNT_NORESVPORT;
                        break;
+                case Opt_fscache:
+                        mnt->options |= NFS_OPTION_FSCACHE;
+                        kfree(mnt->fscache_uniq);
+                        mnt->fscache_uniq = NULL;
+                        break;
+                case Opt_nofscache:
+                        mnt->options &= ~NFS_OPTION_FSCACHE;
+                        kfree(mnt->fscache_uniq);
+                        mnt->fscache_uniq = NULL;
+                        break;
+                case Opt_fscache_uniq:
+                        string = match_strdup(args);
+                        if (!string)
+                                goto out_nomem;
+                        kfree(mnt->fscache_uniq);
+                        mnt->fscache_uniq = string;
+                        mnt->options |= NFS_OPTION_FSCACHE;
+                        break;
                /*
                 * options that take numeric values
@@ -1191,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        token = match_token(string,
                                            nfs_xprt_protocol_tokens, args);
-                        kfree(string);
                        switch (token) {
                        case Opt_xprt_udp:
@@ -1221,6 +1257,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        token = match_token(string,
                                            nfs_xprt_protocol_tokens, args);
+                        kfree(string);
                        switch (token) {
                        case Opt_xprt_udp:
@@ -1870,8 +1907,6 @@ static void nfs_clone_super(struct super_block *sb,
        nfs_initialise_sb(sb);
 }
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
 {
        const struct nfs_server *a = s->s_fs_info;
@@ -2036,6 +2071,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        if (!s->s_root) {
                /* initial superblock/root creation */
                nfs_fill_super(s, data);
+                nfs_fscache_get_super_cookie(s, data);
        }
        mntroot = nfs_get_root(s, mntfh);
@@ -2056,6 +2092,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 out:
        kfree(data->nfs_server.hostname);
        kfree(data->mount_server.hostname);
+        kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        kfree(mntfh);
@@ -2083,6 +2120,7 @@ static void nfs_kill_super(struct super_block *s)
        bdi_unregister(&server->backing_dev_info);
        kill_anon_super(s);
+        nfs_fscache_release_super_cookie(s);
        nfs_free_server(server);
 }
@@ -2390,6 +2428,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        if (!s->s_root) {
                /* initial superblock/root creation */
                nfs4_fill_super(s);
+                nfs_fscache_get_super_cookie(s, data);
        }
        mntroot = nfs4_get_root(s, mntfh);
@@ -2411,6 +2450,7 @@ out:
        kfree(data->client_address);
        kfree(data->nfs_server.export_path);
        kfree(data->nfs_server.hostname);
+        kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        kfree(mntfh);
@@ -2437,6 +2477,7 @@ static void nfs4_kill_super(struct super_block *sb)
        kill_anon_super(sb);
        nfs4_renewd_prepare_shutdown(server);
+        nfs_fscache_release_super_cookie(sb);
        nfs_free_server(server);
 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
 config NFSD
        tristate "NFS server support"
        depends on INET
+        depends on FILE_LOCKING
        select LOCKD
        select SUNRPC
        select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
 #include <linux/unistd.h>
 #include <linux/slab.h>
 #include <linux/major.h>
+#include <linux/magic.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
                                         struct nfsd3_writeres  *resp)
 {
        __be32  nfserr;
+        unsigned long cnt = argp->len;
        dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
                                SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
        nfserr = nfsd_write(rqstp, &resp->fh, NULL,
                                   argp->offset,
                                   rqstp->rq_vec, argp->vlen,
-                                   argp->len,
+                                   &cnt,
                                   &resp->committed);
-        resp->count = argp->count;
+        resp->count = cnt;
        RETURN_STATUS(nfserr);
 }
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
                struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
                /* Note that we don't care for remote fs's here */
-                if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) {
+                if (sb->s_magic == MSDOS_SUPER_MAGIC) {
                        resp->f_properties = NFS3_FSF_BILLYBOY;
                }
                resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
                        resp->p_link_max = EXT2_LINK_MAX;
                        resp->p_name_max = EXT2_NAME_LEN;
                        break;
-                case 0x4d44:    /* MSDOS_SUPER_MAGIC */
+                case MSDOS_SUPER_MAGIC:
                        resp->p_case_insensitive = 1;
                        resp->p_case_preserving  = 0;
                        break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
 encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 {
        __be32 *p;
-        int len = cb_rec->cbr_fhlen;
+        int len = cb_rec->cbr_fh.fh_size;
        RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
        WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
        WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
        WRITE32(cb_rec->cbr_trunc);
        WRITE32(len);
-        WRITEMEM(cb_rec->cbr_fhval, len);
+        WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
        return 0;
 }
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
 /* Reference counting, callback cleanup, etc., all look racy as heck.
 * And why is cb_set an atomic? */
-static int do_probe_callback(void *data)
+static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
 {
-        struct nfs4_client *clp = data;
        struct sockaddr_in      addr;
        struct nfs4_callback    *cb = &clp->cl_callback;
        struct rpc_timeout      timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
                .client_name    = clp->cl_principal,
        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-                .rpc_argp       = clp,
-        };
        struct rpc_clnt *client;
-        int status;
-        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
+        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-                status = nfserr_cb_path_down;
+                return ERR_PTR(-EINVAL);
-                goto out_err;
-        }
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
        /* Create RPC client */
        client = rpc_create(&args);
+        if (IS_ERR(client))
+                dprintk("NFSD: couldn't create callback client: %ld\n",
+                        PTR_ERR(client));
+        return client;
+}
+static int do_probe_callback(void *data)
+{
+        struct nfs4_client *clp = data;
+        struct nfs4_callback    *cb = &clp->cl_callback;
+        struct rpc_message msg = {
+                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+                .rpc_argp       = clp,
+        };
+        struct rpc_clnt *client;
+        int status;
+        client = setup_callback_client(clp);
        if (IS_ERR(client)) {
-                dprintk("NFSD: couldn't create callback client\n");
                status = PTR_ERR(client);
+                dprintk("NFSD: couldn't create callback client: %d\n",
+                                                                status);
                goto out_err;
        }
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
 out_release_client:
        rpc_shutdown_client(client);
 out_err:
-        dprintk("NFSD: warning: no callback path to client %.*s\n",
+        dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
-                (int)clp->cl_name.len, clp->cl_name.data);
+                (int)clp->cl_name.len, clp->cl_name.data, status);
        put_nfs4_client(clp);
-        return status;
+        return 0;
 }
 /*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 /*
 * called with dp->dl_count inc'ed.
- * nfs4_lock_state() may or may not have been called.
 */
 void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
        open->op_truncate = 0;
        if (open->op_create) {
+                /* FIXME: check session persistence and pnfs flags.
+                 * The nfsv4.1 spec requires the following semantics:
+                 *
+                 * Persistent   | pNFS   | Server REQUIRED | Client Allowed
+                 * Reply Cache  | server |                 |
+                 * -------------+--------+-----------------+--------------------
+                 * no           | no     | EXCLUSIVE4_1    | EXCLUSIVE4_1
+                 *              |        |                 | (SHOULD)
+                 *              |        | and EXCLUSIVE4  | or EXCLUSIVE4
+                 *              |        |                 | (SHOULD NOT)
+                 * no           | yes    | EXCLUSIVE4_1    | EXCLUSIVE4_1
+                 * yes          | no     | GUARDED4        | GUARDED4
+                 * yes          | yes    | GUARDED4        | GUARDED4
+                 */
                /*
                 * Note: create modes (UNCHECKED,GUARDED...) are the same
                 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                                        (u32 *)open->op_verf.data,
                                        &open->op_truncate, &created);
-                /* If we ever decide to use different attrs to store the
+                /*
-                 * verifier in nfsd_create_v3, then we'll need to change this
+                 * Following rfc 3530 14.2.16, use the returned bitmask
+                 * to indicate which attributes we used to store the
+                 * verifier:
                 */
                if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
-                        open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+                        open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
                                                FATTR4_WORD1_TIME_MODIFY);
        } else {
                status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                goto out;
        set_change_info(&open->op_cinfo, current_fh);
-        /* set reply cache */
        fh_dup2(current_fh, &resfh);
-        open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
-        memcpy(open->op_stateowner->so_replay.rp_openfh,
-                        &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
+        /* set reply cache */
+        fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+                        &resfh.fh_handle);
        if (!created)
                status = do_open_permission(rqstp, current_fh, open,
                                            NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
        memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
        /* set replay cache */
-        open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size;
+        fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
-        memcpy(open->op_stateowner->so_replay.rp_openfh,
+                        &current_fh->fh_handle);
-                &current_fh->fh_handle.fh_base,
-                current_fh->fh_handle.fh_size);
        open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
                (open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
        return status;
 }
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+        struct nfsd4_sessionid *sid =
+                        (struct nfsd4_sessionid *)session->se_sessionid.data;
+        clid->cl_boot = sid->clientid.cl_boot;
+        clid->cl_id = sid->clientid.cl_id;
+}
 static __be32
 nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
           struct nfsd4_open *open)
 {
        __be32 status;
+        struct nfsd4_compoundres *resp;
        dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
                (int)open->op_fname.len, open->op_fname.data,
                open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
                return nfserr_inval;
+        if (nfsd4_has_session(cstate))
+                copy_clientid(&open->op_clientid, cstate->session);
        nfs4_lock_state();
        /* check seqid for replay. set nfs4_owner */
-        status = nfsd4_process_open1(open);
+        resp = rqstp->rq_resp;
+        status = nfsd4_process_open1(&resp->cstate, open);
        if (status == nfserr_replay_me) {
                struct nfs4_replay *rp = &open->op_stateowner->so_replay;
                fh_put(&cstate->current_fh);
-                cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
+                fh_copy_shallow(&cstate->current_fh.fh_handle,
-                memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
+                                &rp->rp_openfh);
-                                rp->rp_openfh_len);
                status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
                if (status)
                        dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (open->op_claim_type) {
                case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-                        status = nfserr_inval;
-                        if (open->op_create)
-                                goto out;
-                        /* fall through */
                case NFS4_OPEN_CLAIM_NULL:
                        /*
                         * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
                return nfserr_inval;
-        getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
+        getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
-        getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+        getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+        getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
        getattr->ga_fhp = &cstate->current_fh;
        return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        /* check stateid */
-        if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
-                                &read->rd_stateid,
+                                                 RD_STATE, &read->rd_filp))) {
-                                CHECK_FH | RD_STATE, &read->rd_filp))) {
                dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
                goto out;
        }
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
                return nfserr_inval;
-        readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
+        readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
-        readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+        readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+        readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
        if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
            (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
                nfs4_lock_state();
-                status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+                status = nfs4_preprocess_stateid_op(cstate,
-                        &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
+                        &setattr->sa_stateid, WR_STATE, NULL);
                nfs4_unlock_state();
                if (status) {
                        dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct file *filp = NULL;
        u32 *p;
        __be32 status = nfs_ok;
+        unsigned long cnt;
        /* no need to check permission - this will be done in nfsd_write() */
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                return nfserr_inval;
        nfs4_lock_state();
-        status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid,
+        status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
-                                        CHECK_FH | WR_STATE, &filp);
        if (filp)
                get_file(filp);
        nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                return status;
        }
-        write->wr_bytes_written = write->wr_buflen;
+        cnt = write->wr_buflen;
        write->wr_how_written = write->wr_stable_how;
        p = (u32 *)write->wr_verifier.data;
        *p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status =  nfsd_write(rqstp, &cstate->current_fh, filp,
                             write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-                             write->wr_buflen, &write->wr_how_written);
+                             &cnt, &write->wr_how_written);
        if (filp)
                fput(filp);
+        write->wr_bytes_written = cnt;
        if (status == nfserr_symlink)
                status = nfserr_inval;
        return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
-        if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0)
+        if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
-            || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+            || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+            || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
                return nfserr_attrnotsupp;
        if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
            || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                goto out_kfree;
-        p = buf + 3;
+        /* skip bitmap */
+        p = buf + 1 + ntohl(buf[0]);
        status = nfserr_not_same;
        if (ntohl(*p++) != verify->ve_attrlen)
                goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
                nfsdstats.nfs4_opcount[opnum]++;
 }
-static void cstate_free(struct nfsd4_compound_state *cstate)
-{
-        if (cstate == NULL)
-                return;
-        fh_put(&cstate->current_fh);
-        fh_put(&cstate->save_fh);
-        BUG_ON(cstate->replay_owner);
-        kfree(cstate);
-}
-static struct nfsd4_compound_state *cstate_alloc(void)
-{
-        struct nfsd4_compound_state *cstate;
-        cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
-        if (cstate == NULL)
-                return NULL;
-        fh_init(&cstate->current_fh, NFS4_FHSIZE);
-        fh_init(&cstate->save_fh, NFS4_FHSIZE);
-        cstate->replay_owner = NULL;
-        return cstate;
-}
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
                              void *);
+enum nfsd4_op_flags {
+        ALLOWED_WITHOUT_FH = 1 << 0,    /* No current filehandle required */
+        ALLOWED_ON_ABSENT_FS = 2 << 0,  /* ops processed on absent fs */
+        ALLOWED_AS_FIRST_OP = 3 << 0,   /* ops reqired first in compound */
+};
 struct nfsd4_operation {
        nfsd4op_func op_func;
        u32 op_flags;
-/* Most ops require a valid current filehandle; a few don't: */
-#define ALLOWED_WITHOUT_FH 1
-/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
-#define ALLOWED_ON_ABSENT_FS 2
        char *op_name;
 };
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 /*
+ * This is a replay of a compound for which no cache entry pages
+ * were used. Encode the sequence operation, and if cachethis is FALSE
+ * encode the uncache rep error on the next operation.
+ */
+static __be32
+nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
+                         struct nfsd4_compoundres *resp)
+{
+        struct nfsd4_op *op;
+        dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
+                resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
+        /* Encode the replayed sequence operation */
+        BUG_ON(resp->opcnt != 1);
+        op = &args->ops[resp->opcnt - 1];
+        nfsd4_encode_operation(resp, op);
+        /*return nfserr_retry_uncached_rep in next operation. */
+        if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
+                op = &args->ops[resp->opcnt++];
+                op->status = nfserr_retry_uncached_rep;
+                nfsd4_encode_operation(resp, op);
+        }
+        return op->status;
+}
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules.
+ *
+ * TODO:
+ * - enforce NFS4ERR_NOT_ONLY_OP,
+ * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ */
+static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+{
+        if (args->minorversion && args->opcnt > 0) {
+                struct nfsd4_op *op = &args->ops[0];
+                return (op->status == nfserr_op_illegal) ||
+                       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+        }
+        return true;
+}
+/*
 * COMPOUND call.
 */
 static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 {
        struct nfsd4_op *op;
        struct nfsd4_operation *opdesc;
-        struct nfsd4_compound_state *cstate = NULL;
+        struct nfsd4_compound_state *cstate = &resp->cstate;
        int             slack_bytes;
        __be32          status;
        resp->xbuf = &rqstp->rq_res;
-        resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+        resp->p = rqstp->rq_res.head[0].iov_base +
+                                                rqstp->rq_res.head[0].iov_len;
        resp->tagp = resp->p;
        /* reserve space for: taglen, tag, and opcnt */
        resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->tag = args->tag;
        resp->opcnt = 0;
        resp->rqstp = rqstp;
+        resp->cstate.minorversion = args->minorversion;
+        resp->cstate.replay_owner = NULL;
+        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+        /* Use the deferral mechanism only for NFSv4.0 compounds */
+        rqstp->rq_usedeferral = (args->minorversion == 0);
        /*
         * According to RFC3010, this takes precedence over all other errors.
         */
        status = nfserr_minor_vers_mismatch;
-        if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+        if (args->minorversion > nfsd_supported_minorversion)
                goto out;
-        status = nfserr_resource;
+        if (!nfs41_op_ordering_ok(args)) {
-        cstate = cstate_alloc();
+                op = &args->ops[0];
-        if (cstate == NULL)
+                op->status = nfserr_sequence_pos;
-                goto out;
+                goto encode_op;
+        }
        status = nfs_ok;
        while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
                        resp->opcnt, args->opcnt, op->opnum,
                        nfsd4_op_name(op->opnum));
                /*
                 * The XDR decode routines may have pre-set op->status;
                 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                        BUG_ON(op->status == nfs_ok);
 encode_op:
+                /* Only from SEQUENCE or CREATE_SESSION */
+                if (resp->cstate.status == nfserr_replay_cache) {
+                        dprintk("%s NFS4.1 replay from cache\n", __func__);
+                        if (nfsd4_not_cached(resp))
+                                status = nfsd4_enc_uncached_replay(args, resp);
+                        else
+                                status = op->status;
+                        goto out;
+                }
                if (op->status == nfserr_replay_me) {
                        op->replay = &cstate->replay_owner->so_replay;
                        nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
+        if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
+                dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
+                status = nfserr_jukebox;
+        }
-        cstate_free(cstate);
+        resp->cstate.status = status;
+        fh_put(&resp->cstate.current_fh);
+        fh_put(&resp->cstate.save_fh);
+        BUG_ON(resp->cstate.replay_owner);
 out:
        nfsd4_release_compoundargs(args);
+        /* Reset deferral mechanism for RPC deferrals */
+        rqstp->rq_usedeferral = 1;
        dprintk("nfsv4 compound returned %d\n", ntohl(status));
        return status;
 }
-static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
+static struct nfsd4_operation nfsd4_ops[] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
                .op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
                .op_name = "OP_PUTFH",
        },
        [OP_PUTPUBFH] = {
-                /* unsupported, just for future reference: */
+                .op_func = (nfsd4op_func)nfsd4_putrootfh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
                .op_name = "OP_PUTPUBFH",
        },
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
                .op_name = "OP_RELEASE_LOCKOWNER",
        },
+        /* NFSv4.1 operations */
+        [OP_EXCHANGE_ID] = {
+                .op_func = (nfsd4op_func)nfsd4_exchange_id,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_EXCHANGE_ID",
+        },
+        [OP_CREATE_SESSION] = {
+                .op_func = (nfsd4op_func)nfsd4_create_session,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_CREATE_SESSION",
+        },
+        [OP_DESTROY_SESSION] = {
+                .op_func = (nfsd4op_func)nfsd4_destroy_session,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_DESTROY_SESSION",
+        },
+        [OP_SEQUENCE] = {
+                .op_func = (nfsd4op_func)nfsd4_sequence,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_SEQUENCE",
+        },
 };
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..3444c0052a87 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
 typedef int (recdir_func)(struct dentry *, struct dentry *);
-struct dentry_list {
+struct name_list {
-        struct dentry *dentry;
+        char name[HEXDIR_LEN];
        struct list_head list;
 };
-struct dentry_list_arg {
-        struct list_head dentries;
-        struct dentry *parent;
-};
 static int
-nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
+nfsd4_build_namelist(void *arg, const char *name, int namlen,
                loff_t offset, u64 ino, unsigned int d_type)
 {
-        struct dentry_list_arg *dla = arg;
+        struct list_head *names = arg;
-        struct list_head *dentries = &dla->dentries;
+        struct name_list *entry;
-        struct dentry *parent = dla->parent;
-        struct dentry *dentry;
-        struct dentry_list *child;
-        if (name && isdotent(name, namlen))
+        if (namlen != HEXDIR_LEN - 1)
                return 0;
-        dentry = lookup_one_len(name, parent, namlen);
+        entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
-        if (IS_ERR(dentry))
+        if (entry == NULL)
-                return PTR_ERR(dentry);
-        child = kmalloc(sizeof(*child), GFP_KERNEL);
-        if (child == NULL)
                return -ENOMEM;
-        child->dentry = dentry;
+        memcpy(entry->name, name, HEXDIR_LEN - 1);
-        list_add(&child->list, dentries);
+        entry->name[HEXDIR_LEN - 1] = '\0';
+        list_add(&entry->list, names);
        return 0;
 }
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
        const struct cred *original_cred;
        struct file *filp;
-        struct dentry_list_arg dla = {
+        LIST_HEAD(names);
-                .parent = dir,
+        struct name_list *entry;
-        };
+        struct dentry *dentry;
-        struct list_head *dentries = &dla.dentries;
-        struct dentry_list *child;
        int status;
        if (!rec_dir_init)
@@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
-        INIT_LIST_HEAD(dentries);
        filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
                           current_cred());
        status = PTR_ERR(filp);
        if (IS_ERR(filp))
                goto out;
-        INIT_LIST_HEAD(dentries);
+        status = vfs_readdir(filp, nfsd4_build_namelist, &names);
-        status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
        fput(filp);
-        while (!list_empty(dentries)) {
+        while (!list_empty(&names)) {
-                child = list_entry(dentries->next, struct dentry_list, list);
+                entry = list_entry(names.next, struct name_list, list);
-                status = f(dir, child->dentry);
+                dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+                if (IS_ERR(dentry)) {
+                        status = PTR_ERR(dentry);
+                        goto out;
+                }
+                status = f(dir, dentry);
+                dput(dentry);
                if (status)
                        goto out;
-                list_del(&child->list);
+                list_del(&entry->list);
-                dput(child->dentry);
+                kfree(entry);
-                kfree(child);
        }
 out:
-        while (!list_empty(dentries)) {
+        while (!list_empty(&names)) {
-                child = list_entry(dentries->next, struct dentry_list, list);
+                entry = list_entry(names.next, struct name_list, list);
-                list_del(&child->list);
+                list_del(&entry->list);
-                dput(child->dentry);
+                kfree(entry);
-                kfree(child);
        }
        nfs4_reset_creds(original_cred);
        return status;
@@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
        int status;
-        if (nfs4_has_reclaimed_state(child->d_name.name))
+        /* note: we currently use this path only for minorversion 0 */
+        if (nfs4_has_reclaimed_state(child->d_name.name, false))
                return 0;
        status = nfsd4_clear_clid_dir(parent, child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..c65a27b76a9d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
 static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
+static u64 current_sessionid = 1;
 #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid;              /* bits all 1 */
 /* forward declarations */
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 static void nfs4_set_recdir(char *recdir);
-/* Locking:
+/* Locking: */
- *
- * client_mutex:
+/* Currently used for almost all code touching nfsv4 state: */
- *      protects clientid_hashtbl[], clientstr_hashtbl[],
- *      unconfstr_hashtbl[], uncofid_hashtbl[].
- */
 static DEFINE_MUTEX(client_mutex);
+/*
+ * Currently used for the del_recall_lru and file hash table.  In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(recall_lock);
 static struct kmem_cache *stateowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
        return x;
 }
-/* forward declarations */
-static void release_stateowner(struct nfs4_stateowner *sop);
-static void release_stateid(struct nfs4_stateid *stp, int flags);
-/*
- * Delegation state
- */
-/* recall_lock protects the del_recall_lru */
-static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
-static void
-free_nfs4_file(struct kref *kref)
-{
-        struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
-        list_del(&fp->fi_hash);
-        iput(fp->fi_inode);
-        kmem_cache_free(file_slab, fp);
-}
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
-        kref_put(&fi->fi_ref, free_nfs4_file);
+        if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+                list_del(&fi->fi_hash);
+                spin_unlock(&recall_lock);
+                iput(fi->fi_inode);
+                kmem_cache_free(file_slab, fi);
+        }
 }
 static inline void
 get_nfs4_file(struct nfs4_file *fi)
 {
-        kref_get(&fi->fi_ref);
+        atomic_inc(&fi->fi_ref);
 }
 static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
        dp->dl_stateid.si_generation = 0;
-        dp->dl_fhlen = current_fh->fh_handle.fh_size;
+        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
-        memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
-                        current_fh->fh_handle.fh_size);
        dp->dl_time = 0;
        atomic_set(&dp->dl_count, 1);
        list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,291 @@ static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
 static struct list_head client_lru;
 static struct list_head close_lru;
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+        list_del(&stp->st_hash);
+        list_del(&stp->st_perfile);
+        list_del(&stp->st_perstateowner);
+}
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+        put_nfs4_file(stp->st_file);
+        kmem_cache_free(stateid_slab, stp);
+}
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+        unhash_generic_stateid(stp);
+        locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+        free_generic_stateid(stp);
+}
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+        struct nfs4_stateid *stp;
+        list_del(&sop->so_idhash);
+        list_del(&sop->so_strhash);
+        list_del(&sop->so_perstateid);
+        while (!list_empty(&sop->so_stateids)) {
+                stp = list_first_entry(&sop->so_stateids,
+                                struct nfs4_stateid, st_perstateowner);
+                release_lock_stateid(stp);
+        }
+}
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+        unhash_lockowner(sop);
+        nfs4_put_stateowner(sop);
+}
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+        struct nfs4_stateowner *lock_sop;
+        while (!list_empty(&open_stp->st_lockowners)) {
+                lock_sop = list_entry(open_stp->st_lockowners.next,
+                                struct nfs4_stateowner, so_perstateid);
+                /* list_del(&open_stp->st_lockowners);  */
+                BUG_ON(lock_sop->so_is_open_owner);
+                release_lockowner(lock_sop);
+        }
+}
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
+        unhash_generic_stateid(stp);
+        release_stateid_lockowners(stp);
+        nfsd_close(stp->st_vfs_file);
+        free_generic_stateid(stp);
+}
+static void unhash_openowner(struct nfs4_stateowner *sop)
+{
+        struct nfs4_stateid *stp;
+        list_del(&sop->so_idhash);
+        list_del(&sop->so_strhash);
+        list_del(&sop->so_perclient);
+        list_del(&sop->so_perstateid); /* XXX: necessary? */
+        while (!list_empty(&sop->so_stateids)) {
+                stp = list_first_entry(&sop->so_stateids,
+                                struct nfs4_stateid, st_perstateowner);
+                release_open_stateid(stp);
+        }
+}
+static void release_openowner(struct nfs4_stateowner *sop)
+{
+        unhash_openowner(sop);
+        list_del(&sop->so_close_lru);
+        nfs4_put_stateowner(sop);
+}
+static DEFINE_SPINLOCK(sessionid_lock);
+#define SESSION_HASH_SIZE       512
+static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+        struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+        return sid->sequence % SESSION_HASH_SIZE;
+}
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+        u32 *ptr = (u32 *)(&sessionid->data[0]);
+        dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+        struct nfs4_client *clp = ses->se_client;
+        struct nfsd4_sessionid *sid;
+        sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+        sid->clientid = clp->cl_clientid;
+        sid->sequence = current_sessionid++;
+        sid->reserved = 0;
+}
+/*
+ * Give the client the number of slots it requests bound by
+ * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ *
+ * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
+ * should (up to a point) re-negotiate active sessions and reduce their
+ * slot usage to make rooom for new connections. For now we just fail the
+ * create session.
+ */
+static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+{
+        int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+        spin_lock(&nfsd_serv->sv_lock);
+        if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
+                np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
+        nfsd_serv->sv_drc_pages_used += np;
+        spin_unlock(&nfsd_serv->sv_lock);
+        if (np <= 0) {
+                status = nfserr_resource;
+                fchan->maxreqs = 0;
+        } else
+                fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+        return status;
+}
+/*
+ * fchan holds the client values on input, and the server values on output
+ */
+static int init_forechannel_attrs(struct svc_rqst *rqstp,
+                                    struct nfsd4_session *session,
+                                    struct nfsd4_channel_attrs *fchan)
+{
+        int status = 0;
+        __u32   maxcount = svc_max_payload(rqstp);
+        /* headerpadsz set to zero in encode routine */
+        /* Use the client's max request and max response size if possible */
+        if (fchan->maxreq_sz > maxcount)
+                fchan->maxreq_sz = maxcount;
+        session->se_fmaxreq_sz = fchan->maxreq_sz;
+        if (fchan->maxresp_sz > maxcount)
+                fchan->maxresp_sz = maxcount;
+        session->se_fmaxresp_sz = fchan->maxresp_sz;
+        /* Set the max response cached size our default which is
+         * a multiple of PAGE_SIZE and small */
+        session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+        fchan->maxresp_cached = session->se_fmaxresp_cached;
+        /* Use the client's maxops if possible */
+        if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+                fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+        session->se_fmaxops = fchan->maxops;
+        /* try to use the client requested number of slots */
+        if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+                fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+        /* FIXME: Error means no more DRC pages so the server should
+         * recover pages from existing sessions. For now fail session
+         * creation.
+         */
+        status = set_forechannel_maxreqs(fchan);
+        session->se_fnumslots = fchan->maxreqs;
+        return status;
+}
+static int
+alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
+                   struct nfsd4_create_session *cses)
+{
+        struct nfsd4_session *new, tmp;
+        int idx, status = nfserr_resource, slotsize;
+        memset(&tmp, 0, sizeof(tmp));
+        /* FIXME: For now, we just accept the client back channel attributes. */
+        status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+        if (status)
+                goto out;
+        /* allocate struct nfsd4_session and slot table in one piece */
+        slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
+        if (!new)
+                goto out;
+        memcpy(new, &tmp, sizeof(*new));
+        new->se_client = clp;
+        gen_sessionid(new);
+        idx = hash_sessionid(&new->se_sessionid);
+        memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
+               NFS4_MAX_SESSIONID_LEN);
+        new->se_flags = cses->flags;
+        kref_init(&new->se_ref);
+        spin_lock(&sessionid_lock);
+        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+        list_add(&new->se_perclnt, &clp->cl_sessions);
+        spin_unlock(&sessionid_lock);
+        status = nfs_ok;
+out:
+        return status;
+}
+/* caller must hold sessionid_lock */
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+{
+        struct nfsd4_session *elem;
+        int idx;
+        dump_sessionid(__func__, sessionid);
+        idx = hash_sessionid(sessionid);
+        dprintk("%s: idx is %d\n", __func__, idx);
+        /* Search in the appropriate list */
+        list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+                dump_sessionid("list traversal", &elem->se_sessionid);
+                if (!memcmp(elem->se_sessionid.data, sessionid->data,
+                            NFS4_MAX_SESSIONID_LEN)) {
+                        return elem;
+                }
+        }
+        dprintk("%s: session not found\n", __func__);
+        return NULL;
+}
+/* caller must hold sessionid_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+        list_del(&ses->se_hash);
+        list_del(&ses->se_perclnt);
+}
+static void
+release_session(struct nfsd4_session *ses)
+{
+        spin_lock(&sessionid_lock);
+        unhash_session(ses);
+        spin_unlock(&sessionid_lock);
+        nfsd4_put_session(ses);
+}
+static void nfsd4_release_respages(struct page **respages, short resused);
+void
+free_session(struct kref *kref)
+{
+        struct nfsd4_session *ses;
+        int i;
+        ses = container_of(kref, struct nfsd4_session, se_ref);
+        for (i = 0; i < ses->se_fnumslots; i++) {
+                struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
+                nfsd4_release_respages(e->ce_respages, e->ce_resused);
+        }
+        kfree(ses->se_slots);
+        kfree(ses);
+}
 static inline void
 renew_client(struct nfs4_client *clp)
 {
@@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid)
 {
        if (clid->cl_boot == boot_time)
                return 0;
-        dprintk("NFSD stale clientid (%08x/%08x)\n", 
+        dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
-                        clid->cl_boot, clid->cl_id);
+                clid->cl_boot, clid->cl_id, boot_time);
        return 1;
 }
@@ -376,6 +649,8 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
        shutdown_callback_client(clp);
+        nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
+                             clp->cl_slot.sl_cache_entry.ce_resused);
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
        kfree(clp->cl_principal);
@@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp)
        list_del(&clp->cl_lru);
        while (!list_empty(&clp->cl_openowners)) {
                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
-                release_stateowner(sop);
+                release_openowner(sop);
+        }
+        while (!list_empty(&clp->cl_sessions)) {
+                struct nfsd4_session  *ses;
+                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                                 se_perclnt);
+                release_session(ses);
        }
        put_nfs4_client(clp);
 }
@@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
+        INIT_LIST_HEAD(&clp->cl_sessions);
        INIT_LIST_HEAD(&clp->cl_lru);
        return clp;
 }
@@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid)
        return NULL;
 }
+/*
+ * Return 1 iff clp's clientid establishment method matches the use_exchange_id
+ * parameter. Matching is based on the fact the at least one of the
+ * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
+ *
+ * FIXME: we need to unify the clientid namespaces for nfsv4.x
+ * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
+ * and SET_CLIENTID{,_CONFIRM}
+ */
+static inline int
+match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+{
+        bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+        return use_exchange_id == has_exchange_flags;
+}
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+                             bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname))
+                if (same_name(clp->cl_recdir, dname) &&
+                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
 }
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+                               bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname))
+                if (same_name(clp->cl_recdir, dname) &&
+                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
@@ -685,6 +987,534 @@ out_err:
        return;
 }
+void
+nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        resp->cstate.statp = statp;
+}
+/*
+ * Dereference the result pages.
+ */
+static void
+nfsd4_release_respages(struct page **respages, short resused)
+{
+        int i;
+        dprintk("--> %s\n", __func__);
+        for (i = 0; i < resused; i++) {
+                if (!respages[i])
+                        continue;
+                put_page(respages[i]);
+                respages[i] = NULL;
+        }
+}
+static void
+nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
+{
+        int i;
+        for (i = 0; i < count; i++) {
+                topages[i] = frompages[i];
+                if (!topages[i])
+                        continue;
+                get_page(topages[i]);
+        }
+}
+/*
+ * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
+ * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
+ * length of the XDR response is less than se_fmaxresp_cached
+ * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
+ * of the reply (e.g. readdir).
+ *
+ * Store the base and length of the rq_req.head[0] page
+ * of the NFSv4.1 data, just past the rpc header.
+ */
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+        struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+        struct svc_rqst *rqstp = resp->rqstp;
+        struct nfsd4_compoundargs *args = rqstp->rq_argp;
+        struct nfsd4_op *op = &args->ops[resp->opcnt];
+        struct kvec *resv = &rqstp->rq_res.head[0];
+        dprintk("--> %s entry %p\n", __func__, entry);
+        /* Don't cache a failed OP_SEQUENCE. */
+        if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
+                return;
+        nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
+        entry->ce_opcnt = resp->opcnt;
+        entry->ce_status = resp->cstate.status;
+        /*
+         * Don't need a page to cache just the sequence operation - the slot
+         * does this for us!
+         */
+        if (nfsd4_not_cached(resp)) {
+                entry->ce_resused = 0;
+                entry->ce_rpchdrlen = 0;
+                dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
+                        resp->cstate.slot->sl_cache_entry.ce_cachethis);
+                return;
+        }
+        entry->ce_resused = rqstp->rq_resused;
+        if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
+                entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
+        nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
+                         entry->ce_resused);
+        entry->ce_datav.iov_base = resp->cstate.statp;
+        entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
+                                (char *)page_address(rqstp->rq_respages[0]));
+        /* Current request rpc header length*/
+        entry->ce_rpchdrlen = (char *)resp->cstate.statp -
+                                (char *)page_address(rqstp->rq_respages[0]);
+}
+/*
+ * We keep the rpc header, but take the nfs reply from the replycache.
+ */
+static int
+nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
+                        struct nfsd4_cache_entry *entry)
+{
+        struct svc_rqst *rqstp = resp->rqstp;
+        struct kvec *resv = &resp->rqstp->rq_res.head[0];
+        int len;
+        /* Current request rpc header length*/
+        len = (char *)resp->cstate.statp -
+                        (char *)page_address(rqstp->rq_respages[0]);
+        if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
+                dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
+                        entry->ce_datav.iov_len);
+                return 0;
+        }
+        /* copy the cached reply nfsd data past the current rpc header */
+        memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
+                entry->ce_datav.iov_len);
+        resv->iov_len = len + entry->ce_datav.iov_len;
+        return 1;
+}
+/*
+ * Keep the first page of the replay. Copy the NFSv4.1 data from the first
+ * cached page.  Replace any futher replay pages from the cache.
+ */
+__be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+                         struct nfsd4_sequence *seq)
+{
+        struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+        __be32 status;
+        dprintk("--> %s entry %p\n", __func__, entry);
+        /*
+         * If this is just the sequence operation, we did not keep
+         * a page in the cache entry because we can just use the
+         * slot info stored in struct nfsd4_sequence that was checked
+         * against the slot in nfsd4_sequence().
+         *
+         * This occurs when seq->cachethis is FALSE, or when the client
+         * session inactivity timer fires and a solo sequence operation
+         * is sent (lease renewal).
+         */
+        if (seq && nfsd4_not_cached(resp)) {
+                seq->maxslots = resp->cstate.session->se_fnumslots;
+                return nfs_ok;
+        }
+        if (!nfsd41_copy_replay_data(resp, entry)) {
+                /*
+                 * Not enough room to use the replay rpc header, send the
+                 * cached header. Release all the allocated result pages.
+                 */
+                svc_free_res_pages(resp->rqstp);
+                nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
+                        entry->ce_resused);
+        } else {
+                /* Release all but the first allocated result page */
+                resp->rqstp->rq_resused--;
+                svc_free_res_pages(resp->rqstp);
+                nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
+                                 &entry->ce_respages[1],
+                                 entry->ce_resused - 1);
+        }
+        resp->rqstp->rq_resused = entry->ce_resused;
+        resp->opcnt = entry->ce_opcnt;
+        resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
+        status = entry->ce_status;
+        return status;
+}
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+        /* pNFS is not supported */
+        new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+        /* Referrals are supported, Migration is not. */
+        new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+        /* set the wire flags to return to client. */
+        clid->flags = new->cl_exchange_flags;
+}
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+                  struct nfsd4_compound_state *cstate,
+                  struct nfsd4_exchange_id *exid)
+{
+        struct nfs4_client *unconf, *conf, *new;
+        int status;
+        unsigned int            strhashval;
+        char                    dname[HEXDIR_LEN];
+        nfs4_verifier           verf = exid->verifier;
+        u32                     ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+        dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+                " ip_addr=%u flags %x, spa_how %d\n",
+                __func__, rqstp, exid, exid->clname.len, exid->clname.data,
+                ip_addr, exid->flags, exid->spa_how);
+        if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+                return nfserr_inval;
+        /* Currently only support SP4_NONE */
+        switch (exid->spa_how) {
+        case SP4_NONE:
+                break;
+        case SP4_SSV:
+                return nfserr_encr_alg_unsupp;
+        default:
+                BUG();                          /* checked by xdr code */
+        case SP4_MACH_CRED:
+                return nfserr_serverfault;      /* no excuse :-/ */
+        }
+        status = nfs4_make_rec_clidname(dname, &exid->clname);
+        if (status)
+                goto error;
+        strhashval = clientstr_hashval(dname);
+        nfs4_lock_state();
+        status = nfs_ok;
+        conf = find_confirmed_client_by_str(dname, strhashval, true);
+        if (conf) {
+                if (!same_verf(&verf, &conf->cl_verifier)) {
+                        /* 18.35.4 case 8 */
+                        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+                                status = nfserr_not_same;
+                                goto out;
+                        }
+                        /* Client reboot: destroy old state */
+                        expire_client(conf);
+                        goto out_new;
+                }
+                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+                        /* 18.35.4 case 9 */
+                        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+                                status = nfserr_perm;
+                                goto out;
+                        }
+                        expire_client(conf);
+                        goto out_new;
+                }
+                if (ip_addr != conf->cl_addr &&
+                    !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
+                        /* Client collision. 18.35.4 case 3 */
+                        status = nfserr_clid_inuse;
+                        goto out;
+                }
+                /*
+                 * Set bit when the owner id and verifier map to an already
+                 * confirmed client id (18.35.3).
+                 */
+                exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+                /*
+                 * Falling into 18.35.4 case 2, possible router replay.
+                 * Leave confirmed record intact and return same result.
+                 */
+                copy_verf(conf, &verf);
+                new = conf;
+                goto out_copy;
+        } else {
+                /* 18.35.4 case 7 */
+                if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+                        status = nfserr_noent;
+                        goto out;
+                }
+        }
+        unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+        if (unconf) {
+                /*
+                 * Possible retry or client restart.  Per 18.35.4 case 4,
+                 * a new unconfirmed record should be generated regardless
+                 * of whether any properties have changed.
+                 */
+                expire_client(unconf);
+        }
+out_new:
+        /* Normal case */
+        new = create_client(exid->clname, dname);
+        if (new == NULL) {
+                status = nfserr_resource;
+                goto out;
+        }
+        copy_verf(new, &verf);
+        copy_cred(&new->cl_cred, &rqstp->rq_cred);
+        new->cl_addr = ip_addr;
+        gen_clid(new);
+        gen_confirm(new);
+        add_to_unconfirmed(new, strhashval);
+out_copy:
+        exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+        exid->clientid.cl_id = new->cl_clientid.cl_id;
+        new->cl_slot.sl_seqid = 0;
+        exid->seqid = 1;
+        nfsd4_set_ex_flags(new, exid);
+        dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+                new->cl_slot.sl_seqid, new->cl_exchange_flags);
+        status = nfs_ok;
+out:
+        nfs4_unlock_state();
+error:
+        dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
+        return status;
+}
+static int
+check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+{
+        dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
+                slot->sl_seqid);
+        /* The slot is in use, and no response has been sent. */
+        if (slot->sl_inuse) {
+                if (seqid == slot->sl_seqid)
+                        return nfserr_jukebox;
+                else
+                        return nfserr_seq_misordered;
+        }
+        /* Normal */
+        if (likely(seqid == slot->sl_seqid + 1))
+                return nfs_ok;
+        /* Replay */
+        if (seqid == slot->sl_seqid)
+                return nfserr_replay_cache;
+        /* Wraparound */
+        if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+                return nfs_ok;
+        /* Misordered replay or misordered new request */
+        return nfserr_seq_misordered;
+}
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+                     struct nfsd4_compound_state *cstate,
+                     struct nfsd4_create_session *cr_ses)
+{
+        u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfs4_client *conf, *unconf;
+        struct nfsd4_slot *slot = NULL;
+        int status = 0;
+        nfs4_lock_state();
+        unconf = find_unconfirmed_client(&cr_ses->clientid);
+        conf = find_confirmed_client(&cr_ses->clientid);
+        if (conf) {
+                slot = &conf->cl_slot;
+                status = check_slot_seqid(cr_ses->seqid, slot);
+                if (status == nfserr_replay_cache) {
+                        dprintk("Got a create_session replay! seqid= %d\n",
+                                slot->sl_seqid);
+                        cstate->slot = slot;
+                        cstate->status = status;
+                        /* Return the cached reply status */
+                        status = nfsd4_replay_cache_entry(resp, NULL);
+                        goto out;
+                } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+                        status = nfserr_seq_misordered;
+                        dprintk("Sequence misordered!\n");
+                        dprintk("Expected seqid= %d but got seqid= %d\n",
+                                slot->sl_seqid, cr_ses->seqid);
+                        goto out;
+                }
+                conf->cl_slot.sl_seqid++;
+        } else if (unconf) {
+                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+                    (ip_addr != unconf->cl_addr)) {
+                        status = nfserr_clid_inuse;
+                        goto out;
+                }
+                slot = &unconf->cl_slot;
+                status = check_slot_seqid(cr_ses->seqid, slot);
+                if (status) {
+                        /* an unconfirmed replay returns misordered */
+                        status = nfserr_seq_misordered;
+                        goto out;
+                }
+                slot->sl_seqid++; /* from 0 to 1 */
+                move_to_confirmed(unconf);
+                /*
+                 * We do not support RDMA or persistent sessions
+                 */
+                cr_ses->flags &= ~SESSION4_PERSIST;
+                cr_ses->flags &= ~SESSION4_RDMA;
+                conf = unconf;
+        } else {
+                status = nfserr_stale_clientid;
+                goto out;
+        }
+        status = alloc_init_session(rqstp, conf, cr_ses);
+        if (status)
+                goto out;
+        memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+               NFS4_MAX_SESSIONID_LEN);
+        cr_ses->seqid = slot->sl_seqid;
+        slot->sl_inuse = true;
+        cstate->slot = slot;
+        /* Ensure a page is used for the cache */
+        slot->sl_cache_entry.ce_cachethis = 1;
+out:
+        nfs4_unlock_state();
+        dprintk("%s returns %d\n", __func__, ntohl(status));
+        return status;
+}
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+                      struct nfsd4_compound_state *cstate,
+                      struct nfsd4_destroy_session *sessionid)
+{
+        struct nfsd4_session *ses;
+        u32 status = nfserr_badsession;
+        /* Notes:
+         * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
+         * - Should we return nfserr_back_chan_busy if waiting for
+         *   callbacks on to-be-destroyed session?
+         * - Do we need to clear any callback info from previous session?
+         */
+        dump_sessionid(__func__, &sessionid->sessionid);
+        spin_lock(&sessionid_lock);
+        ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+        if (!ses) {
+                spin_unlock(&sessionid_lock);
+                goto out;
+        }
+        unhash_session(ses);
+        spin_unlock(&sessionid_lock);
+        /* wait for callbacks */
+        shutdown_callback_client(ses->se_client);
+        nfsd4_put_session(ses);
+        status = nfs_ok;
+out:
+        dprintk("%s returns %d\n", __func__, ntohl(status));
+        return status;
+}
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp,
+               struct nfsd4_compound_state *cstate,
+               struct nfsd4_sequence *seq)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfsd4_session *session;
+        struct nfsd4_slot *slot;
+        int status;
+        if (resp->opcnt != 1)
+                return nfserr_sequence_pos;
+        spin_lock(&sessionid_lock);
+        status = nfserr_badsession;
+        session = find_in_sessionid_hashtbl(&seq->sessionid);
+        if (!session)
+                goto out;
+        status = nfserr_badslot;
+        if (seq->slotid >= session->se_fnumslots)
+                goto out;
+        slot = &session->se_slots[seq->slotid];
+        dprintk("%s: slotid %d\n", __func__, seq->slotid);
+        status = check_slot_seqid(seq->seqid, slot);
+        if (status == nfserr_replay_cache) {
+                cstate->slot = slot;
+                cstate->session = session;
+                /* Return the cached reply status and set cstate->status
+                 * for nfsd4_svc_encode_compoundres processing */
+                status = nfsd4_replay_cache_entry(resp, seq);
+                cstate->status = nfserr_replay_cache;
+                goto replay_cache;
+        }
+        if (status)
+                goto out;
+        /* Success! bump slot seqid */
+        slot->sl_inuse = true;
+        slot->sl_seqid = seq->seqid;
+        slot->sl_cache_entry.ce_cachethis = seq->cachethis;
+        /* Always set the cache entry cachethis for solo sequence */
+        if (nfsd4_is_solo_sequence(resp))
+                slot->sl_cache_entry.ce_cachethis = 1;
+        cstate->slot = slot;
+        cstate->session = session;
+replay_cache:
+        /* Renew the clientid on success and on replay.
+         * Hold a session reference until done processing the compound:
+         * nfsd4_put_session called only if the cstate slot is set.
+         */
+        renew_client(session->se_client);
+        nfsd4_get_session(session);
+out:
+        spin_unlock(&sessionid_lock);
+        dprintk("%s: return %d\n", __func__, ntohl(status));
+        return status;
+}
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
@@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        strhashval = clientstr_hashval(dname);
        nfs4_lock_state();
-        conf = find_confirmed_client_by_str(dname, strhashval);
+        conf = find_confirmed_client_by_str(dname, strhashval, false);
        if (conf) {
                /* RFC 3530 14.2.33 CASE 0: */
                status = nfserr_clid_inuse;
-                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
+                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
-                                || conf->cl_addr != sin->sin_addr.s_addr) {
+                        dprintk("NFSD: setclientid: string in use by client"
-                        dprintk("NFSD: setclientid: string in use by clientat %pI4\n",
+                                " at %pI4\n", &conf->cl_addr);
-                                &conf->cl_addr);
                        goto out;
                }
        }
@@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * has a description of SETCLIENTID request processing consisting
         * of 5 bullet points, labeled as CASE0 - CASE4 below.
         */
-        unconf = find_unconfirmed_client_by_str(dname, strhashval);
+        unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
        status = nfserr_resource;
        if (!conf) {
                /*
@@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        unsigned int hash =
                                clientstr_hashval(unconf->cl_recdir);
                        conf = find_confirmed_client_by_str(unconf->cl_recdir,
-                                                                        hash);
+                                                            hash, false);
                        if (conf) {
                                nfsd4_remove_clid_dir(conf);
                                expire_client(conf);
@@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino)
        fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
        if (fp) {
-                kref_init(&fp->fi_ref);
+                atomic_set(&fp->fi_ref, 1);
                INIT_LIST_HEAD(&fp->fi_hash);
                INIT_LIST_HEAD(&fp->fi_stateids);
                INIT_LIST_HEAD(&fp->fi_delegations);
+                spin_lock(&recall_lock);
                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+                spin_unlock(&recall_lock);
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
@@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
        return sop;
 }
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
-        struct nfs4_stateowner *lock_sop;
-        while (!list_empty(&open_stp->st_lockowners)) {
-                lock_sop = list_entry(open_stp->st_lockowners.next,
-                                struct nfs4_stateowner, so_perstateid);
-                /* list_del(&open_stp->st_lockowners);  */
-                BUG_ON(lock_sop->so_is_open_owner);
-                release_stateowner(lock_sop);
-        }
-}
-static void
-unhash_stateowner(struct nfs4_stateowner *sop)
-{
-        struct nfs4_stateid *stp;
-        list_del(&sop->so_idhash);
-        list_del(&sop->so_strhash);
-        if (sop->so_is_open_owner)
-                list_del(&sop->so_perclient);
-        list_del(&sop->so_perstateid);
-        while (!list_empty(&sop->so_stateids)) {
-                stp = list_entry(sop->so_stateids.next,
-                        struct nfs4_stateid, st_perstateowner);
-                if (sop->so_is_open_owner)
-                        release_stateid(stp, OPEN_STATE);
-                else
-                        release_stateid(stp, LOCK_STATE);
-        }
-}
-static void
-release_stateowner(struct nfs4_stateowner *sop)
-{
-        unhash_stateowner(sop);
-        list_del(&sop->so_close_lru);
-        nfs4_put_stateowner(sop);
-}
 static inline void
 init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
        struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
        stp->st_stateid.si_generation = 0;
        stp->st_access_bmap = 0;
        stp->st_deny_bmap = 0;
-        __set_bit(open->op_share_access, &stp->st_access_bmap);
+        __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
+                  &stp->st_access_bmap);
        __set_bit(open->op_share_deny, &stp->st_deny_bmap);
        stp->st_openstp = NULL;
 }
 static void
-release_stateid(struct nfs4_stateid *stp, int flags)
-{
-        struct file *filp = stp->st_vfs_file;
-        list_del(&stp->st_hash);
-        list_del(&stp->st_perfile);
-        list_del(&stp->st_perstateowner);
-        if (flags & OPEN_STATE) {
-                release_stateid_lockowners(stp);
-                stp->st_vfs_file = NULL;
-                nfsd_close(filp);
-        } else if (flags & LOCK_STATE)
-                locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
-        put_nfs4_file(stp->st_file);
-        kmem_cache_free(stateid_slab, stp);
-}
-static void
 move_to_close_lru(struct nfs4_stateowner *sop)
 {
        dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1932,33 @@ find_file(struct inode *ino)
        unsigned int hashval = file_hashval(ino);
        struct nfs4_file *fp;
+        spin_lock(&recall_lock);
        list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
                if (fp->fi_inode == ino) {
                        get_nfs4_file(fp);
+                        spin_unlock(&recall_lock);
                        return fp;
                }
        }
+        spin_unlock(&recall_lock);
        return NULL;
 }
-static inline int access_valid(u32 x)
+static inline int access_valid(u32 x, u32 minorversion)
 {
-        if (x < NFS4_SHARE_ACCESS_READ)
+        if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
                return 0;
-        if (x > NFS4_SHARE_ACCESS_BOTH)
+        if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
+                return 0;
+        x &= ~NFS4_SHARE_ACCESS_MASK;
+        if (minorversion && x) {
+                if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
+                        return 0;
+                if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
+                        return 0;
+                x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
+        }
+        if (x)
                return 0;
        return 1;
 }
@@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
 __be32
-nfsd4_process_open1(struct nfsd4_open *open)
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+                    struct nfsd4_open *open)
 {
        clientid_t *clientid = &open->op_clientid;
        struct nfs4_client *clp = NULL;
@@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
                        return nfserr_expired;
                goto renew;
        }
+        /* When sessions are used, skip open sequenceid processing */
+        if (nfsd4_has_session(cstate))
+                goto renew;
        if (!sop->so_confirmed) {
                /* Replace unconfirmed owners without checking for replay. */
                clp = sop->so_client;
-                release_stateowner(sop);
+                release_openowner(sop);
                open->op_stateowner = NULL;
                goto renew;
        }
@@ -1709,6 +2498,7 @@ out:
 __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
        struct nfs4_file *fp = NULL;
        struct inode *ino = current_fh->fh_dentry->d_inode;
        struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        __be32 status;
        status = nfserr_inval;
-        if (!access_valid(open->op_share_access)
+        if (!access_valid(open->op_share_access, resp->cstate.minorversion)
                        || !deny_valid(open->op_share_deny))
                goto out;
        /*
@@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
                init_stateid(stp, fp, open);
                status = nfsd4_truncate(rqstp, current_fh, open);
                if (status) {
-                        release_stateid(stp, OPEN_STATE);
+                        release_open_stateid(stp);
                        goto out;
                }
+                if (nfsd4_has_session(&resp->cstate))
+                        update_stateid(&stp->st_stateid);
        }
        memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+        if (nfsd4_has_session(&resp->cstate))
+                open->op_stateowner->so_confirmed = 1;
        /*
        * Attempt to hand out a delegation. No error return, because the
        * OPEN succeeds even if we fail.
@@ -1790,7 +2585,8 @@ out:
        * To finish the open response, we just need to set the rflags.
        */
        open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-        if (!open->op_stateowner->so_confirmed)
+        if (!open->op_stateowner->so_confirmed &&
+            !nfsd4_has_session(&resp->cstate))
                open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
        return status;
@@ -1898,7 +2694,7 @@ nfs4_laundromat(void)
                }
                dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
                        sop->so_id);
-                release_stateowner(sop);
+                release_openowner(sop);
        }
        if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
                clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2779,7 @@ out:
 static inline __be32
 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 {
-        /* Trying to call delegreturn with a special stateid? Yuch: */
+        if (ONE_STATEID(stateid) && (flags & RD_STATE))
-        if (!(flags & (RD_STATE | WR_STATE)))
-                return nfserr_bad_stateid;
-        else if (ONE_STATEID(stateid) && (flags & RD_STATE))
                return nfs_ok;
        else if (locks_in_grace()) {
                /* Answer in remaining cases depends on existance of
@@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 * that are not able to provide mandatory locking.
 */
 static inline int
-io_during_grace_disallowed(struct inode *inode, int flags)
+grace_disallows_io(struct inode *inode)
 {
-        return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
+        return locks_in_grace() && mandatory_lock(inode);
-                && mandatory_lock(inode);
 }
-static int check_stateid_generation(stateid_t *in, stateid_t *ref)
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
 {
+        /*
+         * When sessions are used the stateid generation number is ignored
+         * when it is zero.
+         */
+        if ((flags & HAS_SESSION) && in->si_generation == 0)
+                goto out;
        /* If the client sends us a stateid from the future, it's buggy: */
        if (in->si_generation > ref->si_generation)
                return nfserr_bad_stateid;
@@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
         */
        if (in->si_generation < ref->si_generation)
                return nfserr_old_stateid;
+out:
        return nfs_ok;
 }
+static int is_delegation_stateid(stateid_t *stateid)
+{
+        return stateid->si_fileid == 0;
+}
 /*
 * Checks for stateid operations
 */
 __be32
-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+                           stateid_t *stateid, int flags, struct file **filpp)
 {
        struct nfs4_stateid *stp = NULL;
        struct nfs4_delegation *dp = NULL;
-        stateid_t *stidp;
+        struct svc_fh *current_fh = &cstate->current_fh;
        struct inode *ino = current_fh->fh_dentry->d_inode;
        __be32 status;
-        dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
-                stateid->si_boot, stateid->si_stateownerid, 
-                stateid->si_fileid, stateid->si_generation); 
        if (filpp)
                *filpp = NULL;
-        if (io_during_grace_disallowed(ino, flags))
+        if (grace_disallows_io(ino))
                return nfserr_grace;
+        if (nfsd4_has_session(cstate))
+                flags |= HAS_SESSION;
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
                return check_special_stateids(current_fh, stateid, flags);
-        /* STALE STATEID */
        status = nfserr_stale_stateid;
        if (STALE_STATEID(stateid)) 
                goto out;
-        /* BAD STATEID */
        status = nfserr_bad_stateid;
-        if (!stateid->si_fileid) { /* delegation stateid */
+        if (is_delegation_stateid(stateid)) {
-                if(!(dp = find_delegation_stateid(ino, stateid))) {
+                dp = find_delegation_stateid(ino, stateid);
-                        dprintk("NFSD: delegation stateid not found\n");
+                if (!dp)
                        goto out;
-                }
+                status = check_stateid_generation(stateid, &dp->dl_stateid,
-                stidp = &dp->dl_stateid;
+                                                  flags);
+                if (status)
+                        goto out;
+                status = nfs4_check_delegmode(dp, flags);
+                if (status)
+                        goto out;
+                renew_client(dp->dl_client);
+                if (filpp)
+                        *filpp = dp->dl_vfs_file;
        } else { /* open or lock stateid */
-                if (!(stp = find_stateid(stateid, flags))) {
+                stp = find_stateid(stateid, flags);
-                        dprintk("NFSD: open or lock stateid not found\n");
+                if (!stp)
                        goto out;
-                }
+                if (nfs4_check_fh(current_fh, stp))
-                if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
                        goto out;
-                stidp = &stp->st_stateid;
+                status = check_stateid_generation(stateid, &stp->st_stateid,
-        }
+                                                  flags);
-        status = check_stateid_generation(stateid, stidp);
+                if (status)
-        if (status)
+                        goto out;
-                goto out;
+                status = nfs4_check_openmode(stp, flags);
-        if (stp) {
+                if (status)
-                if ((status = nfs4_check_openmode(stp,flags)))
                        goto out;
                renew_client(stp->st_stateowner->so_client);
                if (filpp)
                        *filpp = stp->st_vfs_file;
-        } else {
-                if ((status = nfs4_check_delegmode(dp, flags)))
-                        goto out;
-                renew_client(dp->dl_client);
-                if (flags & DELEG_RET)
-                        unhash_delegation(dp);
-                if (filpp)
-                        *filpp = dp->dl_vfs_file;
        }
        status = nfs_ok;
 out:
@@ -2113,10 +2915,14 @@ setlkflg (int type)
 * Checks for sequence id mutating operations. 
 */
 static __be32
-nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+                         stateid_t *stateid, int flags,
+                         struct nfs4_stateowner **sopp,
+                         struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
        struct nfs4_stateid *stp;
        struct nfs4_stateowner *sop;
+        struct svc_fh *current_fh = &cstate->current_fh;
        __be32 status;
        dprintk("NFSD: preprocess_seqid_op: seqid=%d " 
@@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
        if (STALE_STATEID(stateid))
                return nfserr_stale_stateid;
+        if (nfsd4_has_session(cstate))
+                flags |= HAS_SESSION;
        /*
        * We return BAD_STATEID if filehandle doesn't match stateid, 
        * the confirmed flag is incorrecly set, or the generation 
@@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                if (lock->lk_is_new) {
                        if (!sop->so_is_open_owner)
                                return nfserr_bad_stateid;
-                        if (!same_clid(&clp->cl_clientid, lockclid))
+                        if (!(flags & HAS_SESSION) &&
-                               return nfserr_bad_stateid;
+                            !same_clid(&clp->cl_clientid, lockclid))
+                                return nfserr_bad_stateid;
                        /* stp is the open stateid */
                        status = nfs4_check_openmode(stp, lkflg);
                        if (status)
@@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
        *  For the moment, we ignore the possibility of 
        *  generation number wraparound.
        */
-        if (seqid != sop->so_seqid)
+        if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
                goto check_replay;
        if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                                " confirmed yet!\n");
                return nfserr_bad_stateid;
        }
-        status = check_stateid_generation(stateid, &stp->st_stateid);
+        status = check_stateid_generation(stateid, &stp->st_stateid, flags);
        if (status)
                return status;
        renew_client(sop->so_client);
@@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        oc->oc_seqid, &oc->oc_req_stateid,
                                        CONFIRM | OPEN_STATE,
                                        &oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
                        (int)cstate->current_fh.fh_dentry->d_name.len,
                        cstate->current_fh.fh_dentry->d_name.name);
-        if (!access_valid(od->od_share_access)
+        if (!access_valid(od->od_share_access, cstate->minorversion)
                        || !deny_valid(od->od_share_deny))
                return nfserr_inval;
        nfs4_lock_state();
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        od->od_seqid,
                                        &od->od_stateid, 
                                        OPEN_STATE,
@@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        /* check close_lru for replay */
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        close->cl_seqid,
                                        &close->cl_stateid, 
                                        OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
        /* release_stateid() calls nfsd_close() if needed */
-        release_stateid(stp, OPEN_STATE);
+        release_open_stateid(stp);
        /* place unused nfs4_stateowners on so_close_lru list to be
         * released by the laundromat service after the lease period
@@ -2394,16 +3205,40 @@ __be32
 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_delegreturn *dr)
 {
+        struct nfs4_delegation *dp;
+        stateid_t *stateid = &dr->dr_stateid;
+        struct inode *inode;
        __be32 status;
+        int flags = 0;
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
-                goto out;
+                return status;
+        inode = cstate->current_fh.fh_dentry->d_inode;
+        if (nfsd4_has_session(cstate))
+                flags |= HAS_SESSION;
        nfs4_lock_state();
-        status = nfs4_preprocess_stateid_op(&cstate->current_fh,
+        status = nfserr_bad_stateid;
-                                            &dr->dr_stateid, DELEG_RET, NULL);
+        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-        nfs4_unlock_state();
+                goto out;
+        status = nfserr_stale_stateid;
+        if (STALE_STATEID(stateid))
+                goto out;
+        status = nfserr_bad_stateid;
+        if (!is_delegation_stateid(stateid))
+                goto out;
+        dp = find_delegation_stateid(inode, stateid);
+        if (!dp)
+                goto out;
+        status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+        if (status)
+                goto out;
+        renew_client(dp->dl_client);
+        unhash_delegation(dp);
 out:
+        nfs4_unlock_state();
        return status;
 }
@@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                struct nfs4_file *fp;
                
                status = nfserr_stale_clientid;
-                if (STALE_CLIENTID(&lock->lk_new_clientid))
+                if (!nfsd4_has_session(cstate) &&
+                    STALE_CLIENTID(&lock->lk_new_clientid))
                        goto out;
                /* validate and update open stateid and open seqid */
-                status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+                status = nfs4_preprocess_seqid_op(cstate,
                                        lock->lk_new_open_seqid,
                                        &lock->lk_new_open_stateid,
                                        OPEN_STATE,
@@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
        } else {
                /* lock (lock owner + lock stateid) already exists */
-                status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+                status = nfs4_preprocess_seqid_op(cstate,
                                       lock->lk_old_lock_seqid, 
                                       &lock->lk_old_lock_stateid, 
                                       LOCK_STATE,
@@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        }
 out:
        if (status && lock->lk_is_new && lock_sop)
-                release_stateowner(lock_sop);
+                release_lockowner(lock_sop);
        if (lock->lk_replay_owner) {
                nfs4_get_stateowner(lock->lk_replay_owner);
                cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
        status = nfserr_stale_clientid;
-        if (STALE_CLIENTID(&lockt->lt_clientid))
+        if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
                goto out;
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
                                                                                
-        if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+        if ((status = nfs4_preprocess_seqid_op(cstate,
                                        locku->lu_seqid, 
                                        &locku->lu_stateid, 
                                        LOCK_STATE,
@@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
                /* unhash_stateowner deletes so_perclient only
                 * for openowners. */
                list_del(&sop->so_perclient);
-                release_stateowner(sop);
+                release_lockowner(sop);
        }
 out:
        nfs4_unlock_state();
@@ -3051,12 +3887,12 @@ alloc_reclaim(void)
 }
 int
-nfs4_has_reclaimed_state(const char *name)
+nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
 {
        unsigned int strhashval = clientstr_hashval(name);
        struct nfs4_client *clp;
-        clp = find_confirmed_client_by_str(name, strhashval);
+        clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
        return clp ? 1 : 0;
 }
@@ -3153,6 +3989,8 @@ nfs4_state_init(void)
                INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
                INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
        }
+        for (i = 0; i < SESSION_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&sessionid_hashtbl[i]);
        for (i = 0; i < FILE_HASH_SIZE; i++) {
                INIT_LIST_HEAD(&file_hashtbl[i]);
        }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b820c311931c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/vfs.h>
+#include <linux/utsname.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
        return p;
 }
+static int zero_clientid(clientid_t *clid)
+{
+        return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
 static int
 defer_free(struct nfsd4_compoundargs *argp,
                void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
        bmval[0] = 0;
        bmval[1] = 0;
+        bmval[2] = 0;
        READ_BUF(4);
        READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
                READ32(bmval[0]);
        if (bmlen > 1)
                READ32(bmval[1]);
+        if (bmlen > 2)
+                READ32(bmval[2]);
        DECODE_TAIL;
 }
+static u32 nfsd_attrmask[] = {
+        NFSD_WRITEABLE_ATTRS_WORD0,
+        NFSD_WRITEABLE_ATTRS_WORD1,
+        NFSD_WRITEABLE_ATTRS_WORD2
+};
+static u32 nfsd41_ex_attrmask[] = {
+        NFSD_SUPPATTR_EXCLCREAT_WORD0,
+        NFSD_SUPPATTR_EXCLCREAT_WORD1,
+        NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
 static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
-    struct nfs4_acl **acl)
+                   struct iattr *iattr, struct nfs4_acl **acl)
 {
        int expected_len, len = 0;
        u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
         * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
         * read-only attributes return ERR_INVAL.
         */
-        if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+        if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
+            (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
+            (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
                return nfserr_attrnotsupp;
-        if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1))
+        if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+            (bmval[2] & ~writable[2]))
                return nfserr_inval;
        READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
                        goto xdr_error;
                }
        }
+        BUG_ON(bmval[2]);       /* no such writeable attr supported yet */
        if (len != expected_len)
                goto xdr_error;
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
        if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
                return status;
-        if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl)))
+        status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
+                                    &create->cr_iattr, &create->cr_acl);
+        if (status)
                goto out;
        DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
        READ_BUF(lockt->lt_owner.len);
        READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
+        if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
+                return nfserr_inval;
        DECODE_TAIL;
 }
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                switch (open->op_createmode) {
                case NFS4_CREATE_UNCHECKED:
                case NFS4_CREATE_GUARDED:
-                        if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl)))
+                        status = nfsd4_decode_fattr(argp, open->op_bmval,
+                                nfsd_attrmask, &open->op_iattr, &open->op_acl);
+                        if (status)
                                goto out;
                        break;
                case NFS4_CREATE_EXCLUSIVE:
                        READ_BUF(8);
                        COPYMEM(open->op_verf.data, 8);
                        break;
+                case NFS4_CREATE_EXCLUSIVE4_1:
+                        if (argp->minorversion < 1)
+                                goto xdr_error;
+                        READ_BUF(8);
+                        COPYMEM(open->op_verf.data, 8);
+                        status = nfsd4_decode_fattr(argp, open->op_bmval,
+                                nfsd41_ex_attrmask, &open->op_iattr,
+                                &open->op_acl);
+                        if (status)
+                                goto out;
+                        break;
                default:
                        goto xdr_error;
                }
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
        status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
        if (status)
                return status;
-        return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+        return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
                                  &setattr->sa_iattr, &setattr->sa_acl);
 }
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
        READ_BUF(rlockowner->rl_owner.len);
        READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
+        if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+                return nfserr_inval;
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+                         struct nfsd4_exchange_id *exid)
+{
+        int dummy;
+        DECODE_HEAD;
+        READ_BUF(NFS4_VERIFIER_SIZE);
+        COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+        READ_BUF(4);
+        READ32(exid->clname.len);
+        READ_BUF(exid->clname.len);
+        SAVEMEM(exid->clname.data, exid->clname.len);
+        READ_BUF(4);
+        READ32(exid->flags);
+        /* Ignore state_protect4_a */
+        READ_BUF(4);
+        READ32(exid->spa_how);
+        switch (exid->spa_how) {
+        case SP4_NONE:
+                break;
+        case SP4_MACH_CRED:
+                /* spo_must_enforce */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                /* spo_must_allow */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                break;
+        case SP4_SSV:
+                /* ssp_ops */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy * 4);
+                p += dummy;
+                /* ssp_hash_algs<> */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* ssp_encr_algs<> */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* ssp_window and ssp_num_gss_handles */
+                READ_BUF(8);
+                READ32(dummy);
+                READ32(dummy);
+                break;
+        default:
+                goto xdr_error;
+        }
+        /* Ignore Implementation ID */
+        READ_BUF(4);    /* nfs_impl_id4 array length */
+        READ32(dummy);
+        if (dummy > 1)
+                goto xdr_error;
+        if (dummy == 1) {
+                /* nii_domain */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* nii_name */
+                READ_BUF(4);
+                READ32(dummy);
+                READ_BUF(dummy);
+                p += XDR_QUADLEN(dummy);
+                /* nii_date */
+                READ_BUF(12);
+                p += 3;
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+                            struct nfsd4_create_session *sess)
+{
+        DECODE_HEAD;
+        u32 dummy;
+        char *machine_name;
+        int i;
+        int nr_secflavs;
+        READ_BUF(16);
+        COPYMEM(&sess->clientid, 8);
+        READ32(sess->seqid);
+        READ32(sess->flags);
+        /* Fore channel attrs */
+        READ_BUF(28);
+        READ32(dummy); /* headerpadsz is always 0 */
+        READ32(sess->fore_channel.maxreq_sz);
+        READ32(sess->fore_channel.maxresp_sz);
+        READ32(sess->fore_channel.maxresp_cached);
+        READ32(sess->fore_channel.maxops);
+        READ32(sess->fore_channel.maxreqs);
+        READ32(sess->fore_channel.nr_rdma_attrs);
+        if (sess->fore_channel.nr_rdma_attrs == 1) {
+                READ_BUF(4);
+                READ32(sess->fore_channel.rdma_attrs);
+        } else if (sess->fore_channel.nr_rdma_attrs > 1) {
+                dprintk("Too many fore channel attr bitmaps!\n");
+                goto xdr_error;
+        }
+        /* Back channel attrs */
+        READ_BUF(28);
+        READ32(dummy); /* headerpadsz is always 0 */
+        READ32(sess->back_channel.maxreq_sz);
+        READ32(sess->back_channel.maxresp_sz);
+        READ32(sess->back_channel.maxresp_cached);
+        READ32(sess->back_channel.maxops);
+        READ32(sess->back_channel.maxreqs);
+        READ32(sess->back_channel.nr_rdma_attrs);
+        if (sess->back_channel.nr_rdma_attrs == 1) {
+                READ_BUF(4);
+                READ32(sess->back_channel.rdma_attrs);
+        } else if (sess->back_channel.nr_rdma_attrs > 1) {
+                dprintk("Too many back channel attr bitmaps!\n");
+                goto xdr_error;
+        }
+        READ_BUF(8);
+        READ32(sess->callback_prog);
+        /* callback_sec_params4 */
+        READ32(nr_secflavs);
+        for (i = 0; i < nr_secflavs; ++i) {
+                READ_BUF(4);
+                READ32(dummy);
+                switch (dummy) {
+                case RPC_AUTH_NULL:
+                        /* Nothing to read */
+                        break;
+                case RPC_AUTH_UNIX:
+                        READ_BUF(8);
+                        /* stamp */
+                        READ32(dummy);
+                        /* machine name */
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        SAVEMEM(machine_name, dummy);
+                        /* uid, gid */
+                        READ_BUF(8);
+                        READ32(sess->uid);
+                        READ32(sess->gid);
+                        /* more gids */
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy * 4);
+                        for (i = 0; i < dummy; ++i)
+                                READ32(dummy);
+                        break;
+                case RPC_AUTH_GSS:
+                        dprintk("RPC_AUTH_GSS callback secflavor "
+                                "not supported!\n");
+                        READ_BUF(8);
+                        /* gcbp_service */
+                        READ32(dummy);
+                        /* gcbp_handle_from_server */
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                        /* gcbp_handle_from_client */
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                        break;
+                default:
+                        dprintk("Illegal callback secflavor\n");
+                        return nfserr_inval;
+                }
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+                             struct nfsd4_destroy_session *destroy_session)
+{
+        DECODE_HEAD;
+        READ_BUF(NFS4_MAX_SESSIONID_LEN);
+        COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+                      struct nfsd4_sequence *seq)
+{
+        DECODE_HEAD;
+        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+        COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        READ32(seq->seqid);
+        READ32(seq->slotid);
+        READ32(seq->maxslots);
+        READ32(seq->cachethis);
        DECODE_TAIL;
 }
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 static __be32
 nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
 {
-        return nfserr_opnotsupp;
+        return nfserr_notsupp;
 }
 typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_OPEN_CONFIRM]       = (nfsd4_dec)nfsd4_decode_open_confirm,
        [OP_OPEN_DOWNGRADE]     = (nfsd4_dec)nfsd4_decode_open_downgrade,
        [OP_PUTFH]              = (nfsd4_dec)nfsd4_decode_putfh,
-        [OP_PUTPUBFH]           = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_PUTPUBFH]           = (nfsd4_dec)nfsd4_decode_noop,
        [OP_PUTROOTFH]          = (nfsd4_dec)nfsd4_decode_noop,
        [OP_READ]               = (nfsd4_dec)nfsd4_decode_read,
        [OP_READDIR]            = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_RELEASE_LOCKOWNER]  = (nfsd4_dec)nfsd4_decode_release_lockowner,
 };
+static nfsd4_dec nfsd41_dec_ops[] = {
+        [OP_ACCESS]             (nfsd4_dec)nfsd4_decode_access,
+        [OP_CLOSE]              (nfsd4_dec)nfsd4_decode_close,
+        [OP_COMMIT]             (nfsd4_dec)nfsd4_decode_commit,
+        [OP_CREATE]             (nfsd4_dec)nfsd4_decode_create,
+        [OP_DELEGPURGE]         (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DELEGRETURN]        (nfsd4_dec)nfsd4_decode_delegreturn,
+        [OP_GETATTR]            (nfsd4_dec)nfsd4_decode_getattr,
+        [OP_GETFH]              (nfsd4_dec)nfsd4_decode_noop,
+        [OP_LINK]               (nfsd4_dec)nfsd4_decode_link,
+        [OP_LOCK]               (nfsd4_dec)nfsd4_decode_lock,
+        [OP_LOCKT]              (nfsd4_dec)nfsd4_decode_lockt,
+        [OP_LOCKU]              (nfsd4_dec)nfsd4_decode_locku,
+        [OP_LOOKUP]             (nfsd4_dec)nfsd4_decode_lookup,
+        [OP_LOOKUPP]            (nfsd4_dec)nfsd4_decode_noop,
+        [OP_NVERIFY]            (nfsd4_dec)nfsd4_decode_verify,
+        [OP_OPEN]               (nfsd4_dec)nfsd4_decode_open,
+        [OP_OPENATTR]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPEN_CONFIRM]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPEN_DOWNGRADE]     (nfsd4_dec)nfsd4_decode_open_downgrade,
+        [OP_PUTFH]              (nfsd4_dec)nfsd4_decode_putfh,
+        [OP_PUTPUBFH]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_PUTROOTFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_READ]               (nfsd4_dec)nfsd4_decode_read,
+        [OP_READDIR]            (nfsd4_dec)nfsd4_decode_readdir,
+        [OP_READLINK]           (nfsd4_dec)nfsd4_decode_noop,
+        [OP_REMOVE]             (nfsd4_dec)nfsd4_decode_remove,
+        [OP_RENAME]             (nfsd4_dec)nfsd4_decode_rename,
+        [OP_RENEW]              (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RESTOREFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_SAVEFH]             (nfsd4_dec)nfsd4_decode_noop,
+        [OP_SECINFO]            (nfsd4_dec)nfsd4_decode_secinfo,
+        [OP_SETATTR]            (nfsd4_dec)nfsd4_decode_setattr,
+        [OP_SETCLIENTID]        (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_VERIFY]             (nfsd4_dec)nfsd4_decode_verify,
+        [OP_WRITE]              (nfsd4_dec)nfsd4_decode_write,
+        [OP_RELEASE_LOCKOWNER]  (nfsd4_dec)nfsd4_decode_notsupp,
+        /* new operations for NFSv4.1 */
+        [OP_BACKCHANNEL_CTL]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_EXCHANGE_ID]        (nfsd4_dec)nfsd4_decode_exchange_id,
+        [OP_CREATE_SESSION]     (nfsd4_dec)nfsd4_decode_create_session,
+        [OP_DESTROY_SESSION]    (nfsd4_dec)nfsd4_decode_destroy_session,
+        [OP_FREE_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICEINFO]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICELIST]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTCOMMIT]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTGET]          (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTRETURN]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SECINFO_NO_NAME]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SEQUENCE]           (nfsd4_dec)nfsd4_decode_sequence,
+        [OP_SET_SSV]            (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_TEST_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_WANT_DELEGATION]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DESTROY_CLIENTID]   (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RECLAIM_COMPLETE]   (nfsd4_dec)nfsd4_decode_notsupp,
+};
 struct nfsd4_minorversion_ops {
        nfsd4_dec *decoders;
        int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
 static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
        [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+        [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
 };
 static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 {
        u32 bmval0 = bmval[0];
        u32 bmval1 = bmval[1];
+        u32 bmval2 = bmval[2];
        struct kstat stat;
        struct svc_fh tempfh;
        struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        int err;
        int aclsupport = 0;
        struct nfs4_acl *acl = NULL;
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        u32 minorversion = resp->cstate.minorversion;
        BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-        BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
+        BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
-        BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
+        BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+        BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
        if (exp->ex_fslocs.migrated) {
+                BUG_ON(bmval[2]);
                status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
                if (status)
                        goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if ((buflen -= 16) < 0)
                goto out_resource;
-        WRITE32(2);
+        if (unlikely(bmval2)) {
-        WRITE32(bmval0);
+                WRITE32(3);
-        WRITE32(bmval1);
+                WRITE32(bmval0);
+                WRITE32(bmval1);
+                WRITE32(bmval2);
+        } else if (likely(bmval1)) {
+                WRITE32(2);
+                WRITE32(bmval0);
+                WRITE32(bmval1);
+        } else {
+                WRITE32(1);
+                WRITE32(bmval0);
+        }
        attrlenp = p++;                /* to be backfilled later */
        if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-                u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
+                u32 word0 = nfsd_suppattrs0(minorversion);
+                u32 word1 = nfsd_suppattrs1(minorversion);
+                u32 word2 = nfsd_suppattrs2(minorversion);
                if ((buflen -= 12) < 0)
                        goto out_resource;
                if (!aclsupport)
                        word0 &= ~FATTR4_WORD0_ACL;
                if (!exp->ex_fslocs.locations)
                        word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
-                WRITE32(2);
+                if (!word2) {
-                WRITE32(word0);
+                        WRITE32(2);
-                WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+                        WRITE32(word0);
+                        WRITE32(word1);
+                } else {
+                        WRITE32(3);
+                        WRITE32(word0);
+                        WRITE32(word1);
+                        WRITE32(word2);
+                }
        }
        if (bmval0 & FATTR4_WORD0_TYPE) {
                if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
                }
                WRITE64(stat.ino);
        }
+        if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+                WRITE32(3);
+                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+        }
        *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
        *countp = p - buffer;
        status = nfs_ok;
@@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 }
 static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+                         struct nfsd4_exchange_id *exid)
+{
+        ENCODE_HEAD;
+        char *major_id;
+        char *server_scope;
+        int major_id_sz;
+        int server_scope_sz;
+        uint64_t minor_id = 0;
+        if (nfserr)
+                return nfserr;
+        major_id = utsname()->nodename;
+        major_id_sz = strlen(major_id);
+        server_scope = utsname()->nodename;
+        server_scope_sz = strlen(server_scope);
+        RESERVE_SPACE(
+                8 /* eir_clientid */ +
+                4 /* eir_sequenceid */ +
+                4 /* eir_flags */ +
+                4 /* spr_how (SP4_NONE) */ +
+                8 /* so_minor_id */ +
+                4 /* so_major_id.len */ +
+                (XDR_QUADLEN(major_id_sz) * 4) +
+                4 /* eir_server_scope.len */ +
+                (XDR_QUADLEN(server_scope_sz) * 4) +
+                4 /* eir_server_impl_id.count (0) */);
+        WRITEMEM(&exid->clientid, 8);
+        WRITE32(exid->seqid);
+        WRITE32(exid->flags);
+        /* state_protect4_r. Currently only support SP4_NONE */
+        BUG_ON(exid->spa_how != SP4_NONE);
+        WRITE32(exid->spa_how);
+        /* The server_owner struct */
+        WRITE64(minor_id);      /* Minor id */
+        /* major id */
+        WRITE32(major_id_sz);
+        WRITEMEM(major_id, major_id_sz);
+        /* Server scope */
+        WRITE32(server_scope_sz);
+        WRITEMEM(server_scope, server_scope_sz);
+        /* Implementation id */
+        WRITE32(0);     /* zero length nfs_impl_id4 array */
+        ADJUST_ARGS();
+        return 0;
+}
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+                            struct nfsd4_create_session *sess)
+{
+        ENCODE_HEAD;
+        if (nfserr)
+                return nfserr;
+        RESERVE_SPACE(24);
+        WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        WRITE32(sess->seqid);
+        WRITE32(sess->flags);
+        ADJUST_ARGS();
+        RESERVE_SPACE(28);
+        WRITE32(0); /* headerpadsz */
+        WRITE32(sess->fore_channel.maxreq_sz);
+        WRITE32(sess->fore_channel.maxresp_sz);
+        WRITE32(sess->fore_channel.maxresp_cached);
+        WRITE32(sess->fore_channel.maxops);
+        WRITE32(sess->fore_channel.maxreqs);
+        WRITE32(sess->fore_channel.nr_rdma_attrs);
+        ADJUST_ARGS();
+        if (sess->fore_channel.nr_rdma_attrs) {
+                RESERVE_SPACE(4);
+                WRITE32(sess->fore_channel.rdma_attrs);
+                ADJUST_ARGS();
+        }
+        RESERVE_SPACE(28);
+        WRITE32(0); /* headerpadsz */
+        WRITE32(sess->back_channel.maxreq_sz);
+        WRITE32(sess->back_channel.maxresp_sz);
+        WRITE32(sess->back_channel.maxresp_cached);
+        WRITE32(sess->back_channel.maxops);
+        WRITE32(sess->back_channel.maxreqs);
+        WRITE32(sess->back_channel.nr_rdma_attrs);
+        ADJUST_ARGS();
+        if (sess->back_channel.nr_rdma_attrs) {
+                RESERVE_SPACE(4);
+                WRITE32(sess->back_channel.rdma_attrs);
+                ADJUST_ARGS();
+        }
+        return 0;
+}
+static __be32
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+                             struct nfsd4_destroy_session *destroy_session)
+{
+        return nfserr;
+}
+__be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+                      struct nfsd4_sequence *seq)
+{
+        ENCODE_HEAD;
+        if (nfserr)
+                return nfserr;
+        RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
+        WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        WRITE32(seq->seqid);
+        WRITE32(seq->slotid);
+        WRITE32(seq->maxslots);
+        /*
+         * FIXME: for now:
+         *   target_maxslots = maxslots
+         *   status_flags = 0
+         */
+        WRITE32(seq->maxslots);
+        WRITE32(0);
+        ADJUST_ARGS();
+        return 0;
+}
+static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
        return nfserr;
@@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
 static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_ACCESS]             = (nfsd4_enc)nfsd4_encode_access,
        [OP_CLOSE]              = (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_VERIFY]             = (nfsd4_enc)nfsd4_encode_noop,
        [OP_WRITE]              = (nfsd4_enc)nfsd4_encode_write,
        [OP_RELEASE_LOCKOWNER]  = (nfsd4_enc)nfsd4_encode_noop,
+        /* NFSv4.1 operations */
+        [OP_BACKCHANNEL_CTL]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_EXCHANGE_ID]        = (nfsd4_enc)nfsd4_encode_exchange_id,
+        [OP_CREATE_SESSION]     = (nfsd4_enc)nfsd4_encode_create_session,
+        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_destroy_session,
+        [OP_FREE_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SEQUENCE]           = (nfsd4_enc)nfsd4_encode_sequence,
+        [OP_SET_SSV]            = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_TEST_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_WANT_DELEGATION]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_DESTROY_CLIENTID]   = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_RECLAIM_COMPLETE]   = (nfsd4_enc)nfsd4_encode_noop,
 };
+/*
+ * Calculate the total amount of memory that the compound response has taken
+ * after encoding the current operation.
+ *
+ * pad: add on 8 bytes for the next operation's op_code and status so that
+ * there is room to cache a failure on the next operation.
+ *
+ * Compare this length to the session se_fmaxresp_cached.
+ *
+ * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
+ * will be at least a page and will therefore hold the xdr_buf head.
+ */
+static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+{
+        int status = 0;
+        struct xdr_buf *xb = &resp->rqstp->rq_res;
+        struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+        struct nfsd4_session *session = NULL;
+        struct nfsd4_slot *slot = resp->cstate.slot;
+        u32 length, tlen = 0, pad = 8;
+        if (!nfsd4_has_session(&resp->cstate))
+                return status;
+        session = resp->cstate.session;
+        if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+                return status;
+        if (resp->opcnt >= args->opcnt)
+                pad = 0; /* this is the last operation */
+        if (xb->page_len == 0) {
+                length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
+        } else {
+                if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
+                        tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
+                length = xb->head[0].iov_len + xb->page_len + tlen + pad;
+        }
+        dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
+                length, xb->page_len, tlen, pad);
+        if (length <= session->se_fmaxresp_cached)
+                return status;
+        else
+                return nfserr_rep_too_big_to_cache;
+}
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
@@ -2635,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
        BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
               !nfsd4_enc_ops[op->opnum]);
        op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+        /* nfsd4_check_drc_limit guarantees enough room for error status */
+        if (!op->status && nfsd4_check_drc_limit(resp))
+                op->status = nfserr_rep_too_big_to_cache;
 status:
        /*
         * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
                iov = &rqstp->rq_res.head[0];
        iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
        BUG_ON(iov->iov_len > PAGE_SIZE);
+        if (nfsd4_has_session(&resp->cstate)) {
+                if (resp->cstate.status == nfserr_replay_cache &&
+                                !nfsd4_not_cached(resp)) {
+                        iov->iov_len = resp->cstate.iovlen;
+                } else {
+                        nfsd4_store_cache_entry(resp);
+                        dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+                        resp->cstate.slot->sl_inuse = 0;
+                }
+                if (resp->cstate.session)
+                        nfsd4_put_session(resp->cstate.session);
+        }
        return 1;
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a4ed8644d69c..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
        NFSD_FO_UnlockFS,
        NFSD_Threads,
        NFSD_Pool_Threads,
+        NFSD_Pool_Stats,
        NFSD_Versions,
        NFSD_Ports,
        NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
        .owner          = THIS_MODULE,
 };
+extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+static struct file_operations pool_stats_operations = {
+        .open           = nfsd_pool_stats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+        .owner          = THIS_MODULE,
+};
 /*----------------------------------------------------------------------------*/
 /*
 * payload - write methods
@@ -781,8 +792,9 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
-        char *vers, sign;
+        char *vers, *minorp, sign;
        int len, num;
+        unsigned minor;
        ssize_t tlen = 0;
        char *sep;
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                do {
                        sign = *vers;
                        if (sign == '+' || sign == '-')
-                                num = simple_strtol((vers+1), NULL, 0);
+                                num = simple_strtol((vers+1), &minorp, 0);
                        else
-                                num = simple_strtol(vers, NULL, 0);
+                                num = simple_strtol(vers, &minorp, 0);
+                        if (*minorp == '.') {
+                                if (num < 4)
+                                        return -EINVAL;
+                                minor = simple_strtoul(minorp+1, NULL, 0);
+                                if (minor == 0)
+                                        return -EINVAL;
+                                if (nfsd_minorversion(minor, sign == '-' ?
+                                                     NFSD_CLEAR : NFSD_SET) < 0)
+                                        return -EINVAL;
+                                goto next;
+                        }
                        switch(num) {
                        case 2:
                        case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                        default:
                                return -EINVAL;
                        }
+                next:
                        vers += len + 1;
                        tlen += len;
                } while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                                       num);
                        sep = " ";
                }
+        if (nfsd_vers(4, NFSD_AVAIL))
+                for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+                        len += sprintf(buf+len, " %c4.%u",
+                                        (nfsd_vers(4, NFSD_TEST) &&
+                                         nfsd_minorversion(minor, NFSD_TEST)) ?
+                                                '+' : '-',
+                                        minor);
        len += sprintf(buf+len, "\n");
        return len;
 }
@@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+                [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
                [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 {
        __be32  nfserr;
        int     stable = 1;
+        unsigned long cnt = argp->len;
        dprintk("nfsd: WRITE    %s %d bytes at %d\n",
                SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
        nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
                                   argp->offset,
                                   rqstp->rq_vec, argp->vlen,
-                                   argp->len,
+                                   &cnt,
                                   &stable);
        return nfsd_return_attrs(nfserr, resp);
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index bc3567bab8c4..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
+#include <linux/swap.h>
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
 extern struct svc_program       nfsd_program;
 static int                      nfsd(void *vrqstp);
 struct timeval                  nfssvc_boot;
-static atomic_t                 nfsd_busy;
-static unsigned long            nfsd_last_call;
-static DEFINE_SPINLOCK(nfsd_call_lock);
 /*
 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program		nfsd_program = {
 };
+u32 nfsd_supported_minorversion;
 int nfsd_vers(int vers, enum vers_op change)
 {
        if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
        }
        return 0;
 }
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+        if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+                return -1;
+        switch(change) {
+        case NFSD_SET:
+                nfsd_supported_minorversion = minorversion;
+                break;
+        case NFSD_CLEAR:
+                if (minorversion == 0)
+                        return -1;
+                nfsd_supported_minorversion = minorversion - 1;
+                break;
+        case NFSD_TEST:
+                return minorversion <= nfsd_supported_minorversion;
+        case NFSD_AVAIL:
+                return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+        }
+        return 0;
+}
 /*
 * Maximum number of nfsd processes
 */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
        }
 }
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+        /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
+        #define NFSD_DRC_SIZE_SHIFT     7
+        nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
+                                                >> NFSD_DRC_SIZE_SHIFT;
+        nfsd_serv->sv_drc_pages_used = 0;
+        dprintk("%s svc_drc_max_pages %u\n", __func__,
+                nfsd_serv->sv_drc_max_pages);
+}
 int nfsd_create_serv(void)
 {
@@ -227,11 +271,12 @@ int nfsd_create_serv(void)
                        nfsd_max_blksize /= 2;
        }
-        atomic_set(&nfsd_busy, 0);
        nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
                                      nfsd_last_thread, nfsd, THIS_MODULE);
        if (nfsd_serv == NULL)
                err = -ENOMEM;
+        else
+                set_max_drc();
        do_gettimeofday(&nfssvc_boot);          /* record boot time */
        return err;
@@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
        return error;
 }
-static inline void
-update_thread_usage(int busy_threads)
-{
-        unsigned long prev_call;
-        unsigned long diff;
-        int decile;
-        spin_lock(&nfsd_call_lock);
-        prev_call = nfsd_last_call;
-        nfsd_last_call = jiffies;
-        decile = busy_threads*10/nfsdstats.th_cnt;
-        if (decile>0 && decile <= 10) {
-                diff = nfsd_last_call - prev_call;
-                if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
-                        nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
-                if (decile == 10)
-                        nfsdstats.th_fullcnt++;
-        }
-        spin_unlock(&nfsd_call_lock);
-}
 /*
 * This is the NFS server kernel thread
@@ -403,7 +428,6 @@ static int
 nfsd(void *vrqstp)
 {
        struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
-        struct fs_struct *fsp;
        int err, preverr = 0;
        /* Lock module and set up kernel thread */
@@ -412,13 +436,11 @@ nfsd(void *vrqstp)
        /* At this point, the thread shares current->fs
         * with the init process. We need to create files with a
         * umask of 0 instead of init's umask. */
-        fsp = copy_fs_struct(current->fs);
+        if (unshare_fs_struct() < 0) {
-        if (!fsp) {
                printk("Unable to start nfsd thread: out of memory\n");
                goto out;
        }
-        exit_fs(current);
-        current->fs = fsp;
        current->fs->umask = 0;
        /*
@@ -463,8 +485,6 @@ nfsd(void *vrqstp)
                        continue;
                }
-                update_thread_usage(atomic_read(&nfsd_busy));
-                atomic_inc(&nfsd_busy);
                /* Lock the export hash tables for reading. */
                exp_readlock();
@@ -473,8 +493,6 @@ nfsd(void *vrqstp)
                /* Unlock export hash tables */
                exp_readunlock();
-                update_thread_usage(atomic_read(&nfsd_busy));
-                atomic_dec(&nfsd_busy);
        }
        /* Clear signals before calling svc_exit_thread() */
@@ -542,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
                + rqstp->rq_res.head[0].iov_len;
        rqstp->rq_res.head[0].iov_len += sizeof(__be32);
+        /* NFSv4.1 DRC requires statp */
+        if (rqstp->rq_vers == 4)
+                nfsd4_set_statp(rqstp, statp);
        /* Now call the procedure handler, and encode NFS status. */
        nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
        nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -573,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
        nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
        return 1;
 }
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+        if (nfsd_serv == NULL)
+                return -ENODEV;
+        return svc_pool_stats_open(nfsd_serv, file);
+}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..ab93fcfef254 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        }
        /* Revoke setuid/setgid on chown */
-        if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+        if (!S_ISDIR(inode->i_mode) &&
-            ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) {
+            (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+             ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
                iap->ia_valid |= ATTR_KILL_PRIV;
                if (iap->ia_valid & ATTR_MODE) {
                        /* we're setting mode too, just clear the s*id bits */
@@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry)
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                                loff_t offset, struct kvec *vec, int vlen,
-                                unsigned long cnt, int *stablep)
+                                unsigned long *cnt, int *stablep)
 {
        struct svc_export       *exp;
        struct dentry           *dentry;
@@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        err = nfserr_perm;
        if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-                (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
+                (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
                goto out;
 #endif
@@ -1009,7 +1010,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
        set_fs(oldfs);
        if (host_err >= 0) {
-                nfsdstats.io_write += cnt;
+                nfsdstats.io_write += host_err;
                fsnotify_modify(file->f_path.dentry);
        }
@@ -1054,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        }
        dprintk("nfsd: write complete host_err=%d\n", host_err);
-        if (host_err >= 0)
+        if (host_err >= 0) {
                err = 0;
-        else 
+                *cnt = host_err;
+        } else
                err = nfserrno(host_err);
 out:
        return err;
@@ -1098,7 +1100,7 @@ out:
 */
 __be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-                loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
+                loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
                int *stablep)
 {
        __be32                  err = 0;
@@ -1179,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
        return 0;
 }
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+        if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+                iap->ia_valid &= ~ATTR_SIZE;
+}
 /*
 * Create a file (regular, directory, device, fifo); UNIX sockets 
 * not yet implemented.
@@ -1274,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        switch (type) {
        case S_IFREG:
                host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+                if (!host_err)
+                        nfsd_check_ignore_resizing(iap);
                break;
        case S_IFDIR:
                host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                /* setattr will sync the child (or not) */
        }
+        nfsd_check_ignore_resizing(iap);
        if (createmode == NFS3_CREATE_EXCLUSIVE) {
                /* Cram the verifier into atime/mtime */
                iap->ia_valid = ATTR_MTIME|ATTR_ATIME
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
new file mode 100644
index 000000000000..df3e62c1ddc5
--- /dev/null
+++ b/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_NILFS2_FS) += nilfs2.o
+nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
+        btnode.o bmap.o btree.o direct.o dat.o recovery.o \
+        the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
+        ifile.o alloc.o gcinode.o ioctl.o gcdat.o
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
new file mode 100644
index 000000000000..d69e6ae59251
--- /dev/null
+++ b/fs/nilfs2/alloc.c
@@ -0,0 +1,504 @@
+/*
+ * alloc.c - NILFS dat/inode allocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include "mdt.h"
+#include "alloc.h"
+static inline unsigned long
+nilfs_palloc_groups_per_desc_block(const struct inode *inode)
+{
+        return (1UL << inode->i_blkbits) /
+                sizeof(struct nilfs_palloc_group_desc);
+}
+static inline unsigned long
+nilfs_palloc_groups_count(const struct inode *inode)
+{
+        return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
+}
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+{
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+        mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
+        if (!mi->mi_bgl)
+                return -ENOMEM;
+        bgl_lock_init(mi->mi_bgl);
+        nilfs_mdt_set_entry_size(inode, entry_size, 0);
+        mi->mi_blocks_per_group =
+                DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
+                             mi->mi_entries_per_block) + 1;
+                /* Number of blocks in a group including entry blocks and
+                   a bitmap block */
+        mi->mi_blocks_per_desc_block =
+                nilfs_palloc_groups_per_desc_block(inode) *
+                mi->mi_blocks_per_group + 1;
+                /* Number of blocks per descriptor including the
+                   descriptor block */
+        return 0;
+}
+static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
+                                        unsigned long *offset)
+{
+        __u64 group = nr;
+        *offset = do_div(group, nilfs_palloc_entries_per_group(inode));
+        return group;
+}
+static unsigned long
+nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
+{
+        unsigned long desc_block =
+                group / nilfs_palloc_groups_per_desc_block(inode);
+        return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
+}
+static unsigned long
+nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
+{
+        unsigned long desc_offset =
+                group % nilfs_palloc_groups_per_desc_block(inode);
+        return nilfs_palloc_desc_blkoff(inode, group) + 1 +
+                desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
+}
+static unsigned long
+nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
+                               const struct nilfs_palloc_group_desc *desc)
+{
+        unsigned long nfree;
+        spin_lock(nilfs_mdt_bgl_lock(inode, group));
+        nfree = le32_to_cpu(desc->pg_nfrees);
+        spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+        return nfree;
+}
+static void
+nilfs_palloc_group_desc_add_entries(struct inode *inode,
+                                    unsigned long group,
+                                    struct nilfs_palloc_group_desc *desc,
+                                    u32 n)
+{
+        spin_lock(nilfs_mdt_bgl_lock(inode, group));
+        le32_add_cpu(&desc->pg_nfrees, n);
+        spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+}
+static unsigned long
+nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
+{
+        unsigned long group, group_offset;
+        group = nilfs_palloc_group(inode, nr, &group_offset);
+        return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
+                group_offset / NILFS_MDT(inode)->mi_entries_per_block;
+}
+static void nilfs_palloc_desc_block_init(struct inode *inode,
+                                         struct buffer_head *bh, void *kaddr)
+{
+        struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
+        unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
+        __le32 nfrees;
+        nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
+        while (n-- > 0) {
+                desc->pg_nfrees = nfrees;
+                desc++;
+        }
+}
+static int nilfs_palloc_get_desc_block(struct inode *inode,
+                                       unsigned long group,
+                                       int create, struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(inode,
+                                   nilfs_palloc_desc_blkoff(inode, group),
+                                   create, nilfs_palloc_desc_block_init, bhp);
+}
+static int nilfs_palloc_get_bitmap_block(struct inode *inode,
+                                         unsigned long group,
+                                         int create, struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(inode,
+                                   nilfs_palloc_bitmap_blkoff(inode, group),
+                                   create, NULL, bhp);
+}
+int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
+                                 int create, struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
+                                   create, NULL, bhp);
+}
+static struct nilfs_palloc_group_desc *
+nilfs_palloc_block_get_group_desc(const struct inode *inode,
+                                  unsigned long group,
+                                  const struct buffer_head *bh, void *kaddr)
+{
+        return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
+                group % nilfs_palloc_groups_per_desc_block(inode);
+}
+static unsigned char *
+nilfs_palloc_block_get_bitmap(const struct inode *inode,
+                              const struct buffer_head *bh, void *kaddr)
+{
+        return (unsigned char *)(kaddr + bh_offset(bh));
+}
+void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
+                                   const struct buffer_head *bh, void *kaddr)
+{
+        unsigned long entry_offset, group_offset;
+        nilfs_palloc_group(inode, nr, &group_offset);
+        entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
+        return kaddr + bh_offset(bh) +
+                entry_offset * NILFS_MDT(inode)->mi_entry_size;
+}
+static int nilfs_palloc_find_available_slot(struct inode *inode,
+                                            unsigned long group,
+                                            unsigned long target,
+                                            unsigned char *bitmap,
+                                            int bsize)  /* size in bits */
+{
+        int curr, pos, end, i;
+        if (target > 0) {
+                end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
+                if (end > bsize)
+                        end = bsize;
+                pos = nilfs_find_next_zero_bit(bitmap, end, target);
+                if (pos < end &&
+                    !nilfs_set_bit_atomic(
+                            nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
+                        return pos;
+        } else
+                end = 0;
+        for (i = 0, curr = end;
+             i < bsize;
+             i += BITS_PER_LONG, curr += BITS_PER_LONG) {
+                /* wrap around */
+                if (curr >= bsize)
+                        curr = 0;
+                while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
+                       != ~0UL) {
+                        end = curr + BITS_PER_LONG;
+                        if (end > bsize)
+                                end = bsize;
+                        pos = nilfs_find_next_zero_bit(bitmap, end, curr);
+                        if ((pos < end) &&
+                            !nilfs_set_bit_atomic(
+                                    nilfs_mdt_bgl_lock(inode, group), pos,
+                                    bitmap))
+                                return pos;
+                }
+        }
+        return -ENOSPC;
+}
+static unsigned long
+nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
+                                       unsigned long curr, unsigned long max)
+{
+        return min_t(unsigned long,
+                     nilfs_palloc_groups_per_desc_block(inode) -
+                     curr % nilfs_palloc_groups_per_desc_block(inode),
+                     max - curr + 1);
+}
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+                                     struct nilfs_palloc_req *req)
+{
+        struct buffer_head *desc_bh, *bitmap_bh;
+        struct nilfs_palloc_group_desc *desc;
+        unsigned char *bitmap;
+        void *desc_kaddr, *bitmap_kaddr;
+        unsigned long group, maxgroup, ngroups;
+        unsigned long group_offset, maxgroup_offset;
+        unsigned long n, entries_per_group, groups_per_desc_block;
+        unsigned long i, j;
+        int pos, ret;
+        ngroups = nilfs_palloc_groups_count(inode);
+        maxgroup = ngroups - 1;
+        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+        entries_per_group = nilfs_palloc_entries_per_group(inode);
+        groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
+        for (i = 0; i < ngroups; i += n) {
+                if (group >= ngroups) {
+                        /* wrap around */
+                        group = 0;
+                        maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
+                                                      &maxgroup_offset) - 1;
+                }
+                ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+                if (ret < 0)
+                        return ret;
+                desc_kaddr = kmap(desc_bh->b_page);
+                desc = nilfs_palloc_block_get_group_desc(
+                        inode, group, desc_bh, desc_kaddr);
+                n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
+                                                           maxgroup);
+                for (j = 0; j < n; j++, desc++, group++) {
+                        if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
+                            > 0) {
+                                ret = nilfs_palloc_get_bitmap_block(
+                                        inode, group, 1, &bitmap_bh);
+                                if (ret < 0)
+                                        goto out_desc;
+                                bitmap_kaddr = kmap(bitmap_bh->b_page);
+                                bitmap = nilfs_palloc_block_get_bitmap(
+                                        inode, bitmap_bh, bitmap_kaddr);
+                                pos = nilfs_palloc_find_available_slot(
+                                        inode, group, group_offset, bitmap,
+                                        entries_per_group);
+                                if (pos >= 0) {
+                                        /* found a free entry */
+                                        nilfs_palloc_group_desc_add_entries(
+                                                inode, group, desc, -1);
+                                        req->pr_entry_nr =
+                                                entries_per_group * group + pos;
+                                        kunmap(desc_bh->b_page);
+                                        kunmap(bitmap_bh->b_page);
+                                        req->pr_desc_bh = desc_bh;
+                                        req->pr_bitmap_bh = bitmap_bh;
+                                        return 0;
+                                }
+                                kunmap(bitmap_bh->b_page);
+                                brelse(bitmap_bh);
+                        }
+                        group_offset = 0;
+                }
+                kunmap(desc_bh->b_page);
+                brelse(desc_bh);
+        }
+        /* no entries left */
+        return -ENOSPC;
+ out_desc:
+        kunmap(desc_bh->b_page);
+        brelse(desc_bh);
+        return ret;
+}
+void nilfs_palloc_commit_alloc_entry(struct inode *inode,
+                                     struct nilfs_palloc_req *req)
+{
+        nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+        nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+        nilfs_mdt_mark_dirty(inode);
+        brelse(req->pr_bitmap_bh);
+        brelse(req->pr_desc_bh);
+}
+void nilfs_palloc_commit_free_entry(struct inode *inode,
+                                    struct nilfs_palloc_req *req)
+{
+        struct nilfs_palloc_group_desc *desc;
+        unsigned long group, group_offset;
+        unsigned char *bitmap;
+        void *desc_kaddr, *bitmap_kaddr;
+        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+        desc_kaddr = kmap(req->pr_desc_bh->b_page);
+        desc = nilfs_palloc_block_get_group_desc(inode, group,
+                                                 req->pr_desc_bh, desc_kaddr);
+        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+        bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+                                               bitmap_kaddr);
+        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+                                    group_offset, bitmap))
+                printk(KERN_WARNING "%s: entry number %llu already freed\n",
+                       __func__, (unsigned long long)req->pr_entry_nr);
+        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+        kunmap(req->pr_bitmap_bh->b_page);
+        kunmap(req->pr_desc_bh->b_page);
+        nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+        nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+        nilfs_mdt_mark_dirty(inode);
+        brelse(req->pr_bitmap_bh);
+        brelse(req->pr_desc_bh);
+}
+void nilfs_palloc_abort_alloc_entry(struct inode *inode,
+                                    struct nilfs_palloc_req *req)
+{
+        struct nilfs_palloc_group_desc *desc;
+        void *desc_kaddr, *bitmap_kaddr;
+        unsigned char *bitmap;
+        unsigned long group, group_offset;
+        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+        desc_kaddr = kmap(req->pr_desc_bh->b_page);
+        desc = nilfs_palloc_block_get_group_desc(inode, group,
+                                                 req->pr_desc_bh, desc_kaddr);
+        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+        bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+                                               bitmap_kaddr);
+        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+                                    group_offset, bitmap))
+                printk(KERN_WARNING "%s: entry numer %llu already freed\n",
+                       __func__, (unsigned long long)req->pr_entry_nr);
+        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+        kunmap(req->pr_bitmap_bh->b_page);
+        kunmap(req->pr_desc_bh->b_page);
+        brelse(req->pr_bitmap_bh);
+        brelse(req->pr_desc_bh);
+        req->pr_entry_nr = 0;
+        req->pr_bitmap_bh = NULL;
+        req->pr_desc_bh = NULL;
+}
+int nilfs_palloc_prepare_free_entry(struct inode *inode,
+                                    struct nilfs_palloc_req *req)
+{
+        struct buffer_head *desc_bh, *bitmap_bh;
+        unsigned long group, group_offset;
+        int ret;
+        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+        ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+        if (ret < 0)
+                return ret;
+        ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
+        if (ret < 0) {
+                brelse(desc_bh);
+                return ret;
+        }
+        req->pr_desc_bh = desc_bh;
+        req->pr_bitmap_bh = bitmap_bh;
+        return 0;
+}
+void nilfs_palloc_abort_free_entry(struct inode *inode,
+                                   struct nilfs_palloc_req *req)
+{
+        brelse(req->pr_bitmap_bh);
+        brelse(req->pr_desc_bh);
+        req->pr_entry_nr = 0;
+        req->pr_bitmap_bh = NULL;
+        req->pr_desc_bh = NULL;
+}
+static int
+nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
+{
+        __u64 first, last;
+        first = group * nilfs_palloc_entries_per_group(inode);
+        last = first + nilfs_palloc_entries_per_group(inode) - 1;
+        return (nr >= first) && (nr <= last);
+}
+int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
+{
+        struct buffer_head *desc_bh, *bitmap_bh;
+        struct nilfs_palloc_group_desc *desc;
+        unsigned char *bitmap;
+        void *desc_kaddr, *bitmap_kaddr;
+        unsigned long group, group_offset;
+        int i, j, n, ret;
+        for (i = 0; i < nitems; i += n) {
+                group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
+                ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
+                if (ret < 0)
+                        return ret;
+                ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
+                                                    &bitmap_bh);
+                if (ret < 0) {
+                        brelse(desc_bh);
+                        return ret;
+                }
+                desc_kaddr = kmap(desc_bh->b_page);
+                desc = nilfs_palloc_block_get_group_desc(
+                        inode, group, desc_bh, desc_kaddr);
+                bitmap_kaddr = kmap(bitmap_bh->b_page);
+                bitmap = nilfs_palloc_block_get_bitmap(
+                        inode, bitmap_bh, bitmap_kaddr);
+                for (j = i, n = 0;
+                     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
+                                                              entry_nrs[j]);
+                     j++, n++) {
+                        nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
+                        if (!nilfs_clear_bit_atomic(
+                                    nilfs_mdt_bgl_lock(inode, group),
+                                    group_offset, bitmap)) {
+                                printk(KERN_WARNING
+                                       "%s: entry number %llu already freed\n",
+                                       __func__,
+                                       (unsigned long long)entry_nrs[j]);
+                        }
+                }
+                nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+                kunmap(bitmap_bh->b_page);
+                kunmap(desc_bh->b_page);
+                nilfs_mdt_mark_buffer_dirty(desc_bh);
+                nilfs_mdt_mark_buffer_dirty(bitmap_bh);
+                nilfs_mdt_mark_dirty(inode);
+                brelse(bitmap_bh);
+                brelse(desc_bh);
+        }
+        return 0;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
new file mode 100644
index 000000000000..4ace5475c2c7
--- /dev/null
+++ b/fs/nilfs2/alloc.h
@@ -0,0 +1,72 @@
+/*
+ * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+#ifndef _NILFS_ALLOC_H
+#define _NILFS_ALLOC_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+static inline unsigned long
+nilfs_palloc_entries_per_group(const struct inode *inode)
+{
+        return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
+}
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
+                                 struct buffer_head **);
+void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
+                                   const struct buffer_head *, void *);
+/**
+ * nilfs_palloc_req - persistent alloctor request and reply
+ * @pr_entry_nr: entry number (vblocknr or inode number)
+ * @pr_desc_bh: buffer head of the buffer containing block group descriptors
+ * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
+ * @pr_entry_bh: buffer head of the buffer containing translation entries
+ */
+struct nilfs_palloc_req {
+        __u64 pr_entry_nr;
+        struct buffer_head *pr_desc_bh;
+        struct buffer_head *pr_bitmap_bh;
+        struct buffer_head *pr_entry_bh;
+};
+int nilfs_palloc_prepare_alloc_entry(struct inode *,
+                                     struct nilfs_palloc_req *);
+void nilfs_palloc_commit_alloc_entry(struct inode *,
+                                     struct nilfs_palloc_req *);
+void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
+#define nilfs_set_bit_atomic            ext2_set_bit_atomic
+#define nilfs_clear_bit_atomic          ext2_clear_bit_atomic
+#define nilfs_find_next_zero_bit        ext2_find_next_zero_bit
+#endif  /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
new file mode 100644
index 000000000000..24638e059bf3
--- /dev/null
+++ b/fs/nilfs2/bmap.c
@@ -0,0 +1,783 @@
+/*
+ * bmap.c - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "bmap.h"
+#include "sb.h"
+#include "btnode.h"
+#include "mdt.h"
+#include "dat.h"
+#include "alloc.h"
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
+                               __u64 *ptrp)
+{
+        __u64 ptr;
+        int ret;
+        down_read(&bmap->b_sem);
+        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
+        if (ret < 0)
+                goto out;
+        if (bmap->b_pops->bpop_translate != NULL) {
+                ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
+                if (ret < 0)
+                        goto out;
+                *ptrp = ptr;
+        }
+ out:
+        up_read(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_lookup - find a record
+ * @bmap: bmap
+ * @key: key
+ * @recp: pointer to record
+ *
+ * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
+ * @bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @recp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
+                      unsigned long key,
+                      unsigned long *recp)
+{
+        __u64 ptr;
+        int ret;
+        /* XXX: use macro for level 1 */
+        ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
+        if (recp != NULL)
+                *recp = ptr;
+        return ret;
+}
+static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+        __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
+        __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
+        int ret, n;
+        if (bmap->b_ops->bop_check_insert != NULL) {
+                ret = bmap->b_ops->bop_check_insert(bmap, key);
+                if (ret > 0) {
+                        n = bmap->b_ops->bop_gather_data(
+                                bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
+                        if (n < 0)
+                                return n;
+                        ret = nilfs_btree_convert_and_insert(
+                                bmap, key, ptr, keys, ptrs, n,
+                                NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
+                        if (ret == 0)
+                                bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
+                        return ret;
+                } else if (ret < 0)
+                        return ret;
+        }
+        return bmap->b_ops->bop_insert(bmap, key, ptr);
+}
+/**
+ * nilfs_bmap_insert - insert a new key-record pair into a bmap
+ * @bmap: bmap
+ * @key: key
+ * @rec: record
+ *
+ * Description: nilfs_bmap_insert() inserts the new key-record pair specified
+ * by @key and @rec into @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EEXIST - A record associated with @key already exist.
+ */
+int nilfs_bmap_insert(struct nilfs_bmap *bmap,
+                      unsigned long key,
+                      unsigned long rec)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = nilfs_bmap_do_insert(bmap, key, rec);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+        __u64 keys[NILFS_BMAP_LARGE_LOW + 1];
+        __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
+        int ret, n;
+        if (bmap->b_ops->bop_check_delete != NULL) {
+                ret = bmap->b_ops->bop_check_delete(bmap, key);
+                if (ret > 0) {
+                        n = bmap->b_ops->bop_gather_data(
+                                bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
+                        if (n < 0)
+                                return n;
+                        ret = nilfs_direct_delete_and_convert(
+                                bmap, key, keys, ptrs, n,
+                                NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
+                        if (ret == 0)
+                                bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
+                        return ret;
+                } else if (ret < 0)
+                        return ret;
+        }
+        return bmap->b_ops->bop_delete(bmap, key);
+}
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+{
+        __u64 lastkey;
+        int ret;
+        down_read(&bmap->b_sem);
+        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+        if (!ret)
+                *key = lastkey;
+        up_read(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_delete - delete a key-record pair from a bmap
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_delete() deletes the key-record pair specified by
+ * @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = nilfs_bmap_do_delete(bmap, key);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+        __u64 lastkey;
+        int ret;
+        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+        if (ret < 0) {
+                if (ret == -ENOENT)
+                        ret = 0;
+                return ret;
+        }
+        while (key <= lastkey) {
+                ret = nilfs_bmap_do_delete(bmap, lastkey);
+                if (ret < 0)
+                        return ret;
+                ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+                if (ret < 0) {
+                        if (ret == -ENOENT)
+                                ret = 0;
+                        return ret;
+                }
+        }
+        return 0;
+}
+/**
+ * nilfs_bmap_truncate - truncate a bmap to a specified key
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
+ * greater than or equal to @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = nilfs_bmap_do_truncate(bmap, key);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_clear - free resources a bmap holds
+ * @bmap: bmap
+ *
+ * Description: nilfs_bmap_clear() frees resources associated with @bmap.
+ */
+void nilfs_bmap_clear(struct nilfs_bmap *bmap)
+{
+        down_write(&bmap->b_sem);
+        if (bmap->b_ops->bop_clear != NULL)
+                bmap->b_ops->bop_clear(bmap);
+        up_write(&bmap->b_sem);
+}
+/**
+ * nilfs_bmap_propagate - propagate dirty state
+ * @bmap: bmap
+ * @bh: buffer head
+ *
+ * Description: nilfs_bmap_propagate() marks the buffers that directly or
+ * indirectly refer to the block specified by @bh dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = bmap->b_ops->bop_propagate(bmap, bh);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_lookup_dirty_buffers -
+ * @bmap: bmap
+ * @listp: pointer to buffer head list
+ */
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+                                     struct list_head *listp)
+{
+        if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
+                bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
+}
+/**
+ * nilfs_bmap_assign - assign a new block number to a block
+ * @bmap: bmap
+ * @bhp: pointer to buffer head
+ * @blocknr: block number
+ * @binfo: block information
+ *
+ * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
+ * buffer specified by @bh.
+ *
+ * Return Value: On success, 0 is returned and the buffer head of a newly
+ * create buffer and the block information associated with the buffer are
+ * stored in the place pointed by @bh and @binfo, respectively. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_assign(struct nilfs_bmap *bmap,
+                      struct buffer_head **bh,
+                      unsigned long blocknr,
+                      union nilfs_binfo *binfo)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_mark - mark block dirty
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ *
+ * Description: nilfs_bmap_mark() marks the block specified by @key and @level
+ * as dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+        int ret;
+        if (bmap->b_ops->bop_mark == NULL)
+                return 0;
+        down_write(&bmap->b_sem);
+        ret = bmap->b_ops->bop_mark(bmap, key, level);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/**
+ * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
+ * @bmap: bmap
+ *
+ * Description: nilfs_test_and_clear() is the atomic operation to test and
+ * clear the dirty state of @bmap.
+ *
+ * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
+ */
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
+{
+        int ret;
+        down_write(&bmap->b_sem);
+        ret = nilfs_bmap_dirty(bmap);
+        nilfs_bmap_clear_dirty(bmap);
+        up_write(&bmap->b_sem);
+        return ret;
+}
+/*
+ * Internal use only
+ */
+void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
+{
+        inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
+        if (NILFS_MDT(bmap->b_inode))
+                nilfs_mdt_mark_dirty(bmap->b_inode);
+        else
+                mark_inode_dirty(bmap->b_inode);
+}
+void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
+{
+        inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
+        if (NILFS_MDT(bmap->b_inode))
+                nilfs_mdt_mark_dirty(bmap->b_inode);
+        else
+                mark_inode_dirty(bmap->b_inode);
+}
+int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
+                         struct buffer_head **bhp)
+{
+        return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
+                                ptr, 0, bhp, 0);
+}
+void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
+                          struct buffer_head *bh)
+{
+        brelse(bh);
+}
+int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
+                             struct buffer_head **bhp)
+{
+        int ret;
+        ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
+                               ptr, 0, bhp, 1);
+        if (ret < 0)
+                return ret;
+        set_buffer_nilfs_volatile(*bhp);
+        return 0;
+}
+void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
+                             struct buffer_head *bh)
+{
+        nilfs_btnode_delete(bh);
+}
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
+                              const struct buffer_head *bh)
+{
+        struct buffer_head *pbh;
+        __u64 key;
+        key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
+                                         bmap->b_inode->i_blkbits);
+        for (pbh = page_buffers(bh->b_page); pbh != bh;
+             pbh = pbh->b_this_page, key++);
+        return key;
+}
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
+{
+        __s64 diff;
+        diff = key - bmap->b_last_allocated_key;
+        if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
+            (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
+            (bmap->b_last_allocated_ptr + diff > 0))
+                return bmap->b_last_allocated_ptr + diff;
+        else
+                return NILFS_BMAP_INVALID_PTR;
+}
+static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+}
+#define NILFS_BMAP_GROUP_DIV    8
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
+{
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
+        unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
+        unsigned long group = bmap->b_inode->i_ino / entries_per_group;
+        return group * entries_per_group +
+                (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
+                (entries_per_group / NILFS_BMAP_GROUP_DIV);
+}
+static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
+                                     union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req,
+                                      sector_t blocknr)
+{
+        nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
+                               blocknr);
+}
+static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
+                                     union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
+                                    union nilfs_bmap_ptr_req *req)
+{
+        return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
+                                    union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
+}
+static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
+                                       union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
+}
+static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
+                                   union nilfs_bmap_ptr_req *req)
+{
+        nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
+                      sector_t blocknr)
+{
+        return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
+}
+int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
+{
+        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
+}
+int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
+                              union nilfs_bmap_ptr_req *oldreq,
+                              union nilfs_bmap_ptr_req *newreq)
+{
+        int ret;
+        ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
+        if (ret < 0)
+                return ret;
+        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
+        if (ret < 0)
+                bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+        return ret;
+}
+void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
+                              union nilfs_bmap_ptr_req *oldreq,
+                              union nilfs_bmap_ptr_req *newreq)
+{
+        bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
+        bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
+}
+void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
+                             union nilfs_bmap_ptr_req *oldreq,
+                             union nilfs_bmap_ptr_req *newreq)
+{
+        bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+        bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
+}
+static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
+                                  __u64 *ptrp)
+{
+        sector_t blocknr;
+        int ret;
+        ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
+        if (ret < 0)
+                return ret;
+        if (ptrp != NULL)
+                *ptrp = blocknr;
+        return 0;
+}
+static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        /* ignore target ptr */
+        req->bpr_ptr = bmap->b_last_allocated_ptr++;
+        return 0;
+}
+static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
+                                      union nilfs_bmap_ptr_req *req)
+{
+        /* do nothing */
+}
+static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
+                                     union nilfs_bmap_ptr_req *req)
+{
+        bmap->b_last_allocated_ptr--;
+}
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
+        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
+        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
+        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
+        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
+        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
+        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
+        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
+        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_v,
+        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
+        .bpop_translate         =       nilfs_bmap_translate_v,
+};
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
+        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
+        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
+        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
+        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
+        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
+        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
+        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
+        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_vmdt,
+        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
+        .bpop_translate         =       nilfs_bmap_translate_v,
+};
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
+        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_p,
+        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_p,
+        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_p,
+        .bpop_prepare_start_ptr =       NULL,
+        .bpop_commit_start_ptr  =       NULL,
+        .bpop_abort_start_ptr   =       NULL,
+        .bpop_prepare_end_ptr   =       NULL,
+        .bpop_commit_end_ptr    =       NULL,
+        .bpop_abort_end_ptr     =       NULL,
+        .bpop_translate         =       NULL,
+};
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
+        .bpop_prepare_alloc_ptr =       NULL,
+        .bpop_commit_alloc_ptr  =       NULL,
+        .bpop_abort_alloc_ptr   =       NULL,
+        .bpop_prepare_start_ptr =       NULL,
+        .bpop_commit_start_ptr  =       NULL,
+        .bpop_abort_start_ptr   =       NULL,
+        .bpop_prepare_end_ptr   =       NULL,
+        .bpop_commit_end_ptr    =       NULL,
+        .bpop_abort_end_ptr     =       NULL,
+        .bpop_translate         =       NULL,
+};
+/**
+ * nilfs_bmap_read - read a bmap from an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_read() initializes the bmap @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+        if (raw_inode == NULL)
+                memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
+        else
+                memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
+        init_rwsem(&bmap->b_sem);
+        bmap->b_state = 0;
+        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+        switch (bmap->b_inode->i_ino) {
+        case NILFS_DAT_INO:
+                bmap->b_pops = &nilfs_bmap_ptr_ops_p;
+                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+                break;
+        case NILFS_CPFILE_INO:
+        case NILFS_SUFILE_INO:
+                bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
+                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+                break;
+        default:
+                bmap->b_pops = &nilfs_bmap_ptr_ops_v;
+                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+                break;
+        }
+        return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
+                nilfs_btree_init(bmap,
+                                 NILFS_BMAP_LARGE_LOW,
+                                 NILFS_BMAP_LARGE_HIGH) :
+                nilfs_direct_init(bmap,
+                                  NILFS_BMAP_SMALL_LOW,
+                                  NILFS_BMAP_SMALL_HIGH);
+}
+/**
+ * nilfs_bmap_write - write back a bmap to an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
+ */
+void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+        down_write(&bmap->b_sem);
+        memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
+               NILFS_INODE_BMAP_SIZE * sizeof(__le64));
+        if (bmap->b_inode->i_ino == NILFS_DAT_INO)
+                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+        up_write(&bmap->b_sem);
+}
+void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
+{
+        memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
+        init_rwsem(&bmap->b_sem);
+        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+        bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
+        bmap->b_last_allocated_key = 0;
+        bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+        bmap->b_state = 0;
+        nilfs_btree_init_gc(bmap);
+}
+void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+{
+        memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
+        init_rwsem(&gcbmap->b_sem);
+        gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
+}
+void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+{
+        memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
+        init_rwsem(&bmap->b_sem);
+        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
new file mode 100644
index 000000000000..4f2708abb1ba
--- /dev/null
+++ b/fs/nilfs2/bmap.h
@@ -0,0 +1,244 @@
+/*
+ * bmap.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_BMAP_H
+#define _NILFS_BMAP_H
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "alloc.h"
+#define NILFS_BMAP_INVALID_PTR  0
+#define nilfs_bmap_dkey_to_key(dkey)    le64_to_cpu(dkey)
+#define nilfs_bmap_key_to_dkey(key)     cpu_to_le64(key)
+#define nilfs_bmap_dptr_to_ptr(dptr)    le64_to_cpu(dptr)
+#define nilfs_bmap_ptr_to_dptr(ptr)     cpu_to_le64(ptr)
+#define nilfs_bmap_keydiff_abs(diff)    ((diff) < 0 ? -(diff) : (diff))
+struct nilfs_bmap;
+/**
+ * union nilfs_bmap_ptr_req - request for bmap ptr
+ * @bpr_ptr: bmap pointer
+ * @bpr_req: request for persistent allocator
+ */
+union nilfs_bmap_ptr_req {
+        __u64 bpr_ptr;
+        struct nilfs_palloc_req bpr_req;
+};
+/**
+ * struct nilfs_bmap_stats - bmap statistics
+ * @bs_nblocks: number of blocks created or deleted
+ */
+struct nilfs_bmap_stats {
+        unsigned int bs_nblocks;
+};
+/**
+ * struct nilfs_bmap_operations - bmap operation table
+ */
+struct nilfs_bmap_operations {
+        int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+        int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
+        int (*bop_delete)(struct nilfs_bmap *, __u64);
+        void (*bop_clear)(struct nilfs_bmap *);
+        int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
+        void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
+                                         struct list_head *);
+        int (*bop_assign)(struct nilfs_bmap *,
+                          struct buffer_head **,
+                          sector_t,
+                          union nilfs_binfo *);
+        int (*bop_mark)(struct nilfs_bmap *, __u64, int);
+        /* The following functions are internal use only. */
+        int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+        int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
+        int (*bop_check_delete)(struct nilfs_bmap *, __u64);
+        int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
+};
+/**
+ * struct nilfs_bmap_ptr_operations - bmap ptr operation table
+ */
+struct nilfs_bmap_ptr_operations {
+        int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
+                                      union nilfs_bmap_ptr_req *);
+        void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
+                                      union nilfs_bmap_ptr_req *);
+        void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
+                                     union nilfs_bmap_ptr_req *);
+        int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
+                                      union nilfs_bmap_ptr_req *);
+        void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
+                                      union nilfs_bmap_ptr_req *,
+                                      sector_t);
+        void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
+                                     union nilfs_bmap_ptr_req *);
+        int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
+                                    union nilfs_bmap_ptr_req *);
+        void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
+                                    union nilfs_bmap_ptr_req *);
+        void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
+                                   union nilfs_bmap_ptr_req *);
+        int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
+};
+#define NILFS_BMAP_SIZE         (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
+#define NILFS_BMAP_KEY_BIT      (sizeof(unsigned long) * 8 /* CHAR_BIT */)
+#define NILFS_BMAP_NEW_PTR_INIT \
+        (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
+static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
+{
+        return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
+}
+/**
+ * struct nilfs_bmap - bmap structure
+ * @b_u: raw data
+ * @b_sem: semaphore
+ * @b_inode: owner of bmap
+ * @b_ops: bmap operation table
+ * @b_pops: bmap ptr operation table
+ * @b_low: low watermark of conversion
+ * @b_high: high watermark of conversion
+ * @b_last_allocated_key: last allocated key for data block
+ * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_state: state
+ */
+struct nilfs_bmap {
+        union {
+                __u8 u_flags;
+                __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
+        } b_u;
+        struct rw_semaphore b_sem;
+        struct inode *b_inode;
+        const struct nilfs_bmap_operations *b_ops;
+        const struct nilfs_bmap_ptr_operations *b_pops;
+        __u64 b_low;
+        __u64 b_high;
+        __u64 b_last_allocated_key;
+        __u64 b_last_allocated_ptr;
+        int b_state;
+};
+/* state */
+#define NILFS_BMAP_DIRTY        0x00000001
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
+int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
+void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
+int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
+int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
+int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
+int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+void nilfs_bmap_clear(struct nilfs_bmap *);
+int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
+int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
+                      unsigned long, union nilfs_binfo *);
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
+int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
+void nilfs_bmap_init_gc(struct nilfs_bmap *);
+void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+/*
+ * Internal use only
+ */
+int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
+int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
+                              const struct buffer_head *);
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
+int nilfs_bmap_prepare_update(struct nilfs_bmap *,
+                              union nilfs_bmap_ptr_req *,
+                              union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_update(struct nilfs_bmap *,
+                              union nilfs_bmap_ptr_req *,
+                              union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_update(struct nilfs_bmap *,
+                             union nilfs_bmap_ptr_req *,
+                             union nilfs_bmap_ptr_req *);
+void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
+void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
+int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
+                         struct buffer_head **);
+void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
+int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
+                             struct buffer_head **);
+void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
+/* Assume that bmap semaphore is locked. */
+static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
+{
+        return !!(bmap->b_state & NILFS_BMAP_DIRTY);
+}
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
+{
+        bmap->b_state |= NILFS_BMAP_DIRTY;
+}
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
+{
+        bmap->b_state &= ~NILFS_BMAP_DIRTY;
+}
+#define NILFS_BMAP_LARGE        0x1
+#define NILFS_BMAP_SMALL_LOW    NILFS_DIRECT_KEY_MIN
+#define NILFS_BMAP_SMALL_HIGH   NILFS_DIRECT_KEY_MAX
+#define NILFS_BMAP_LARGE_LOW    NILFS_BTREE_ROOT_NCHILDREN_MAX
+#define NILFS_BMAP_LARGE_HIGH   NILFS_BTREE_KEY_MAX
+#endif  /* _NILFS_BMAP_H */
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
new file mode 100644
index 000000000000..d41509bff47b
--- /dev/null
+++ b/fs/nilfs2/bmap_union.h
@@ -0,0 +1,42 @@
+/*
+ * bmap_union.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_BMAP_UNION_H
+#define _NILFS_BMAP_UNION_H
+#include "bmap.h"
+#include "direct.h"
+#include "btree.h"
+/**
+ * nilfs_bmap_union -
+ * @bi_bmap: bmap structure
+ * @bi_btree: direct map structure
+ * @bi_direct: B-tree structure
+ */
+union nilfs_bmap_union {
+        struct nilfs_bmap bi_bmap;
+        struct nilfs_direct bi_direct;
+        struct nilfs_btree bi_btree;
+};
+#endif  /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
new file mode 100644
index 000000000000..4cc07b2c30e0
--- /dev/null
+++ b/fs/nilfs2/btnode.c
@@ -0,0 +1,316 @@
+/*
+ * btnode.c - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * This file was originally written by Seiji Kihara <kihara@osrg.net>
+ * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
+ * stabilization and simplification.
+ *
+ */
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "dat.h"
+#include "page.h"
+#include "btnode.h"
+void nilfs_btnode_cache_init_once(struct address_space *btnc)
+{
+        INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
+        spin_lock_init(&btnc->tree_lock);
+        INIT_LIST_HEAD(&btnc->private_list);
+        spin_lock_init(&btnc->private_lock);
+        spin_lock_init(&btnc->i_mmap_lock);
+        INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
+        INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
+}
+static struct address_space_operations def_btnode_aops;
+void nilfs_btnode_cache_init(struct address_space *btnc)
+{
+        btnc->host = NULL;  /* can safely set to host inode ? */
+        btnc->flags = 0;
+        mapping_set_gfp_mask(btnc, GFP_NOFS);
+        btnc->assoc_mapping = NULL;
+        btnc->backing_dev_info = &default_backing_dev_info;
+        btnc->a_ops = &def_btnode_aops;
+}
+void nilfs_btnode_cache_clear(struct address_space *btnc)
+{
+        invalidate_mapping_pages(btnc, 0, -1);
+        truncate_inode_pages(btnc, 0);
+}
+int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
+                              sector_t pblocknr, struct buffer_head **pbh,
+                              int newblk)
+{
+        struct buffer_head *bh;
+        struct inode *inode = NILFS_BTNC_I(btnc);
+        int err;
+        bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+        if (unlikely(!bh))
+                return -ENOMEM;
+        err = -EEXIST; /* internal code */
+        if (newblk) {
+                if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+                             buffer_dirty(bh))) {
+                        brelse(bh);
+                        BUG();
+                }
+                bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+                bh->b_blocknr = blocknr;
+                set_buffer_mapped(bh);
+                set_buffer_uptodate(bh);
+                goto found;
+        }
+        if (buffer_uptodate(bh) || buffer_dirty(bh))
+                goto found;
+        if (pblocknr == 0) {
+                pblocknr = blocknr;
+                if (inode->i_ino != NILFS_DAT_INO) {
+                        struct inode *dat =
+                                nilfs_dat_inode(NILFS_I_NILFS(inode));
+                        /* blocknr is a virtual block number */
+                        err = nilfs_dat_translate(dat, blocknr, &pblocknr);
+                        if (unlikely(err)) {
+                                brelse(bh);
+                                goto out_locked;
+                        }
+                }
+        }
+        lock_buffer(bh);
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                err = -EEXIST; /* internal code */
+                goto found;
+        }
+        set_buffer_mapped(bh);
+        bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+        bh->b_blocknr = pblocknr; /* set block address for read */
+        bh->b_end_io = end_buffer_read_sync;
+        get_bh(bh);
+        submit_bh(READ, bh);
+        bh->b_blocknr = blocknr; /* set back to the given block address */
+        err = 0;
+found:
+        *pbh = bh;
+out_locked:
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        return err;
+}
+int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
+                     sector_t pblocknr, struct buffer_head **pbh, int newblk)
+{
+        struct buffer_head *bh;
+        int err;
+        err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
+        if (err == -EEXIST) /* internal code (cache hit) */
+                return 0;
+        if (unlikely(err))
+                return err;
+        bh = *pbh;
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
+                brelse(bh);
+                return -EIO;
+        }
+        return 0;
+}
+/**
+ * nilfs_btnode_delete - delete B-tree node buffer
+ * @bh: buffer to be deleted
+ *
+ * nilfs_btnode_delete() invalidates the specified buffer and delete the page
+ * including the buffer if the page gets unbusy.
+ */
+void nilfs_btnode_delete(struct buffer_head *bh)
+{
+        struct address_space *mapping;
+        struct page *page = bh->b_page;
+        pgoff_t index = page_index(page);
+        int still_dirty;
+        page_cache_get(page);
+        lock_page(page);
+        wait_on_page_writeback(page);
+        nilfs_forget_buffer(bh);
+        still_dirty = PageDirty(page);
+        mapping = page->mapping;
+        unlock_page(page);
+        page_cache_release(page);
+        if (!still_dirty && mapping)
+                invalidate_inode_pages2_range(mapping, index, index);
+}
+/**
+ * nilfs_btnode_prepare_change_key
+ *  prepare to move contents of the block for old key to one of new key.
+ *  the old buffer will not be removed, but might be reused for new buffer.
+ *  it might return -ENOMEM because of memory allocation errors,
+ *  and might return -EIO because of disk read errors.
+ */
+int nilfs_btnode_prepare_change_key(struct address_space *btnc,
+                                    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+        struct buffer_head *obh, *nbh;
+        struct inode *inode = NILFS_BTNC_I(btnc);
+        __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+        int err;
+        if (oldkey == newkey)
+                return 0;
+        obh = ctxt->bh;
+        ctxt->newbh = NULL;
+        if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
+                lock_page(obh->b_page);
+                /*
+                 * We cannot call radix_tree_preload for the kernels older
+                 * than 2.6.23, because it is not exported for modules.
+                 */
+                err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+                if (err)
+                        goto failed_unlock;
+                /* BUG_ON(oldkey != obh->b_page->index); */
+                if (unlikely(oldkey != obh->b_page->index))
+                        NILFS_PAGE_BUG(obh->b_page,
+                                       "invalid oldkey %lld (newkey=%lld)",
+                                       (unsigned long long)oldkey,
+                                       (unsigned long long)newkey);
+retry:
+                spin_lock_irq(&btnc->tree_lock);
+                err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
+                spin_unlock_irq(&btnc->tree_lock);
+                /*
+                 * Note: page->index will not change to newkey until
+                 * nilfs_btnode_commit_change_key() will be called.
+                 * To protect the page in intermediate state, the page lock
+                 * is held.
+                 */
+                radix_tree_preload_end();
+                if (!err)
+                        return 0;
+                else if (err != -EEXIST)
+                        goto failed_unlock;
+                err = invalidate_inode_pages2_range(btnc, newkey, newkey);
+                if (!err)
+                        goto retry;
+                /* fallback to copy mode */
+                unlock_page(obh->b_page);
+        }
+        err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
+        if (likely(!err)) {
+                BUG_ON(nbh == obh);
+                ctxt->newbh = nbh;
+        }
+        return err;
+ failed_unlock:
+        unlock_page(obh->b_page);
+        return err;
+}
+/**
+ * nilfs_btnode_commit_change_key
+ *  commit the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_commit_change_key(struct address_space *btnc,
+                                    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+        struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
+        __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+        struct page *opage;
+        if (oldkey == newkey)
+                return;
+        if (nbh == NULL) {      /* blocksize == pagesize */
+                opage = obh->b_page;
+                if (unlikely(oldkey != opage->index))
+                        NILFS_PAGE_BUG(opage,
+                                       "invalid oldkey %lld (newkey=%lld)",
+                                       (unsigned long long)oldkey,
+                                       (unsigned long long)newkey);
+                if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
+                        BUG();
+                spin_lock_irq(&btnc->tree_lock);
+                radix_tree_delete(&btnc->page_tree, oldkey);
+                radix_tree_tag_set(&btnc->page_tree, newkey,
+                                   PAGECACHE_TAG_DIRTY);
+                spin_unlock_irq(&btnc->tree_lock);
+                opage->index = obh->b_blocknr = newkey;
+                unlock_page(opage);
+        } else {
+                nilfs_copy_buffer(nbh, obh);
+                nilfs_btnode_mark_dirty(nbh);
+                nbh->b_blocknr = newkey;
+                ctxt->bh = nbh;
+                nilfs_btnode_delete(obh); /* will decrement bh->b_count */
+        }
+}
+/**
+ * nilfs_btnode_abort_change_key
+ *  abort the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_abort_change_key(struct address_space *btnc,
+                                   struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+        struct buffer_head *nbh = ctxt->newbh;
+        __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+        if (oldkey == newkey)
+                return;
+        if (nbh == NULL) {      /* blocksize == pagesize */
+                spin_lock_irq(&btnc->tree_lock);
+                radix_tree_delete(&btnc->page_tree, newkey);
+                spin_unlock_irq(&btnc->tree_lock);
+                unlock_page(ctxt->bh->b_page);
+        } else
+                brelse(nbh);
+}
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
new file mode 100644
index 000000000000..35faa86444a7
--- /dev/null
+++ b/fs/nilfs2/btnode.h
@@ -0,0 +1,58 @@
+/*
+ * btnode.h - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#ifndef _NILFS_BTNODE_H
+#define _NILFS_BTNODE_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+struct nilfs_btnode_chkey_ctxt {
+        __u64 oldkey;
+        __u64 newkey;
+        struct buffer_head *bh;
+        struct buffer_head *newbh;
+};
+void nilfs_btnode_cache_init_once(struct address_space *);
+void nilfs_btnode_cache_init(struct address_space *);
+void nilfs_btnode_cache_clear(struct address_space *);
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
+                              struct buffer_head **, int);
+int nilfs_btnode_get(struct address_space *, __u64, sector_t,
+                     struct buffer_head **, int);
+void nilfs_btnode_delete(struct buffer_head *);
+int nilfs_btnode_prepare_change_key(struct address_space *,
+                                    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_commit_change_key(struct address_space *,
+                                    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_abort_change_key(struct address_space *,
+                                   struct nilfs_btnode_chkey_ctxt *);
+#define nilfs_btnode_mark_dirty(bh)     nilfs_mark_buffer_dirty(bh)
+#endif  /* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
new file mode 100644
index 000000000000..6b37a2767293
--- /dev/null
+++ b/fs/nilfs2/btree.c
@@ -0,0 +1,2269 @@
+/*
+ * btree.c - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "btnode.h"
+#include "btree.h"
+#include "alloc.h"
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+        struct buffer_head *bp_bh;
+        struct buffer_head *bp_sib_bh;
+        int bp_index;
+        union nilfs_bmap_ptr_req bp_oldreq;
+        union nilfs_bmap_ptr_req bp_newreq;
+        struct nilfs_btnode_chkey_ctxt bp_ctxt;
+        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+                      int, __u64 *, __u64 *);
+};
+/*
+ * B-tree path operations
+ */
+static struct kmem_cache *nilfs_btree_path_cache;
+int __init nilfs_btree_path_cache_init(void)
+{
+        nilfs_btree_path_cache =
+                kmem_cache_create("nilfs2_btree_path_cache",
+                                  sizeof(struct nilfs_btree_path) *
+                                  NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
+        return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
+}
+void nilfs_btree_path_cache_destroy(void)
+{
+        kmem_cache_destroy(nilfs_btree_path_cache);
+}
+static inline struct nilfs_btree_path *
+nilfs_btree_alloc_path(const struct nilfs_btree *btree)
+{
+        return (struct nilfs_btree_path *)
+                kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+}
+static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
+                                         struct nilfs_btree_path *path)
+{
+        kmem_cache_free(nilfs_btree_path_cache, path);
+}
+static void nilfs_btree_init_path(const struct nilfs_btree *btree,
+                                  struct nilfs_btree_path *path)
+{
+        int level;
+        for (level = NILFS_BTREE_LEVEL_DATA;
+             level < NILFS_BTREE_LEVEL_MAX;
+             level++) {
+                path[level].bp_bh = NULL;
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index = 0;
+                path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+                path[level].bp_op = NULL;
+        }
+}
+static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
+                                   struct nilfs_btree_path *path)
+{
+        int level;
+        for (level = NILFS_BTREE_LEVEL_DATA;
+             level < NILFS_BTREE_LEVEL_MAX;
+             level++) {
+                if (path[level].bp_bh != NULL) {
+                        nilfs_bmap_put_block(&btree->bt_bmap,
+                                             path[level].bp_bh);
+                        path[level].bp_bh = NULL;
+                }
+                /* sib_bh is released or deleted by prepare or commit
+                 * operations. */
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index = 0;
+                path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+                path[level].bp_op = NULL;
+        }
+}
+/*
+ * B-tree node operations
+ */
+static inline int
+nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
+                           const struct nilfs_btree_node *node)
+{
+        return node->bn_flags;
+}
+static inline void
+nilfs_btree_node_set_flags(struct nilfs_btree *btree,
+                           struct nilfs_btree_node *node,
+                           int flags)
+{
+        node->bn_flags = flags;
+}
+static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
+                                        const struct nilfs_btree_node *node)
+{
+        return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+}
+static inline int
+nilfs_btree_node_get_level(const struct nilfs_btree *btree,
+                           const struct nilfs_btree_node *node)
+{
+        return node->bn_level;
+}
+static inline void
+nilfs_btree_node_set_level(struct nilfs_btree *btree,
+                           struct nilfs_btree_node *node,
+                           int level)
+{
+        node->bn_level = level;
+}
+static inline int
+nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
+                               const struct nilfs_btree_node *node)
+{
+        return le16_to_cpu(node->bn_nchildren);
+}
+static inline void
+nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
+                               struct nilfs_btree_node *node,
+                               int nchildren)
+{
+        node->bn_nchildren = cpu_to_le16(nchildren);
+}
+static inline int
+nilfs_btree_node_size(const struct nilfs_btree *btree)
+{
+        return 1 << btree->bt_bmap.b_inode->i_blkbits;
+}
+static inline int
+nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
+                               const struct nilfs_btree_node *node)
+{
+        return nilfs_btree_node_root(btree, node) ?
+                NILFS_BTREE_ROOT_NCHILDREN_MIN :
+                NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+}
+static inline int
+nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
+                               const struct nilfs_btree_node *node)
+{
+        return nilfs_btree_node_root(btree, node) ?
+                NILFS_BTREE_ROOT_NCHILDREN_MAX :
+                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
+}
+static inline __le64 *
+nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
+                       const struct nilfs_btree_node *node)
+{
+        return (__le64 *)((char *)(node + 1) +
+                          (nilfs_btree_node_root(btree, node) ?
+                           0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
+}
+static inline __le64 *
+nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
+                       const struct nilfs_btree_node *node)
+{
+        return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
+                          nilfs_btree_node_nchildren_max(btree, node));
+}
+static inline __u64
+nilfs_btree_node_get_key(const struct nilfs_btree *btree,
+                         const struct nilfs_btree_node *node, int index)
+{
+        return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
+                                        index));
+}
+static inline void
+nilfs_btree_node_set_key(struct nilfs_btree *btree,
+                         struct nilfs_btree_node *node, int index, __u64 key)
+{
+        *(nilfs_btree_node_dkeys(btree, node) + index) =
+                nilfs_bmap_key_to_dkey(key);
+}
+static inline __u64
+nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
+                         const struct nilfs_btree_node *node,
+                         int index)
+{
+        return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+                                        index));
+}
+static inline void
+nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
+                         struct nilfs_btree_node *node,
+                         int index,
+                         __u64 ptr)
+{
+        *(nilfs_btree_node_dptrs(btree, node) + index) =
+                nilfs_bmap_ptr_to_dptr(ptr);
+}
+static void nilfs_btree_node_init(struct nilfs_btree *btree,
+                                  struct nilfs_btree_node *node,
+                                  int flags, int level, int nchildren,
+                                  const __u64 *keys, const __u64 *ptrs)
+{
+        __le64 *dkeys;
+        __le64 *dptrs;
+        int i;
+        nilfs_btree_node_set_flags(btree, node, flags);
+        nilfs_btree_node_set_level(btree, node, level);
+        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dptrs = nilfs_btree_node_dptrs(btree, node);
+        for (i = 0; i < nchildren; i++) {
+                dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
+                dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
+        }
+}
+/* Assume the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
+                                       struct nilfs_btree_node *left,
+                                       struct nilfs_btree_node *right,
+                                       int n)
+{
+        __le64 *ldkeys, *rdkeys;
+        __le64 *ldptrs, *rdptrs;
+        int lnchildren, rnchildren;
+        ldkeys = nilfs_btree_node_dkeys(btree, left);
+        ldptrs = nilfs_btree_node_dptrs(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        rdkeys = nilfs_btree_node_dkeys(btree, right);
+        rdptrs = nilfs_btree_node_dptrs(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
+        memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
+        memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
+        memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
+        lnchildren += n;
+        rnchildren -= n;
+        nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+        nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+}
+/* Assume that the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
+                                        struct nilfs_btree_node *left,
+                                        struct nilfs_btree_node *right,
+                                        int n)
+{
+        __le64 *ldkeys, *rdkeys;
+        __le64 *ldptrs, *rdptrs;
+        int lnchildren, rnchildren;
+        ldkeys = nilfs_btree_node_dkeys(btree, left);
+        ldptrs = nilfs_btree_node_dptrs(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        rdkeys = nilfs_btree_node_dkeys(btree, right);
+        rdptrs = nilfs_btree_node_dptrs(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
+        memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
+        memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
+        memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
+        lnchildren -= n;
+        rnchildren += n;
+        nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+        nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+}
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_insert(struct nilfs_btree *btree,
+                                    struct nilfs_btree_node *node,
+                                    __u64 key, __u64 ptr, int index)
+{
+        __le64 *dkeys;
+        __le64 *dptrs;
+        int nchildren;
+        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dptrs = nilfs_btree_node_dptrs(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        if (index < nchildren) {
+                memmove(dkeys + index + 1, dkeys + index,
+                        (nchildren - index) * sizeof(*dkeys));
+                memmove(dptrs + index + 1, dptrs + index,
+                        (nchildren - index) * sizeof(*dptrs));
+        }
+        dkeys[index] = nilfs_bmap_key_to_dkey(key);
+        dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
+        nchildren++;
+        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+}
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_delete(struct nilfs_btree *btree,
+                                    struct nilfs_btree_node *node,
+                                    __u64 *keyp, __u64 *ptrp, int index)
+{
+        __u64 key;
+        __u64 ptr;
+        __le64 *dkeys;
+        __le64 *dptrs;
+        int nchildren;
+        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dptrs = nilfs_btree_node_dptrs(btree, node);
+        key = nilfs_bmap_dkey_to_key(dkeys[index]);
+        ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        if (keyp != NULL)
+                *keyp = key;
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        if (index < nchildren - 1) {
+                memmove(dkeys + index, dkeys + index + 1,
+                        (nchildren - index - 1) * sizeof(*dkeys));
+                memmove(dptrs + index, dptrs + index + 1,
+                        (nchildren - index - 1) * sizeof(*dptrs));
+        }
+        nchildren--;
+        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+}
+static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
+                                   const struct nilfs_btree_node *node,
+                                   __u64 key, int *indexp)
+{
+        __u64 nkey;
+        int index, low, high, s;
+        /* binary search */
+        low = 0;
+        high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+        index = 0;
+        s = 0;
+        while (low <= high) {
+                index = (low + high) / 2;
+                nkey = nilfs_btree_node_get_key(btree, node, index);
+                if (nkey == key) {
+                        s = 0;
+                        goto out;
+                } else if (nkey < key) {
+                        low = index + 1;
+                        s = -1;
+                } else {
+                        high = index - 1;
+                        s = 1;
+                }
+        }
+        /* adjust index */
+        if (nilfs_btree_node_get_level(btree, node) >
+            NILFS_BTREE_LEVEL_NODE_MIN) {
+                if ((s > 0) && (index > 0))
+                        index--;
+        } else if (s < 0)
+                index++;
+ out:
+        *indexp = index;
+        return s == 0;
+}
+static inline struct nilfs_btree_node *
+nilfs_btree_get_root(const struct nilfs_btree *btree)
+{
+        return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
+}
+static inline struct nilfs_btree_node *
+nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
+                             const struct nilfs_btree_path *path,
+                             int level)
+{
+        return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
+}
+static inline struct nilfs_btree_node *
+nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
+                         const struct nilfs_btree_path *path,
+                         int level)
+{
+        return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
+}
+static inline int nilfs_btree_height(const struct nilfs_btree *btree)
+{
+        return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
+                + 1;
+}
+static inline struct nilfs_btree_node *
+nilfs_btree_get_node(const struct nilfs_btree *btree,
+                     const struct nilfs_btree_path *path,
+                     int level)
+{
+        return (level == nilfs_btree_height(btree) - 1) ?
+                nilfs_btree_get_root(btree) :
+                nilfs_btree_get_nonroot_node(btree, path, level);
+}
+static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
+                                 struct nilfs_btree_path *path,
+                                 __u64 key, __u64 *ptrp, int minlevel)
+{
+        struct nilfs_btree_node *node;
+        __u64 ptr;
+        int level, index, found, ret;
+        node = nilfs_btree_get_root(btree);
+        level = nilfs_btree_node_get_level(btree, node);
+        if ((level < minlevel) ||
+            (nilfs_btree_node_get_nchildren(btree, node) <= 0))
+                return -ENOENT;
+        found = nilfs_btree_node_lookup(btree, node, key, &index);
+        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+        path[level].bp_bh = NULL;
+        path[level].bp_index = index;
+        for (level--; level >= minlevel; level--) {
+                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                                           &path[level].bp_bh);
+                if (ret < 0)
+                        return ret;
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+                if (!found)
+                        found = nilfs_btree_node_lookup(btree, node, key,
+                                                        &index);
+                else
+                        index = 0;
+                if (index < nilfs_btree_node_nchildren_max(btree, node))
+                        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+                else {
+                        WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
+                        /* insert */
+                        ptr = NILFS_BMAP_INVALID_PTR;
+                }
+                path[level].bp_index = index;
+        }
+        if (!found)
+                return -ENOENT;
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        return 0;
+}
+static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node;
+        __u64 ptr;
+        int index, level, ret;
+        node = nilfs_btree_get_root(btree);
+        index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+        if (index < 0)
+                return -ENOENT;
+        level = nilfs_btree_node_get_level(btree, node);
+        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+        path[level].bp_bh = NULL;
+        path[level].bp_index = index;
+        for (level--; level > 0; level--) {
+                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                                           &path[level].bp_bh);
+                if (ret < 0)
+                        return ret;
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+                index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+                ptr = nilfs_btree_node_get_ptr(btree, node, index);
+                path[level].bp_index = index;
+        }
+        if (keyp != NULL)
+                *keyp = nilfs_btree_node_get_key(btree, node, index);
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        return 0;
+}
+static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
+                              __u64 key, int level, __u64 *ptrp)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        __u64 ptr;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static void nilfs_btree_promote_key(struct nilfs_btree *btree,
+                                    struct nilfs_btree_path *path,
+                                    int level, __u64 key)
+{
+        if (level < nilfs_btree_height(btree) - 1) {
+                do {
+                        lock_buffer(path[level].bp_bh);
+                        nilfs_btree_node_set_key(
+                                btree,
+                                nilfs_btree_get_nonroot_node(
+                                        btree, path, level),
+                                path[level].bp_index, key);
+                        if (!buffer_dirty(path[level].bp_bh))
+                                nilfs_btnode_mark_dirty(path[level].bp_bh);
+                        unlock_buffer(path[level].bp_bh);
+                } while ((path[level].bp_index == 0) &&
+                         (++level < nilfs_btree_height(btree) - 1));
+        }
+        /* root */
+        if (level == nilfs_btree_height(btree) - 1) {
+                nilfs_btree_node_set_key(btree,
+                                         nilfs_btree_get_root(btree),
+                                         path[level].bp_index, key);
+        }
+}
+static void nilfs_btree_do_insert(struct nilfs_btree *btree,
+                                  struct nilfs_btree_path *path,
+                                  int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node;
+        if (level < nilfs_btree_height(btree) - 1) {
+                lock_buffer(path[level].bp_bh);
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+                                        path[level].bp_index);
+                if (!buffer_dirty(path[level].bp_bh))
+                        nilfs_btnode_mark_dirty(path[level].bp_bh);
+                unlock_buffer(path[level].bp_bh);
+                if (path[level].bp_index == 0)
+                        nilfs_btree_promote_key(btree, path, level + 1,
+                                                nilfs_btree_node_get_key(
+                                                        btree, node, 0));
+        } else {
+                node = nilfs_btree_get_root(btree);
+                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+                                        path[level].bp_index);
+        }
+}
+static void nilfs_btree_carry_left(struct nilfs_btree *btree,
+                                   struct nilfs_btree_path *path,
+                                   int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *left;
+        int nchildren, lnchildren, n, move;
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        move = 0;
+        n = (nchildren + lnchildren + 1) / 2 - lnchildren;
+        if (n > path[level].bp_index) {
+                /* move insert point */
+                n--;
+                move = 1;
+        }
+        nilfs_btree_node_move_left(btree, left, node, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, node, 0));
+        if (move) {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                path[level].bp_bh = path[level].bp_sib_bh;
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index += lnchildren;
+                path[level + 1].bp_index--;
+        } else {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index -= n;
+        }
+        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+static void nilfs_btree_carry_right(struct nilfs_btree *btree,
+                                    struct nilfs_btree_path *path,
+                                    int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *right;
+        int nchildren, rnchildren, n, move;
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        move = 0;
+        n = (nchildren + rnchildren + 1) / 2 - rnchildren;
+        if (n > nchildren - path[level].bp_index) {
+                /* move insert point */
+                n--;
+                move = 1;
+        }
+        nilfs_btree_node_move_right(btree, node, right, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        path[level + 1].bp_index++;
+        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, right, 0));
+        path[level + 1].bp_index--;
+        if (move) {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                path[level].bp_bh = path[level].bp_sib_bh;
+                path[level].bp_sib_bh = NULL;
+                path[level].bp_index -=
+                        nilfs_btree_node_get_nchildren(btree, node);
+                path[level + 1].bp_index++;
+        } else {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                path[level].bp_sib_bh = NULL;
+        }
+        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+static void nilfs_btree_split(struct nilfs_btree *btree,
+                              struct nilfs_btree_path *path,
+                              int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *right;
+        __u64 newkey;
+        __u64 newptr;
+        int nchildren, n, move;
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        move = 0;
+        n = (nchildren + 1) / 2;
+        if (n > nchildren - path[level].bp_index) {
+                n--;
+                move = 1;
+        }
+        nilfs_btree_node_move_right(btree, node, right, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        newkey = nilfs_btree_node_get_key(btree, right, 0);
+        newptr = path[level].bp_newreq.bpr_ptr;
+        if (move) {
+                path[level].bp_index -=
+                        nilfs_btree_node_get_nchildren(btree, node);
+                nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
+                                        path[level].bp_index);
+                *keyp = nilfs_btree_node_get_key(btree, right, 0);
+                *ptrp = path[level].bp_newreq.bpr_ptr;
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                path[level].bp_bh = path[level].bp_sib_bh;
+                path[level].bp_sib_bh = NULL;
+        } else {
+                nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+                *keyp = nilfs_btree_node_get_key(btree, right, 0);
+                *ptrp = path[level].bp_newreq.bpr_ptr;
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                path[level].bp_sib_bh = NULL;
+        }
+        path[level + 1].bp_index++;
+}
+static void nilfs_btree_grow(struct nilfs_btree *btree,
+                             struct nilfs_btree_path *path,
+                             int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *root, *child;
+        int n;
+        lock_buffer(path[level].bp_sib_bh);
+        root = nilfs_btree_get_root(btree);
+        child = nilfs_btree_get_sib_node(btree, path, level);
+        n = nilfs_btree_node_get_nchildren(btree, root);
+        nilfs_btree_node_move_right(btree, root, child, n);
+        nilfs_btree_node_set_level(btree, root, level + 1);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        path[level].bp_bh = path[level].bp_sib_bh;
+        path[level].bp_sib_bh = NULL;
+        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+        *keyp = nilfs_btree_node_get_key(btree, child, 0);
+        *ptrp = path[level].bp_newreq.bpr_ptr;
+}
+static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
+                                   const struct nilfs_btree_path *path)
+{
+        struct nilfs_btree_node *node;
+        int level;
+        if (path == NULL)
+                return NILFS_BMAP_INVALID_PTR;
+        /* left sibling */
+        level = NILFS_BTREE_LEVEL_NODE_MIN;
+        if (path[level].bp_index > 0) {
+                node = nilfs_btree_get_node(btree, path, level);
+                return nilfs_btree_node_get_ptr(btree, node,
+                                                path[level].bp_index - 1);
+        }
+        /* parent */
+        level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
+        if (level <= nilfs_btree_height(btree) - 1) {
+                node = nilfs_btree_get_node(btree, path, level);
+                return nilfs_btree_node_get_ptr(btree, node,
+                                                path[level].bp_index);
+        }
+        return NILFS_BMAP_INVALID_PTR;
+}
+static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
+                                       const struct nilfs_btree_path *path,
+                                       __u64 key)
+{
+        __u64 ptr;
+        ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
+        if (ptr != NILFS_BMAP_INVALID_PTR)
+                /* sequential access */
+                return ptr;
+        else {
+                ptr = nilfs_btree_find_near(btree, path);
+                if (ptr != NILFS_BMAP_INVALID_PTR)
+                        /* near */
+                        return ptr;
+        }
+        /* block group */
+        return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
+}
+static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
+                                     __u64 ptr)
+{
+        btree->bt_bmap.b_last_allocated_key = key;
+        btree->bt_bmap.b_last_allocated_ptr = ptr;
+}
+static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      int *levelp, __u64 key, __u64 ptr,
+                                      struct nilfs_bmap_stats *stats)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree_node *node, *parent, *sib;
+        __u64 sibptr;
+        int pindex, level, ret;
+        stats->bs_nblocks = 0;
+        level = NILFS_BTREE_LEVEL_DATA;
+        /* allocate a new ptr for data block */
+        if (btree->bt_ops->btop_find_target != NULL)
+                path[level].bp_newreq.bpr_ptr =
+                        btree->bt_ops->btop_find_target(btree, path, key);
+        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+                &btree->bt_bmap, &path[level].bp_newreq);
+        if (ret < 0)
+                goto err_out_data;
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+             level < nilfs_btree_height(btree) - 1;
+             level++) {
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                if (nilfs_btree_node_get_nchildren(btree, node) <
+                    nilfs_btree_node_nchildren_max(btree, node)) {
+                        path[level].bp_op = nilfs_btree_do_insert;
+                        stats->bs_nblocks++;
+                        goto out;
+                }
+                parent = nilfs_btree_get_node(btree, path, level + 1);
+                pindex = path[level + 1].bp_index;
+                /* left sibling */
+                if (pindex > 0) {
+                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          pindex - 1);
+                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                                                   &bh);
+                        if (ret < 0)
+                                goto err_out_child_node;
+                        sib = (struct nilfs_btree_node *)bh->b_data;
+                        if (nilfs_btree_node_get_nchildren(btree, sib) <
+                            nilfs_btree_node_nchildren_max(btree, sib)) {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_carry_left;
+                                stats->bs_nblocks++;
+                                goto out;
+                        } else
+                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                }
+                /* right sibling */
+                if (pindex <
+                    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          pindex + 1);
+                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                                                   &bh);
+                        if (ret < 0)
+                                goto err_out_child_node;
+                        sib = (struct nilfs_btree_node *)bh->b_data;
+                        if (nilfs_btree_node_get_nchildren(btree, sib) <
+                            nilfs_btree_node_nchildren_max(btree, sib)) {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_carry_right;
+                                stats->bs_nblocks++;
+                                goto out;
+                        } else
+                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                }
+                /* split */
+                path[level].bp_newreq.bpr_ptr =
+                        path[level - 1].bp_newreq.bpr_ptr + 1;
+                ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+                        &btree->bt_bmap, &path[level].bp_newreq);
+                if (ret < 0)
+                        goto err_out_child_node;
+                ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+                                               path[level].bp_newreq.bpr_ptr,
+                                               &bh);
+                if (ret < 0)
+                        goto err_out_curr_node;
+                stats->bs_nblocks++;
+                lock_buffer(bh);
+                nilfs_btree_node_init(btree,
+                                      (struct nilfs_btree_node *)bh->b_data,
+                                      0, level, 0, NULL, NULL);
+                unlock_buffer(bh);
+                path[level].bp_sib_bh = bh;
+                path[level].bp_op = nilfs_btree_split;
+        }
+        /* root */
+        node = nilfs_btree_get_root(btree);
+        if (nilfs_btree_node_get_nchildren(btree, node) <
+            nilfs_btree_node_nchildren_max(btree, node)) {
+                path[level].bp_op = nilfs_btree_do_insert;
+                stats->bs_nblocks++;
+                goto out;
+        }
+        /* grow */
+        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
+        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+                &btree->bt_bmap, &path[level].bp_newreq);
+        if (ret < 0)
+                goto err_out_child_node;
+        ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+                                       path[level].bp_newreq.bpr_ptr, &bh);
+        if (ret < 0)
+                goto err_out_curr_node;
+        lock_buffer(bh);
+        nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
+                              0, level, 0, NULL, NULL);
+        unlock_buffer(bh);
+        path[level].bp_sib_bh = bh;
+        path[level].bp_op = nilfs_btree_grow;
+        level++;
+        path[level].bp_op = nilfs_btree_do_insert;
+        /* a newly-created node block and a data block are added */
+        stats->bs_nblocks += 2;
+        /* success */
+ out:
+        *levelp = level;
+        return ret;
+        /* error */
+ err_out_curr_node:
+        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+                                                    &path[level].bp_newreq);
+ err_out_child_node:
+        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
+                nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
+                        &btree->bt_bmap, &path[level].bp_newreq);
+        }
+        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+                                                       &path[level].bp_newreq);
+ err_out_data:
+        *levelp = level;
+        stats->bs_nblocks = 0;
+        return ret;
+}
+static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      int maxlevel, __u64 key, __u64 ptr)
+{
+        int level;
+        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
+        if (btree->bt_ops->btop_set_target != NULL)
+                btree->bt_ops->btop_set_target(btree, key, ptr);
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+                if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
+                        btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
+                                &btree->bt_bmap, &path[level - 1].bp_newreq);
+                }
+                path[level].bp_op(btree, path, level, &key, &ptr);
+        }
+        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+                nilfs_bmap_set_dirty(&btree->bt_bmap);
+}
+static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        struct nilfs_bmap_stats stats;
+        int level, ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+                                    NILFS_BTREE_LEVEL_NODE_MIN);
+        if (ret != -ENOENT) {
+                if (ret == 0)
+                        ret = -EEXIST;
+                goto out;
+        }
+        ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
+        if (ret < 0)
+                goto out;
+        nilfs_btree_commit_insert(btree, path, level, key, ptr);
+        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static void nilfs_btree_do_delete(struct nilfs_btree *btree,
+                                  struct nilfs_btree_path *path,
+                                  int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node;
+        if (level < nilfs_btree_height(btree) - 1) {
+                lock_buffer(path[level].bp_bh);
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                nilfs_btree_node_delete(btree, node, keyp, ptrp,
+                                        path[level].bp_index);
+                if (!buffer_dirty(path[level].bp_bh))
+                        nilfs_btnode_mark_dirty(path[level].bp_bh);
+                unlock_buffer(path[level].bp_bh);
+                if (path[level].bp_index == 0)
+                        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, node, 0));
+        } else {
+                node = nilfs_btree_get_root(btree);
+                nilfs_btree_node_delete(btree, node, keyp, ptrp,
+                                        path[level].bp_index);
+        }
+}
+static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
+                                    struct nilfs_btree_path *path,
+                                    int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *left;
+        int nchildren, lnchildren, n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        n = (nchildren + lnchildren) / 2 - nchildren;
+        nilfs_btree_node_move_right(btree, left, node, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, node, 0));
+        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        path[level].bp_sib_bh = NULL;
+        path[level].bp_index += n;
+}
+static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
+                                     struct nilfs_btree_path *path,
+                                     int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *right;
+        int nchildren, rnchildren, n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(btree, path, level);
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        n = (nchildren + rnchildren) / 2 - nchildren;
+        nilfs_btree_node_move_left(btree, node, right, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        path[level + 1].bp_index++;
+        nilfs_btree_promote_key(btree, path, level + 1,
+                                nilfs_btree_node_get_key(btree, right, 0));
+        path[level + 1].bp_index--;
+        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        path[level].bp_sib_bh = NULL;
+}
+static void nilfs_btree_concat_left(struct nilfs_btree *btree,
+                                    struct nilfs_btree_path *path,
+                                    int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *left;
+        int n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(btree, path, level);
+        n = nilfs_btree_node_get_nchildren(btree, node);
+        nilfs_btree_node_move_left(btree, left, node, n);
+        if (!buffer_dirty(path[level].bp_sib_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        path[level].bp_bh = path[level].bp_sib_bh;
+        path[level].bp_sib_bh = NULL;
+        path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+}
+static void nilfs_btree_concat_right(struct nilfs_btree *btree,
+                                     struct nilfs_btree_path *path,
+                                     int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *node, *right;
+        int n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        lock_buffer(path[level].bp_sib_bh);
+        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(btree, path, level);
+        n = nilfs_btree_node_get_nchildren(btree, right);
+        nilfs_btree_node_move_left(btree, node, right, n);
+        if (!buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        unlock_buffer(path[level].bp_bh);
+        unlock_buffer(path[level].bp_sib_bh);
+        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        path[level].bp_sib_bh = NULL;
+        path[level + 1].bp_index++;
+}
+static void nilfs_btree_shrink(struct nilfs_btree *btree,
+                               struct nilfs_btree_path *path,
+                               int level, __u64 *keyp, __u64 *ptrp)
+{
+        struct nilfs_btree_node *root, *child;
+        int n;
+        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+        lock_buffer(path[level].bp_bh);
+        root = nilfs_btree_get_root(btree);
+        child = nilfs_btree_get_nonroot_node(btree, path, level);
+        nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
+        nilfs_btree_node_set_level(btree, root, level);
+        n = nilfs_btree_node_get_nchildren(btree, child);
+        nilfs_btree_node_move_left(btree, root, child, n);
+        unlock_buffer(path[level].bp_bh);
+        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        path[level].bp_bh = NULL;
+}
+static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      int *levelp,
+                                      struct nilfs_bmap_stats *stats)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree_node *node, *parent, *sib;
+        __u64 sibptr;
+        int pindex, level, ret;
+        ret = 0;
+        stats->bs_nblocks = 0;
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+             level < nilfs_btree_height(btree) - 1;
+             level++) {
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                path[level].bp_oldreq.bpr_ptr =
+                        nilfs_btree_node_get_ptr(btree, node,
+                                                 path[level].bp_index);
+                if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+                        ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+                                &btree->bt_bmap, &path[level].bp_oldreq);
+                        if (ret < 0)
+                                goto err_out_child_node;
+                }
+                if (nilfs_btree_node_get_nchildren(btree, node) >
+                    nilfs_btree_node_nchildren_min(btree, node)) {
+                        path[level].bp_op = nilfs_btree_do_delete;
+                        stats->bs_nblocks++;
+                        goto out;
+                }
+                parent = nilfs_btree_get_node(btree, path, level + 1);
+                pindex = path[level + 1].bp_index;
+                if (pindex > 0) {
+                        /* left sibling */
+                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          pindex - 1);
+                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                                                   &bh);
+                        if (ret < 0)
+                                goto err_out_curr_node;
+                        sib = (struct nilfs_btree_node *)bh->b_data;
+                        if (nilfs_btree_node_get_nchildren(btree, sib) >
+                            nilfs_btree_node_nchildren_min(btree, sib)) {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_borrow_left;
+                                stats->bs_nblocks++;
+                                goto out;
+                        } else {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_concat_left;
+                                stats->bs_nblocks++;
+                                /* continue; */
+                        }
+                } else if (pindex <
+                           nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+                        /* right sibling */
+                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          pindex + 1);
+                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                                                   &bh);
+                        if (ret < 0)
+                                goto err_out_curr_node;
+                        sib = (struct nilfs_btree_node *)bh->b_data;
+                        if (nilfs_btree_node_get_nchildren(btree, sib) >
+                            nilfs_btree_node_nchildren_min(btree, sib)) {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_borrow_right;
+                                stats->bs_nblocks++;
+                                goto out;
+                        } else {
+                                path[level].bp_sib_bh = bh;
+                                path[level].bp_op = nilfs_btree_concat_right;
+                                stats->bs_nblocks++;
+                                /* continue; */
+                        }
+                } else {
+                        /* no siblings */
+                        /* the only child of the root node */
+                        WARN_ON(level != nilfs_btree_height(btree) - 2);
+                        if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+                            NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+                                path[level].bp_op = nilfs_btree_shrink;
+                                stats->bs_nblocks += 2;
+                        } else {
+                                path[level].bp_op = nilfs_btree_do_delete;
+                                stats->bs_nblocks++;
+                        }
+                        goto out;
+                }
+        }
+        node = nilfs_btree_get_root(btree);
+        path[level].bp_oldreq.bpr_ptr =
+                nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
+        if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+                ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+                        &btree->bt_bmap, &path[level].bp_oldreq);
+                if (ret < 0)
+                        goto err_out_child_node;
+        }
+        /* child of the root node is deleted */
+        path[level].bp_op = nilfs_btree_do_delete;
+        stats->bs_nblocks++;
+        /* success */
+ out:
+        *levelp = level;
+        return ret;
+        /* error */
+ err_out_curr_node:
+        if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+                btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+                        &btree->bt_bmap, &path[level].bp_oldreq);
+ err_out_child_node:
+        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
+                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+                        btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+                                &btree->bt_bmap, &path[level].bp_oldreq);
+        }
+        *levelp = level;
+        stats->bs_nblocks = 0;
+        return ret;
+}
+static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
+                                      struct nilfs_btree_path *path,
+                                      int maxlevel)
+{
+        int level;
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+                if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
+                        btree->bt_bmap.b_pops->bpop_commit_end_ptr(
+                                &btree->bt_bmap, &path[level].bp_oldreq);
+                path[level].bp_op(btree, path, level, NULL, NULL);
+        }
+        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+                nilfs_bmap_set_dirty(&btree->bt_bmap);
+}
+static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        struct nilfs_bmap_stats stats;
+        int level, ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+                                    NILFS_BTREE_LEVEL_NODE_MIN);
+        if (ret < 0)
+                goto out;
+        ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+        if (ret < 0)
+                goto out;
+        nilfs_btree_commit_delete(btree, path, level);
+        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree *btree;
+        struct nilfs_btree_node *root, *node;
+        __u64 maxkey, nextmaxkey;
+        __u64 ptr;
+        int nchildren, ret;
+        btree = (struct nilfs_btree *)bmap;
+        root = nilfs_btree_get_root(btree);
+        switch (nilfs_btree_height(btree)) {
+        case 2:
+                bh = NULL;
+                node = root;
+                break;
+        case 3:
+                nchildren = nilfs_btree_node_get_nchildren(btree, root);
+                if (nchildren > 1)
+                        return 0;
+                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                if (ret < 0)
+                        return ret;
+                node = (struct nilfs_btree_node *)bh->b_data;
+                break;
+        default:
+                return 0;
+        }
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+        nextmaxkey = (nchildren > 1) ?
+                nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+        if (bh != NULL)
+                nilfs_bmap_put_block(bmap, bh);
+        return (maxkey == key) && (nextmaxkey < bmap->b_low);
+}
+static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
+                                   __u64 *keys, __u64 *ptrs, int nitems)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree *btree;
+        struct nilfs_btree_node *node, *root;
+        __le64 *dkeys;
+        __le64 *dptrs;
+        __u64 ptr;
+        int nchildren, i, ret;
+        btree = (struct nilfs_btree *)bmap;
+        root = nilfs_btree_get_root(btree);
+        switch (nilfs_btree_height(btree)) {
+        case 2:
+                bh = NULL;
+                node = root;
+                break;
+        case 3:
+                nchildren = nilfs_btree_node_get_nchildren(btree, root);
+                WARN_ON(nchildren > 1);
+                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                if (ret < 0)
+                        return ret;
+                node = (struct nilfs_btree_node *)bh->b_data;
+                break;
+        default:
+                node = NULL;
+                return -EINVAL;
+        }
+        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        if (nchildren < nitems)
+                nitems = nchildren;
+        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dptrs = nilfs_btree_node_dptrs(btree, node);
+        for (i = 0; i < nitems; i++) {
+                keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
+                ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
+        }
+        if (bh != NULL)
+                nilfs_bmap_put_block(bmap, bh);
+        return nitems;
+}
+static int
+nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
+                                       union nilfs_bmap_ptr_req *dreq,
+                                       union nilfs_bmap_ptr_req *nreq,
+                                       struct buffer_head **bhp,
+                                       struct nilfs_bmap_stats *stats)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree *btree;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        stats->bs_nblocks = 0;
+        /* for data */
+        /* cannot find near ptr */
+        if (btree->bt_ops->btop_find_target != NULL)
+                dreq->bpr_ptr
+                        = btree->bt_ops->btop_find_target(btree, NULL, key);
+        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
+        if (ret < 0)
+                return ret;
+        *bhp = NULL;
+        stats->bs_nblocks++;
+        if (nreq != NULL) {
+                nreq->bpr_ptr = dreq->bpr_ptr + 1;
+                ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
+                if (ret < 0)
+                        goto err_out_dreq;
+                ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
+                if (ret < 0)
+                        goto err_out_nreq;
+                *bhp = bh;
+                stats->bs_nblocks++;
+        }
+        /* success */
+        return 0;
+        /* error */
+ err_out_nreq:
+        bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
+ err_out_dreq:
+        bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
+        stats->bs_nblocks = 0;
+        return ret;
+}
+static void
+nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
+                                      __u64 key, __u64 ptr,
+                                      const __u64 *keys, const __u64 *ptrs,
+                                      int n, __u64 low, __u64 high,
+                                      union nilfs_bmap_ptr_req *dreq,
+                                      union nilfs_bmap_ptr_req *nreq,
+                                      struct buffer_head *bh)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_node *node;
+        __u64 tmpptr;
+        /* free resources */
+        if (bmap->b_ops->bop_clear != NULL)
+                bmap->b_ops->bop_clear(bmap);
+        /* ptr must be a pointer to a buffer head. */
+        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+        /* convert and insert */
+        btree = (struct nilfs_btree *)bmap;
+        nilfs_btree_init(bmap, low, high);
+        if (nreq != NULL) {
+                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
+                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
+                }
+                /* create child node at level 1 */
+                lock_buffer(bh);
+                node = (struct nilfs_btree_node *)bh->b_data;
+                nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
+                nilfs_btree_node_insert(btree, node,
+                                        key, dreq->bpr_ptr, n);
+                if (!buffer_dirty(bh))
+                        nilfs_btnode_mark_dirty(bh);
+                if (!nilfs_bmap_dirty(bmap))
+                        nilfs_bmap_set_dirty(bmap);
+                unlock_buffer(bh);
+                nilfs_bmap_put_block(bmap, bh);
+                /* create root node at level 2 */
+                node = nilfs_btree_get_root(btree);
+                tmpptr = nreq->bpr_ptr;
+                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+                                      2, 1, &keys[0], &tmpptr);
+        } else {
+                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
+                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+                /* create root node at level 1 */
+                node = nilfs_btree_get_root(btree);
+                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+                                      1, n, keys, ptrs);
+                nilfs_btree_node_insert(btree, node,
+                                        key, dreq->bpr_ptr, n);
+                if (!nilfs_bmap_dirty(bmap))
+                        nilfs_bmap_set_dirty(bmap);
+        }
+        if (btree->bt_ops->btop_set_target != NULL)
+                btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
+}
+/**
+ * nilfs_btree_convert_and_insert -
+ * @bmap:
+ * @key:
+ * @ptr:
+ * @keys:
+ * @ptrs:
+ * @n:
+ * @low:
+ * @high:
+ */
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
+                                   __u64 key, __u64 ptr,
+                                   const __u64 *keys, const __u64 *ptrs,
+                                   int n, __u64 low, __u64 high)
+{
+        struct buffer_head *bh;
+        union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
+        struct nilfs_bmap_stats stats;
+        int ret;
+        if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+                di = &dreq;
+                ni = NULL;
+        } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
+                           1 << bmap->b_inode->i_blkbits)) {
+                di = &dreq;
+                ni = &nreq;
+        } else {
+                di = NULL;
+                ni = NULL;
+                BUG();
+        }
+        ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
+                                                     &stats);
+        if (ret < 0)
+                return ret;
+        nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
+                                              low, high, di, ni, bh);
+        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        return 0;
+}
+static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
+                                   struct nilfs_btree_path *path,
+                                   int level,
+                                   struct buffer_head *bh)
+{
+        while ((++level < nilfs_btree_height(btree) - 1) &&
+               !buffer_dirty(path[level].bp_bh))
+                nilfs_btnode_mark_dirty(path[level].bp_bh);
+        return 0;
+}
+static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
+                                        struct nilfs_btree_path *path,
+                                        int level)
+{
+        struct nilfs_btree_node *parent;
+        int ret;
+        parent = nilfs_btree_get_node(btree, path, level + 1);
+        path[level].bp_oldreq.bpr_ptr =
+                nilfs_btree_node_get_ptr(btree, parent,
+                                         path[level + 1].bp_index);
+        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
+        ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
+                                        &path[level].bp_oldreq,
+                                        &path[level].bp_newreq);
+        if (ret < 0)
+                return ret;
+        if (buffer_nilfs_node(path[level].bp_bh)) {
+                path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
+                path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
+                path[level].bp_ctxt.bh = path[level].bp_bh;
+                ret = nilfs_btnode_prepare_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+                if (ret < 0) {
+                        nilfs_bmap_abort_update(&btree->bt_bmap,
+                                                &path[level].bp_oldreq,
+                                                &path[level].bp_newreq);
+                        return ret;
+                }
+        }
+        return 0;
+}
+static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
+                                        struct nilfs_btree_path *path,
+                                        int level)
+{
+        struct nilfs_btree_node *parent;
+        nilfs_bmap_commit_update(&btree->bt_bmap,
+                                 &path[level].bp_oldreq,
+                                 &path[level].bp_newreq);
+        if (buffer_nilfs_node(path[level].bp_bh)) {
+                nilfs_btnode_commit_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+                path[level].bp_bh = path[level].bp_ctxt.bh;
+        }
+        set_buffer_nilfs_volatile(path[level].bp_bh);
+        parent = nilfs_btree_get_node(btree, path, level + 1);
+        nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
+                                 path[level].bp_newreq.bpr_ptr);
+}
+static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
+                                       struct nilfs_btree_path *path,
+                                       int level)
+{
+        nilfs_bmap_abort_update(&btree->bt_bmap,
+                                &path[level].bp_oldreq,
+                                &path[level].bp_newreq);
+        if (buffer_nilfs_node(path[level].bp_bh))
+                nilfs_btnode_abort_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+}
+static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
+                                           struct nilfs_btree_path *path,
+                                           int minlevel,
+                                           int *maxlevelp)
+{
+        int level, ret;
+        level = minlevel;
+        if (!buffer_nilfs_volatile(path[level].bp_bh)) {
+                ret = nilfs_btree_prepare_update_v(btree, path, level);
+                if (ret < 0)
+                        return ret;
+        }
+        while ((++level < nilfs_btree_height(btree) - 1) &&
+               !buffer_dirty(path[level].bp_bh)) {
+                WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
+                ret = nilfs_btree_prepare_update_v(btree, path, level);
+                if (ret < 0)
+                        goto out;
+        }
+        /* success */
+        *maxlevelp = level - 1;
+        return 0;
+        /* error */
+ out:
+        while (--level > minlevel)
+                nilfs_btree_abort_update_v(btree, path, level);
+        if (!buffer_nilfs_volatile(path[level].bp_bh))
+                nilfs_btree_abort_update_v(btree, path, level);
+        return ret;
+}
+static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
+                                           struct nilfs_btree_path *path,
+                                           int minlevel,
+                                           int maxlevel,
+                                           struct buffer_head *bh)
+{
+        int level;
+        if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
+                nilfs_btree_commit_update_v(btree, path, minlevel);
+        for (level = minlevel + 1; level <= maxlevel; level++)
+                nilfs_btree_commit_update_v(btree, path, level);
+}
+static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
+                                   struct nilfs_btree_path *path,
+                                   int level,
+                                   struct buffer_head *bh)
+{
+        int maxlevel, ret;
+        struct nilfs_btree_node *parent;
+        __u64 ptr;
+        get_bh(bh);
+        path[level].bp_bh = bh;
+        ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+        if (ret < 0)
+                goto out;
+        if (buffer_nilfs_volatile(path[level].bp_bh)) {
+                parent = nilfs_btree_get_node(btree, path, level + 1);
+                ptr = nilfs_btree_node_get_ptr(btree, parent,
+                                               path[level + 1].bp_index);
+                ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+                if (ret < 0)
+                        goto out;
+        }
+        nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+ out:
+        brelse(path[level].bp_bh);
+        path[level].bp_bh = NULL;
+        return ret;
+}
+static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
+                                 struct buffer_head *bh)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        struct nilfs_btree_node *node;
+        __u64 key;
+        int level, ret;
+        WARN_ON(!buffer_dirty(bh));
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        if (buffer_nilfs_node(bh)) {
+                node = (struct nilfs_btree_node *)bh->b_data;
+                key = nilfs_btree_node_get_key(btree, node, 0);
+                level = nilfs_btree_node_get_level(btree, node);
+        } else {
+                key = nilfs_bmap_data_get_key(bmap, bh);
+                level = NILFS_BTREE_LEVEL_DATA;
+        }
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+        if (ret < 0) {
+                if (unlikely(ret == -ENOENT))
+                        printk(KERN_CRIT "%s: key = %llu, level == %d\n",
+                               __func__, (unsigned long long)key, level);
+                goto out;
+        }
+        ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
+                                    struct buffer_head *bh)
+{
+        return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+}
+static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
+                                         struct list_head *lists,
+                                         struct buffer_head *bh)
+{
+        struct list_head *head;
+        struct buffer_head *cbh;
+        struct nilfs_btree_node *node, *cnode;
+        __u64 key, ckey;
+        int level;
+        get_bh(bh);
+        node = (struct nilfs_btree_node *)bh->b_data;
+        key = nilfs_btree_node_get_key(btree, node, 0);
+        level = nilfs_btree_node_get_level(btree, node);
+        list_for_each(head, &lists[level]) {
+                cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
+                cnode = (struct nilfs_btree_node *)cbh->b_data;
+                ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+                if (key < ckey)
+                        break;
+        }
+        list_add_tail(&bh->b_assoc_buffers, head);
+}
+static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+                                             struct list_head *listp)
+{
+        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+        struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
+        struct list_head lists[NILFS_BTREE_LEVEL_MAX];
+        struct pagevec pvec;
+        struct buffer_head *bh, *head;
+        pgoff_t index = 0;
+        int level, i;
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+             level < NILFS_BTREE_LEVEL_MAX;
+             level++)
+                INIT_LIST_HEAD(&lists[level]);
+        pagevec_init(&pvec, 0);
+        while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
+                                  PAGEVEC_SIZE)) {
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        bh = head = page_buffers(pvec.pages[i]);
+                        do {
+                                if (buffer_dirty(bh))
+                                        nilfs_btree_add_dirty_buffer(btree,
+                                                                     lists, bh);
+                        } while ((bh = bh->b_this_page) != head);
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+             level < NILFS_BTREE_LEVEL_MAX;
+             level++)
+                list_splice(&lists[level], listp->prev);
+}
+static int nilfs_btree_assign_p(struct nilfs_btree *btree,
+                                struct nilfs_btree_path *path,
+                                int level,
+                                struct buffer_head **bh,
+                                sector_t blocknr,
+                                union nilfs_binfo *binfo)
+{
+        struct nilfs_btree_node *parent;
+        __u64 key;
+        __u64 ptr;
+        int ret;
+        parent = nilfs_btree_get_node(btree, path, level + 1);
+        ptr = nilfs_btree_node_get_ptr(btree, parent,
+                                       path[level + 1].bp_index);
+        if (buffer_nilfs_node(*bh)) {
+                path[level].bp_ctxt.oldkey = ptr;
+                path[level].bp_ctxt.newkey = blocknr;
+                path[level].bp_ctxt.bh = *bh;
+                ret = nilfs_btnode_prepare_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+                if (ret < 0)
+                        return ret;
+                nilfs_btnode_commit_change_key(
+                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &path[level].bp_ctxt);
+                *bh = path[level].bp_ctxt.bh;
+        }
+        nilfs_btree_node_set_ptr(btree, parent,
+                                 path[level + 1].bp_index, blocknr);
+        key = nilfs_btree_node_get_key(btree, parent,
+                                       path[level + 1].bp_index);
+        /* on-disk format */
+        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_dat.bi_level = level;
+        return 0;
+}
+static int nilfs_btree_assign_v(struct nilfs_btree *btree,
+                                struct nilfs_btree_path *path,
+                                int level,
+                                struct buffer_head **bh,
+                                sector_t blocknr,
+                                union nilfs_binfo *binfo)
+{
+        struct nilfs_btree_node *parent;
+        __u64 key;
+        __u64 ptr;
+        union nilfs_bmap_ptr_req req;
+        int ret;
+        parent = nilfs_btree_get_node(btree, path, level + 1);
+        ptr = nilfs_btree_node_get_ptr(btree, parent,
+                                       path[level + 1].bp_index);
+        req.bpr_ptr = ptr;
+        ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
+                                                               &req);
+        if (ret < 0)
+                return ret;
+        btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
+                                                        &req, blocknr);
+        key = nilfs_btree_node_get_key(btree, parent,
+                                       path[level + 1].bp_index);
+        /* on-disk format */
+        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        return 0;
+}
+static int nilfs_btree_assign(struct nilfs_bmap *bmap,
+                              struct buffer_head **bh,
+                              sector_t blocknr,
+                              union nilfs_binfo *binfo)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        struct nilfs_btree_node *node;
+        __u64 key;
+        int level, ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        if (buffer_nilfs_node(*bh)) {
+                node = (struct nilfs_btree_node *)(*bh)->b_data;
+                key = nilfs_btree_node_get_key(btree, node, 0);
+                level = nilfs_btree_node_get_level(btree, node);
+        } else {
+                key = nilfs_bmap_data_get_key(bmap, *bh);
+                level = NILFS_BTREE_LEVEL_DATA;
+        }
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+        if (ret < 0) {
+                WARN_ON(ret == -ENOENT);
+                goto out;
+        }
+        ret = btree->bt_ops->btop_assign(btree, path, level, bh,
+                                            blocknr, binfo);
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
+                                 struct buffer_head **bh,
+                                 sector_t blocknr,
+                                 union nilfs_binfo *binfo)
+{
+        struct nilfs_btree *btree;
+        struct nilfs_btree_node *node;
+        __u64 key;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+        if (ret < 0)
+                return ret;
+        if (buffer_nilfs_node(*bh)) {
+                node = (struct nilfs_btree_node *)(*bh)->b_data;
+                key = nilfs_btree_node_get_key(btree, node, 0);
+        } else
+                key = nilfs_bmap_data_get_key(bmap, *bh);
+        /* on-disk format */
+        binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
+        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        return 0;
+}
+static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+        struct buffer_head *bh;
+        struct nilfs_btree *btree;
+        struct nilfs_btree_path *path;
+        __u64 ptr;
+        int ret;
+        btree = (struct nilfs_btree *)bmap;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
+        if (ret < 0) {
+                WARN_ON(ret == -ENOENT);
+                goto out;
+        }
+        ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
+        if (ret < 0) {
+                WARN_ON(ret == -ENOENT);
+                goto out;
+        }
+        if (!buffer_dirty(bh))
+                nilfs_btnode_mark_dirty(bh);
+        nilfs_bmap_put_block(&btree->bt_bmap, bh);
+        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+                nilfs_bmap_set_dirty(&btree->bt_bmap);
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
+static const struct nilfs_bmap_operations nilfs_btree_ops = {
+        .bop_lookup             =       nilfs_btree_lookup,
+        .bop_insert             =       nilfs_btree_insert,
+        .bop_delete             =       nilfs_btree_delete,
+        .bop_clear              =       NULL,
+        .bop_propagate          =       nilfs_btree_propagate,
+        .bop_lookup_dirty_buffers =     nilfs_btree_lookup_dirty_buffers,
+        .bop_assign             =       nilfs_btree_assign,
+        .bop_mark               =       nilfs_btree_mark,
+        .bop_last_key           =       nilfs_btree_last_key,
+        .bop_check_insert       =       NULL,
+        .bop_check_delete       =       nilfs_btree_check_delete,
+        .bop_gather_data        =       nilfs_btree_gather_data,
+};
+static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
+        .bop_lookup             =       NULL,
+        .bop_insert             =       NULL,
+        .bop_delete             =       NULL,
+        .bop_clear              =       NULL,
+        .bop_propagate          =       nilfs_btree_propagate_gc,
+        .bop_lookup_dirty_buffers =     nilfs_btree_lookup_dirty_buffers,
+        .bop_assign             =       nilfs_btree_assign_gc,
+        .bop_mark               =       NULL,
+        .bop_last_key           =       NULL,
+        .bop_check_insert       =       NULL,
+        .bop_check_delete       =       NULL,
+        .bop_gather_data        =       NULL,
+};
+static const struct nilfs_btree_operations nilfs_btree_ops_v = {
+        .btop_find_target       =       nilfs_btree_find_target_v,
+        .btop_set_target        =       nilfs_btree_set_target_v,
+        .btop_propagate         =       nilfs_btree_propagate_v,
+        .btop_assign            =       nilfs_btree_assign_v,
+};
+static const struct nilfs_btree_operations nilfs_btree_ops_p = {
+        .btop_find_target       =       NULL,
+        .btop_set_target        =       NULL,
+        .btop_propagate         =       nilfs_btree_propagate_p,
+        .btop_assign            =       nilfs_btree_assign_p,
+};
+int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+{
+        struct nilfs_btree *btree;
+        btree = (struct nilfs_btree *)bmap;
+        bmap->b_ops = &nilfs_btree_ops;
+        bmap->b_low = low;
+        bmap->b_high = high;
+        switch (bmap->b_inode->i_ino) {
+        case NILFS_DAT_INO:
+                btree->bt_ops = &nilfs_btree_ops_p;
+                break;
+        default:
+                btree->bt_ops = &nilfs_btree_ops_v;
+                break;
+        }
+        return 0;
+}
+void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
+{
+        bmap->b_low = NILFS_BMAP_LARGE_LOW;
+        bmap->b_high = NILFS_BMAP_LARGE_HIGH;
+        bmap->b_ops = &nilfs_btree_ops_gc;
+}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
new file mode 100644
index 000000000000..4766deb52fb1
--- /dev/null
+++ b/fs/nilfs2/btree.h
@@ -0,0 +1,117 @@
+/*
+ * btree.h - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_BTREE_H
+#define _NILFS_BTREE_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/list.h>
+#include <linux/nilfs2_fs.h>
+#include "btnode.h"
+#include "bmap.h"
+struct nilfs_btree;
+struct nilfs_btree_path;
+/**
+ * struct nilfs_btree_operations - B-tree operation table
+ */
+struct nilfs_btree_operations {
+        __u64 (*btop_find_target)(const struct nilfs_btree *,
+                                  const struct nilfs_btree_path *, __u64);
+        void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
+        struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
+        int (*btop_propagate)(struct nilfs_btree *,
+                              struct nilfs_btree_path *,
+                              int,
+                              struct buffer_head *);
+        int (*btop_assign)(struct nilfs_btree *,
+                           struct nilfs_btree_path *,
+                           int,
+                           struct buffer_head **,
+                           sector_t,
+                           union nilfs_binfo *);
+};
+/**
+ * struct nilfs_btree_node - B-tree node
+ * @bn_flags: flags
+ * @bn_level: level
+ * @bn_nchildren: number of children
+ * @bn_pad: padding
+ */
+struct nilfs_btree_node {
+        __u8 bn_flags;
+        __u8 bn_level;
+        __le16 bn_nchildren;
+        __le32 bn_pad;
+};
+/* flags */
+#define NILFS_BTREE_NODE_ROOT   0x01
+/* level */
+#define NILFS_BTREE_LEVEL_DATA          0
+#define NILFS_BTREE_LEVEL_NODE_MIN      (NILFS_BTREE_LEVEL_DATA + 1)
+#define NILFS_BTREE_LEVEL_MAX           14
+/**
+ * struct nilfs_btree - B-tree structure
+ * @bt_bmap: bmap base structure
+ * @bt_ops: B-tree operation table
+ */
+struct nilfs_btree {
+        struct nilfs_bmap bt_bmap;
+        /* B-tree-specific members */
+        const struct nilfs_btree_operations *bt_ops;
+};
+#define NILFS_BTREE_ROOT_SIZE           NILFS_BMAP_SIZE
+#define NILFS_BTREE_ROOT_NCHILDREN_MAX                                  \
+        ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) /    \
+         (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_ROOT_NCHILDREN_MIN  0
+#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64))
+#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize)                        \
+        (((nodesize) - sizeof(struct nilfs_btree_node) -                \
+                NILFS_BTREE_NODE_EXTRA_PAD_SIZE) /                      \
+         (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize)                        \
+        ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
+#define NILFS_BTREE_KEY_MIN     ((__u64)0)
+#define NILFS_BTREE_KEY_MAX     (~(__u64)0)
+int nilfs_btree_path_cache_init(void);
+void nilfs_btree_path_cache_destroy(void);
+int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
+                                   const __u64 *, const __u64 *,
+                                   int, __u64, __u64);
+void nilfs_btree_init_gc(struct nilfs_bmap *);
+#endif  /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
new file mode 100644
index 000000000000..e90b60dfced9
--- /dev/null
+++ b/fs/nilfs2/cpfile.c
@@ -0,0 +1,925 @@
+/*
+ * cpfile.c - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "cpfile.h"
+static inline unsigned long
+nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
+{
+        return NILFS_MDT(cpfile)->mi_entries_per_block;
+}
+/* block number from the beginning of the file */
+static unsigned long
+nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
+{
+        __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+        do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+        return (unsigned long)tcno;
+}
+/* offset in block */
+static unsigned long
+nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
+{
+        __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+        return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+}
+static unsigned long
+nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
+                                  __u64 curr,
+                                  __u64 max)
+{
+        return min_t(__u64,
+                     nilfs_cpfile_checkpoints_per_block(cpfile) -
+                     nilfs_cpfile_get_offset(cpfile, curr),
+                     max - curr);
+}
+static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
+                                           __u64 cno)
+{
+        return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
+}
+static unsigned int
+nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
+                                         struct buffer_head *bh,
+                                         void *kaddr,
+                                         unsigned int n)
+{
+        struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+        unsigned int count;
+        count = le32_to_cpu(cp->cp_checkpoints_count) + n;
+        cp->cp_checkpoints_count = cpu_to_le32(count);
+        return count;
+}
+static unsigned int
+nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
+                                         struct buffer_head *bh,
+                                         void *kaddr,
+                                         unsigned int n)
+{
+        struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+        unsigned int count;
+        WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
+        count = le32_to_cpu(cp->cp_checkpoints_count) - n;
+        cp->cp_checkpoints_count = cpu_to_le32(count);
+        return count;
+}
+static inline struct nilfs_cpfile_header *
+nilfs_cpfile_block_get_header(const struct inode *cpfile,
+                              struct buffer_head *bh,
+                              void *kaddr)
+{
+        return kaddr + bh_offset(bh);
+}
+static struct nilfs_checkpoint *
+nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
+                                  struct buffer_head *bh,
+                                  void *kaddr)
+{
+        return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
+                NILFS_MDT(cpfile)->mi_entry_size;
+}
+static void nilfs_cpfile_block_init(struct inode *cpfile,
+                                    struct buffer_head *bh,
+                                    void *kaddr)
+{
+        struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+        int n = nilfs_cpfile_checkpoints_per_block(cpfile);
+        while (n-- > 0) {
+                nilfs_checkpoint_set_invalid(cp);
+                cp = (void *)cp + cpsz;
+        }
+}
+static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
+                                                struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
+}
+static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
+                                                    __u64 cno,
+                                                    int create,
+                                                    struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(cpfile,
+                                   nilfs_cpfile_get_blkoff(cpfile, cno),
+                                   create, nilfs_cpfile_block_init, bhp);
+}
+static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
+                                                       __u64 cno)
+{
+        return nilfs_mdt_delete_block(cpfile,
+                                      nilfs_cpfile_get_blkoff(cpfile, cno));
+}
+/**
+ * nilfs_cpfile_get_checkpoint - get a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @create: create flag
+ * @cpp: pointer to a checkpoint
+ * @bhp: pointer to a buffer head
+ *
+ * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
+ * specified by @cno. A new checkpoint will be created if @cno is the current
+ * checkpoint number and @create is nonzero.
+ *
+ * Return Value: On success, 0 is returned, and the checkpoint and the
+ * buffer head of the buffer on which the checkpoint is located are stored in
+ * the place pointed by @cpp and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ *
+ * %-EINVAL - invalid checkpoint.
+ */
+int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
+                                __u64 cno,
+                                int create,
+                                struct nilfs_checkpoint **cpp,
+                                struct buffer_head **bhp)
+{
+        struct buffer_head *header_bh, *cp_bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        void *kaddr;
+        int ret;
+        if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
+                     (cno < nilfs_mdt_cno(cpfile) && create)))
+                return -EINVAL;
+        down_write(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
+        if (ret < 0)
+                goto out_header;
+        kaddr = kmap(cp_bh->b_page);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        if (nilfs_checkpoint_invalid(cp)) {
+                if (!create) {
+                        kunmap(cp_bh->b_page);
+                        brelse(cp_bh);
+                        ret = -ENOENT;
+                        goto out_header;
+                }
+                /* a newly-created checkpoint */
+                nilfs_checkpoint_clear_invalid(cp);
+                if (!nilfs_cpfile_is_in_first(cpfile, cno))
+                        nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
+                                                                 kaddr, 1);
+                nilfs_mdt_mark_buffer_dirty(cp_bh);
+                kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+                                                       kaddr);
+                le64_add_cpu(&header->ch_ncheckpoints, 1);
+                kunmap_atomic(kaddr, KM_USER0);
+                nilfs_mdt_mark_buffer_dirty(header_bh);
+                nilfs_mdt_mark_dirty(cpfile);
+        }
+        if (cpp != NULL)
+                *cpp = cp;
+        *bhp = cp_bh;
+ out_header:
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_cpfile_put_checkpoint - put a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @bh: buffer head
+ *
+ * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
+ * specified by @cno. @bh must be the buffer head which has been returned by
+ * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
+ */
+void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
+                                 struct buffer_head *bh)
+{
+        kunmap(bh->b_page);
+        brelse(bh);
+}
+/**
+ * nilfs_cpfile_delete_checkpoints - delete checkpoints
+ * @cpfile: inode of checkpoint file
+ * @start: start checkpoint number
+ * @end: end checkpoint numer
+ *
+ * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
+ * the period from @start to @end, excluding @end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
+int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
+                                    __u64 start,
+                                    __u64 end)
+{
+        struct buffer_head *header_bh, *cp_bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+        __u64 cno;
+        void *kaddr;
+        unsigned long tnicps;
+        int ret, ncps, nicps, count, i;
+        if (unlikely(start == 0 || start > end)) {
+                printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
+                       "[%llu, %llu)\n", __func__,
+                       (unsigned long long)start, (unsigned long long)end);
+                return -EINVAL;
+        }
+        /* cannot delete the latest checkpoint */
+        if (start == nilfs_mdt_cno(cpfile) - 1)
+                return -EPERM;
+        down_write(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        tnicps = 0;
+        for (cno = start; cno < end; cno += ncps) {
+                ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                goto out_sem;
+                        /* skip hole */
+                        ret = 0;
+                        continue;
+                }
+                kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+                cp = nilfs_cpfile_block_get_checkpoint(
+                        cpfile, cno, cp_bh, kaddr);
+                nicps = 0;
+                for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
+                        WARN_ON(nilfs_checkpoint_snapshot(cp));
+                        if (!nilfs_checkpoint_invalid(cp)) {
+                                nilfs_checkpoint_set_invalid(cp);
+                                nicps++;
+                        }
+                }
+                if (nicps > 0) {
+                        tnicps += nicps;
+                        nilfs_mdt_mark_buffer_dirty(cp_bh);
+                        nilfs_mdt_mark_dirty(cpfile);
+                        if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
+                            (count = nilfs_cpfile_block_sub_valid_checkpoints(
+                                    cpfile, cp_bh, kaddr, nicps)) == 0) {
+                                /* make hole */
+                                kunmap_atomic(kaddr, KM_USER0);
+                                brelse(cp_bh);
+                                ret = nilfs_cpfile_delete_checkpoint_block(
+                                        cpfile, cno);
+                                if (ret == 0)
+                                        continue;
+                                printk(KERN_ERR "%s: cannot delete block\n",
+                                       __func__);
+                                goto out_sem;
+                        }
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(cp_bh);
+        }
+        if (tnicps > 0) {
+                kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+                                                       kaddr);
+                le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
+                nilfs_mdt_mark_buffer_dirty(header_bh);
+                nilfs_mdt_mark_dirty(cpfile);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
+                                              struct nilfs_checkpoint *cp,
+                                              struct nilfs_cpinfo *ci)
+{
+        ci->ci_flags = le32_to_cpu(cp->cp_flags);
+        ci->ci_cno = le64_to_cpu(cp->cp_cno);
+        ci->ci_create = le64_to_cpu(cp->cp_create);
+        ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
+        ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
+        ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
+        ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+}
+static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
+                                          struct nilfs_cpinfo *ci, size_t nci)
+{
+        struct nilfs_checkpoint *cp;
+        struct buffer_head *bh;
+        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+        __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
+        void *kaddr;
+        int n, ret;
+        int ncps, i;
+        if (cno == 0)
+                return -ENOENT; /* checkpoint number 0 is invalid */
+        down_read(&NILFS_MDT(cpfile)->mi_sem);
+        for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
+                ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                goto out;
+                        continue; /* skip hole */
+                }
+                kaddr = kmap_atomic(bh->b_page, KM_USER0);
+                cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+                for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
+                        if (!nilfs_checkpoint_invalid(cp))
+                                nilfs_cpfile_checkpoint_to_cpinfo(
+                                        cpfile, cp, &ci[n++]);
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(bh);
+        }
+        ret = n;
+        if (n > 0)
+                *cnop = ci[n - 1].ci_cno + 1;
+ out:
+        up_read(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
+                                          struct nilfs_cpinfo *ci, size_t nci)
+{
+        struct buffer_head *bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        __u64 curr = *cnop, next;
+        unsigned long curr_blkoff, next_blkoff;
+        void *kaddr;
+        int n = 0, ret;
+        down_read(&NILFS_MDT(cpfile)->mi_sem);
+        if (curr == 0) {
+                ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+                if (ret < 0)
+                        goto out;
+                kaddr = kmap_atomic(bh->b_page, KM_USER0);
+                header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+                curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(bh);
+                if (curr == 0) {
+                        ret = 0;
+                        goto out;
+                }
+        } else if (unlikely(curr == ~(__u64)0)) {
+                ret = 0;
+                goto out;
+        }
+        curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
+        if (unlikely(ret < 0)) {
+                if (ret == -ENOENT)
+                        ret = 0; /* No snapshots (started from a hole block) */
+                goto out;
+        }
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        while (n < nci) {
+                cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
+                curr = ~(__u64)0; /* Terminator */
+                if (unlikely(nilfs_checkpoint_invalid(cp) ||
+                             !nilfs_checkpoint_snapshot(cp)))
+                        break;
+                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
+                next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+                if (next == 0)
+                        break; /* reach end of the snapshot list */
+                next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
+                if (curr_blkoff != next_blkoff) {
+                        kunmap_atomic(kaddr, KM_USER0);
+                        brelse(bh);
+                        ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
+                                                                0, &bh);
+                        if (unlikely(ret < 0)) {
+                                WARN_ON(ret == -ENOENT);
+                                goto out;
+                        }
+                        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+                }
+                curr = next;
+                curr_blkoff = next_blkoff;
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(bh);
+        *cnop = curr;
+        ret = n;
+ out:
+        up_read(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_cpfile_get_cpinfo -
+ * @cpfile:
+ * @cno:
+ * @ci:
+ * @nci:
+ */
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
+                                struct nilfs_cpinfo *ci, size_t nci)
+{
+        switch (mode) {
+        case NILFS_CHECKPOINT:
+                return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
+        case NILFS_SNAPSHOT:
+                return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
+        default:
+                return -EINVAL;
+        }
+}
+/**
+ * nilfs_cpfile_delete_checkpoint -
+ * @cpfile:
+ * @cno:
+ */
+int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
+{
+        struct nilfs_cpinfo ci;
+        __u64 tcno = cno;
+        ssize_t nci;
+        int ret;
+        nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
+        if (nci < 0)
+                return nci;
+        else if (nci == 0 || ci.ci_cno != cno)
+                return -ENOENT;
+        /* cannot delete the latest checkpoint nor snapshots */
+        ret = nilfs_cpinfo_snapshot(&ci);
+        if (ret < 0)
+                return ret;
+        else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
+                return -EPERM;
+        return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
+}
+static struct nilfs_snapshot_list *
+nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
+                                     __u64 cno,
+                                     struct buffer_head *bh,
+                                     void *kaddr)
+{
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        struct nilfs_snapshot_list *list;
+        if (cno != 0) {
+                cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+                list = &cp->cp_snapshot_list;
+        } else {
+                header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+                list = &header->ch_snapshot_list;
+        }
+        return list;
+}
+static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
+{
+        struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        struct nilfs_snapshot_list *list;
+        __u64 curr, prev;
+        unsigned long curr_blkoff, prev_blkoff;
+        void *kaddr;
+        int ret;
+        if (cno == 0)
+                return -ENOENT; /* checkpoint number 0 is invalid */
+        down_write(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        if (nilfs_checkpoint_invalid(cp)) {
+                ret = -ENOENT;
+                kunmap_atomic(kaddr, KM_USER0);
+                goto out_cp;
+        }
+        if (nilfs_checkpoint_snapshot(cp)) {
+                ret = 0;
+                kunmap_atomic(kaddr, KM_USER0);
+                goto out_cp;
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+        if (ret < 0)
+                goto out_cp;
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+        list = &header->ch_snapshot_list;
+        curr_bh = header_bh;
+        get_bh(curr_bh);
+        curr = 0;
+        curr_blkoff = 0;
+        prev = le64_to_cpu(list->ssl_prev);
+        while (prev > cno) {
+                prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
+                curr = prev;
+                if (curr_blkoff != prev_blkoff) {
+                        kunmap_atomic(kaddr, KM_USER0);
+                        brelse(curr_bh);
+                        ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
+                                                                0, &curr_bh);
+                        if (ret < 0)
+                                goto out_header;
+                        kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+                }
+                curr_blkoff = prev_blkoff;
+                cp = nilfs_cpfile_block_get_checkpoint(
+                        cpfile, curr, curr_bh, kaddr);
+                list = &cp->cp_snapshot_list;
+                prev = le64_to_cpu(list->ssl_prev);
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+        if (prev != 0) {
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+                                                        &prev_bh);
+                if (ret < 0)
+                        goto out_curr;
+        } else {
+                prev_bh = header_bh;
+                get_bh(prev_bh);
+        }
+        kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+        list = nilfs_cpfile_block_get_snapshot_list(
+                cpfile, curr, curr_bh, kaddr);
+        list->ssl_prev = cpu_to_le64(cno);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
+        cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
+        nilfs_checkpoint_set_snapshot(cp);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+        list = nilfs_cpfile_block_get_snapshot_list(
+                cpfile, prev, prev_bh, kaddr);
+        list->ssl_next = cpu_to_le64(cno);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+        le64_add_cpu(&header->ch_nsnapshots, 1);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(prev_bh);
+        nilfs_mdt_mark_buffer_dirty(curr_bh);
+        nilfs_mdt_mark_buffer_dirty(cp_bh);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+        nilfs_mdt_mark_dirty(cpfile);
+        brelse(prev_bh);
+ out_curr:
+        brelse(curr_bh);
+ out_header:
+        brelse(header_bh);
+ out_cp:
+        brelse(cp_bh);
+ out_sem:
+        up_write(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
+{
+        struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
+        struct nilfs_cpfile_header *header;
+        struct nilfs_checkpoint *cp;
+        struct nilfs_snapshot_list *list;
+        __u64 next, prev;
+        void *kaddr;
+        int ret;
+        if (cno == 0)
+                return -ENOENT; /* checkpoint number 0 is invalid */
+        down_write(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        if (nilfs_checkpoint_invalid(cp)) {
+                ret = -ENOENT;
+                kunmap_atomic(kaddr, KM_USER0);
+                goto out_cp;
+        }
+        if (!nilfs_checkpoint_snapshot(cp)) {
+                ret = 0;
+                kunmap_atomic(kaddr, KM_USER0);
+                goto out_cp;
+        }
+        list = &cp->cp_snapshot_list;
+        next = le64_to_cpu(list->ssl_next);
+        prev = le64_to_cpu(list->ssl_prev);
+        kunmap_atomic(kaddr, KM_USER0);
+        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+        if (ret < 0)
+                goto out_cp;
+        if (next != 0) {
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
+                                                        &next_bh);
+                if (ret < 0)
+                        goto out_header;
+        } else {
+                next_bh = header_bh;
+                get_bh(next_bh);
+        }
+        if (prev != 0) {
+                ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+                                                        &prev_bh);
+                if (ret < 0)
+                        goto out_next;
+        } else {
+                prev_bh = header_bh;
+                get_bh(prev_bh);
+        }
+        kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
+        list = nilfs_cpfile_block_get_snapshot_list(
+                cpfile, next, next_bh, kaddr);
+        list->ssl_prev = cpu_to_le64(prev);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+        list = nilfs_cpfile_block_get_snapshot_list(
+                cpfile, prev, prev_bh, kaddr);
+        list->ssl_next = cpu_to_le64(next);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+        cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
+        cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
+        nilfs_checkpoint_clear_snapshot(cp);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+        le64_add_cpu(&header->ch_nsnapshots, -1);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(next_bh);
+        nilfs_mdt_mark_buffer_dirty(prev_bh);
+        nilfs_mdt_mark_buffer_dirty(cp_bh);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+        nilfs_mdt_mark_dirty(cpfile);
+        brelse(prev_bh);
+ out_next:
+        brelse(next_bh);
+ out_header:
+        brelse(header_bh);
+ out_cp:
+        brelse(cp_bh);
+ out_sem:
+        up_write(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_cpfile_is_snapshot -
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ *
+ * Description:
+ *
+ * Return Value: On success, 1 is returned if the checkpoint specified by
+ * @cno is a snapshot, or 0 if not. On error, one of the following negative
+ * error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
+{
+        struct buffer_head *bh;
+        struct nilfs_checkpoint *cp;
+        void *kaddr;
+        int ret;
+        if (cno == 0)
+                return -ENOENT; /* checkpoint number 0 is invalid */
+        down_read(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+        if (ret < 0)
+                goto out;
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+        ret = nilfs_checkpoint_snapshot(cp);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(bh);
+ out:
+        up_read(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_cpfile_change_cpmode - change checkpoint mode
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @status: mode of checkpoint
+ *
+ * Description: nilfs_change_cpmode() changes the mode of the checkpoint
+ * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
+{
+        struct the_nilfs *nilfs;
+        int ret;
+        nilfs = NILFS_MDT(cpfile)->mi_nilfs;
+        switch (mode) {
+        case NILFS_CHECKPOINT:
+                /*
+                 * Check for protecting existing snapshot mounts:
+                 * bd_mount_sem is used to make this operation atomic and
+                 * exclusive with a new mount job.  Though it doesn't cover
+                 * umount, it's enough for the purpose.
+                 */
+                down(&nilfs->ns_bdev->bd_mount_sem);
+                if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
+                        /* Current implementation does not have to protect
+                           plain read-only mounts since they are exclusive
+                           with a read/write mount and are protected from the
+                           cleaner. */
+                        ret = -EBUSY;
+                } else
+                        ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
+                up(&nilfs->ns_bdev->bd_mount_sem);
+                return ret;
+        case NILFS_SNAPSHOT:
+                return nilfs_cpfile_set_snapshot(cpfile, cno);
+        default:
+                return -EINVAL;
+        }
+}
+/**
+ * nilfs_cpfile_get_stat - get checkpoint statistics
+ * @cpfile: inode of checkpoint file
+ * @stat: pointer to a structure of checkpoint statistics
+ *
+ * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
+{
+        struct buffer_head *bh;
+        struct nilfs_cpfile_header *header;
+        void *kaddr;
+        int ret;
+        down_read(&NILFS_MDT(cpfile)->mi_sem);
+        ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+        cpstat->cs_cno = nilfs_mdt_cno(cpfile);
+        cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
+        cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(bh);
+ out_sem:
+        up_read(&NILFS_MDT(cpfile)->mi_sem);
+        return ret;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
new file mode 100644
index 000000000000..1a8a1008c342
--- /dev/null
+++ b/fs/nilfs2/cpfile.h
@@ -0,0 +1,45 @@
+/*
+ * cpfile.h - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_CPFILE_H
+#define _NILFS_CPFILE_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#define NILFS_CPFILE_GFP        NILFS_MDT_GFP
+int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
+                                struct nilfs_checkpoint **,
+                                struct buffer_head **);
+void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
+int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
+int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
+int nilfs_cpfile_is_snapshot(struct inode *, __u64);
+int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
+                                struct nilfs_cpinfo *, size_t);
+#endif  /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
new file mode 100644
index 000000000000..bb8a5818e7f1
--- /dev/null
+++ b/fs/nilfs2/dat.c
@@ -0,0 +1,430 @@
+/*
+ * dat.c - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "dat.h"
+#define NILFS_CNO_MIN   ((__u64)1)
+#define NILFS_CNO_MAX   (~(__u64)0)
+static int nilfs_dat_prepare_entry(struct inode *dat,
+                                   struct nilfs_palloc_req *req, int create)
+{
+        return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+                                            create, &req->pr_entry_bh);
+}
+static void nilfs_dat_commit_entry(struct inode *dat,
+                                   struct nilfs_palloc_req *req)
+{
+        nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
+        nilfs_mdt_mark_dirty(dat);
+        brelse(req->pr_entry_bh);
+}
+static void nilfs_dat_abort_entry(struct inode *dat,
+                                  struct nilfs_palloc_req *req)
+{
+        brelse(req->pr_entry_bh);
+}
+int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        int ret;
+        ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+        if (ret < 0)
+                return ret;
+        ret = nilfs_dat_prepare_entry(dat, req, 1);
+        if (ret < 0)
+                nilfs_palloc_abort_alloc_entry(dat, req);
+        return ret;
+}
+void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        struct nilfs_dat_entry *entry;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+        entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
+        entry->de_blocknr = cpu_to_le64(0);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_palloc_commit_alloc_entry(dat, req);
+        nilfs_dat_commit_entry(dat, req);
+}
+void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        nilfs_dat_abort_entry(dat, req);
+        nilfs_palloc_abort_alloc_entry(dat, req);
+}
+int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        int ret;
+        ret = nilfs_palloc_prepare_free_entry(dat, req);
+        if (ret < 0)
+                return ret;
+        ret = nilfs_dat_prepare_entry(dat, req, 0);
+        if (ret < 0) {
+                nilfs_palloc_abort_free_entry(dat, req);
+                return ret;
+        }
+        return 0;
+}
+void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        struct nilfs_dat_entry *entry;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+        entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
+        entry->de_blocknr = cpu_to_le64(0);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_dat_commit_entry(dat, req);
+        nilfs_palloc_commit_free_entry(dat, req);
+}
+void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        nilfs_dat_abort_entry(dat, req);
+        nilfs_palloc_abort_free_entry(dat, req);
+}
+int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        int ret;
+        ret = nilfs_dat_prepare_entry(dat, req, 0);
+        WARN_ON(ret == -ENOENT);
+        return ret;
+}
+void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
+                            sector_t blocknr)
+{
+        struct nilfs_dat_entry *entry;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
+        if (entry->de_blocknr != cpu_to_le64(0) ||
+            entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
+                printk(KERN_CRIT
+                       "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
+                       __func__, (unsigned long long)req->pr_entry_nr,
+                       (unsigned long long)le64_to_cpu(entry->de_start),
+                       (unsigned long long)le64_to_cpu(entry->de_end),
+                       (unsigned long long)le64_to_cpu(entry->de_blocknr));
+        }
+        entry->de_blocknr = cpu_to_le64(blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_dat_commit_entry(dat, req);
+}
+void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        nilfs_dat_abort_entry(dat, req);
+}
+int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        struct nilfs_dat_entry *entry;
+        __u64 start;
+        sector_t blocknr;
+        void *kaddr;
+        int ret;
+        ret = nilfs_dat_prepare_entry(dat, req, 0);
+        if (ret < 0) {
+                WARN_ON(ret == -ENOENT);
+                return ret;
+        }
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        start = le64_to_cpu(entry->de_start);
+        blocknr = le64_to_cpu(entry->de_blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (blocknr == 0) {
+                ret = nilfs_palloc_prepare_free_entry(dat, req);
+                if (ret < 0) {
+                        nilfs_dat_abort_entry(dat, req);
+                        return ret;
+                }
+        }
+        return 0;
+}
+void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
+                          int dead)
+{
+        struct nilfs_dat_entry *entry;
+        __u64 start, end;
+        sector_t blocknr;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        end = start = le64_to_cpu(entry->de_start);
+        if (!dead) {
+                end = nilfs_mdt_cno(dat);
+                WARN_ON(start > end);
+        }
+        entry->de_end = cpu_to_le64(end);
+        blocknr = le64_to_cpu(entry->de_blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (blocknr == 0)
+                nilfs_dat_commit_free(dat, req);
+        else
+                nilfs_dat_commit_entry(dat, req);
+}
+void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+        struct nilfs_dat_entry *entry;
+        __u64 start;
+        sector_t blocknr;
+        void *kaddr;
+        kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+                                             req->pr_entry_bh, kaddr);
+        start = le64_to_cpu(entry->de_start);
+        blocknr = le64_to_cpu(entry->de_blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (start == nilfs_mdt_cno(dat) && blocknr == 0)
+                nilfs_palloc_abort_free_entry(dat, req);
+        nilfs_dat_abort_entry(dat, req);
+}
+/**
+ * nilfs_dat_mark_dirty -
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
+{
+        struct nilfs_palloc_req req;
+        int ret;
+        req.pr_entry_nr = vblocknr;
+        ret = nilfs_dat_prepare_entry(dat, &req, 0);
+        if (ret == 0)
+                nilfs_dat_commit_entry(dat, &req);
+        return ret;
+}
+/**
+ * nilfs_dat_freev - free virtual block numbers
+ * @dat: DAT file inode
+ * @vblocknrs: array of virtual block numbers
+ * @nitems: number of virtual block numbers
+ *
+ * Description: nilfs_dat_freev() frees the virtual block numbers specified by
+ * @vblocknrs and @nitems.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * nagative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
+int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
+{
+        return nilfs_palloc_freev(dat, vblocknrs, nitems);
+}
+/**
+ * nilfs_dat_move - change a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknr: block number
+ *
+ * Description: nilfs_dat_move() changes the block number associated with
+ * @vblocknr to @blocknr.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
+{
+        struct buffer_head *entry_bh;
+        struct nilfs_dat_entry *entry;
+        void *kaddr;
+        int ret;
+        ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+        if (ret < 0)
+                return ret;
+        kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+        if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
+                printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
+                       (unsigned long long)vblocknr,
+                       (unsigned long long)le64_to_cpu(entry->de_start),
+                       (unsigned long long)le64_to_cpu(entry->de_end));
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(entry_bh);
+                return -EINVAL;
+        }
+        WARN_ON(blocknr == 0);
+        entry->de_blocknr = cpu_to_le64(blocknr);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(entry_bh);
+        nilfs_mdt_mark_dirty(dat);
+        brelse(entry_bh);
+        return 0;
+}
+/**
+ * nilfs_dat_translate - translate a virtual block number to a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknrp: pointer to a block number
+ *
+ * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
+ * to the corresponding block number.
+ *
+ * Return Value: On success, 0 is returned and the block number associated
+ * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A block number associated with @vblocknr does not exist.
+ */
+int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
+{
+        struct buffer_head *entry_bh;
+        struct nilfs_dat_entry *entry;
+        sector_t blocknr;
+        void *kaddr;
+        int ret;
+        ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+        if (ret < 0)
+                return ret;
+        kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+        entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+        blocknr = le64_to_cpu(entry->de_blocknr);
+        if (blocknr == 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        if (blocknrp != NULL)
+                *blocknrp = blocknr;
+ out:
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(entry_bh);
+        return ret;
+}
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
+                            size_t nvi)
+{
+        struct buffer_head *entry_bh;
+        struct nilfs_dat_entry *entry;
+        __u64 first, last;
+        void *kaddr;
+        unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
+        int i, j, n, ret;
+        for (i = 0; i < nvi; i += n) {
+                ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
+                                                   0, &entry_bh);
+                if (ret < 0)
+                        return ret;
+                kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+                /* last virtual block number in this block */
+                first = vinfo[i].vi_vblocknr;
+                do_div(first, entries_per_block);
+                first *= entries_per_block;
+                last = first + entries_per_block - 1;
+                for (j = i, n = 0;
+                     j < nvi && vinfo[j].vi_vblocknr >= first &&
+                             vinfo[j].vi_vblocknr <= last;
+                     j++, n++) {
+                        entry = nilfs_palloc_block_get_entry(
+                                dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
+                        vinfo[j].vi_start = le64_to_cpu(entry->de_start);
+                        vinfo[j].vi_end = le64_to_cpu(entry->de_end);
+                        vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(entry_bh);
+        }
+        return nvi;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
new file mode 100644
index 000000000000..d9560654a4b7
--- /dev/null
+++ b/fs/nilfs2/dat.h
@@ -0,0 +1,52 @@
+/*
+ * dat.h - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_DAT_H
+#define _NILFS_DAT_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#define NILFS_DAT_GFP   NILFS_MDT_GFP
+struct nilfs_palloc_req;
+int nilfs_dat_translate(struct inode *, __u64, sector_t *);
+int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
+                            sector_t);
+void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_mark_dirty(struct inode *, __u64);
+int nilfs_dat_freev(struct inode *, __u64 *, size_t);
+int nilfs_dat_move(struct inode *, __u64, sector_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
+#endif  /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
new file mode 100644
index 000000000000..54100acc1102
--- /dev/null
+++ b/fs/nilfs2/dir.c
@@ -0,0 +1,711 @@
+/*
+ * dir.c - NILFS directory entry operations
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * All code that works with directory layout had been switched to pagecache
+ * and moved here. AV
+ */
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include "nilfs.h"
+#include "page.h"
+/*
+ * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
+ * more robust, but we have what we have
+ */
+static inline unsigned nilfs_chunk_size(struct inode *inode)
+{
+        return inode->i_sb->s_blocksize;
+}
+static inline void nilfs_put_page(struct page *page)
+{
+        kunmap(page);
+        page_cache_release(page);
+}
+static inline unsigned long dir_pages(struct inode *inode)
+{
+        return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+        unsigned last_byte = inode->i_size;
+        last_byte -= page_nr << PAGE_CACHE_SHIFT;
+        if (last_byte > PAGE_CACHE_SIZE)
+                last_byte = PAGE_CACHE_SIZE;
+        return last_byte;
+}
+static int nilfs_prepare_chunk_uninterruptible(struct page *page,
+                                               struct address_space *mapping,
+                                               unsigned from, unsigned to)
+{
+        loff_t pos = page_offset(page) + from;
+        return block_write_begin(NULL, mapping, pos, to - from,
+                                 AOP_FLAG_UNINTERRUPTIBLE, &page,
+                                 NULL, nilfs_get_block);
+}
+static int nilfs_prepare_chunk(struct page *page,
+                               struct address_space *mapping,
+                               unsigned from, unsigned to)
+{
+        loff_t pos = page_offset(page) + from;
+        return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
+                                 NULL, nilfs_get_block);
+}
+static int nilfs_commit_chunk(struct page *page,
+                              struct address_space *mapping,
+                              unsigned from, unsigned to)
+{
+        struct inode *dir = mapping->host;
+        struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
+        loff_t pos = page_offset(page) + from;
+        unsigned len = to - from;
+        unsigned nr_dirty, copied;
+        int err;
+        nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
+        copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
+        if (pos + copied > dir->i_size) {
+                i_size_write(dir, pos + copied);
+                mark_inode_dirty(dir);
+        }
+        if (IS_DIRSYNC(dir))
+                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+        unlock_page(page);
+        return err;
+}
+static void nilfs_check_page(struct page *page)
+{
+        struct inode *dir = page->mapping->host;
+        struct super_block *sb = dir->i_sb;
+        unsigned chunk_size = nilfs_chunk_size(dir);
+        char *kaddr = page_address(page);
+        unsigned offs, rec_len;
+        unsigned limit = PAGE_CACHE_SIZE;
+        struct nilfs_dir_entry *p;
+        char *error;
+        if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+                limit = dir->i_size & ~PAGE_CACHE_MASK;
+                if (limit & (chunk_size - 1))
+                        goto Ebadsize;
+                if (!limit)
+                        goto out;
+        }
+        for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
+                p = (struct nilfs_dir_entry *)(kaddr + offs);
+                rec_len = le16_to_cpu(p->rec_len);
+                if (rec_len < NILFS_DIR_REC_LEN(1))
+                        goto Eshort;
+                if (rec_len & 3)
+                        goto Ealign;
+                if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
+                        goto Enamelen;
+                if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                        goto Espan;
+        }
+        if (offs != limit)
+                goto Eend;
+out:
+        SetPageChecked(page);
+        return;
+        /* Too bad, we had an error */
+Ebadsize:
+        nilfs_error(sb, "nilfs_check_page",
+                    "size of directory #%lu is not a multiple of chunk size",
+                    dir->i_ino
+        );
+        goto fail;
+Eshort:
+        error = "rec_len is smaller than minimal";
+        goto bad_entry;
+Ealign:
+        error = "unaligned directory entry";
+        goto bad_entry;
+Enamelen:
+        error = "rec_len is too small for name_len";
+        goto bad_entry;
+Espan:
+        error = "directory entry across blocks";
+bad_entry:
+        nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
+                    "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+                    dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                    (unsigned long) le64_to_cpu(p->inode),
+                    rec_len, p->name_len);
+        goto fail;
+Eend:
+        p = (struct nilfs_dir_entry *)(kaddr + offs);
+        nilfs_error(sb, "nilfs_check_page",
+                    "entry in directory #%lu spans the page boundary"
+                    "offset=%lu, inode=%lu",
+                    dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                    (unsigned long) le64_to_cpu(p->inode));
+fail:
+        SetPageChecked(page);
+        SetPageError(page);
+}
+static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
+{
+        struct address_space *mapping = dir->i_mapping;
+        struct page *page = read_cache_page(mapping, n,
+                                (filler_t *)mapping->a_ops->readpage, NULL);
+        if (!IS_ERR(page)) {
+                wait_on_page_locked(page);
+                kmap(page);
+                if (!PageUptodate(page))
+                        goto fail;
+                if (!PageChecked(page))
+                        nilfs_check_page(page);
+                if (PageError(page))
+                        goto fail;
+        }
+        return page;
+fail:
+        nilfs_put_page(page);
+        return ERR_PTR(-EIO);
+}
+/*
+ * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
+ *
+ * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
+ */
+static int
+nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
+{
+        if (len != de->name_len)
+                return 0;
+        if (!de->inode)
+                return 0;
+        return !memcmp(name, de->name, len);
+}
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
+{
+        return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+static unsigned char
+nilfs_filetype_table[NILFS_FT_MAX] = {
+        [NILFS_FT_UNKNOWN]      = DT_UNKNOWN,
+        [NILFS_FT_REG_FILE]     = DT_REG,
+        [NILFS_FT_DIR]          = DT_DIR,
+        [NILFS_FT_CHRDEV]       = DT_CHR,
+        [NILFS_FT_BLKDEV]       = DT_BLK,
+        [NILFS_FT_FIFO]         = DT_FIFO,
+        [NILFS_FT_SOCK]         = DT_SOCK,
+        [NILFS_FT_SYMLINK]      = DT_LNK,
+};
+#define S_SHIFT 12
+static unsigned char
+nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+        [S_IFREG >> S_SHIFT]    = NILFS_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = NILFS_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = NILFS_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = NILFS_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = NILFS_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = NILFS_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = NILFS_FT_SYMLINK,
+};
+static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
+{
+        mode_t mode = inode->i_mode;
+        de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        loff_t pos = filp->f_pos;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        unsigned int offset = pos & ~PAGE_CACHE_MASK;
+        unsigned long n = pos >> PAGE_CACHE_SHIFT;
+        unsigned long npages = dir_pages(inode);
+/*      unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
+        unsigned char *types = NULL;
+        int ret;
+        if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
+                goto success;
+        types = nilfs_filetype_table;
+        for ( ; n < npages; n++, offset = 0) {
+                char *kaddr, *limit;
+                struct nilfs_dir_entry *de;
+                struct page *page = nilfs_get_page(inode, n);
+                if (IS_ERR(page)) {
+                        nilfs_error(sb, __func__, "bad page in #%lu",
+                                    inode->i_ino);
+                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ret = -EIO;
+                        goto done;
+                }
+                kaddr = page_address(page);
+                de = (struct nilfs_dir_entry *)(kaddr + offset);
+                limit = kaddr + nilfs_last_byte(inode, n) -
+                        NILFS_DIR_REC_LEN(1);
+                for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
+                        if (de->rec_len == 0) {
+                                nilfs_error(sb, __func__,
+                                            "zero-length directory entry");
+                                ret = -EIO;
+                                nilfs_put_page(page);
+                                goto done;
+                        }
+                        if (de->inode) {
+                                int over;
+                                unsigned char d_type = DT_UNKNOWN;
+                                if (types && de->file_type < NILFS_FT_MAX)
+                                        d_type = types[de->file_type];
+                                offset = (char *)de - kaddr;
+                                over = filldir(dirent, de->name, de->name_len,
+                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                                le64_to_cpu(de->inode), d_type);
+                                if (over) {
+                                        nilfs_put_page(page);
+                                        goto success;
+                                }
+                        }
+                        filp->f_pos += le16_to_cpu(de->rec_len);
+                }
+                nilfs_put_page(page);
+        }
+success:
+        ret = 0;
+done:
+        return ret;
+}
+/*
+ *      nilfs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *dir, struct dentry *dentry,
+                 struct page **res_page)
+{
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+        unsigned long start, n;
+        unsigned long npages = dir_pages(dir);
+        struct page *page = NULL;
+        struct nilfs_inode_info *ei = NILFS_I(dir);
+        struct nilfs_dir_entry *de;
+        if (npages == 0)
+                goto out;
+        /* OFFSET_CACHE */
+        *res_page = NULL;
+        start = ei->i_dir_start_lookup;
+        if (start >= npages)
+                start = 0;
+        n = start;
+        do {
+                char *kaddr;
+                page = nilfs_get_page(dir, n);
+                if (!IS_ERR(page)) {
+                        kaddr = page_address(page);
+                        de = (struct nilfs_dir_entry *)kaddr;
+                        kaddr += nilfs_last_byte(dir, n) - reclen;
+                        while ((char *) de <= kaddr) {
+                                if (de->rec_len == 0) {
+                                        nilfs_error(dir->i_sb, __func__,
+                                                "zero-length directory entry");
+                                        nilfs_put_page(page);
+                                        goto out;
+                                }
+                                if (nilfs_match(namelen, name, de))
+                                        goto found;
+                                de = nilfs_next_entry(de);
+                        }
+                        nilfs_put_page(page);
+                }
+                if (++n >= npages)
+                        n = 0;
+                /* next page is past the blocks we've got */
+                if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+                        nilfs_error(dir->i_sb, __func__,
+                               "dir %lu size %lld exceeds block cout %llu",
+                               dir->i_ino, dir->i_size,
+                               (unsigned long long)dir->i_blocks);
+                        goto out;
+                }
+        } while (n != start);
+out:
+        return NULL;
+found:
+        *res_page = page;
+        ei->i_dir_start_lookup = n;
+        return de;
+}
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
+{
+        struct page *page = nilfs_get_page(dir, 0);
+        struct nilfs_dir_entry *de = NULL;
+        if (!IS_ERR(page)) {
+                de = nilfs_next_entry(
+                        (struct nilfs_dir_entry *)page_address(page));
+                *p = page;
+        }
+        return de;
+}
+ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+        ino_t res = 0;
+        struct nilfs_dir_entry *de;
+        struct page *page;
+        de = nilfs_find_entry(dir, dentry, &page);
+        if (de) {
+                res = le64_to_cpu(de->inode);
+                kunmap(page);
+                page_cache_release(page);
+        }
+        return res;
+}
+/* Releases the page */
+void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+                    struct page *page, struct inode *inode)
+{
+        unsigned from = (char *) de - (char *) page_address(page);
+        unsigned to = from + le16_to_cpu(de->rec_len);
+        struct address_space *mapping = page->mapping;
+        int err;
+        lock_page(page);
+        err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
+        BUG_ON(err);
+        de->inode = cpu_to_le64(inode->i_ino);
+        nilfs_set_de_type(de, inode);
+        err = nilfs_commit_chunk(page, mapping, from, to);
+        nilfs_put_page(page);
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+/*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
+        mark_inode_dirty(dir);
+}
+/*
+ *      Parent is locked.
+ */
+int nilfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        unsigned chunk_size = nilfs_chunk_size(dir);
+        unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+        unsigned short rec_len, name_len;
+        struct page *page = NULL;
+        struct nilfs_dir_entry *de;
+        unsigned long npages = dir_pages(dir);
+        unsigned long n;
+        char *kaddr;
+        unsigned from, to;
+        int err;
+        /*
+         * We take care of directory expansion in the same loop.
+         * This code plays outside i_size, so it locks the page
+         * to protect that region.
+         */
+        for (n = 0; n <= npages; n++) {
+                char *dir_end;
+                page = nilfs_get_page(dir, n);
+                err = PTR_ERR(page);
+                if (IS_ERR(page))
+                        goto out;
+                lock_page(page);
+                kaddr = page_address(page);
+                dir_end = kaddr + nilfs_last_byte(dir, n);
+                de = (struct nilfs_dir_entry *)kaddr;
+                kaddr += PAGE_CACHE_SIZE - reclen;
+                while ((char *)de <= kaddr) {
+                        if ((char *)de == dir_end) {
+                                /* We hit i_size */
+                                name_len = 0;
+                                rec_len = chunk_size;
+                                de->rec_len = cpu_to_le16(chunk_size);
+                                de->inode = 0;
+                                goto got_it;
+                        }
+                        if (de->rec_len == 0) {
+                                nilfs_error(dir->i_sb, __func__,
+                                            "zero-length directory entry");
+                                err = -EIO;
+                                goto out_unlock;
+                        }
+                        err = -EEXIST;
+                        if (nilfs_match(namelen, name, de))
+                                goto out_unlock;
+                        name_len = NILFS_DIR_REC_LEN(de->name_len);
+                        rec_len = le16_to_cpu(de->rec_len);
+                        if (!de->inode && rec_len >= reclen)
+                                goto got_it;
+                        if (rec_len >= name_len + reclen)
+                                goto got_it;
+                        de = (struct nilfs_dir_entry *)((char *)de + rec_len);
+                }
+                unlock_page(page);
+                nilfs_put_page(page);
+        }
+        BUG();
+        return -EINVAL;
+got_it:
+        from = (char *)de - (char *)page_address(page);
+        to = from + rec_len;
+        err = nilfs_prepare_chunk(page, page->mapping, from, to);
+        if (err)
+                goto out_unlock;
+        if (de->inode) {
+                struct nilfs_dir_entry *de1;
+                de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
+                de1->rec_len = cpu_to_le16(rec_len - name_len);
+                de->rec_len = cpu_to_le16(name_len);
+                de = de1;
+        }
+        de->name_len = namelen;
+        memcpy(de->name, name, namelen);
+        de->inode = cpu_to_le64(inode->i_ino);
+        nilfs_set_de_type(de, inode);
+        err = nilfs_commit_chunk(page, page->mapping, from, to);
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+/*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
+        mark_inode_dirty(dir);
+        /* OFFSET_CACHE */
+out_put:
+        nilfs_put_page(page);
+out:
+        return err;
+out_unlock:
+        unlock_page(page);
+        goto out_put;
+}
+/*
+ * nilfs_delete_entry deletes a directory entry by merging it with the
+ * previous entry. Page is up-to-date. Releases the page.
+ */
+int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        char *kaddr = page_address(page);
+        unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+        unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+        struct nilfs_dir_entry *pde = NULL;
+        struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+        int err;
+        while ((char *)de < (char *)dir) {
+                if (de->rec_len == 0) {
+                        nilfs_error(inode->i_sb, __func__,
+                                    "zero-length directory entry");
+                        err = -EIO;
+                        goto out;
+                }
+                pde = de;
+                de = nilfs_next_entry(de);
+        }
+        if (pde)
+                from = (char *)pde - (char *)page_address(page);
+        lock_page(page);
+        err = nilfs_prepare_chunk(page, mapping, from, to);
+        BUG_ON(err);
+        if (pde)
+                pde->rec_len = cpu_to_le16(to - from);
+        dir->inode = 0;
+        err = nilfs_commit_chunk(page, mapping, from, to);
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+/*      NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
+        mark_inode_dirty(inode);
+out:
+        nilfs_put_page(page);
+        return err;
+}
+/*
+ * Set the first fragment of directory.
+ */
+int nilfs_make_empty(struct inode *inode, struct inode *parent)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct page *page = grab_cache_page(mapping, 0);
+        unsigned chunk_size = nilfs_chunk_size(inode);
+        struct nilfs_dir_entry *de;
+        int err;
+        void *kaddr;
+        if (!page)
+                return -ENOMEM;
+        err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
+        if (unlikely(err)) {
+                unlock_page(page);
+                goto fail;
+        }
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr, 0, chunk_size);
+        de = (struct nilfs_dir_entry *)kaddr;
+        de->name_len = 1;
+        de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
+        memcpy(de->name, ".\0\0", 4);
+        de->inode = cpu_to_le64(inode->i_ino);
+        nilfs_set_de_type(de, inode);
+        de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
+        de->name_len = 2;
+        de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
+        de->inode = cpu_to_le64(parent->i_ino);
+        memcpy(de->name, "..\0", 4);
+        nilfs_set_de_type(de, inode);
+        kunmap_atomic(kaddr, KM_USER0);
+        err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
+fail:
+        page_cache_release(page);
+        return err;
+}
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int nilfs_empty_dir(struct inode *inode)
+{
+        struct page *page = NULL;
+        unsigned long i, npages = dir_pages(inode);
+        for (i = 0; i < npages; i++) {
+                char *kaddr;
+                struct nilfs_dir_entry *de;
+                page = nilfs_get_page(inode, i);
+                if (IS_ERR(page))
+                        continue;
+                kaddr = page_address(page);
+                de = (struct nilfs_dir_entry *)kaddr;
+                kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
+                while ((char *)de <= kaddr) {
+                        if (de->rec_len == 0) {
+                                nilfs_error(inode->i_sb, __func__,
+                                            "zero-length directory entry "
+                                            "(kaddr=%p, de=%p)\n", kaddr, de);
+                                goto not_empty;
+                        }
+                        if (de->inode != 0) {
+                                /* check for . and .. */
+                                if (de->name[0] != '.')
+                                        goto not_empty;
+                                if (de->name_len > 2)
+                                        goto not_empty;
+                                if (de->name_len < 2) {
+                                        if (de->inode !=
+                                            cpu_to_le64(inode->i_ino))
+                                                goto not_empty;
+                                } else if (de->name[1] != '.')
+                                        goto not_empty;
+                        }
+                        de = nilfs_next_entry(de);
+                }
+                nilfs_put_page(page);
+        }
+        return 1;
+not_empty:
+        nilfs_put_page(page);
+        return 0;
+}
+struct file_operations nilfs_dir_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = nilfs_readdir,
+        .unlocked_ioctl = nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = nilfs_ioctl,
+#endif  /* CONFIG_COMPAT */
+        .fsync          = nilfs_sync_file,
+};
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
new file mode 100644
index 000000000000..c6379e482781
--- /dev/null
+++ b/fs/nilfs2/direct.c
@@ -0,0 +1,436 @@
+/*
+ * direct.c - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "page.h"
+#include "direct.h"
+#include "alloc.h"
+static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
+{
+        return (__le64 *)
+                ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
+}
+static inline __u64
+nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
+{
+        return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
+}
+static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
+                                        __u64 key, __u64 ptr)
+{
+        *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
+}
+static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
+                               __u64 key, int level, __u64 *ptrp)
+{
+        struct nilfs_direct *direct;
+        __u64 ptr;
+        direct = (struct nilfs_direct *)bmap;
+        if ((key > NILFS_DIRECT_KEY_MAX) ||
+            (level != 1) ||     /* XXX: use macro for level 1 */
+            ((ptr = nilfs_direct_get_ptr(direct, key)) ==
+             NILFS_BMAP_INVALID_PTR))
+                return -ENOENT;
+        if (ptrp != NULL)
+                *ptrp = ptr;
+        return 0;
+}
+static __u64
+nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
+{
+        __u64 ptr;
+        ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
+        if (ptr != NILFS_BMAP_INVALID_PTR)
+                /* sequential access */
+                return ptr;
+        else
+                /* block group */
+                return nilfs_bmap_find_target_in_group(&direct->d_bmap);
+}
+static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
+                                      __u64 key, __u64 ptr)
+{
+        direct->d_bmap.b_last_allocated_key = key;
+        direct->d_bmap.b_last_allocated_ptr = ptr;
+}
+static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
+                                       __u64 key,
+                                       union nilfs_bmap_ptr_req *req,
+                                       struct nilfs_bmap_stats *stats)
+{
+        int ret;
+        if (direct->d_ops->dop_find_target != NULL)
+                req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
+        ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
+                                                               req);
+        if (ret < 0)
+                return ret;
+        stats->bs_nblocks = 1;
+        return 0;
+}
+static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
+                                       union nilfs_bmap_ptr_req *req,
+                                       __u64 key, __u64 ptr)
+{
+        struct buffer_head *bh;
+        /* ptr must be a pointer to a buffer head. */
+        bh = (struct buffer_head *)((unsigned long)ptr);
+        set_buffer_nilfs_volatile(bh);
+        if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
+                direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
+                        &direct->d_bmap, req);
+        nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
+        if (!nilfs_bmap_dirty(&direct->d_bmap))
+                nilfs_bmap_set_dirty(&direct->d_bmap);
+        if (direct->d_ops->dop_set_target != NULL)
+                direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
+}
+static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+        struct nilfs_direct *direct;
+        union nilfs_bmap_ptr_req req;
+        struct nilfs_bmap_stats stats;
+        int ret;
+        direct = (struct nilfs_direct *)bmap;
+        if (key > NILFS_DIRECT_KEY_MAX)
+                return -ENOENT;
+        if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
+                return -EEXIST;
+        ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
+        if (ret < 0)
+                return ret;
+        nilfs_direct_commit_insert(direct, &req, key, ptr);
+        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        return 0;
+}
+static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
+                                       union nilfs_bmap_ptr_req *req,
+                                       __u64 key,
+                                       struct nilfs_bmap_stats *stats)
+{
+        int ret;
+        if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+                req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+                ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
+                        &direct->d_bmap, req);
+                if (ret < 0)
+                        return ret;
+        }
+        stats->bs_nblocks = 1;
+        return 0;
+}
+static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
+                                       union nilfs_bmap_ptr_req *req,
+                                       __u64 key)
+{
+        if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
+                direct->d_bmap.b_pops->bpop_commit_end_ptr(
+                        &direct->d_bmap, req);
+        nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+}
+static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+        struct nilfs_direct *direct;
+        union nilfs_bmap_ptr_req req;
+        struct nilfs_bmap_stats stats;
+        int ret;
+        direct = (struct nilfs_direct *)bmap;
+        if ((key > NILFS_DIRECT_KEY_MAX) ||
+            nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
+                return -ENOENT;
+        ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
+        if (ret < 0)
+                return ret;
+        nilfs_direct_commit_delete(direct, &req, key);
+        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+        return 0;
+}
+static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+{
+        struct nilfs_direct *direct;
+        __u64 key, lastkey;
+        direct = (struct nilfs_direct *)bmap;
+        lastkey = NILFS_DIRECT_KEY_MAX + 1;
+        for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
+                if (nilfs_direct_get_ptr(direct, key) !=
+                    NILFS_BMAP_INVALID_PTR)
+                        lastkey = key;
+        if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
+                return -ENOENT;
+        *keyp = lastkey;
+        return 0;
+}
+static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
+{
+        return key > NILFS_DIRECT_KEY_MAX;
+}
+static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
+                                    __u64 *keys, __u64 *ptrs, int nitems)
+{
+        struct nilfs_direct *direct;
+        __u64 key;
+        __u64 ptr;
+        int n;
+        direct = (struct nilfs_direct *)bmap;
+        if (nitems > NILFS_DIRECT_NBLOCKS)
+                nitems = NILFS_DIRECT_NBLOCKS;
+        n = 0;
+        for (key = 0; key < nitems; key++) {
+                ptr = nilfs_direct_get_ptr(direct, key);
+                if (ptr != NILFS_BMAP_INVALID_PTR) {
+                        keys[n] = key;
+                        ptrs[n] = ptr;
+                        n++;
+                }
+        }
+        return n;
+}
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
+                                    __u64 key, __u64 *keys, __u64 *ptrs,
+                                    int n, __u64 low, __u64 high)
+{
+        struct nilfs_direct *direct;
+        __le64 *dptrs;
+        int ret, i, j;
+        /* no need to allocate any resource for conversion */
+        /* delete */
+        ret = bmap->b_ops->bop_delete(bmap, key);
+        if (ret < 0)
+                return ret;
+        /* free resources */
+        if (bmap->b_ops->bop_clear != NULL)
+                bmap->b_ops->bop_clear(bmap);
+        /* convert */
+        direct = (struct nilfs_direct *)bmap;
+        dptrs = nilfs_direct_dptrs(direct);
+        for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
+                if ((j < n) && (i == keys[j])) {
+                        dptrs[i] = (i != key) ?
+                                nilfs_bmap_ptr_to_dptr(ptrs[j]) :
+                                NILFS_BMAP_INVALID_PTR;
+                        j++;
+                } else
+                        dptrs[i] = NILFS_BMAP_INVALID_PTR;
+        }
+        nilfs_direct_init(bmap, low, high);
+        return 0;
+}
+static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
+                                    struct buffer_head *bh)
+{
+        union nilfs_bmap_ptr_req oldreq, newreq;
+        __u64 key;
+        __u64 ptr;
+        int ret;
+        key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+        ptr = nilfs_direct_get_ptr(direct, key);
+        if (!buffer_nilfs_volatile(bh)) {
+                oldreq.bpr_ptr = ptr;
+                newreq.bpr_ptr = ptr;
+                ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
+                                                &newreq);
+                if (ret < 0)
+                        return ret;
+                nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
+                set_buffer_nilfs_volatile(bh);
+                nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+        } else
+                ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+        return ret;
+}
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+                                  struct buffer_head *bh)
+{
+        struct nilfs_direct *direct;
+        direct = (struct nilfs_direct *)bmap;
+        return (direct->d_ops->dop_propagate != NULL) ?
+                direct->d_ops->dop_propagate(direct, bh) :
+                0;
+}
+static int nilfs_direct_assign_v(struct nilfs_direct *direct,
+                                 __u64 key, __u64 ptr,
+                                 struct buffer_head **bh,
+                                 sector_t blocknr,
+                                 union nilfs_binfo *binfo)
+{
+        union nilfs_bmap_ptr_req req;
+        int ret;
+        req.bpr_ptr = ptr;
+        ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
+                &direct->d_bmap, &req);
+        if (ret < 0)
+                return ret;
+        direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
+                                                     &req, blocknr);
+        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        return 0;
+}
+static int nilfs_direct_assign_p(struct nilfs_direct *direct,
+                                 __u64 key, __u64 ptr,
+                                 struct buffer_head **bh,
+                                 sector_t blocknr,
+                                 union nilfs_binfo *binfo)
+{
+        nilfs_direct_set_ptr(direct, key, blocknr);
+        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_dat.bi_level = 0;
+        return 0;
+}
+static int nilfs_direct_assign(struct nilfs_bmap *bmap,
+                               struct buffer_head **bh,
+                               sector_t blocknr,
+                               union nilfs_binfo *binfo)
+{
+        struct nilfs_direct *direct;
+        __u64 key;
+        __u64 ptr;
+        direct = (struct nilfs_direct *)bmap;
+        key = nilfs_bmap_data_get_key(bmap, *bh);
+        if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
+                printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
+                       (unsigned long long)key);
+                return -EINVAL;
+        }
+        ptr = nilfs_direct_get_ptr(direct, key);
+        if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
+                printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
+                       (unsigned long long)ptr);
+                return -EINVAL;
+        }
+        return direct->d_ops->dop_assign(direct, key, ptr, bh,
+                                         blocknr, binfo);
+}
+static const struct nilfs_bmap_operations nilfs_direct_ops = {
+        .bop_lookup             =       nilfs_direct_lookup,
+        .bop_insert             =       nilfs_direct_insert,
+        .bop_delete             =       nilfs_direct_delete,
+        .bop_clear              =       NULL,
+        .bop_propagate          =       nilfs_direct_propagate,
+        .bop_lookup_dirty_buffers       =       NULL,
+        .bop_assign             =       nilfs_direct_assign,
+        .bop_mark               =       NULL,
+        .bop_last_key           =       nilfs_direct_last_key,
+        .bop_check_insert       =       nilfs_direct_check_insert,
+        .bop_check_delete       =       NULL,
+        .bop_gather_data        =       nilfs_direct_gather_data,
+};
+static const struct nilfs_direct_operations nilfs_direct_ops_v = {
+        .dop_find_target        =       nilfs_direct_find_target_v,
+        .dop_set_target         =       nilfs_direct_set_target_v,
+        .dop_propagate          =       nilfs_direct_propagate_v,
+        .dop_assign             =       nilfs_direct_assign_v,
+};
+static const struct nilfs_direct_operations nilfs_direct_ops_p = {
+        .dop_find_target        =       NULL,
+        .dop_set_target         =       NULL,
+        .dop_propagate          =       NULL,
+        .dop_assign             =       nilfs_direct_assign_p,
+};
+int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+{
+        struct nilfs_direct *direct;
+        direct = (struct nilfs_direct *)bmap;
+        bmap->b_ops = &nilfs_direct_ops;
+        bmap->b_low = low;
+        bmap->b_high = high;
+        switch (bmap->b_inode->i_ino) {
+        case NILFS_DAT_INO:
+                direct->d_ops = &nilfs_direct_ops_p;
+                break;
+        default:
+                direct->d_ops = &nilfs_direct_ops_v;
+                break;
+        }
+        return 0;
+}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
new file mode 100644
index 000000000000..45d2c5cda812
--- /dev/null
+++ b/fs/nilfs2/direct.h
@@ -0,0 +1,78 @@
+/*
+ * direct.h - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_DIRECT_H
+#define _NILFS_DIRECT_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "bmap.h"
+struct nilfs_direct;
+/**
+ * struct nilfs_direct_operations - direct mapping operation table
+ */
+struct nilfs_direct_operations {
+        __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
+        void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
+        int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
+        int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
+                          struct buffer_head **, sector_t,
+                          union nilfs_binfo *);
+};
+/**
+ * struct nilfs_direct_node - direct node
+ * @dn_flags: flags
+ * @dn_pad: padding
+ */
+struct nilfs_direct_node {
+        __u8 dn_flags;
+        __u8 pad[7];
+};
+/**
+ * struct nilfs_direct - direct mapping
+ * @d_bmap: bmap structure
+ * @d_ops: direct mapping operation table
+ */
+struct nilfs_direct {
+        struct nilfs_bmap d_bmap;
+        /* direct-mapping-specific members */
+        const struct nilfs_direct_operations *d_ops;
+};
+#define NILFS_DIRECT_NBLOCKS    (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
+#define NILFS_DIRECT_KEY_MIN    0
+#define NILFS_DIRECT_KEY_MAX    (NILFS_DIRECT_NBLOCKS - 1)
+int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
+                                    __u64 *, int, __u64, __u64);
+#endif  /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
new file mode 100644
index 000000000000..6bd84a0d8238
--- /dev/null
+++ b/fs/nilfs2/file.c
@@ -0,0 +1,160 @@
+/*
+ * file.c - NILFS regular file handling primitives including fsync().
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>,
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include "nilfs.h"
+#include "segment.h"
+int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+        /*
+         * Called from fsync() system call
+         * This is the only entry point that can catch write and synch
+         * timing for both data blocks and intermediate blocks.
+         *
+         * This function should be implemented when the writeback function
+         * will be implemented.
+         */
+        struct inode *inode = dentry->d_inode;
+        int err;
+        if (!nilfs_inode_dirty(inode))
+                return 0;
+        if (datasync)
+                err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
+                                                    LLONG_MAX);
+        else
+                err = nilfs_construct_segment(inode->i_sb);
+        return err;
+}
+static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct page *page = vmf->page;
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct nilfs_transaction_info ti;
+        int ret;
+        if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
+                return VM_FAULT_SIGBUS; /* -ENOSPC */
+        lock_page(page);
+        if (page->mapping != inode->i_mapping ||
+            page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
+                unlock_page(page);
+                return VM_FAULT_NOPAGE; /* make the VM retry the fault */
+        }
+        /*
+         * check to see if the page is mapped already (no holes)
+         */
+        if (PageMappedToDisk(page)) {
+                unlock_page(page);
+                goto mapped;
+        }
+        if (page_has_buffers(page)) {
+                struct buffer_head *bh, *head;
+                int fully_mapped = 1;
+                bh = head = page_buffers(page);
+                do {
+                        if (!buffer_mapped(bh)) {
+                                fully_mapped = 0;
+                                break;
+                        }
+                } while (bh = bh->b_this_page, bh != head);
+                if (fully_mapped) {
+                        SetPageMappedToDisk(page);
+                        unlock_page(page);
+                        goto mapped;
+                }
+        }
+        unlock_page(page);
+        /*
+         * fill hole blocks
+         */
+        ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+        /* never returns -ENOMEM, but may return -ENOSPC */
+        if (unlikely(ret))
+                return VM_FAULT_SIGBUS;
+        ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
+        if (unlikely(ret)) {
+                nilfs_transaction_abort(inode->i_sb);
+                return ret;
+        }
+        nilfs_transaction_commit(inode->i_sb);
+ mapped:
+        SetPageChecked(page);
+        wait_on_page_writeback(page);
+        return 0;
+}
+struct vm_operations_struct nilfs_file_vm_ops = {
+        .fault          = filemap_fault,
+        .page_mkwrite   = nilfs_page_mkwrite,
+};
+static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        file_accessed(file);
+        vma->vm_ops = &nilfs_file_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+        return 0;
+}
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the nilfs filesystem.
+ */
+struct file_operations nilfs_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = generic_file_aio_read,
+        .aio_write      = generic_file_aio_write,
+        .unlocked_ioctl = nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = nilfs_ioctl,
+#endif  /* CONFIG_COMPAT */
+        .mmap           = nilfs_file_mmap,
+        .open           = generic_file_open,
+        /* .release     = nilfs_release_file, */
+        .fsync          = nilfs_sync_file,
+        .splice_read    = generic_file_splice_read,
+};
+struct inode_operations nilfs_file_inode_operations = {
+        .truncate       = nilfs_truncate,
+        .setattr        = nilfs_setattr,
+        .permission     = nilfs_permission,
+};
+/* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
new file mode 100644
index 000000000000..93383c5cee90
--- /dev/null
+++ b/fs/nilfs2/gcdat.c
@@ -0,0 +1,84 @@
+/*
+ * gcdat.c - NILFS shadow DAT inode for GC
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
+{
+        struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
+        struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
+        int err;
+        gcdat->i_state = 0;
+        gcdat->i_blocks = dat->i_blocks;
+        gii->i_flags = dii->i_flags;
+        gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
+        gii->i_cno = 0;
+        nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
+        err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
+        if (unlikely(err))
+                return err;
+        return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
+                                      &dii->i_btnode_cache);
+}
+void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
+{
+        struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
+        struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
+        struct address_space *mapping = dat->i_mapping;
+        struct address_space *gmapping = gcdat->i_mapping;
+        down_write(&NILFS_MDT(dat)->mi_sem);
+        dat->i_blocks = gcdat->i_blocks;
+        dii->i_flags = gii->i_flags;
+        dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
+        nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
+        nilfs_clear_dirty_pages(mapping);
+        nilfs_copy_back_pages(mapping, gmapping);
+        /* note: mdt dirty flags should be cleared by segctor. */
+        nilfs_clear_dirty_pages(&dii->i_btnode_cache);
+        nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
+        up_write(&NILFS_MDT(dat)->mi_sem);
+}
+void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
+{
+        struct inode *gcdat = nilfs->ns_gc_dat;
+        struct nilfs_inode_info *gii = NILFS_I(gcdat);
+        gcdat->i_state = I_CLEAR;
+        gii->i_flags = 0;
+        truncate_inode_pages(gcdat->i_mapping, 0);
+        truncate_inode_pages(&gii->i_btnode_cache, 0);
+}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
new file mode 100644
index 000000000000..19d2102b6a69
--- /dev/null
+++ b/fs/nilfs2/gcinode.c
@@ -0,0 +1,288 @@
+/*
+ * gcinode.c - dummy inodes to buffer blocks for garbage collection
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+/*
+ * This file adds the cache of on-disk blocks to be moved in garbage
+ * collection.  The disk blocks are held with dummy inodes (called
+ * gcinodes), and this file provides lookup function of the dummy
+ * inodes and their buffer read function.
+ *
+ * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
+ * has to treat blocks that belong to a same file but have different
+ * checkpoint numbers.  To avoid interference among generations, dummy
+ * inodes are managed separatly from actual inodes, and their lookup
+ * function (nilfs_gc_iget) is designed to be specified with a
+ * checkpoint number argument as well as an inode number.
+ *
+ * Buffers and pages held by the dummy inodes will be released each
+ * time after they are copied to a new log.  Dirty blocks made on the
+ * current generation and the blocks to be moved by GC never overlap
+ * because the dirty blocks make a new generation; they rather must be
+ * written individually.
+ */
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+#include "dat.h"
+#include "ifile.h"
+static struct address_space_operations def_gcinode_aops = {};
+/* XXX need def_gcinode_iops/fops? */
+/*
+ * nilfs_gccache_submit_read_data() - add data buffer and submit read request
+ * @inode - gc inode
+ * @blkoff - dummy offset treated as the key for the page cache
+ * @pbn - physical block number of the block
+ * @vbn - virtual block number of the block, 0 for non-virtual block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_data() registers the data buffer
+ * specified by @pbn to the GC pagecache with the key @blkoff.
+ * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The block specified with @pbn does not exist.
+ */
+int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
+                                   sector_t pbn, __u64 vbn,
+                                   struct buffer_head **out_bh)
+{
+        struct buffer_head *bh;
+        int err;
+        bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+        if (unlikely(!bh))
+                return -ENOMEM;
+        if (buffer_uptodate(bh))
+                goto out;
+        if (pbn == 0) {
+                struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
+                                          /* use original dat, not gc dat. */
+                err = nilfs_dat_translate(dat_inode, vbn, &pbn);
+                if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
+                        brelse(bh);
+                        goto failed;
+                }
+        }
+        lock_buffer(bh);
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                goto out;
+        }
+        if (!buffer_mapped(bh)) {
+                bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+                set_buffer_mapped(bh);
+        }
+        bh->b_blocknr = pbn;
+        bh->b_end_io = end_buffer_read_sync;
+        get_bh(bh);
+        submit_bh(READ, bh);
+        if (vbn)
+                bh->b_blocknr = vbn;
+ out:
+        err = 0;
+        *out_bh = bh;
+ failed:
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        return err;
+}
+/*
+ * nilfs_gccache_submit_read_node() - add node buffer and submit read request
+ * @inode - gc inode
+ * @pbn - physical block number for the block
+ * @vbn - virtual block number for the block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_node() registers the node buffer
+ * specified by @vbn to the GC pagecache.  @pbn can be supplied by the
+ * caller to avoid translation of the disk block address.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
+                                   __u64 vbn, struct buffer_head **out_bh)
+{
+        int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+                                            vbn ? : pbn, pbn, out_bh, 0);
+        if (ret == -EEXIST) /* internal code (cache hit) */
+                ret = 0;
+        return ret;
+}
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
+{
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh))
+                return -EIO;
+        if (buffer_dirty(bh))
+                return -EEXIST;
+        if (buffer_nilfs_node(bh))
+                nilfs_btnode_mark_dirty(bh);
+        else
+                nilfs_mdt_mark_buffer_dirty(bh);
+        return 0;
+}
+/*
+ * nilfs_init_gccache() - allocate and initialize gc_inode hash table
+ * @nilfs - the_nilfs
+ *
+ * Return Value: On success, 0.
+ * On error, a negative error code is returned.
+ */
+int nilfs_init_gccache(struct the_nilfs *nilfs)
+{
+        int loop;
+        BUG_ON(nilfs->ns_gc_inodes_h);
+        INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
+        nilfs->ns_gc_inodes_h =
+                kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
+                        GFP_NOFS);
+        if (nilfs->ns_gc_inodes_h == NULL)
+                return -ENOMEM;
+        for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
+                INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
+        return 0;
+}
+/*
+ * nilfs_destroy_gccache() - free gc_inode hash table
+ * @nilfs - the nilfs
+ */
+void nilfs_destroy_gccache(struct the_nilfs *nilfs)
+{
+        if (nilfs->ns_gc_inodes_h) {
+                nilfs_remove_all_gcinode(nilfs);
+                kfree(nilfs->ns_gc_inodes_h);
+                nilfs->ns_gc_inodes_h = NULL;
+        }
+}
+static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
+                                   __u64 cno)
+{
+        struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
+        struct nilfs_inode_info *ii;
+        if (!inode)
+                return NULL;
+        inode->i_op = NULL;
+        inode->i_fop = NULL;
+        inode->i_mapping->a_ops = &def_gcinode_aops;
+        ii = NILFS_I(inode);
+        ii->i_cno = cno;
+        ii->i_flags = 0;
+        ii->i_state = 1 << NILFS_I_GCINODE;
+        ii->i_bh = NULL;
+        nilfs_bmap_init_gc(ii->i_bmap);
+        return inode;
+}
+static unsigned long ihash(ino_t ino, __u64 cno)
+{
+        return hash_long((unsigned long)((ino << 2) + cno),
+                         NILFS_GCINODE_HASH_BITS);
+}
+/*
+ * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
+ */
+struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
+{
+        struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
+        struct hlist_node *node;
+        struct inode *inode;
+        hlist_for_each_entry(inode, node, head, i_hash) {
+                if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
+                        return inode;
+        }
+        inode = alloc_gcinode(nilfs, ino, cno);
+        if (likely(inode)) {
+                hlist_add_head(&inode->i_hash, head);
+                list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
+        }
+        return inode;
+}
+/*
+ * nilfs_clear_gcinode() - clear and free a gc inode
+ */
+void nilfs_clear_gcinode(struct inode *inode)
+{
+        nilfs_mdt_clear(inode);
+        nilfs_mdt_destroy(inode);
+}
+/*
+ * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
+ */
+void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
+{
+        struct hlist_head *head = nilfs->ns_gc_inodes_h;
+        struct hlist_node *node, *n;
+        struct inode *inode;
+        int loop;
+        for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
+                hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
+                        hlist_del_init(&inode->i_hash);
+                        list_del_init(&NILFS_I(inode)->i_dirty);
+                        nilfs_clear_gcinode(inode); /* might sleep */
+                }
+        }
+}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
new file mode 100644
index 000000000000..de86401f209f
--- /dev/null
+++ b/fs/nilfs2/ifile.c
@@ -0,0 +1,150 @@
+/*
+ * ifile.c - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "ifile.h"
+/**
+ * nilfs_ifile_create_inode - create a new disk inode
+ * @ifile: ifile inode
+ * @out_ino: pointer to a variable to store inode number
+ * @out_bh: buffer_head contains newly allocated disk inode
+ *
+ * Return Value: On success, 0 is returned and the newly allocated inode
+ * number is stored in the place pointed by @ino, and buffer_head pointer
+ * that contains newly allocated disk inode structure is stored in the
+ * place pointed by @out_bh
+ * On error, one of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No inode left.
+ */
+int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
+                             struct buffer_head **out_bh)
+{
+        struct nilfs_palloc_req req;
+        int ret;
+        req.pr_entry_nr = 0;  /* 0 says find free inode from beginning of
+                                 a group. dull code!! */
+        req.pr_entry_bh = NULL;
+        ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+        if (!ret) {
+                ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
+                                                   &req.pr_entry_bh);
+                if (ret < 0)
+                        nilfs_palloc_abort_alloc_entry(ifile, &req);
+        }
+        if (ret < 0) {
+                brelse(req.pr_entry_bh);
+                return ret;
+        }
+        nilfs_palloc_commit_alloc_entry(ifile, &req);
+        nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+        nilfs_mdt_mark_dirty(ifile);
+        *out_ino = (ino_t)req.pr_entry_nr;
+        *out_bh = req.pr_entry_bh;
+        return 0;
+}
+/**
+ * nilfs_ifile_delete_inode - delete a disk inode
+ * @ifile: ifile inode
+ * @ino: inode number
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The inode number @ino have not been allocated.
+ */
+int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
+{
+        struct nilfs_palloc_req req = {
+                .pr_entry_nr = ino, .pr_entry_bh = NULL
+        };
+        struct nilfs_inode *raw_inode;
+        void *kaddr;
+        int ret;
+        ret = nilfs_palloc_prepare_free_entry(ifile, &req);
+        if (!ret) {
+                ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
+                                                   &req.pr_entry_bh);
+                if (ret < 0)
+                        nilfs_palloc_abort_free_entry(ifile, &req);
+        }
+        if (ret < 0) {
+                brelse(req.pr_entry_bh);
+                return ret;
+        }
+        kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
+        raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
+                                                 req.pr_entry_bh, kaddr);
+        raw_inode->i_flags = 0;
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+        brelse(req.pr_entry_bh);
+        nilfs_palloc_commit_free_entry(ifile, &req);
+        return 0;
+}
+int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
+                                struct buffer_head **out_bh)
+{
+        struct super_block *sb = ifile->i_sb;
+        int err;
+        if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
+                nilfs_error(sb, __func__, "bad inode number: %lu",
+                            (unsigned long) ino);
+                return -EINVAL;
+        }
+        err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
+        if (unlikely(err)) {
+                if (err == -EINVAL)
+                        nilfs_error(sb, __func__, "ifile is broken");
+                else
+                        nilfs_warning(sb, __func__,
+                                      "unable to read inode: %lu",
+                                      (unsigned long) ino);
+        }
+        return err;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
new file mode 100644
index 000000000000..5d30a35679b5
--- /dev/null
+++ b/fs/nilfs2/ifile.h
@@ -0,0 +1,53 @@
+/*
+ * ifile.h - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_IFILE_H
+#define _NILFS_IFILE_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "alloc.h"
+#define NILFS_IFILE_GFP  NILFS_MDT_GFP
+static inline struct nilfs_inode *
+nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
+{
+        void *kaddr = kmap(ibh->b_page);
+        return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
+}
+static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
+                                           struct buffer_head *ibh)
+{
+        kunmap(ibh->b_page);
+}
+int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
+int nilfs_ifile_delete_inode(struct inode *, ino_t);
+int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+#endif  /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
new file mode 100644
index 000000000000..49ab4a49bb4f
--- /dev/null
+++ b/fs/nilfs2/inode.c
@@ -0,0 +1,785 @@
+/*
+ * inode.c - NILFS inode operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/uio.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+#include "cpfile.h"
+#include "ifile.h"
+/**
+ * nilfs_get_block() - get a file block on the filesystem (callback function)
+ * @inode - inode struct of the target file
+ * @blkoff - file block number
+ * @bh_result - buffer head to be mapped on
+ * @create - indicate whether allocating the block or not when it has not
+ *      been allocated yet.
+ *
+ * This function does not issue actual read request of the specified data
+ * block. It is done by VFS.
+ * Bulk read for direct-io is not supported yet. (should be supported)
+ */
+int nilfs_get_block(struct inode *inode, sector_t blkoff,
+                    struct buffer_head *bh_result, int create)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        unsigned long blknum = 0;
+        int err = 0, ret;
+        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        /* This exclusion control is a workaround; should be revised */
+        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
+        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        if (ret == 0) { /* found */
+                map_bh(bh_result, inode->i_sb, blknum);
+                goto out;
+        }
+        /* data block was not found */
+        if (ret == -ENOENT && create) {
+                struct nilfs_transaction_info ti;
+                bh_result->b_blocknr = 0;
+                err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+                if (unlikely(err))
+                        goto out;
+                err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
+                                        (unsigned long)bh_result);
+                if (unlikely(err != 0)) {
+                        if (err == -EEXIST) {
+                                /*
+                                 * The get_block() function could be called
+                                 * from multiple callers for an inode.
+                                 * However, the page having this block must
+                                 * be locked in this case.
+                                 */
+                                printk(KERN_WARNING
+                                       "nilfs_get_block: a race condition "
+                                       "while inserting a data block. "
+                                       "(inode number=%lu, file block "
+                                       "offset=%llu)\n",
+                                       inode->i_ino,
+                                       (unsigned long long)blkoff);
+                                err = 0;
+                        } else if (err == -EINVAL) {
+                                nilfs_error(inode->i_sb, __func__,
+                                            "broken bmap (inode=%lu)\n",
+                                            inode->i_ino);
+                                err = -EIO;
+                        }
+                        nilfs_transaction_abort(inode->i_sb);
+                        goto out;
+                }
+                nilfs_transaction_commit(inode->i_sb); /* never fails */
+                /* Error handling should be detailed */
+                set_buffer_new(bh_result);
+                map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
+                                                      to proper value */
+        } else if (ret == -ENOENT) {
+                /* not found is not error (e.g. hole); must return without
+                   the mapped state flag. */
+                ;
+        } else {
+                err = ret;
+        }
+ out:
+        return err;
+}
+/**
+ * nilfs_readpage() - implement readpage() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @page - the page to be read
+ */
+static int nilfs_readpage(struct file *file, struct page *page)
+{
+        return mpage_readpage(page, nilfs_get_block);
+}
+/**
+ * nilfs_readpages() - implement readpages() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @mapping - address_space struct used for reading multiple pages
+ * @pages - the pages to be read
+ * @nr_pages - number of pages to be read
+ */
+static int nilfs_readpages(struct file *file, struct address_space *mapping,
+                           struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
+}
+static int nilfs_writepages(struct address_space *mapping,
+                            struct writeback_control *wbc)
+{
+        struct inode *inode = mapping->host;
+        int err = 0;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                err = nilfs_construct_dsync_segment(inode->i_sb, inode,
+                                                    wbc->range_start,
+                                                    wbc->range_end);
+        return err;
+}
+static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        int err;
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        if (wbc->sync_mode == WB_SYNC_ALL) {
+                err = nilfs_construct_segment(inode->i_sb);
+                if (unlikely(err))
+                        return err;
+        } else if (wbc->for_reclaim)
+                nilfs_flush_segment(inode->i_sb, inode->i_ino);
+        return 0;
+}
+static int nilfs_set_page_dirty(struct page *page)
+{
+        int ret = __set_page_dirty_buffers(page);
+        if (ret) {
+                struct inode *inode = page->mapping->host;
+                struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+                unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+                nilfs_set_file_dirty(sbi, inode, nr_dirty);
+        }
+        return ret;
+}
+static int nilfs_write_begin(struct file *file, struct address_space *mapping,
+                             loff_t pos, unsigned len, unsigned flags,
+                             struct page **pagep, void **fsdata)
+{
+        struct inode *inode = mapping->host;
+        int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
+        if (unlikely(err))
+                return err;
+        *pagep = NULL;
+        err = block_write_begin(file, mapping, pos, len, flags, pagep,
+                                fsdata, nilfs_get_block);
+        if (unlikely(err))
+                nilfs_transaction_abort(inode->i_sb);
+        return err;
+}
+static int nilfs_write_end(struct file *file, struct address_space *mapping,
+                           loff_t pos, unsigned len, unsigned copied,
+                           struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned nr_dirty;
+        int err;
+        nr_dirty = nilfs_page_count_clean_buffers(page, start,
+                                                  start + copied);
+        copied = generic_write_end(file, mapping, pos, len, copied, page,
+                                   fsdata);
+        nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+        err = nilfs_transaction_commit(inode->i_sb);
+        return err ? : copied;
+}
+static ssize_t
+nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+                loff_t offset, unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        ssize_t size;
+        if (rw == WRITE)
+                return 0;
+        /* Needs synchronization with the cleaner */
+        size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+                                  offset, nr_segs, nilfs_get_block, NULL);
+        return size;
+}
+struct address_space_operations nilfs_aops = {
+        .writepage              = nilfs_writepage,
+        .readpage               = nilfs_readpage,
+        /* .sync_page           = nilfs_sync_page, */
+        .writepages             = nilfs_writepages,
+        .set_page_dirty         = nilfs_set_page_dirty,
+        .readpages              = nilfs_readpages,
+        .write_begin            = nilfs_write_begin,
+        .write_end              = nilfs_write_end,
+        /* .releasepage         = nilfs_releasepage, */
+        .invalidatepage         = block_invalidatepage,
+        .direct_IO              = nilfs_direct_IO,
+};
+struct inode *nilfs_new_inode(struct inode *dir, int mode)
+{
+        struct super_block *sb = dir->i_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct inode *inode;
+        struct nilfs_inode_info *ii;
+        int err = -ENOMEM;
+        ino_t ino;
+        inode = new_inode(sb);
+        if (unlikely(!inode))
+                goto failed;
+        mapping_set_gfp_mask(inode->i_mapping,
+                             mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+        ii = NILFS_I(inode);
+        ii->i_state = 1 << NILFS_I_NEW;
+        err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
+        if (unlikely(err))
+                goto failed_ifile_create_inode;
+        /* reference count of i_bh inherits from nilfs_mdt_read_block() */
+        atomic_inc(&sbi->s_inodes_count);
+        inode->i_uid = current_fsuid();
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+        inode->i_ino = ino;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
+                err = nilfs_bmap_read(ii->i_bmap, NULL);
+                if (err < 0)
+                        goto failed_bmap;
+                set_bit(NILFS_I_BMAP, &ii->i_state);
+                /* No lock is needed; iget() ensures it. */
+        }
+        ii->i_flags = NILFS_I(dir)->i_flags;
+        if (S_ISLNK(mode))
+                ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
+        if (!S_ISDIR(mode))
+                ii->i_flags &= ~NILFS_DIRSYNC_FL;
+        /* ii->i_file_acl = 0; */
+        /* ii->i_dir_acl = 0; */
+        ii->i_dir_start_lookup = 0;
+#ifdef CONFIG_NILFS_FS_POSIX_ACL
+        ii->i_acl = NULL;
+        ii->i_default_acl = NULL;
+#endif
+        ii->i_cno = 0;
+        nilfs_set_inode_flags(inode);
+        spin_lock(&sbi->s_next_gen_lock);
+        inode->i_generation = sbi->s_next_generation++;
+        spin_unlock(&sbi->s_next_gen_lock);
+        insert_inode_hash(inode);
+        err = nilfs_init_acl(inode, dir);
+        if (unlikely(err))
+                goto failed_acl; /* never occur. When supporting
+                                    nilfs_init_acl(), proper cancellation of
+                                    above jobs should be considered */
+        mark_inode_dirty(inode);
+        return inode;
+ failed_acl:
+ failed_bmap:
+        inode->i_nlink = 0;
+        iput(inode);  /* raw_inode will be deleted through
+                         generic_delete_inode() */
+        goto failed;
+ failed_ifile_create_inode:
+        make_bad_inode(inode);
+        iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
+                         called */
+ failed:
+        return ERR_PTR(err);
+}
+void nilfs_free_inode(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        clear_inode(inode);
+        /* XXX: check error code? Is there any thing I can do? */
+        (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
+        atomic_dec(&sbi->s_inodes_count);
+}
+void nilfs_set_inode_flags(struct inode *inode)
+{
+        unsigned int flags = NILFS_I(inode)->i_flags;
+        inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+                            S_DIRSYNC);
+        if (flags & NILFS_SYNC_FL)
+                inode->i_flags |= S_SYNC;
+        if (flags & NILFS_APPEND_FL)
+                inode->i_flags |= S_APPEND;
+        if (flags & NILFS_IMMUTABLE_FL)
+                inode->i_flags |= S_IMMUTABLE;
+#ifndef NILFS_ATIME_DISABLE
+        if (flags & NILFS_NOATIME_FL)
+#endif
+                inode->i_flags |= S_NOATIME;
+        if (flags & NILFS_DIRSYNC_FL)
+                inode->i_flags |= S_DIRSYNC;
+        mapping_set_gfp_mask(inode->i_mapping,
+                             mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+}
+int nilfs_read_inode_common(struct inode *inode,
+                            struct nilfs_inode *raw_inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        int err;
+        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+        inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
+        inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
+        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+        inode->i_size = le64_to_cpu(raw_inode->i_size);
+        inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+        inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+        inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+        inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+        inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+        inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+        if (inode->i_nlink == 0 && inode->i_mode == 0)
+                return -EINVAL; /* this inode is deleted */
+        inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
+        ii->i_flags = le32_to_cpu(raw_inode->i_flags);
+#if 0
+        ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+        ii->i_dir_acl = S_ISREG(inode->i_mode) ?
+                0 : le32_to_cpu(raw_inode->i_dir_acl);
+#endif
+        ii->i_cno = 0;
+        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)) {
+                err = nilfs_bmap_read(ii->i_bmap, raw_inode);
+                if (err < 0)
+                        return err;
+                set_bit(NILFS_I_BMAP, &ii->i_state);
+                /* No lock is needed; iget() ensures it. */
+        }
+        return 0;
+}
+static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
+                              struct inode *inode)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
+        struct buffer_head *bh;
+        struct nilfs_inode *raw_inode;
+        int err;
+        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
+        if (unlikely(err))
+                goto bad_inode;
+        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
+#ifdef CONFIG_NILFS_FS_POSIX_ACL
+        ii->i_acl = NILFS_ACL_NOT_CACHED;
+        ii->i_default_acl = NILFS_ACL_NOT_CACHED;
+#endif
+        if (nilfs_read_inode_common(inode, raw_inode))
+                goto failed_unmap;
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &nilfs_file_inode_operations;
+                inode->i_fop = &nilfs_file_operations;
+                inode->i_mapping->a_ops = &nilfs_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &nilfs_dir_inode_operations;
+                inode->i_fop = &nilfs_dir_operations;
+                inode->i_mapping->a_ops = &nilfs_aops;
+        } else if (S_ISLNK(inode->i_mode)) {
+                inode->i_op = &nilfs_symlink_inode_operations;
+                inode->i_mapping->a_ops = &nilfs_aops;
+        } else {
+                inode->i_op = &nilfs_special_inode_operations;
+                init_special_inode(
+                        inode, inode->i_mode,
+                        new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+        }
+        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+        brelse(bh);
+        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        nilfs_set_inode_flags(inode);
+        return 0;
+ failed_unmap:
+        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+        brelse(bh);
+ bad_inode:
+        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        return err;
+}
+struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
+{
+        struct inode *inode;
+        int err;
+        inode = iget_locked(sb, ino);
+        if (unlikely(!inode))
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        err = __nilfs_read_inode(sb, ino, inode);
+        if (unlikely(err)) {
+                iget_failed(inode);
+                return ERR_PTR(err);
+        }
+        unlock_new_inode(inode);
+        return inode;
+}
+void nilfs_write_inode_common(struct inode *inode,
+                              struct nilfs_inode *raw_inode, int has_bmap)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+        raw_inode->i_uid = cpu_to_le32(inode->i_uid);
+        raw_inode->i_gid = cpu_to_le32(inode->i_gid);
+        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+        raw_inode->i_size = cpu_to_le64(inode->i_size);
+        raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+        raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
+        raw_inode->i_flags = cpu_to_le32(ii->i_flags);
+        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+        if (has_bmap)
+                nilfs_bmap_write(ii->i_bmap, raw_inode);
+        else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+                raw_inode->i_device_code =
+                        cpu_to_le64(new_encode_dev(inode->i_rdev));
+        /* When extending inode, nilfs->ns_inode_size should be checked
+           for substitutions of appended fields */
+}
+void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
+{
+        ino_t ino = inode->i_ino;
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_inode *raw_inode;
+        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
+        /* The buffer is guarded with lock_buffer() by the caller */
+        if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
+                memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
+        set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+        nilfs_write_inode_common(inode, raw_inode, 0);
+                /* XXX: call with has_bmap = 0 is a workaround to avoid
+                   deadlock of bmap. This delays update of i_bmap to just
+                   before writing */
+        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
+}
+#define NILFS_MAX_TRUNCATE_BLOCKS       16384  /* 64MB for 4KB block */
+static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
+                                unsigned long from)
+{
+        unsigned long b;
+        int ret;
+        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+                return;
+ repeat:
+        ret = nilfs_bmap_last_key(ii->i_bmap, &b);
+        if (ret == -ENOENT)
+                return;
+        else if (ret < 0)
+                goto failed;
+        if (b < from)
+                return;
+        b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
+        ret = nilfs_bmap_truncate(ii->i_bmap, b);
+        nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
+        if (!ret || (ret == -ENOMEM &&
+                     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
+                goto repeat;
+ failed:
+        if (ret == -EINVAL)
+                nilfs_error(ii->vfs_inode.i_sb, __func__,
+                            "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
+        else
+                nilfs_warning(ii->vfs_inode.i_sb, __func__,
+                              "failed to truncate bmap (ino=%lu, err=%d)",
+                              ii->vfs_inode.i_ino, ret);
+}
+void nilfs_truncate(struct inode *inode)
+{
+        unsigned long blkoff;
+        unsigned int blocksize;
+        struct nilfs_transaction_info ti;
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        blocksize = sb->s_blocksize;
+        blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
+        nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+        block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
+        nilfs_truncate_bmap(ii, blkoff);
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        if (IS_SYNC(inode))
+                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+        nilfs_transaction_commit(sb);
+        /* May construct a logical segment and may fail in sync mode.
+           But truncate has no return value. */
+}
+void nilfs_delete_inode(struct inode *inode)
+{
+        struct nilfs_transaction_info ti;
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        if (unlikely(is_bad_inode(inode))) {
+                if (inode->i_data.nrpages)
+                        truncate_inode_pages(&inode->i_data, 0);
+                clear_inode(inode);
+                return;
+        }
+        nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+        if (inode->i_data.nrpages)
+                truncate_inode_pages(&inode->i_data, 0);
+        nilfs_truncate_bmap(ii, 0);
+        nilfs_free_inode(inode);
+        /* nilfs_free_inode() marks inode buffer dirty */
+        if (IS_SYNC(inode))
+                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        nilfs_transaction_commit(sb);
+        /* May construct a logical segment and may fail in sync mode.
+           But delete_inode has no return value. */
+}
+int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct nilfs_transaction_info ti;
+        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        int err;
+        err = inode_change_ok(inode, iattr);
+        if (err)
+                return err;
+        err = nilfs_transaction_begin(sb, &ti, 0);
+        if (unlikely(err))
+                return err;
+        err = inode_setattr(inode, iattr);
+        if (!err && (iattr->ia_valid & ATTR_MODE))
+                err = nilfs_acl_chmod(inode);
+        if (likely(!err))
+                err = nilfs_transaction_commit(sb);
+        else
+                nilfs_transaction_abort(sb);
+        return err;
+}
+int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
+                           struct buffer_head **pbh)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        int err;
+        spin_lock(&sbi->s_inode_lock);
+        /* Caller of this function MUST lock s_inode_lock */
+        if (ii->i_bh == NULL) {
+                spin_unlock(&sbi->s_inode_lock);
+                err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
+                                                  pbh);
+                if (unlikely(err))
+                        return err;
+                spin_lock(&sbi->s_inode_lock);
+                if (ii->i_bh == NULL)
+                        ii->i_bh = *pbh;
+                else {
+                        brelse(*pbh);
+                        *pbh = ii->i_bh;
+                }
+        } else
+                *pbh = ii->i_bh;
+        get_bh(*pbh);
+        spin_unlock(&sbi->s_inode_lock);
+        return 0;
+}
+int nilfs_inode_dirty(struct inode *inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+        int ret = 0;
+        if (!list_empty(&ii->i_dirty)) {
+                spin_lock(&sbi->s_inode_lock);
+                ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
+                        test_bit(NILFS_I_BUSY, &ii->i_state);
+                spin_unlock(&sbi->s_inode_lock);
+        }
+        return ret;
+}
+int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
+                         unsigned nr_dirty)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
+        if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
+                return 0;
+        spin_lock(&sbi->s_inode_lock);
+        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+                /* Because this routine may race with nilfs_dispose_list(),
+                   we have to check NILFS_I_QUEUED here, too. */
+                if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
+                        /* This will happen when somebody is freeing
+                           this inode. */
+                        nilfs_warning(sbi->s_super, __func__,
+                                      "cannot get inode (ino=%lu)\n",
+                                      inode->i_ino);
+                        spin_unlock(&sbi->s_inode_lock);
+                        return -EINVAL; /* NILFS_I_DIRTY may remain for
+                                           freeing inode */
+                }
+                list_del(&ii->i_dirty);
+                list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
+                set_bit(NILFS_I_QUEUED, &ii->i_state);
+        }
+        spin_unlock(&sbi->s_inode_lock);
+        return 0;
+}
+int nilfs_mark_inode_dirty(struct inode *inode)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+        struct buffer_head *ibh;
+        int err;
+        err = nilfs_load_inode_block(sbi, inode, &ibh);
+        if (unlikely(err)) {
+                nilfs_warning(inode->i_sb, __func__,
+                              "failed to reget inode block.\n");
+                return err;
+        }
+        lock_buffer(ibh);
+        nilfs_update_inode(inode, ibh);
+        unlock_buffer(ibh);
+        nilfs_mdt_mark_buffer_dirty(ibh);
+        nilfs_mdt_mark_dirty(sbi->s_ifile);
+        brelse(ibh);
+        return 0;
+}
+/**
+ * nilfs_dirty_inode - reflect changes on given inode to an inode block.
+ * @inode: inode of the file to be registered.
+ *
+ * nilfs_dirty_inode() loads a inode block containing the specified
+ * @inode and copies data from a nilfs_inode to a corresponding inode
+ * entry in the inode block. This operation is excluded from the segment
+ * construction. This function can be called both as a single operation
+ * and as a part of indivisible file operations.
+ */
+void nilfs_dirty_inode(struct inode *inode)
+{
+        struct nilfs_transaction_info ti;
+        if (is_bad_inode(inode)) {
+                nilfs_warning(inode->i_sb, __func__,
+                              "tried to mark bad_inode dirty. ignored.\n");
+                dump_stack();
+                return;
+        }
+        nilfs_transaction_begin(inode->i_sb, &ti, 0);
+        nilfs_mark_inode_dirty(inode);
+        nilfs_transaction_commit(inode->i_sb); /* never fails */
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
new file mode 100644
index 000000000000..108d281ebca5
--- /dev/null
+++ b/fs/nilfs2/ioctl.c
@@ -0,0 +1,654 @@
+/*
+ * ioctl.c - NILFS ioctl operations.
+ *
+ * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/smp_lock.h>     /* lock_kernel(), unlock_kernel() */
+#include <linux/capability.h>   /* capable() */
+#include <linux/uaccess.h>      /* copy_from_user(), copy_to_user() */
+#include <linux/nilfs2_fs.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "bmap.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
+                                 struct nilfs_argv *argv, int dir,
+                                 ssize_t (*dofunc)(struct the_nilfs *,
+                                                   __u64 *, int,
+                                                   void *, size_t, size_t))
+{
+        void *buf;
+        void __user *base = (void __user *)(unsigned long)argv->v_base;
+        size_t maxmembs, total, n;
+        ssize_t nr;
+        int ret, i;
+        __u64 pos, ppos;
+        if (argv->v_nmembs == 0)
+                return 0;
+        if (argv->v_size > PAGE_SIZE)
+                return -EINVAL;
+        buf = (void *)__get_free_pages(GFP_NOFS, 0);
+        if (unlikely(!buf))
+                return -ENOMEM;
+        maxmembs = PAGE_SIZE / argv->v_size;
+        ret = 0;
+        total = 0;
+        pos = argv->v_index;
+        for (i = 0; i < argv->v_nmembs; i += n) {
+                n = (argv->v_nmembs - i < maxmembs) ?
+                        argv->v_nmembs - i : maxmembs;
+                if ((dir & _IOC_WRITE) &&
+                    copy_from_user(buf, base + argv->v_size * i,
+                                   argv->v_size * n)) {
+                        ret = -EFAULT;
+                        break;
+                }
+                ppos = pos;
+                nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
+                               n);
+                if (nr < 0) {
+                        ret = nr;
+                        break;
+                }
+                if ((dir & _IOC_READ) &&
+                    copy_to_user(base + argv->v_size * i, buf,
+                                 argv->v_size * nr)) {
+                        ret = -EFAULT;
+                        break;
+                }
+                total += nr;
+                if ((size_t)nr < n)
+                        break;
+                if (pos == ppos)
+                        pos += n;
+        }
+        argv->v_nmembs = total;
+        free_pages((unsigned long)buf, 0);
+        return ret;
+}
+static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
+                                     unsigned int cmd, void __user *argp)
+{
+        struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+        struct nilfs_transaction_info ti;
+        struct nilfs_cpmode cpmode;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
+                return -EFAULT;
+        nilfs_transaction_begin(inode->i_sb, &ti, 0);
+        ret = nilfs_cpfile_change_cpmode(
+                cpfile, cpmode.cm_cno, cpmode.cm_mode);
+        if (unlikely(ret < 0)) {
+                nilfs_transaction_abort(inode->i_sb);
+                return ret;
+        }
+        nilfs_transaction_commit(inode->i_sb); /* never fails */
+        return ret;
+}
+static int
+nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
+                              unsigned int cmd, void __user *argp)
+{
+        struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+        struct nilfs_transaction_info ti;
+        __u64 cno;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (copy_from_user(&cno, argp, sizeof(cno)))
+                return -EFAULT;
+        nilfs_transaction_begin(inode->i_sb, &ti, 0);
+        ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
+        if (unlikely(ret < 0)) {
+                nilfs_transaction_abort(inode->i_sb);
+                return ret;
+        }
+        nilfs_transaction_commit(inode->i_sb); /* never fails */
+        return ret;
+}
+static ssize_t
+nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                          void *buf, size_t size, size_t nmembs)
+{
+        return nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
+                                       nmembs);
+}
+static int nilfs_ioctl_get_cpinfo(struct inode *inode, struct file *filp,
+                                  unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_argv argv;
+        int ret;
+        if (copy_from_user(&argv, argp, sizeof(argv)))
+                return -EFAULT;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+                                    nilfs_ioctl_do_get_cpinfo);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &argv, sizeof(argv)))
+                ret = -EFAULT;
+        return ret;
+}
+static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
+                                  unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_cpstat cpstat;
+        int ret;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
+                ret = -EFAULT;
+        return ret;
+}
+static ssize_t
+nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                          void *buf, size_t size, size_t nmembs)
+{
+        return nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
+}
+static int nilfs_ioctl_get_suinfo(struct inode *inode, struct file *filp,
+                                  unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_argv argv;
+        int ret;
+        if (copy_from_user(&argv, argp, sizeof(argv)))
+                return -EFAULT;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+                                    nilfs_ioctl_do_get_suinfo);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &argv, sizeof(argv)))
+                ret = -EFAULT;
+        return ret;
+}
+static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
+                                  unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_sustat sustat;
+        int ret;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &sustat, sizeof(sustat)))
+                ret = -EFAULT;
+        return ret;
+}
+static ssize_t
+nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                         void *buf, size_t size, size_t nmembs)
+{
+        return nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
+}
+static int nilfs_ioctl_get_vinfo(struct inode *inode, struct file *filp,
+                                 unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_argv argv;
+        int ret;
+        if (copy_from_user(&argv, argp, sizeof(argv)))
+                return -EFAULT;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+                                    nilfs_ioctl_do_get_vinfo);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &argv, sizeof(argv)))
+                ret = -EFAULT;
+        return ret;
+}
+static ssize_t
+nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                          void *buf, size_t size, size_t nmembs)
+{
+        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+        struct nilfs_bdesc *bdescs = buf;
+        int ret, i;
+        for (i = 0; i < nmembs; i++) {
+                ret = nilfs_bmap_lookup_at_level(bmap,
+                                                 bdescs[i].bd_offset,
+                                                 bdescs[i].bd_level + 1,
+                                                 &bdescs[i].bd_blocknr);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                return ret;
+                        bdescs[i].bd_blocknr = 0;
+                }
+        }
+        return nmembs;
+}
+static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
+                                  unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct nilfs_argv argv;
+        int ret;
+        if (copy_from_user(&argv, argp, sizeof(argv)))
+                return -EFAULT;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+                                    nilfs_ioctl_do_get_bdescs);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &argv, sizeof(argv)))
+                ret = -EFAULT;
+        return ret;
+}
+static int nilfs_ioctl_move_inode_block(struct inode *inode,
+                                        struct nilfs_vdesc *vdesc,
+                                        struct list_head *buffers)
+{
+        struct buffer_head *bh;
+        int ret;
+        if (vdesc->vd_flags == 0)
+                ret = nilfs_gccache_submit_read_data(
+                        inode, vdesc->vd_offset, vdesc->vd_blocknr,
+                        vdesc->vd_vblocknr, &bh);
+        else
+                ret = nilfs_gccache_submit_read_node(
+                        inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
+        if (unlikely(ret < 0)) {
+                if (ret == -ENOENT)
+                        printk(KERN_CRIT
+                               "%s: invalid virtual block address (%s): "
+                               "ino=%llu, cno=%llu, offset=%llu, "
+                               "blocknr=%llu, vblocknr=%llu\n",
+                               __func__, vdesc->vd_flags ? "node" : "data",
+                               (unsigned long long)vdesc->vd_ino,
+                               (unsigned long long)vdesc->vd_cno,
+                               (unsigned long long)vdesc->vd_offset,
+                               (unsigned long long)vdesc->vd_blocknr,
+                               (unsigned long long)vdesc->vd_vblocknr);
+                return ret;
+        }
+        bh->b_private = vdesc;
+        list_add_tail(&bh->b_assoc_buffers, buffers);
+        return 0;
+}
+static ssize_t
+nilfs_ioctl_do_move_blocks(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                           void *buf, size_t size, size_t nmembs)
+{
+        struct inode *inode;
+        struct nilfs_vdesc *vdesc;
+        struct buffer_head *bh, *n;
+        LIST_HEAD(buffers);
+        ino_t ino;
+        __u64 cno;
+        int i, ret;
+        for (i = 0, vdesc = buf; i < nmembs; ) {
+                ino = vdesc->vd_ino;
+                cno = vdesc->vd_cno;
+                inode = nilfs_gc_iget(nilfs, ino, cno);
+                if (unlikely(inode == NULL)) {
+                        ret = -ENOMEM;
+                        goto failed;
+                }
+                do {
+                        ret = nilfs_ioctl_move_inode_block(inode, vdesc,
+                                                           &buffers);
+                        if (unlikely(ret < 0))
+                                goto failed;
+                        vdesc++;
+                } while (++i < nmembs &&
+                         vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+        }
+        list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+                ret = nilfs_gccache_wait_and_mark_dirty(bh);
+                if (unlikely(ret < 0)) {
+                        if (ret == -EEXIST) {
+                                vdesc = bh->b_private;
+                                printk(KERN_CRIT
+                                       "%s: conflicting %s buffer: "
+                                       "ino=%llu, cno=%llu, offset=%llu, "
+                                       "blocknr=%llu, vblocknr=%llu\n",
+                                       __func__,
+                                       vdesc->vd_flags ? "node" : "data",
+                                       (unsigned long long)vdesc->vd_ino,
+                                       (unsigned long long)vdesc->vd_cno,
+                                       (unsigned long long)vdesc->vd_offset,
+                                       (unsigned long long)vdesc->vd_blocknr,
+                                       (unsigned long long)vdesc->vd_vblocknr);
+                        }
+                        goto failed;
+                }
+                list_del_init(&bh->b_assoc_buffers);
+                bh->b_private = NULL;
+                brelse(bh);
+        }
+        return nmembs;
+ failed:
+        list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+                list_del_init(&bh->b_assoc_buffers);
+                bh->b_private = NULL;
+                brelse(bh);
+        }
+        return ret;
+}
+static inline int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
+                                          struct nilfs_argv *argv,
+                                          int dir)
+{
+        return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+                                     nilfs_ioctl_do_move_blocks);
+}
+static ssize_t
+nilfs_ioctl_do_delete_checkpoints(struct the_nilfs *nilfs, __u64 *posp,
+                                  int flags, void *buf, size_t size,
+                                  size_t nmembs)
+{
+        struct inode *cpfile = nilfs->ns_cpfile;
+        struct nilfs_period *periods = buf;
+        int ret, i;
+        for (i = 0; i < nmembs; i++) {
+                ret = nilfs_cpfile_delete_checkpoints(
+                        cpfile, periods[i].p_start, periods[i].p_end);
+                if (ret < 0)
+                        return ret;
+        }
+        return nmembs;
+}
+static inline int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
+                                                 struct nilfs_argv *argv,
+                                                 int dir)
+{
+        return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+                                     nilfs_ioctl_do_delete_checkpoints);
+}
+static ssize_t
+nilfs_ioctl_do_free_vblocknrs(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                              void *buf, size_t size, size_t nmembs)
+{
+        int ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+        return (ret < 0) ? ret : nmembs;
+}
+static inline int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
+                                             struct nilfs_argv *argv,
+                                             int dir)
+{
+        return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+                                     nilfs_ioctl_do_free_vblocknrs);
+}
+static ssize_t
+nilfs_ioctl_do_mark_blocks_dirty(struct the_nilfs *nilfs, __u64 *posp,
+                                 int flags, void *buf, size_t size,
+                                 size_t nmembs)
+{
+        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+        struct nilfs_bdesc *bdescs = buf;
+        int ret, i;
+        for (i = 0; i < nmembs; i++) {
+                /* XXX: use macro or inline func to check liveness */
+                ret = nilfs_bmap_lookup_at_level(bmap,
+                                                 bdescs[i].bd_offset,
+                                                 bdescs[i].bd_level + 1,
+                                                 &bdescs[i].bd_blocknr);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                return ret;
+                        bdescs[i].bd_blocknr = 0;
+                }
+                if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
+                        /* skip dead block */
+                        continue;
+                if (bdescs[i].bd_level == 0) {
+                        ret = nilfs_mdt_mark_block_dirty(dat,
+                                                         bdescs[i].bd_offset);
+                        if (ret < 0) {
+                                WARN_ON(ret == -ENOENT);
+                                return ret;
+                        }
+                } else {
+                        ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
+                                              bdescs[i].bd_level);
+                        if (ret < 0) {
+                                WARN_ON(ret == -ENOENT);
+                                return ret;
+                        }
+                }
+        }
+        return nmembs;
+}
+static inline int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
+                                                struct nilfs_argv *argv,
+                                                int dir)
+{
+        return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+                                     nilfs_ioctl_do_mark_blocks_dirty);
+}
+static ssize_t
+nilfs_ioctl_do_free_segments(struct the_nilfs *nilfs, __u64 *posp, int flags,
+                             void *buf, size_t size, size_t nmembs)
+{
+        struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
+        int ret;
+        if (unlikely(!sbi))
+                return -EROFS;
+        ret = nilfs_segctor_add_segments_to_be_freed(
+                NILFS_SC(sbi), buf, nmembs);
+        nilfs_put_writer(nilfs);
+        return (ret < 0) ? ret : nmembs;
+}
+static inline int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
+                                             struct nilfs_argv *argv,
+                                             int dir)
+{
+        return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+                                     nilfs_ioctl_do_free_segments);
+}
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
+                                       void __user *argp)
+{
+        struct nilfs_argv argv[5];
+        const char *msg;
+        int dir, ret;
+        if (copy_from_user(argv, argp, sizeof(argv)))
+                return -EFAULT;
+        dir = _IOC_WRITE;
+        ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], dir);
+        if (ret < 0) {
+                msg = "cannot read source blocks";
+                goto failed;
+        }
+        ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], dir);
+        if (ret < 0) {
+                /*
+                 * can safely abort because checkpoints can be removed
+                 * independently.
+                 */
+                msg = "cannot delete checkpoints";
+                goto failed;
+        }
+        ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], dir);
+        if (ret < 0) {
+                /*
+                 * can safely abort because DAT file is updated atomically
+                 * using a copy-on-write technique.
+                 */
+                msg = "cannot delete virtual blocks from DAT file";
+                goto failed;
+        }
+        ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], dir);
+        if (ret < 0) {
+                /*
+                 * can safely abort because the operation is nondestructive.
+                 */
+                msg = "cannot mark copying blocks dirty";
+                goto failed;
+        }
+        ret = nilfs_ioctl_free_segments(nilfs, &argv[4], dir);
+        if (ret < 0) {
+                /*
+                 * can safely abort because this operation is atomic.
+                 */
+                msg = "cannot set segments to be freed";
+                goto failed;
+        }
+        return 0;
+ failed:
+        nilfs_remove_all_gcinode(nilfs);
+        printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
+               msg, ret);
+        return ret;
+}
+static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
+                                      unsigned int cmd, void __user *argp)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return nilfs_clean_segments(inode->i_sb, argp);
+}
+static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
+                            unsigned int cmd, void __user *argp)
+{
+        __u64 cno;
+        int ret;
+        ret = nilfs_construct_segment(inode->i_sb);
+        if (ret < 0)
+                return ret;
+        if (argp != NULL) {
+                cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
+                if (copy_to_user(argp, &cno, sizeof(cno)))
+                        return -EFAULT;
+        }
+        return 0;
+}
+long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        void __user *argp = (void * __user *)arg;
+        switch (cmd) {
+        case NILFS_IOCTL_CHANGE_CPMODE:
+                return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
+        case NILFS_IOCTL_DELETE_CHECKPOINT:
+                return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
+        case NILFS_IOCTL_GET_CPINFO:
+                return nilfs_ioctl_get_cpinfo(inode, filp, cmd, argp);
+        case NILFS_IOCTL_GET_CPSTAT:
+                return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
+        case NILFS_IOCTL_GET_SUINFO:
+                return nilfs_ioctl_get_suinfo(inode, filp, cmd, argp);
+        case NILFS_IOCTL_GET_SUSTAT:
+                return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
+        case NILFS_IOCTL_GET_VINFO:
+                /* XXX: rename to ??? */
+                return nilfs_ioctl_get_vinfo(inode, filp, cmd, argp);
+        case NILFS_IOCTL_GET_BDESCS:
+                return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
+        case NILFS_IOCTL_CLEAN_SEGMENTS:
+                return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
+        case NILFS_IOCTL_SYNC:
+                return nilfs_ioctl_sync(inode, filp, cmd, argp);
+        default:
+                return -ENOTTY;
+        }
+}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
new file mode 100644
index 000000000000..47dd815433fd
--- /dev/null
+++ b/fs/nilfs2/mdt.c
@@ -0,0 +1,563 @@
+/*
+ * mdt.c - meta data file for NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+#define NILFS_MDT_MAX_RA_BLOCKS         (16 - 1)
+#define INIT_UNUSED_INODE_FIELDS
+static int
+nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
+                           struct buffer_head *bh,
+                           void (*init_block)(struct inode *,
+                                              struct buffer_head *, void *))
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        void *kaddr;
+        int ret;
+        /* Caller exclude read accesses using page lock */
+        /* set_buffer_new(bh); */
+        bh->b_blocknr = 0;
+        ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
+        if (unlikely(ret))
+                return ret;
+        set_buffer_mapped(bh);
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
+        if (init_block)
+                init_block(inode, bh, kaddr);
+        flush_dcache_page(bh->b_page);
+        kunmap_atomic(kaddr, KM_USER0);
+        set_buffer_uptodate(bh);
+        nilfs_mark_buffer_dirty(bh);
+        nilfs_mdt_mark_dirty(inode);
+        return 0;
+}
+static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
+                                  struct buffer_head **out_bh,
+                                  void (*init_block)(struct inode *,
+                                                     struct buffer_head *,
+                                                     void *))
+{
+        struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
+        struct nilfs_sb_info *writer = NULL;
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_transaction_info ti;
+        struct buffer_head *bh;
+        int err;
+        if (!sb) {
+                writer = nilfs_get_writer(nilfs);
+                if (!writer) {
+                        err = -EROFS;
+                        goto out;
+                }
+                sb = writer->s_super;
+        }
+        nilfs_transaction_begin(sb, &ti, 0);
+        err = -ENOMEM;
+        bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
+        if (unlikely(!bh))
+                goto failed_unlock;
+        err = -EEXIST;
+        if (buffer_uptodate(bh) || buffer_mapped(bh))
+                goto failed_bh;
+#if 0
+        /* The uptodate flag is not protected by the page lock, but
+           the mapped flag is.  Thus, we don't have to wait the buffer. */
+        wait_on_buffer(bh);
+        if (buffer_uptodate(bh))
+                goto failed_bh;
+#endif
+        bh->b_bdev = nilfs->ns_bdev;
+        err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
+        if (likely(!err)) {
+                get_bh(bh);
+                *out_bh = bh;
+        }
+ failed_bh:
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        brelse(bh);
+ failed_unlock:
+        if (likely(!err))
+                err = nilfs_transaction_commit(sb);
+        else
+                nilfs_transaction_abort(sb);
+        if (writer)
+                nilfs_put_writer(nilfs);
+ out:
+        return err;
+}
+static int
+nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
+                       int mode, struct buffer_head **out_bh)
+{
+        struct buffer_head *bh;
+        unsigned long blknum = 0;
+        int ret = -ENOMEM;
+        bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+        if (unlikely(!bh))
+                goto failed;
+        ret = -EEXIST; /* internal code */
+        if (buffer_uptodate(bh))
+                goto out;
+        if (mode == READA) {
+                if (!trylock_buffer(bh)) {
+                        ret = -EBUSY;
+                        goto failed_bh;
+                }
+        } else /* mode == READ */
+                lock_buffer(bh);
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                goto out;
+        }
+        if (!buffer_mapped(bh)) { /* unused buffer */
+                ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
+                                        &blknum);
+                if (unlikely(ret)) {
+                        unlock_buffer(bh);
+                        goto failed_bh;
+                }
+                bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+                bh->b_blocknr = blknum;
+                set_buffer_mapped(bh);
+        }
+        bh->b_end_io = end_buffer_read_sync;
+        get_bh(bh);
+        submit_bh(mode, bh);
+        ret = 0;
+ out:
+        get_bh(bh);
+        *out_bh = bh;
+ failed_bh:
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        brelse(bh);
+ failed:
+        return ret;
+}
+static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
+                                struct buffer_head **out_bh)
+{
+        struct buffer_head *first_bh, *bh;
+        unsigned long blkoff;
+        int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
+        int err;
+        err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
+        if (err == -EEXIST) /* internal code */
+                goto out;
+        if (unlikely(err))
+                goto failed;
+        blkoff = block + 1;
+        for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+                err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+                if (likely(!err || err == -EEXIST))
+                        brelse(bh);
+                else if (err != -EBUSY)
+                        break; /* abort readahead if bmap lookup failed */
+                if (!buffer_locked(first_bh))
+                        goto out_no_wait;
+        }
+        wait_on_buffer(first_bh);
+ out_no_wait:
+        err = -EIO;
+        if (!buffer_uptodate(first_bh))
+                goto failed_bh;
+ out:
+        *out_bh = first_bh;
+        return 0;
+ failed_bh:
+        brelse(first_bh);
+ failed:
+        return err;
+}
+/**
+ * nilfs_mdt_get_block - read or create a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @blkoff: block offset
+ * @create: create flag
+ * @init_block: initializer used for newly allocated block
+ * @out_bh: output of a pointer to the buffer_head
+ *
+ * nilfs_mdt_get_block() looks up the specified buffer and tries to create
+ * a new buffer if @create is not zero.  On success, the returned buffer is
+ * assured to be either existing or formatted using a buffer lock on success.
+ * @out_bh is substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ *
+ * %-EROFS - Read only filesystem (for create mode)
+ */
+int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
+                        void (*init_block)(struct inode *,
+                                           struct buffer_head *, void *),
+                        struct buffer_head **out_bh)
+{
+        int ret;
+        /* Should be rewritten with merging nilfs_mdt_read_block() */
+ retry:
+        ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
+        if (!create || ret != -ENOENT)
+                return ret;
+        ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
+        if (unlikely(ret == -EEXIST)) {
+                /* create = 0; */  /* limit read-create loop retries */
+                goto retry;
+        }
+        return ret;
+}
+/**
+ * nilfs_mdt_delete_block - make a hole on the meta data file.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, zero is returned.
+ * On error, one of the following negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ */
+int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        int err;
+        err = nilfs_bmap_delete(ii->i_bmap, block);
+        if (likely(!err)) {
+                nilfs_mdt_mark_dirty(inode);
+                nilfs_mdt_forget_block(inode, block);
+        }
+        return err;
+}
+/**
+ * nilfs_mdt_forget_block - discard dirty state and try to remove the page
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
+ * tries to release the page including the buffer from a page cache.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EBUSY - page has an active buffer.
+ *
+ * %-ENOENT - page cache has no page addressed by the offset.
+ */
+int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
+{
+        pgoff_t index = (pgoff_t)block >>
+                (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        struct page *page;
+        unsigned long first_block;
+        int ret = 0;
+        int still_dirty;
+        page = find_lock_page(inode->i_mapping, index);
+        if (!page)
+                return -ENOENT;
+        wait_on_page_writeback(page);
+        first_block = (unsigned long)index <<
+                (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        if (page_has_buffers(page)) {
+                struct buffer_head *bh;
+                bh = nilfs_page_get_nth_block(page, block - first_block);
+                nilfs_forget_buffer(bh);
+        }
+        still_dirty = PageDirty(page);
+        unlock_page(page);
+        page_cache_release(page);
+        if (still_dirty ||
+            invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
+                ret = -EBUSY;
+        return ret;
+}
+/**
+ * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ */
+int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
+{
+        struct buffer_head *bh;
+        int err;
+        err = nilfs_mdt_read_block(inode, block, &bh);
+        if (unlikely(err))
+                return err;
+        nilfs_mark_buffer_dirty(bh);
+        nilfs_mdt_mark_dirty(inode);
+        brelse(bh);
+        return 0;
+}
+int nilfs_mdt_fetch_dirty(struct inode *inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
+                set_bit(NILFS_I_DIRTY, &ii->i_state);
+                return 1;
+        }
+        return test_bit(NILFS_I_DIRTY, &ii->i_state);
+}
+static int
+nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = container_of(page->mapping,
+                                           struct inode, i_data);
+        struct super_block *sb = inode->i_sb;
+        struct nilfs_sb_info *writer = NULL;
+        int err = 0;
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        if (page->mapping->assoc_mapping)
+                return 0; /* Do not request flush for shadow page cache */
+        if (!sb) {
+                writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+                if (!writer)
+                        return -EROFS;
+                sb = writer->s_super;
+        }
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                err = nilfs_construct_segment(sb);
+        else if (wbc->for_reclaim)
+                nilfs_flush_segment(sb, inode->i_ino);
+        if (writer)
+                nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+        return err;
+}
+static struct address_space_operations def_mdt_aops = {
+        .writepage              = nilfs_mdt_write_page,
+};
+static struct inode_operations def_mdt_iops;
+static struct file_operations def_mdt_fops;
+/*
+ * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
+ * ifile, or gcinodes.  This allows the B-tree code and segment constructor
+ * to treat them like regular files, and this helps to simplify the
+ * implementation.
+ *   On the other hand, some of the pseudo inodes have an irregular point:
+ * They don't have valid inode->i_sb pointer because their lifetimes are
+ * longer than those of the super block structs; they may continue for
+ * several consecutive mounts/umounts.  This would need discussions.
+ */
+struct inode *
+nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
+                     ino_t ino, gfp_t gfp_mask)
+{
+        struct inode *inode = nilfs_alloc_inode(sb);
+        if (!inode)
+                return NULL;
+        else {
+                struct address_space * const mapping = &inode->i_data;
+                struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
+                if (!mi) {
+                        nilfs_destroy_inode(inode);
+                        return NULL;
+                }
+                mi->mi_nilfs = nilfs;
+                init_rwsem(&mi->mi_sem);
+                inode->i_sb = sb; /* sb may be NULL for some meta data files */
+                inode->i_blkbits = nilfs->ns_blocksize_bits;
+                inode->i_flags = 0;
+                atomic_set(&inode->i_count, 1);
+                inode->i_nlink = 1;
+                inode->i_ino = ino;
+                inode->i_mode = S_IFREG;
+                inode->i_private = mi;
+#ifdef INIT_UNUSED_INODE_FIELDS
+                atomic_set(&inode->i_writecount, 0);
+                inode->i_size = 0;
+                inode->i_blocks = 0;
+                inode->i_bytes = 0;
+                inode->i_generation = 0;
+#ifdef CONFIG_QUOTA
+                memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+#endif
+                inode->i_pipe = NULL;
+                inode->i_bdev = NULL;
+                inode->i_cdev = NULL;
+                inode->i_rdev = 0;
+#ifdef CONFIG_SECURITY
+                inode->i_security = NULL;
+#endif
+                inode->dirtied_when = 0;
+                INIT_LIST_HEAD(&inode->i_list);
+                INIT_LIST_HEAD(&inode->i_sb_list);
+                inode->i_state = 0;
+#endif
+                spin_lock_init(&inode->i_lock);
+                mutex_init(&inode->i_mutex);
+                init_rwsem(&inode->i_alloc_sem);
+                mapping->host = NULL;  /* instead of inode */
+                mapping->flags = 0;
+                mapping_set_gfp_mask(mapping, gfp_mask);
+                mapping->assoc_mapping = NULL;
+                mapping->backing_dev_info = nilfs->ns_bdi;
+                inode->i_mapping = mapping;
+        }
+        return inode;
+}
+struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
+                            ino_t ino, gfp_t gfp_mask)
+{
+        struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+        if (!inode)
+                return NULL;
+        inode->i_op = &def_mdt_iops;
+        inode->i_fop = &def_mdt_fops;
+        inode->i_mapping->a_ops = &def_mdt_aops;
+        return inode;
+}
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
+                              unsigned header_size)
+{
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+        mi->mi_entry_size = entry_size;
+        mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
+        mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
+}
+void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
+{
+        shadow->i_mapping->assoc_mapping = orig->i_mapping;
+        NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
+                &NILFS_I(orig)->i_btnode_cache;
+}
+void nilfs_mdt_clear(struct inode *inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        invalidate_mapping_pages(inode->i_mapping, 0, -1);
+        truncate_inode_pages(inode->i_mapping, 0);
+        nilfs_bmap_clear(ii->i_bmap);
+        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+}
+void nilfs_mdt_destroy(struct inode *inode)
+{
+        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+        kfree(mdi);
+        nilfs_destroy_inode(inode);
+}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
new file mode 100644
index 000000000000..df683e0bca6a
--- /dev/null
+++ b/fs/nilfs2/mdt.h
@@ -0,0 +1,125 @@
+/*
+ * mdt.h - NILFS meta data file prototype and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#ifndef _NILFS_MDT_H
+#define _NILFS_MDT_H
+#include <linux/buffer_head.h>
+#include <linux/blockgroup_lock.h>
+#include "nilfs.h"
+#include "page.h"
+/**
+ * struct nilfs_mdt_info - on-memory private data of meta data files
+ * @mi_nilfs: back pointer to the_nilfs struct
+ * @mi_sem: reader/writer semaphore for meta data operations
+ * @mi_bgl: per-blockgroup locking
+ * @mi_entry_size: size of an entry
+ * @mi_first_entry_offset: offset to the first entry
+ * @mi_entries_per_block: number of entries in a block
+ * @mi_blocks_per_group: number of blocks in a group
+ * @mi_blocks_per_desc_block: number of blocks per descriptor block
+ */
+struct nilfs_mdt_info {
+        struct the_nilfs       *mi_nilfs;
+        struct rw_semaphore     mi_sem;
+        struct blockgroup_lock *mi_bgl;
+        unsigned                mi_entry_size;
+        unsigned                mi_first_entry_offset;
+        unsigned long           mi_entries_per_block;
+        unsigned long           mi_blocks_per_group;
+        unsigned long           mi_blocks_per_desc_block;
+};
+static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
+{
+        return inode->i_private;
+}
+static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
+}
+/* Default GFP flags using highmem */
+#define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+int nilfs_mdt_get_block(struct inode *, unsigned long, int,
+                        void (*init_block)(struct inode *,
+                                           struct buffer_head *, void *),
+                        struct buffer_head **);
+int nilfs_mdt_delete_block(struct inode *, unsigned long);
+int nilfs_mdt_forget_block(struct inode *, unsigned long);
+int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
+int nilfs_mdt_fetch_dirty(struct inode *);
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+                            gfp_t);
+struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
+                                   ino_t, gfp_t);
+void nilfs_mdt_destroy(struct inode *);
+void nilfs_mdt_clear(struct inode *);
+void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+void nilfs_mdt_set_shadow(struct inode *, struct inode *);
+#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
+static inline void nilfs_mdt_mark_dirty(struct inode *inode)
+{
+        if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
+                set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+static inline void nilfs_mdt_clear_dirty(struct inode *inode)
+{
+        clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+static inline __u64 nilfs_mdt_cno(struct inode *inode)
+{
+        return NILFS_MDT(inode)->mi_nilfs->ns_cno;
+}
+#define nilfs_mdt_bgl_lock(inode, bg) \
+        (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
+static inline int
+nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
+                            unsigned n)
+{
+        return nilfs_read_inode_common(
+                inode, (struct nilfs_inode *)(bh->b_data + n));
+}
+static inline void
+nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
+                             unsigned n)
+{
+        nilfs_write_inode_common(
+                inode, (struct nilfs_inode *)(bh->b_data + n), 1);
+}
+#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
new file mode 100644
index 000000000000..df70dadb336f
--- /dev/null
+++ b/fs/nilfs2/namei.c
@@ -0,0 +1,474 @@
+/*
+ * namei.c - NILFS pathname lookup operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
+ *                       Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+#include <linux/pagemap.h>
+#include "nilfs.h"
+static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+        int err = nilfs_add_link(dentry, inode);
+        if (!err) {
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        inode_dec_link_count(inode);
+        iput(inode);
+        return err;
+}
+/*
+ * Methods themselves.
+ */
+static struct dentry *
+nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+        struct inode *inode;
+        ino_t ino;
+        if (dentry->d_name.len > NILFS_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        ino = nilfs_inode_by_name(dir, dentry);
+        inode = NULL;
+        if (ino) {
+                inode = nilfs_iget(dir->i_sb, ino);
+                if (IS_ERR(inode))
+                        return ERR_CAST(inode);
+        }
+        return d_splice_alias(inode, dentry);
+}
+struct dentry *nilfs_get_parent(struct dentry *child)
+{
+        unsigned long ino;
+        struct inode *inode;
+        struct dentry dotdot;
+        dotdot.d_name.name = "..";
+        dotdot.d_name.len = 2;
+        ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+        if (!ino)
+                return ERR_PTR(-ENOENT);
+        inode = nilfs_iget(child->d_inode->i_sb, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        return d_obtain_alias(inode);
+}
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
+                        struct nameidata *nd)
+{
+        struct inode *inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode = nilfs_new_inode(dir, mode);
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                inode->i_op = &nilfs_file_inode_operations;
+                inode->i_fop = &nilfs_file_operations;
+                inode->i_mapping->a_ops = &nilfs_aops;
+                mark_inode_dirty(inode);
+                err = nilfs_add_nondir(dentry, inode);
+        }
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int
+nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+        struct inode *inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode = nilfs_new_inode(dir, mode);
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                init_special_inode(inode, inode->i_mode, rdev);
+                mark_inode_dirty(inode);
+                err = nilfs_add_nondir(dentry, inode);
+        }
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
+                         const char *symname)
+{
+        struct nilfs_transaction_info ti;
+        struct super_block *sb = dir->i_sb;
+        unsigned l = strlen(symname)+1;
+        struct inode *inode;
+        int err;
+        if (l > sb->s_blocksize)
+                return -ENAMETOOLONG;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out;
+        /* slow symlink */
+        inode->i_op = &nilfs_symlink_inode_operations;
+        inode->i_mapping->a_ops = &nilfs_aops;
+        err = page_symlink(inode, symname, l);
+        if (err)
+                goto out_fail;
+        /* mark_inode_dirty(inode); */
+        /* nilfs_new_inode() and page_symlink() do this */
+        err = nilfs_add_nondir(dentry, inode);
+out:
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+out_fail:
+        inode_dec_link_count(inode);
+        iput(inode);
+        goto out;
+}
+static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
+                      struct dentry *dentry)
+{
+        struct inode *inode = old_dentry->d_inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        if (inode->i_nlink >= NILFS_LINK_MAX)
+                return -EMLINK;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode->i_ctime = CURRENT_TIME;
+        inode_inc_link_count(inode);
+        atomic_inc(&inode->i_count);
+        err = nilfs_add_nondir(dentry, inode);
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct inode *inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        if (dir->i_nlink >= NILFS_LINK_MAX)
+                return -EMLINK;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+        if (err)
+                return err;
+        inode_inc_link_count(dir);
+        inode = nilfs_new_inode(dir, S_IFDIR | mode);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_dir;
+        inode->i_op = &nilfs_dir_inode_operations;
+        inode->i_fop = &nilfs_dir_operations;
+        inode->i_mapping->a_ops = &nilfs_aops;
+        inode_inc_link_count(inode);
+        err = nilfs_make_empty(inode, dir);
+        if (err)
+                goto out_fail;
+        err = nilfs_add_link(dentry, inode);
+        if (err)
+                goto out_fail;
+        d_instantiate(dentry, inode);
+out:
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+out_fail:
+        inode_dec_link_count(inode);
+        inode_dec_link_count(inode);
+        iput(inode);
+out_dir:
+        inode_dec_link_count(dir);
+        goto out;
+}
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode;
+        struct nilfs_dir_entry *de;
+        struct page *page;
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+        if (err)
+                return err;
+        err = -ENOENT;
+        de = nilfs_find_entry(dir, dentry, &page);
+        if (!de)
+                goto out;
+        inode = dentry->d_inode;
+        err = -EIO;
+        if (le64_to_cpu(de->inode) != inode->i_ino)
+                goto out;
+        if (!inode->i_nlink) {
+                nilfs_warning(inode->i_sb, __func__,
+                              "deleting nonexistent file (%lu), %d\n",
+                              inode->i_ino, inode->i_nlink);
+                inode->i_nlink = 1;
+        }
+        err = nilfs_delete_entry(de, page);
+        if (err)
+                goto out;
+        inode->i_ctime = dir->i_ctime;
+        inode_dec_link_count(inode);
+        err = 0;
+out:
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+        if (err)
+                return err;
+        err = -ENOTEMPTY;
+        if (nilfs_empty_dir(inode)) {
+                err = nilfs_unlink(dir, dentry);
+                if (!err) {
+                        inode->i_size = 0;
+                        inode_dec_link_count(inode);
+                        inode_dec_link_count(dir);
+                }
+        }
+        if (!err)
+                err = nilfs_transaction_commit(dir->i_sb);
+        else
+                nilfs_transaction_abort(dir->i_sb);
+        return err;
+}
+static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir,  struct dentry *new_dentry)
+{
+        struct inode *old_inode = old_dentry->d_inode;
+        struct inode *new_inode = new_dentry->d_inode;
+        struct page *dir_page = NULL;
+        struct nilfs_dir_entry *dir_de = NULL;
+        struct page *old_page;
+        struct nilfs_dir_entry *old_de;
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
+        if (unlikely(err))
+                return err;
+        err = -ENOENT;
+        old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
+        if (!old_de)
+                goto out;
+        if (S_ISDIR(old_inode->i_mode)) {
+                err = -EIO;
+                dir_de = nilfs_dotdot(old_inode, &dir_page);
+                if (!dir_de)
+                        goto out_old;
+        }
+        if (new_inode) {
+                struct page *new_page;
+                struct nilfs_dir_entry *new_de;
+                err = -ENOTEMPTY;
+                if (dir_de && !nilfs_empty_dir(new_inode))
+                        goto out_dir;
+                err = -ENOENT;
+                new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
+                if (!new_de)
+                        goto out_dir;
+                inode_inc_link_count(old_inode);
+                nilfs_set_link(new_dir, new_de, new_page, old_inode);
+                new_inode->i_ctime = CURRENT_TIME;
+                if (dir_de)
+                        drop_nlink(new_inode);
+                inode_dec_link_count(new_inode);
+        } else {
+                if (dir_de) {
+                        err = -EMLINK;
+                        if (new_dir->i_nlink >= NILFS_LINK_MAX)
+                                goto out_dir;
+                }
+                inode_inc_link_count(old_inode);
+                err = nilfs_add_link(new_dentry, old_inode);
+                if (err) {
+                        inode_dec_link_count(old_inode);
+                        goto out_dir;
+                }
+                if (dir_de)
+                        inode_inc_link_count(new_dir);
+        }
+        /*
+         * Like most other Unix systems, set the ctime for inodes on a
+         * rename.
+         * inode_dec_link_count() will mark the inode dirty.
+         */
+        old_inode->i_ctime = CURRENT_TIME;
+        nilfs_delete_entry(old_de, old_page);
+        inode_dec_link_count(old_inode);
+        if (dir_de) {
+                nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
+                inode_dec_link_count(old_dir);
+        }
+        err = nilfs_transaction_commit(old_dir->i_sb);
+        return err;
+out_dir:
+        if (dir_de) {
+                kunmap(dir_page);
+                page_cache_release(dir_page);
+        }
+out_old:
+        kunmap(old_page);
+        page_cache_release(old_page);
+out:
+        nilfs_transaction_abort(old_dir->i_sb);
+        return err;
+}
+struct inode_operations nilfs_dir_inode_operations = {
+        .create         = nilfs_create,
+        .lookup         = nilfs_lookup,
+        .link           = nilfs_link,
+        .unlink         = nilfs_unlink,
+        .symlink        = nilfs_symlink,
+        .mkdir          = nilfs_mkdir,
+        .rmdir          = nilfs_rmdir,
+        .mknod          = nilfs_mknod,
+        .rename         = nilfs_rename,
+        .setattr        = nilfs_setattr,
+        .permission     = nilfs_permission,
+};
+struct inode_operations nilfs_special_inode_operations = {
+        .setattr        = nilfs_setattr,
+        .permission     = nilfs_permission,
+};
+struct inode_operations nilfs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
new file mode 100644
index 000000000000..7558c977db02
--- /dev/null
+++ b/fs/nilfs2/nilfs.h
@@ -0,0 +1,318 @@
+/*
+ * nilfs.h - NILFS local header file.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#ifndef _NILFS_H
+#define _NILFS_H
+#include <linux/kernel.h>
+#include <linux/buffer_head.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/nilfs2_fs.h>
+#include "the_nilfs.h"
+#include "sb.h"
+#include "bmap.h"
+#include "bmap_union.h"
+/*
+ * NILFS filesystem version
+ */
+#define NILFS_VERSION           "2.0.5"
+/*
+ * nilfs inode data in memory
+ */
+struct nilfs_inode_info {
+        __u32 i_flags;
+        unsigned long  i_state;         /* Dynamic state flags */
+        struct nilfs_bmap *i_bmap;
+        union nilfs_bmap_union i_bmap_union;
+        __u64 i_xattr;  /* sector_t ??? */
+        __u32 i_dir_start_lookup;
+        __u64 i_cno;            /* check point number for GC inode */
+        struct address_space i_btnode_cache;
+        struct list_head i_dirty;       /* List for connecting dirty files */
+#ifdef CONFIG_NILFS_XATTR
+        /*
+         * Extended attributes can be read independently of the main file
+         * data. Taking i_sem even when reading would cause contention
+         * between readers of EAs and writers of regular file data, so
+         * instead we synchronize on xattr_sem when reading or changing
+         * EAs.
+         */
+        struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_NILFS_POSIX_ACL
+        struct posix_acl *i_acl;
+        struct posix_acl *i_default_acl;
+#endif
+        struct buffer_head *i_bh;       /* i_bh contains a new or dirty
+                                           disk inode */
+        struct inode vfs_inode;
+};
+static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
+{
+        return container_of(inode, struct nilfs_inode_info, vfs_inode);
+}
+static inline struct nilfs_inode_info *
+NILFS_BMAP_I(const struct nilfs_bmap *bmap)
+{
+        return container_of((union nilfs_bmap_union *)bmap,
+                            struct nilfs_inode_info,
+                            i_bmap_union);
+}
+static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
+{
+        struct nilfs_inode_info *ii =
+                container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
+        return &ii->vfs_inode;
+}
+static inline struct inode *NILFS_AS_I(struct address_space *mapping)
+{
+        return (mapping->host) ? :
+                container_of(mapping, struct inode, i_data);
+}
+/*
+ * Dynamic state flags of NILFS on-memory inode (i_state)
+ */
+enum {
+        NILFS_I_NEW = 0,                /* Inode is newly created */
+        NILFS_I_DIRTY,                  /* The file is dirty */
+        NILFS_I_QUEUED,                 /* inode is in dirty_files list */
+        NILFS_I_BUSY,                   /* inode is grabbed by a segment
+                                           constructor */
+        NILFS_I_COLLECTED,              /* All dirty blocks are collected */
+        NILFS_I_UPDATED,                /* The file has been written back */
+        NILFS_I_INODE_DIRTY,            /* write_inode is requested */
+        NILFS_I_BMAP,                   /* has bmap and btnode_cache */
+        NILFS_I_GCINODE,                /* inode for GC, on memory only */
+        NILFS_I_GCDAT,                  /* shadow DAT, on memory only */
+};
+/*
+ * Macros to check inode numbers
+ */
+#define NILFS_MDT_INO_BITS   \
+  ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO |          \
+                  1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO |        \
+                  1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+#define NILFS_SYS_INO_BITS   \
+  ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+#define NILFS_FIRST_INO(sb)  (NILFS_SB(sb)->s_nilfs->ns_first_ino)
+#define NILFS_MDT_INODE(sb, ino) \
+  ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+#define NILFS_VALID_INODE(sb, ino) \
+  ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+/**
+ * struct nilfs_transaction_info: context information for synchronization
+ * @ti_magic: Magic number
+ * @ti_save: Backup of journal_info field of task_struct
+ * @ti_flags: Flags
+ * @ti_count: Nest level
+ * @ti_garbage: List of inode to be put when releasing semaphore
+ */
+struct nilfs_transaction_info {
+        u32                     ti_magic;
+        void                   *ti_save;
+                                /* This should never used. If this happens,
+                                   one of other filesystems has a bug. */
+        unsigned short          ti_flags;
+        unsigned short          ti_count;
+        struct list_head        ti_garbage;
+};
+/* ti_magic */
+#define NILFS_TI_MAGIC          0xd9e392fb
+/* ti_flags */
+#define NILFS_TI_DYNAMIC_ALLOC  0x0001  /* Allocated from slab */
+#define NILFS_TI_SYNC           0x0002  /* Force to construct segment at the
+                                           end of transaction. */
+#define NILFS_TI_GC             0x0004  /* GC context */
+#define NILFS_TI_COMMIT         0x0008  /* Change happened or not */
+#define NILFS_TI_WRITER         0x0010  /* Constructor context */
+int nilfs_transaction_begin(struct super_block *,
+                            struct nilfs_transaction_info *, int);
+int nilfs_transaction_commit(struct super_block *);
+void nilfs_transaction_abort(struct super_block *);
+static inline void nilfs_set_transaction_flag(unsigned int flag)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        ti->ti_flags |= flag;
+}
+static inline int nilfs_test_transaction_flag(unsigned int flag)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
+                return 0;
+        return !!(ti->ti_flags & flag);
+}
+static inline int nilfs_doing_gc(void)
+{
+        return nilfs_test_transaction_flag(NILFS_TI_GC);
+}
+static inline int nilfs_doing_construction(void)
+{
+        return nilfs_test_transaction_flag(NILFS_TI_WRITER);
+}
+static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
+{
+        return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
+}
+/*
+ * function prototype
+ */
+#ifdef CONFIG_NILFS_POSIX_ACL
+#error "NILFS: not yet supported POSIX ACL"
+extern int nilfs_permission(struct inode *, int, struct nameidata *);
+extern int nilfs_acl_chmod(struct inode *);
+extern int nilfs_init_acl(struct inode *, struct inode *);
+#else
+#define nilfs_permission   NULL
+static inline int nilfs_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
+{
+        inode->i_mode &= ~current_umask();
+        return 0;
+}
+#endif
+#define NILFS_ATIME_DISABLE
+/* dir.c */
+extern int nilfs_add_link(struct dentry *, struct inode *);
+extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
+extern int nilfs_make_empty(struct inode *, struct inode *);
+extern struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *, struct dentry *, struct page **);
+extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
+extern int nilfs_empty_dir(struct inode *);
+extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
+extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
+                           struct page *, struct inode *);
+/* file.c */
+extern int nilfs_sync_file(struct file *, struct dentry *, int);
+/* ioctl.c */
+long nilfs_ioctl(struct file *, unsigned int, unsigned long);
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, void __user *);
+/* inode.c */
+extern struct inode *nilfs_new_inode(struct inode *, int);
+extern void nilfs_free_inode(struct inode *);
+extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern void nilfs_set_inode_flags(struct inode *);
+extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
+extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+extern struct inode *nilfs_iget(struct super_block *, unsigned long);
+extern void nilfs_update_inode(struct inode *, struct buffer_head *);
+extern void nilfs_truncate(struct inode *);
+extern void nilfs_delete_inode(struct inode *);
+extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
+                                  struct buffer_head **);
+extern int nilfs_inode_dirty(struct inode *);
+extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
+                                unsigned);
+extern int nilfs_mark_inode_dirty(struct inode *);
+extern void nilfs_dirty_inode(struct inode *);
+/* namei.c */
+extern struct dentry *nilfs_get_parent(struct dentry *);
+/* super.c */
+extern struct inode *nilfs_alloc_inode(struct super_block *);
+extern void nilfs_destroy_inode(struct inode *);
+extern void nilfs_error(struct super_block *, const char *, const char *, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
+extern struct nilfs_super_block *
+nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
+extern int nilfs_store_magic_and_option(struct super_block *,
+                                        struct nilfs_super_block *, char *);
+extern int nilfs_commit_super(struct nilfs_sb_info *, int);
+extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
+extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
+/* gcinode.c */
+int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
+                                   struct buffer_head **);
+int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
+                                   struct buffer_head **);
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
+int nilfs_init_gccache(struct the_nilfs *);
+void nilfs_destroy_gccache(struct the_nilfs *);
+void nilfs_clear_gcinode(struct inode *);
+struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
+void nilfs_remove_all_gcinode(struct the_nilfs *);
+/* gcdat.c */
+int nilfs_init_gcdat_inode(struct the_nilfs *);
+void nilfs_commit_gcdat_inode(struct the_nilfs *);
+void nilfs_clear_gcdat_inode(struct the_nilfs *);
+/*
+ * Inodes and files operations
+ */
+extern struct file_operations nilfs_dir_operations;
+extern struct inode_operations nilfs_file_inode_operations;
+extern struct file_operations nilfs_file_operations;
+extern struct address_space_operations nilfs_aops;
+extern struct inode_operations nilfs_dir_inode_operations;
+extern struct inode_operations nilfs_special_inode_operations;
+extern struct inode_operations nilfs_symlink_inode_operations;
+/*
+ * filesystem type
+ */
+extern struct file_system_type nilfs_fs_type;
+#endif  /* _NILFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
new file mode 100644
index 000000000000..1bfbba9c0e9a
--- /dev/null
+++ b/fs/nilfs2/page.c
@@ -0,0 +1,540 @@
+/*
+ * page.c - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+#define NILFS_BUFFER_INHERENT_BITS  \
+        ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
+         (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
+static struct buffer_head *
+__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
+                       int blkbits, unsigned long b_state)
+{
+        unsigned long first_block;
+        struct buffer_head *bh;
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << blkbits, b_state);
+        first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
+        bh = nilfs_page_get_nth_block(page, block - first_block);
+        touch_buffer(bh);
+        wait_on_buffer(bh);
+        return bh;
+}
+/*
+ * Since the page cache of B-tree node pages or data page cache of pseudo
+ * inodes does not have a valid mapping->host pointer, calling
+ * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
+ * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
+ * To avoid this problem, the old style mark_buffer_dirty() is used instead.
+ */
+void nilfs_mark_buffer_dirty(struct buffer_head *bh)
+{
+        if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
+                __set_page_dirty_nobuffers(bh->b_page);
+}
+struct buffer_head *nilfs_grab_buffer(struct inode *inode,
+                                      struct address_space *mapping,
+                                      unsigned long blkoff,
+                                      unsigned long b_state)
+{
+        int blkbits = inode->i_blkbits;
+        pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
+        struct page *page, *opage;
+        struct buffer_head *bh, *obh;
+        page = grab_cache_page(mapping, index);
+        if (unlikely(!page))
+                return NULL;
+        bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
+        if (unlikely(!bh)) {
+                unlock_page(page);
+                page_cache_release(page);
+                return NULL;
+        }
+        if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
+                /*
+                 * Shadow page cache uses assoc_mapping to point its original
+                 * page cache.  The following code tries the original cache
+                 * if the given cache is a shadow and it didn't hit.
+                 */
+                opage = find_lock_page(mapping->assoc_mapping, index);
+                if (!opage)
+                        return bh;
+                obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
+                                             b_state);
+                if (buffer_uptodate(obh)) {
+                        nilfs_copy_buffer(bh, obh);
+                        if (buffer_dirty(obh)) {
+                                nilfs_mark_buffer_dirty(bh);
+                                if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
+                                        nilfs_mdt_mark_dirty(inode);
+                        }
+                }
+                brelse(obh);
+                unlock_page(opage);
+                page_cache_release(opage);
+        }
+        return bh;
+}
+/**
+ * nilfs_forget_buffer - discard dirty state
+ * @inode: owner inode of the buffer
+ * @bh: buffer head of the buffer to be discarded
+ */
+void nilfs_forget_buffer(struct buffer_head *bh)
+{
+        struct page *page = bh->b_page;
+        lock_buffer(bh);
+        clear_buffer_nilfs_volatile(bh);
+        if (test_clear_buffer_dirty(bh) && nilfs_page_buffers_clean(page))
+                __nilfs_clear_page_dirty(page);
+        clear_buffer_uptodate(bh);
+        clear_buffer_mapped(bh);
+        bh->b_blocknr = -1;
+        ClearPageUptodate(page);
+        ClearPageMappedToDisk(page);
+        unlock_buffer(bh);
+        brelse(bh);
+}
+/**
+ * nilfs_copy_buffer -- copy buffer data and flags
+ * @dbh: destination buffer
+ * @sbh: source buffer
+ */
+void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
+{
+        void *kaddr0, *kaddr1;
+        unsigned long bits;
+        struct page *spage = sbh->b_page, *dpage = dbh->b_page;
+        struct buffer_head *bh;
+        kaddr0 = kmap_atomic(spage, KM_USER0);
+        kaddr1 = kmap_atomic(dpage, KM_USER1);
+        memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
+        kunmap_atomic(kaddr1, KM_USER1);
+        kunmap_atomic(kaddr0, KM_USER0);
+        dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
+        dbh->b_blocknr = sbh->b_blocknr;
+        dbh->b_bdev = sbh->b_bdev;
+        bh = dbh;
+        bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
+        while ((bh = bh->b_this_page) != dbh) {
+                lock_buffer(bh);
+                bits &= bh->b_state;
+                unlock_buffer(bh);
+        }
+        if (bits & (1UL << BH_Uptodate))
+                SetPageUptodate(dpage);
+        else
+                ClearPageUptodate(dpage);
+        if (bits & (1UL << BH_Mapped))
+                SetPageMappedToDisk(dpage);
+        else
+                ClearPageMappedToDisk(dpage);
+}
+/**
+ * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
+ * @page: page to be checked
+ *
+ * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
+ * Otherwise, it returns non-zero value.
+ */
+int nilfs_page_buffers_clean(struct page *page)
+{
+        struct buffer_head *bh, *head;
+        bh = head = page_buffers(page);
+        do {
+                if (buffer_dirty(bh))
+                        return 0;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return 1;
+}
+void nilfs_page_bug(struct page *page)
+{
+        struct address_space *m;
+        unsigned long ino = 0;
+        if (unlikely(!page)) {
+                printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
+                return;
+        }
+        m = page->mapping;
+        if (m) {
+                struct inode *inode = NILFS_AS_I(m);
+                if (inode != NULL)
+                        ino = inode->i_ino;
+        }
+        printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
+               "mapping=%p ino=%lu\n",
+               page, atomic_read(&page->_count),
+               (unsigned long long)page->index, page->flags, m, ino);
+        if (page_has_buffers(page)) {
+                struct buffer_head *bh, *head;
+                int i = 0;
+                bh = head = page_buffers(page);
+                do {
+                        printk(KERN_CRIT
+                               " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
+                               i++, bh, atomic_read(&bh->b_count),
+                               (unsigned long long)bh->b_blocknr, bh->b_state);
+                        bh = bh->b_this_page;
+                } while (bh != head);
+        }
+}
+/**
+ * nilfs_alloc_private_page - allocate a private page with buffer heads
+ *
+ * Return Value: On success, a pointer to the allocated page is returned.
+ * On error, NULL is returned.
+ */
+struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
+                                      unsigned long state)
+{
+        struct buffer_head *bh, *head, *tail;
+        struct page *page;
+        page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
+        if (unlikely(!page))
+                return NULL;
+        lock_page(page);
+        head = alloc_page_buffers(page, size, 0);
+        if (unlikely(!head)) {
+                unlock_page(page);
+                __free_page(page);
+                return NULL;
+        }
+        bh = head;
+        do {
+                bh->b_state = (1UL << BH_NILFS_Allocated) | state;
+                tail = bh;
+                bh->b_bdev = bdev;
+                bh = bh->b_this_page;
+        } while (bh);
+        tail->b_this_page = head;
+        attach_page_buffers(page, head);
+        return page;
+}
+void nilfs_free_private_page(struct page *page)
+{
+        BUG_ON(!PageLocked(page));
+        BUG_ON(page->mapping);
+        if (page_has_buffers(page) && !try_to_free_buffers(page))
+                NILFS_PAGE_BUG(page, "failed to free page");
+        unlock_page(page);
+        __free_page(page);
+}
+/**
+ * nilfs_copy_page -- copy the page with buffers
+ * @dst: destination page
+ * @src: source page
+ * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
+ *
+ * This fuction is for both data pages and btnode pages.  The dirty flag
+ * should be treated by caller.  The page must not be under i/o.
+ * Both src and dst page must be locked
+ */
+static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
+{
+        struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
+        unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
+        BUG_ON(PageWriteback(dst));
+        sbh = sbufs = page_buffers(src);
+        if (!page_has_buffers(dst))
+                create_empty_buffers(dst, sbh->b_size, 0);
+        if (copy_dirty)
+                mask |= (1UL << BH_Dirty);
+        dbh = dbufs = page_buffers(dst);
+        do {
+                lock_buffer(sbh);
+                lock_buffer(dbh);
+                dbh->b_state = sbh->b_state & mask;
+                dbh->b_blocknr = sbh->b_blocknr;
+                dbh->b_bdev = sbh->b_bdev;
+                sbh = sbh->b_this_page;
+                dbh = dbh->b_this_page;
+        } while (dbh != dbufs);
+        copy_highpage(dst, src);
+        if (PageUptodate(src) && !PageUptodate(dst))
+                SetPageUptodate(dst);
+        else if (!PageUptodate(src) && PageUptodate(dst))
+                ClearPageUptodate(dst);
+        if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
+                SetPageMappedToDisk(dst);
+        else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
+                ClearPageMappedToDisk(dst);
+        do {
+                unlock_buffer(sbh);
+                unlock_buffer(dbh);
+                sbh = sbh->b_this_page;
+                dbh = dbh->b_this_page;
+        } while (dbh != dbufs);
+}
+int nilfs_copy_dirty_pages(struct address_space *dmap,
+                           struct address_space *smap)
+{
+        struct pagevec pvec;
+        unsigned int i;
+        pgoff_t index = 0;
+        int err = 0;
+        pagevec_init(&pvec, 0);
+repeat:
+        if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
+                                PAGEVEC_SIZE))
+                return 0;
+        for (i = 0; i < pagevec_count(&pvec); i++) {
+                struct page *page = pvec.pages[i], *dpage;
+                lock_page(page);
+                if (unlikely(!PageDirty(page)))
+                        NILFS_PAGE_BUG(page, "inconsistent dirty state");
+                dpage = grab_cache_page(dmap, page->index);
+                if (unlikely(!dpage)) {
+                        /* No empty page is added to the page cache */
+                        err = -ENOMEM;
+                        unlock_page(page);
+                        break;
+                }
+                if (unlikely(!page_has_buffers(page)))
+                        NILFS_PAGE_BUG(page,
+                                       "found empty page in dat page cache");
+                nilfs_copy_page(dpage, page, 1);
+                __set_page_dirty_nobuffers(dpage);
+                unlock_page(dpage);
+                page_cache_release(dpage);
+                unlock_page(page);
+        }
+        pagevec_release(&pvec);
+        cond_resched();
+        if (likely(!err))
+                goto repeat;
+        return err;
+}
+/**
+ * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
+ * @dmap: destination page cache
+ * @smap: source page cache
+ *
+ * No pages must no be added to the cache during this process.
+ * This must be ensured by the caller.
+ */
+void nilfs_copy_back_pages(struct address_space *dmap,
+                           struct address_space *smap)
+{
+        struct pagevec pvec;
+        unsigned int i, n;
+        pgoff_t index = 0;
+        int err;
+        pagevec_init(&pvec, 0);
+repeat:
+        n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+        if (!n)
+                return;
+        index = pvec.pages[n - 1]->index + 1;
+        for (i = 0; i < pagevec_count(&pvec); i++) {
+                struct page *page = pvec.pages[i], *dpage;
+                pgoff_t offset = page->index;
+                lock_page(page);
+                dpage = find_lock_page(dmap, offset);
+                if (dpage) {
+                        /* override existing page on the destination cache */
+                        WARN_ON(PageDirty(dpage));
+                        nilfs_copy_page(dpage, page, 0);
+                        unlock_page(dpage);
+                        page_cache_release(dpage);
+                } else {
+                        struct page *page2;
+                        /* move the page to the destination cache */
+                        spin_lock_irq(&smap->tree_lock);
+                        page2 = radix_tree_delete(&smap->page_tree, offset);
+                        WARN_ON(page2 != page);
+                        smap->nrpages--;
+                        spin_unlock_irq(&smap->tree_lock);
+                        spin_lock_irq(&dmap->tree_lock);
+                        err = radix_tree_insert(&dmap->page_tree, offset, page);
+                        if (unlikely(err < 0)) {
+                                WARN_ON(err == -EEXIST);
+                                page->mapping = NULL;
+                                page_cache_release(page); /* for cache */
+                        } else {
+                                page->mapping = dmap;
+                                dmap->nrpages++;
+                                if (PageDirty(page))
+                                        radix_tree_tag_set(&dmap->page_tree,
+                                                           offset,
+                                                           PAGECACHE_TAG_DIRTY);
+                        }
+                        spin_unlock_irq(&dmap->tree_lock);
+                }
+                unlock_page(page);
+        }
+        pagevec_release(&pvec);
+        cond_resched();
+        goto repeat;
+}
+void nilfs_clear_dirty_pages(struct address_space *mapping)
+{
+        struct pagevec pvec;
+        unsigned int i;
+        pgoff_t index = 0;
+        pagevec_init(&pvec, 0);
+        while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+                                  PAGEVEC_SIZE)) {
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        struct page *page = pvec.pages[i];
+                        struct buffer_head *bh, *head;
+                        lock_page(page);
+                        ClearPageUptodate(page);
+                        ClearPageMappedToDisk(page);
+                        bh = head = page_buffers(page);
+                        do {
+                                lock_buffer(bh);
+                                clear_buffer_dirty(bh);
+                                clear_buffer_nilfs_volatile(bh);
+                                clear_buffer_uptodate(bh);
+                                clear_buffer_mapped(bh);
+                                unlock_buffer(bh);
+                                bh = bh->b_this_page;
+                        } while (bh != head);
+                        __nilfs_clear_page_dirty(page);
+                        unlock_page(page);
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+unsigned nilfs_page_count_clean_buffers(struct page *page,
+                                        unsigned from, unsigned to)
+{
+        unsigned block_start, block_end;
+        struct buffer_head *bh, *head;
+        unsigned nc = 0;
+        for (bh = head = page_buffers(page), block_start = 0;
+             bh != head || !block_start;
+             block_start = block_end, bh = bh->b_this_page) {
+                block_end = block_start + bh->b_size;
+                if (block_end > from && block_start < to && !buffer_dirty(bh))
+                        nc++;
+        }
+        return nc;
+}
+/*
+ * NILFS2 needs clear_page_dirty() in the following two cases:
+ *
+ * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
+ *    page dirty flags when it copies back pages from the shadow cache
+ *    (gcdat->{i_mapping,i_btnode_cache}) to its original cache
+ *    (dat->{i_mapping,i_btnode_cache}).
+ *
+ * 2) Some B-tree operations like insertion or deletion may dispose buffers
+ *    in dirty state, and this needs to cancel the dirty state of their pages.
+ */
+int __nilfs_clear_page_dirty(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        if (mapping) {
+                spin_lock_irq(&mapping->tree_lock);
+                if (test_bit(PG_dirty, &page->flags)) {
+                        radix_tree_tag_clear(&mapping->page_tree,
+                                             page_index(page),
+                                             PAGECACHE_TAG_DIRTY);
+                        spin_unlock_irq(&mapping->tree_lock);
+                        return clear_page_dirty_for_io(page);
+                }
+                spin_unlock_irq(&mapping->tree_lock);
+                return 0;
+        }
+        return TestClearPageDirty(page);
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
new file mode 100644
index 000000000000..8abca4d1c1f8
--- /dev/null
+++ b/fs/nilfs2/page.h
@@ -0,0 +1,76 @@
+/*
+ * page.h - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+#ifndef _NILFS_PAGE_H
+#define _NILFS_PAGE_H
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+/*
+ * Extended buffer state bits
+ */
+enum {
+        BH_NILFS_Allocated = BH_PrivateStart,
+        BH_NILFS_Node,
+        BH_NILFS_Volatile,
+};
+BUFFER_FNS(NILFS_Allocated, nilfs_allocated)    /* nilfs private buffers */
+BUFFER_FNS(NILFS_Node, nilfs_node)              /* nilfs node buffers */
+BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
+void nilfs_mark_buffer_dirty(struct buffer_head *bh);
+int __nilfs_clear_page_dirty(struct page *);
+struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
+                                      unsigned long, unsigned long);
+void nilfs_forget_buffer(struct buffer_head *);
+void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
+int nilfs_page_buffers_clean(struct page *);
+void nilfs_page_bug(struct page *);
+struct page *nilfs_alloc_private_page(struct block_device *, int,
+                                      unsigned long);
+void nilfs_free_private_page(struct page *);
+int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
+void nilfs_copy_back_pages(struct address_space *, struct address_space *);
+void nilfs_clear_dirty_pages(struct address_space *);
+unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+#define NILFS_PAGE_BUG(page, m, a...) \
+        do { nilfs_page_bug(page); BUG(); } while (0)
+static inline struct buffer_head *
+nilfs_page_get_nth_block(struct page *page, unsigned int count)
+{
+        struct buffer_head *bh = page_buffers(page);
+        while (count-- > 0)
+                bh = bh->b_this_page;
+        get_bh(bh);
+        return bh;
+}
+#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
new file mode 100644
index 000000000000..6ade0963fc1d
--- /dev/null
+++ b/fs/nilfs2/recovery.c
@@ -0,0 +1,929 @@
+/*
+ * recovery.c - NILFS recovery logic
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "sufile.h"
+#include "page.h"
+#include "seglist.h"
+#include "segbuf.h"
+/*
+ * Segment check result
+ */
+enum {
+        NILFS_SEG_VALID,
+        NILFS_SEG_NO_SUPER_ROOT,
+        NILFS_SEG_FAIL_IO,
+        NILFS_SEG_FAIL_MAGIC,
+        NILFS_SEG_FAIL_SEQ,
+        NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
+        NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
+        NILFS_SEG_FAIL_CHECKSUM_FULL,
+        NILFS_SEG_FAIL_CONSISTENCY,
+};
+/* work structure for recovery */
+struct nilfs_recovery_block {
+        ino_t ino;              /* Inode number of the file that this block
+                                   belongs to */
+        sector_t blocknr;       /* block number */
+        __u64 vblocknr;         /* virtual block number */
+        unsigned long blkoff;   /* File offset of the data block (per block) */
+        struct list_head list;
+};
+static int nilfs_warn_segment_error(int err)
+{
+        switch (err) {
+        case NILFS_SEG_FAIL_IO:
+                printk(KERN_WARNING
+                       "NILFS warning: I/O error on loading last segment\n");
+                return -EIO;
+        case NILFS_SEG_FAIL_MAGIC:
+                printk(KERN_WARNING
+                       "NILFS warning: Segment magic number invalid\n");
+                break;
+        case NILFS_SEG_FAIL_SEQ:
+                printk(KERN_WARNING
+                       "NILFS warning: Sequence number mismatch\n");
+                break;
+        case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
+                printk(KERN_WARNING
+                       "NILFS warning: Checksum error in segment summary\n");
+                break;
+        case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
+                printk(KERN_WARNING
+                       "NILFS warning: Checksum error in super root\n");
+                break;
+        case NILFS_SEG_FAIL_CHECKSUM_FULL:
+                printk(KERN_WARNING
+                       "NILFS warning: Checksum error in segment payload\n");
+                break;
+        case NILFS_SEG_FAIL_CONSISTENCY:
+                printk(KERN_WARNING
+                       "NILFS warning: Inconsistent segment\n");
+                break;
+        case NILFS_SEG_NO_SUPER_ROOT:
+                printk(KERN_WARNING
+                       "NILFS warning: No super root in the last segment\n");
+                break;
+        }
+        return -EINVAL;
+}
+static void store_segsum_info(struct nilfs_segsum_info *ssi,
+                              struct nilfs_segment_summary *sum,
+                              unsigned int blocksize)
+{
+        ssi->flags = le16_to_cpu(sum->ss_flags);
+        ssi->seg_seq = le64_to_cpu(sum->ss_seq);
+        ssi->ctime = le64_to_cpu(sum->ss_create);
+        ssi->next = le64_to_cpu(sum->ss_next);
+        ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
+        ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
+        ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
+        ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
+        ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
+}
+/**
+ * calc_crc_cont - check CRC of blocks continuously
+ * @sbi: nilfs_sb_info
+ * @bhs: buffer head of start block
+ * @sum: place to store result
+ * @offset: offset bytes in the first block
+ * @check_bytes: number of bytes to be checked
+ * @start: DBN of start block
+ * @nblock: number of blocks to be checked
+ */
+static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
+                         u32 *sum, unsigned long offset, u64 check_bytes,
+                         sector_t start, unsigned long nblock)
+{
+        unsigned long blocksize = sbi->s_super->s_blocksize;
+        unsigned long size;
+        u32 crc;
+        BUG_ON(offset >= blocksize);
+        check_bytes -= offset;
+        size = min_t(u64, check_bytes, blocksize - offset);
+        crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
+                       (unsigned char *)bhs->b_data + offset, size);
+        if (--nblock > 0) {
+                do {
+                        struct buffer_head *bh
+                                = sb_bread(sbi->s_super, ++start);
+                        if (!bh)
+                                return -EIO;
+                        check_bytes -= size;
+                        size = min_t(u64, check_bytes, blocksize);
+                        crc = crc32_le(crc, bh->b_data, size);
+                        brelse(bh);
+                } while (--nblock > 0);
+        }
+        *sum = crc;
+        return 0;
+}
+/**
+ * nilfs_read_super_root_block - read super root block
+ * @sb: super_block
+ * @sr_block: disk block number of the super root block
+ * @pbh: address of a buffer_head pointer to return super root buffer
+ * @check: CRC check flag
+ */
+int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
+                                struct buffer_head **pbh, int check)
+{
+        struct buffer_head *bh_sr;
+        struct nilfs_super_root *sr;
+        u32 crc;
+        int ret;
+        *pbh = NULL;
+        bh_sr = sb_bread(sb, sr_block);
+        if (unlikely(!bh_sr)) {
+                ret = NILFS_SEG_FAIL_IO;
+                goto failed;
+        }
+        sr = (struct nilfs_super_root *)bh_sr->b_data;
+        if (check) {
+                unsigned bytes = le16_to_cpu(sr->sr_bytes);
+                if (bytes == 0 || bytes > sb->s_blocksize) {
+                        ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+                        goto failed_bh;
+                }
+                if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
+                                  sizeof(sr->sr_sum), bytes, sr_block, 1)) {
+                        ret = NILFS_SEG_FAIL_IO;
+                        goto failed_bh;
+                }
+                if (crc != le32_to_cpu(sr->sr_sum)) {
+                        ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+                        goto failed_bh;
+                }
+        }
+        *pbh = bh_sr;
+        return 0;
+ failed_bh:
+        brelse(bh_sr);
+ failed:
+        return nilfs_warn_segment_error(ret);
+}
+/**
+ * load_segment_summary - read segment summary of the specified partial segment
+ * @sbi: nilfs_sb_info
+ * @pseg_start: start disk block number of partial segment
+ * @seg_seq: sequence number requested
+ * @ssi: pointer to nilfs_segsum_info struct to store information
+ * @full_check: full check flag
+ *              (0: only checks segment summary CRC, 1: data CRC)
+ */
+static int
+load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
+                     u64 seg_seq, struct nilfs_segsum_info *ssi,
+                     int full_check)
+{
+        struct buffer_head *bh_sum;
+        struct nilfs_segment_summary *sum;
+        unsigned long offset, nblock;
+        u64 check_bytes;
+        u32 crc, crc_sum;
+        int ret = NILFS_SEG_FAIL_IO;
+        bh_sum = sb_bread(sbi->s_super, pseg_start);
+        if (!bh_sum)
+                goto out;
+        sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+        /* Check consistency of segment summary */
+        if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
+                ret = NILFS_SEG_FAIL_MAGIC;
+                goto failed;
+        }
+        store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
+        if (seg_seq != ssi->seg_seq) {
+                ret = NILFS_SEG_FAIL_SEQ;
+                goto failed;
+        }
+        if (full_check) {
+                offset = sizeof(sum->ss_datasum);
+                check_bytes =
+                        ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
+                nblock = ssi->nblocks;
+                crc_sum = le32_to_cpu(sum->ss_datasum);
+                ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
+        } else { /* only checks segment summary */
+                offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
+                check_bytes = ssi->sumbytes;
+                nblock = ssi->nsumblk;
+                crc_sum = le32_to_cpu(sum->ss_sumsum);
+                ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
+        }
+        if (unlikely(nblock == 0 ||
+                     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
+                /* This limits the number of blocks read in the CRC check */
+                ret = NILFS_SEG_FAIL_CONSISTENCY;
+                goto failed;
+        }
+        if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
+                          pseg_start, nblock)) {
+                ret = NILFS_SEG_FAIL_IO;
+                goto failed;
+        }
+        if (crc == crc_sum)
+                ret = 0;
+ failed:
+        brelse(bh_sum);
+ out:
+        return ret;
+}
+static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
+                        unsigned int *offset, unsigned int bytes)
+{
+        void *ptr;
+        sector_t blocknr;
+        BUG_ON((*pbh)->b_size < *offset);
+        if (bytes > (*pbh)->b_size - *offset) {
+                blocknr = (*pbh)->b_blocknr;
+                brelse(*pbh);
+                *pbh = sb_bread(sb, blocknr + 1);
+                if (unlikely(!*pbh))
+                        return NULL;
+                *offset = 0;
+        }
+        ptr = (*pbh)->b_data + *offset;
+        *offset += bytes;
+        return ptr;
+}
+static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
+                        unsigned int *offset, unsigned int bytes,
+                        unsigned long count)
+{
+        unsigned int rest_item_in_current_block
+                = ((*pbh)->b_size - *offset) / bytes;
+        if (count <= rest_item_in_current_block) {
+                *offset += bytes * count;
+        } else {
+                sector_t blocknr = (*pbh)->b_blocknr;
+                unsigned int nitem_per_block = (*pbh)->b_size / bytes;
+                unsigned int bcnt;
+                count -= rest_item_in_current_block;
+                bcnt = DIV_ROUND_UP(count, nitem_per_block);
+                *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
+                brelse(*pbh);
+                *pbh = sb_bread(sb, blocknr + bcnt);
+        }
+}
+static int
+collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
+                           struct nilfs_segsum_info *ssi,
+                           struct list_head *head)
+{
+        struct buffer_head *bh;
+        unsigned int offset;
+        unsigned long nfinfo = ssi->nfinfo;
+        sector_t blocknr = sum_blocknr + ssi->nsumblk;
+        ino_t ino;
+        int err = -EIO;
+        if (!nfinfo)
+                return 0;
+        bh = sb_bread(sbi->s_super, sum_blocknr);
+        if (unlikely(!bh))
+                goto out;
+        offset = le16_to_cpu(
+                ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
+        for (;;) {
+                unsigned long nblocks, ndatablk, nnodeblk;
+                struct nilfs_finfo *finfo;
+                finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
+                if (unlikely(!finfo))
+                        goto out;
+                ino = le64_to_cpu(finfo->fi_ino);
+                nblocks = le32_to_cpu(finfo->fi_nblocks);
+                ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+                nnodeblk = nblocks - ndatablk;
+                while (ndatablk-- > 0) {
+                        struct nilfs_recovery_block *rb;
+                        struct nilfs_binfo_v *binfo;
+                        binfo = segsum_get(sbi->s_super, &bh, &offset,
+                                           sizeof(*binfo));
+                        if (unlikely(!binfo))
+                                goto out;
+                        rb = kmalloc(sizeof(*rb), GFP_NOFS);
+                        if (unlikely(!rb)) {
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        rb->ino = ino;
+                        rb->blocknr = blocknr++;
+                        rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
+                        rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
+                        /* INIT_LIST_HEAD(&rb->list); */
+                        list_add_tail(&rb->list, head);
+                }
+                if (--nfinfo == 0)
+                        break;
+                blocknr += nnodeblk; /* always 0 for the data sync segments */
+                segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
+                            nnodeblk);
+                if (unlikely(!bh))
+                        goto out;
+        }
+        err = 0;
+ out:
+        brelse(bh);   /* brelse(NULL) is just ignored */
+        return err;
+}
+static void dispose_recovery_list(struct list_head *head)
+{
+        while (!list_empty(head)) {
+                struct nilfs_recovery_block *rb
+                        = list_entry(head->next,
+                                     struct nilfs_recovery_block, list);
+                list_del(&rb->list);
+                kfree(rb);
+        }
+}
+void nilfs_dispose_segment_list(struct list_head *head)
+{
+        while (!list_empty(head)) {
+                struct nilfs_segment_entry *ent
+                        = list_entry(head->next,
+                                     struct nilfs_segment_entry, list);
+                list_del(&ent->list);
+                nilfs_free_segment_entry(ent);
+        }
+}
+static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
+                                              struct nilfs_recovery_info *ri)
+{
+        struct list_head *head = &ri->ri_used_segments;
+        struct nilfs_segment_entry *ent, *n;
+        struct inode *sufile = nilfs->ns_sufile;
+        __u64 segnum[4];
+        time_t mtime;
+        int err;
+        int i;
+        segnum[0] = nilfs->ns_segnum;
+        segnum[1] = nilfs->ns_nextnum;
+        segnum[2] = ri->ri_segnum;
+        segnum[3] = ri->ri_nextnum;
+        /*
+         * Releasing the next segment of the latest super root.
+         * The next segment is invalidated by this recovery.
+         */
+        err = nilfs_sufile_free(sufile, segnum[1]);
+        if (unlikely(err))
+                goto failed;
+        err = -ENOMEM;
+        for (i = 1; i < 4; i++) {
+                ent = nilfs_alloc_segment_entry(segnum[i]);
+                if (unlikely(!ent))
+                        goto failed;
+                list_add_tail(&ent->list, head);
+        }
+        /*
+         * Collecting segments written after the latest super root.
+         * These are marked dirty to avoid being reallocated in the next write.
+         */
+        mtime = get_seconds();
+        list_for_each_entry_safe(ent, n, head, list) {
+                if (ent->segnum == segnum[0]) {
+                        list_del(&ent->list);
+                        nilfs_free_segment_entry(ent);
+                        continue;
+                }
+                err = nilfs_open_segment_entry(ent, sufile);
+                if (unlikely(err))
+                        goto failed;
+                if (!nilfs_segment_usage_dirty(ent->raw_su)) {
+                        /* make the segment garbage */
+                        ent->raw_su->su_nblocks = cpu_to_le32(0);
+                        ent->raw_su->su_lastmod = cpu_to_le32(mtime);
+                        nilfs_segment_usage_set_dirty(ent->raw_su);
+                }
+                list_del(&ent->list);
+                nilfs_close_segment_entry(ent, sufile);
+                nilfs_free_segment_entry(ent);
+        }
+        /* Allocate new segments for recovery */
+        err = nilfs_sufile_alloc(sufile, &segnum[0]);
+        if (unlikely(err))
+                goto failed;
+        nilfs->ns_pseg_offset = 0;
+        nilfs->ns_seg_seq = ri->ri_seq + 2;
+        nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
+        return 0;
+ failed:
+        /* No need to recover sufile because it will be destroyed on error */
+        return err;
+}
+static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
+                                     struct nilfs_recovery_block *rb,
+                                     struct page *page)
+{
+        struct buffer_head *bh_org;
+        void *kaddr;
+        bh_org = sb_bread(sbi->s_super, rb->blocknr);
+        if (unlikely(!bh_org))
+                return -EIO;
+        kaddr = kmap_atomic(page, KM_USER0);
+        memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(bh_org);
+        return 0;
+}
+static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
+                                struct list_head *head,
+                                unsigned long *nr_salvaged_blocks)
+{
+        struct inode *inode;
+        struct nilfs_recovery_block *rb, *n;
+        unsigned blocksize = sbi->s_super->s_blocksize;
+        struct page *page;
+        loff_t pos;
+        int err = 0, err2 = 0;
+        list_for_each_entry_safe(rb, n, head, list) {
+                inode = nilfs_iget(sbi->s_super, rb->ino);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        inode = NULL;
+                        goto failed_inode;
+                }
+                pos = rb->blkoff << inode->i_blkbits;
+                page = NULL;
+                err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
+                                        0, &page, NULL, nilfs_get_block);
+                if (unlikely(err))
+                        goto failed_inode;
+                err = nilfs_recovery_copy_block(sbi, rb, page);
+                if (unlikely(err))
+                        goto failed_page;
+                err = nilfs_set_file_dirty(sbi, inode, 1);
+                if (unlikely(err))
+                        goto failed_page;
+                block_write_end(NULL, inode->i_mapping, pos, blocksize,
+                                blocksize, page, NULL);
+                unlock_page(page);
+                page_cache_release(page);
+                (*nr_salvaged_blocks)++;
+                goto next;
+ failed_page:
+                unlock_page(page);
+                page_cache_release(page);
+ failed_inode:
+                printk(KERN_WARNING
+                       "NILFS warning: error recovering data block "
+                       "(err=%d, ino=%lu, block-offset=%llu)\n",
+                       err, rb->ino, (unsigned long long)rb->blkoff);
+                if (!err2)
+                        err2 = err;
+ next:
+                iput(inode); /* iput(NULL) is just ignored */
+                list_del_init(&rb->list);
+                kfree(rb);
+        }
+        return err2;
+}
+/**
+ * nilfs_do_roll_forward - salvage logical segments newer than the latest
+ * checkpoint
+ * @sbi: nilfs_sb_info
+ * @nilfs: the_nilfs
+ * @ri: pointer to a nilfs_recovery_info
+ */
+static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
+                                 struct nilfs_sb_info *sbi,
+                                 struct nilfs_recovery_info *ri)
+{
+        struct nilfs_segsum_info ssi;
+        sector_t pseg_start;
+        sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
+        unsigned long nsalvaged_blocks = 0;
+        u64 seg_seq;
+        __u64 segnum, nextnum = 0;
+        int empty_seg = 0;
+        int err = 0, ret;
+        LIST_HEAD(dsync_blocks);  /* list of data blocks to be recovered */
+        enum {
+                RF_INIT_ST,
+                RF_DSYNC_ST,   /* scanning data-sync segments */
+        };
+        int state = RF_INIT_ST;
+        nilfs_attach_writer(nilfs, sbi);
+        pseg_start = ri->ri_lsegs_start;
+        seg_seq = ri->ri_lsegs_start_seq;
+        segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+        while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
+                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+                if (ret) {
+                        if (ret == NILFS_SEG_FAIL_IO) {
+                                err = -EIO;
+                                goto failed;
+                        }
+                        goto strayed;
+                }
+                if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
+                        goto confused;
+                /* Found a valid partial segment; do recovery actions */
+                nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+                empty_seg = 0;
+                nilfs->ns_ctime = ssi.ctime;
+                if (!(ssi.flags & NILFS_SS_GC))
+                        nilfs->ns_nongc_ctime = ssi.ctime;
+                switch (state) {
+                case RF_INIT_ST:
+                        if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
+                                goto try_next_pseg;
+                        state = RF_DSYNC_ST;
+                        /* Fall through */
+                case RF_DSYNC_ST:
+                        if (!NILFS_SEG_DSYNC(&ssi))
+                                goto confused;
+                        err = collect_blocks_from_segsum(
+                                sbi, pseg_start, &ssi, &dsync_blocks);
+                        if (unlikely(err))
+                                goto failed;
+                        if (NILFS_SEG_LOGEND(&ssi)) {
+                                err = recover_dsync_blocks(
+                                        sbi, &dsync_blocks, &nsalvaged_blocks);
+                                if (unlikely(err))
+                                        goto failed;
+                                state = RF_INIT_ST;
+                        }
+                        break; /* Fall through to try_next_pseg */
+                }
+ try_next_pseg:
+                if (pseg_start == ri->ri_lsegs_end)
+                        break;
+                pseg_start += ssi.nblocks;
+                if (pseg_start < seg_end)
+                        continue;
+                goto feed_segment;
+ strayed:
+                if (pseg_start == ri->ri_lsegs_end)
+                        break;
+ feed_segment:
+                /* Looking to the next full segment */
+                if (empty_seg++)
+                        break;
+                seg_seq++;
+                segnum = nextnum;
+                nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+                pseg_start = seg_start;
+        }
+        if (nsalvaged_blocks) {
+                printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
+                       sbi->s_super->s_id, nsalvaged_blocks);
+                ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
+        }
+ out:
+        dispose_recovery_list(&dsync_blocks);
+        nilfs_detach_writer(sbi->s_nilfs, sbi);
+        return err;
+ confused:
+        err = -EINVAL;
+ failed:
+        printk(KERN_ERR
+               "NILFS (device %s): Error roll-forwarding "
+               "(err=%d, pseg block=%llu). ",
+               sbi->s_super->s_id, err, (unsigned long long)pseg_start);
+        goto out;
+}
+static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
+                                      struct nilfs_sb_info *sbi,
+                                      struct nilfs_recovery_info *ri)
+{
+        struct buffer_head *bh;
+        int err;
+        if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
+            nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
+                return;
+        bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
+        BUG_ON(!bh);
+        memset(bh->b_data, 0, bh->b_size);
+        set_buffer_dirty(bh);
+        err = sync_dirty_buffer(bh);
+        if (unlikely(err))
+                printk(KERN_WARNING
+                       "NILFS warning: buffer sync write failed during "
+                       "post-cleaning of recovery.\n");
+        brelse(bh);
+}
+/**
+ * nilfs_recover_logical_segments - salvage logical segments written after
+ * the latest super root
+ * @nilfs: the_nilfs
+ * @sbi: nilfs_sb_info
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - Inconsistent filesystem state.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
+                                   struct nilfs_sb_info *sbi,
+                                   struct nilfs_recovery_info *ri)
+{
+        int err;
+        if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
+                return 0;
+        err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
+        if (unlikely(err)) {
+                printk(KERN_ERR
+                       "NILFS: error loading the latest checkpoint.\n");
+                return err;
+        }
+        err = nilfs_do_roll_forward(nilfs, sbi, ri);
+        if (unlikely(err))
+                goto failed;
+        if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
+                err = nilfs_prepare_segment_for_recovery(nilfs, ri);
+                if (unlikely(err)) {
+                        printk(KERN_ERR "NILFS: Error preparing segments for "
+                               "recovery.\n");
+                        goto failed;
+                }
+                err = nilfs_attach_segment_constructor(sbi);
+                if (unlikely(err))
+                        goto failed;
+                set_nilfs_discontinued(nilfs);
+                err = nilfs_construct_segment(sbi->s_super);
+                nilfs_detach_segment_constructor(sbi);
+                if (unlikely(err)) {
+                        printk(KERN_ERR "NILFS: Oops! recovery failed. "
+                               "(err=%d)\n", err);
+                        goto failed;
+                }
+                nilfs_finish_roll_forward(nilfs, sbi, ri);
+        }
+        nilfs_detach_checkpoint(sbi);
+        return 0;
+ failed:
+        nilfs_detach_checkpoint(sbi);
+        nilfs_mdt_clear(nilfs->ns_cpfile);
+        nilfs_mdt_clear(nilfs->ns_sufile);
+        nilfs_mdt_clear(nilfs->ns_dat);
+        return err;
+}
+/**
+ * nilfs_search_super_root - search the latest valid super root
+ * @nilfs: the_nilfs
+ * @sbi: nilfs_sb_info
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * nilfs_search_super_root() looks for the latest super-root from a partial
+ * segment pointed by the superblock.  It sets up struct the_nilfs through
+ * this search. It fills nilfs_recovery_info (ri) required for recovery.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - No valid segment found
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
+                            struct nilfs_recovery_info *ri)
+{
+        struct nilfs_segsum_info ssi;
+        sector_t pseg_start, pseg_end, sr_pseg_start = 0;
+        sector_t seg_start, seg_end; /* range of full segment (block number) */
+        u64 seg_seq;
+        __u64 segnum, nextnum = 0;
+        __u64 cno;
+        struct nilfs_segment_entry *ent;
+        LIST_HEAD(segments);
+        int empty_seg = 0, scan_newer = 0;
+        int ret;
+        pseg_start = nilfs->ns_last_pseg;
+        seg_seq = nilfs->ns_last_seq;
+        cno = nilfs->ns_last_cno;
+        segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+        /* Calculate range of segment */
+        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+        for (;;) {
+                /* Load segment summary */
+                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+                if (ret) {
+                        if (ret == NILFS_SEG_FAIL_IO)
+                                goto failed;
+                        goto strayed;
+                }
+                pseg_end = pseg_start + ssi.nblocks - 1;
+                if (unlikely(pseg_end > seg_end)) {
+                        ret = NILFS_SEG_FAIL_CONSISTENCY;
+                        goto strayed;
+                }
+                /* A valid partial segment */
+                ri->ri_pseg_start = pseg_start;
+                ri->ri_seq = seg_seq;
+                ri->ri_segnum = segnum;
+                nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+                ri->ri_nextnum = nextnum;
+                empty_seg = 0;
+                if (!NILFS_SEG_HAS_SR(&ssi)) {
+                        if (!scan_newer) {
+                                /* This will never happen because a superblock
+                                   (last_segment) always points to a pseg
+                                   having a super root. */
+                                ret = NILFS_SEG_FAIL_CONSISTENCY;
+                                goto failed;
+                        }
+                        if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
+                                ri->ri_lsegs_start = pseg_start;
+                                ri->ri_lsegs_start_seq = seg_seq;
+                        }
+                        if (NILFS_SEG_LOGEND(&ssi))
+                                ri->ri_lsegs_end = pseg_start;
+                        goto try_next_pseg;
+                }
+                /* A valid super root was found. */
+                ri->ri_cno = cno++;
+                ri->ri_super_root = pseg_end;
+                ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
+                nilfs_dispose_segment_list(&segments);
+                nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
+                        + ssi.nblocks - seg_start;
+                nilfs->ns_seg_seq = seg_seq;
+                nilfs->ns_segnum = segnum;
+                nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
+                nilfs->ns_ctime = ssi.ctime;
+                nilfs->ns_nextnum = nextnum;
+                if (scan_newer)
+                        ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
+                else {
+                        if (nilfs->ns_mount_state & NILFS_VALID_FS)
+                                goto super_root_found;
+                        scan_newer = 1;
+                }
+                /* reset region for roll-forward */
+                pseg_start += ssi.nblocks;
+                if (pseg_start < seg_end)
+                        continue;
+                goto feed_segment;
+ try_next_pseg:
+                /* Standing on a course, or met an inconsistent state */
+                pseg_start += ssi.nblocks;
+                if (pseg_start < seg_end)
+                        continue;
+                goto feed_segment;
+ strayed:
+                /* Off the trail */
+                if (!scan_newer)
+                        /*
+                         * This can happen if a checkpoint was written without
+                         * barriers, or as a result of an I/O failure.
+                         */
+                        goto failed;
+ feed_segment:
+                /* Looking to the next full segment */
+                if (empty_seg++)
+                        goto super_root_found; /* found a valid super root */
+                ent = nilfs_alloc_segment_entry(segnum);
+                if (unlikely(!ent)) {
+                        ret = -ENOMEM;
+                        goto failed;
+                }
+                list_add_tail(&ent->list, &segments);
+                seg_seq++;
+                segnum = nextnum;
+                nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+                pseg_start = seg_start;
+        }
+ super_root_found:
+        /* Updating pointers relating to the latest checkpoint */
+        list_splice(&segments, ri->ri_used_segments.prev);
+        nilfs->ns_last_pseg = sr_pseg_start;
+        nilfs->ns_last_seq = nilfs->ns_seg_seq;
+        nilfs->ns_last_cno = ri->ri_cno;
+        return 0;
+ failed:
+        nilfs_dispose_segment_list(&segments);
+        return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
+}
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
new file mode 100644
index 000000000000..adccd4fc654e
--- /dev/null
+++ b/fs/nilfs2/sb.h
@@ -0,0 +1,102 @@
+/*
+ * sb.h - NILFS on-memory super block structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SB
+#define _NILFS_SB
+#include <linux/types.h>
+#include <linux/fs.h>
+/*
+ * Mount options
+ */
+struct nilfs_mount_options {
+        unsigned long mount_opt;
+        __u64 snapshot_cno;
+};
+struct the_nilfs;
+struct nilfs_sc_info;
+/*
+ * NILFS super-block data in memory
+ */
+struct nilfs_sb_info {
+        /* Snapshot status */
+        __u64 s_snapshot_cno;           /* Checkpoint number */
+        atomic_t s_inodes_count;
+        atomic_t s_blocks_count;        /* Reserved (might be deleted) */
+        /* Mount options */
+        unsigned long s_mount_opt;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned long s_interval;       /* construction interval */
+        unsigned long s_watermark;      /* threshold of data amount
+                                           for the segment construction */
+        /* Fundamental members */
+        struct super_block *s_super;    /* reverse pointer to super_block */
+        struct the_nilfs *s_nilfs;
+        struct list_head s_list;        /* list head for nilfs->ns_supers */
+        /* Segment constructor */
+        struct list_head s_dirty_files; /* dirty files list */
+        struct nilfs_sc_info *s_sc_info; /* segment constructor info */
+        spinlock_t s_inode_lock;        /* Lock for the nilfs inode.
+                                           It covers s_dirty_files list */
+        /* Metadata files */
+        struct inode *s_ifile;          /* index file inode */
+        /* Inode allocator */
+        spinlock_t s_next_gen_lock;
+        u32 s_next_generation;
+};
+static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
+{
+        return sbi->s_sc_info;
+}
+/*
+ * Bit operations for the mount option
+ */
+#define nilfs_clear_opt(sbi, opt)  \
+        do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+#define nilfs_set_opt(sbi, opt)  \
+        do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+#define nilfs_test_opt(sbi, opt)   ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
+#define nilfs_write_opt(sbi, mask, opt)                                 \
+        do { (sbi)->s_mount_opt =                                       \
+                (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) |           \
+                 NILFS_MOUNT_##opt);                                    \
+        } while (0)
+#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
new file mode 100644
index 000000000000..1e68821b4a9b
--- /dev/null
+++ b/fs/nilfs2/segbuf.c
@@ -0,0 +1,439 @@
+/*
+ * segbuf.c - NILFS segment buffer
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/crc32.h>
+#include "page.h"
+#include "segbuf.h"
+#include "seglist.h"
+static struct kmem_cache *nilfs_segbuf_cachep;
+static void nilfs_segbuf_init_once(void *obj)
+{
+        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
+int __init nilfs_init_segbuf_cache(void)
+{
+        nilfs_segbuf_cachep =
+                kmem_cache_create("nilfs2_segbuf_cache",
+                                  sizeof(struct nilfs_segment_buffer),
+                                  0, SLAB_RECLAIM_ACCOUNT,
+                                  nilfs_segbuf_init_once);
+        return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
+}
+void nilfs_destroy_segbuf_cache(void)
+{
+        kmem_cache_destroy(nilfs_segbuf_cachep);
+}
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
+{
+        struct nilfs_segment_buffer *segbuf;
+        segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
+        if (unlikely(!segbuf))
+                return NULL;
+        segbuf->sb_super = sb;
+        INIT_LIST_HEAD(&segbuf->sb_list);
+        INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
+        INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+        return segbuf;
+}
+void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
+{
+        kmem_cache_free(nilfs_segbuf_cachep, segbuf);
+}
+void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
+                     unsigned long offset, struct the_nilfs *nilfs)
+{
+        segbuf->sb_segnum = segnum;
+        nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
+                                &segbuf->sb_fseg_end);
+        segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
+        segbuf->sb_rest_blocks =
+                segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
+                                  __u64 nextnum, struct the_nilfs *nilfs)
+{
+        segbuf->sb_nextnum = nextnum;
+        segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
+}
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
+{
+        struct buffer_head *bh;
+        bh = sb_getblk(segbuf->sb_super,
+                       segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
+        if (unlikely(!bh))
+                return -ENOMEM;
+        nilfs_segbuf_add_segsum_buffer(segbuf, bh);
+        return 0;
+}
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
+                                struct buffer_head **bhp)
+{
+        struct buffer_head *bh;
+        bh = sb_getblk(segbuf->sb_super,
+                       segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
+        if (unlikely(!bh))
+                return -ENOMEM;
+        nilfs_segbuf_add_payload_buffer(segbuf, bh);
+        *bhp = bh;
+        return 0;
+}
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+                       time_t ctime)
+{
+        int err;
+        segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
+        err = nilfs_segbuf_extend_segsum(segbuf);
+        if (unlikely(err))
+                return err;
+        segbuf->sb_sum.flags = flags;
+        segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
+        segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
+        segbuf->sb_sum.ctime = ctime;
+        segbuf->sb_io_error = 0;
+        return 0;
+}
+/*
+ * Setup segument summary
+ */
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
+{
+        struct nilfs_segment_summary *raw_sum;
+        struct buffer_head *bh_sum;
+        bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
+                            struct buffer_head, b_assoc_buffers);
+        raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+        raw_sum->ss_magic    = cpu_to_le32(NILFS_SEGSUM_MAGIC);
+        raw_sum->ss_bytes    = cpu_to_le16(sizeof(*raw_sum));
+        raw_sum->ss_flags    = cpu_to_le16(segbuf->sb_sum.flags);
+        raw_sum->ss_seq      = cpu_to_le64(segbuf->sb_sum.seg_seq);
+        raw_sum->ss_create   = cpu_to_le64(segbuf->sb_sum.ctime);
+        raw_sum->ss_next     = cpu_to_le64(segbuf->sb_sum.next);
+        raw_sum->ss_nblocks  = cpu_to_le32(segbuf->sb_sum.nblocks);
+        raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
+        raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
+        raw_sum->ss_pad      = 0;
+}
+/*
+ * CRC calculation routines
+ */
+void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
+                                     u32 seed)
+{
+        struct buffer_head *bh;
+        struct nilfs_segment_summary *raw_sum;
+        unsigned long size, bytes = segbuf->sb_sum.sumbytes;
+        u32 crc;
+        bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+                        b_assoc_buffers);
+        raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+        size = min_t(unsigned long, bytes, bh->b_size);
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sum +
+                       sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
+                       size - (sizeof(raw_sum->ss_datasum) +
+                               sizeof(raw_sum->ss_sumsum)));
+        list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+                                     b_assoc_buffers) {
+                bytes -= size;
+                size = min_t(unsigned long, bytes, bh->b_size);
+                crc = crc32_le(crc, bh->b_data, size);
+        }
+        raw_sum->ss_sumsum = cpu_to_le32(crc);
+}
+void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+                                   u32 seed)
+{
+        struct buffer_head *bh;
+        struct nilfs_segment_summary *raw_sum;
+        void *kaddr;
+        u32 crc;
+        bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+                        b_assoc_buffers);
+        raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
+                       bh->b_size - sizeof(raw_sum->ss_datasum));
+        list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+                                     b_assoc_buffers) {
+                crc = crc32_le(crc, bh->b_data, bh->b_size);
+        }
+        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+                kaddr = kmap_atomic(bh->b_page, KM_USER0);
+                crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        raw_sum->ss_datasum = cpu_to_le32(crc);
+}
+void nilfs_release_buffers(struct list_head *list)
+{
+        struct buffer_head *bh, *n;
+        list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
+                list_del_init(&bh->b_assoc_buffers);
+                if (buffer_nilfs_allocated(bh)) {
+                        struct page *clone_page = bh->b_page;
+                        /* remove clone page */
+                        brelse(bh);
+                        page_cache_release(clone_page); /* for each bh */
+                        if (page_count(clone_page) <= 2) {
+                                lock_page(clone_page);
+                                nilfs_free_private_page(clone_page);
+                        }
+                        continue;
+                }
+                brelse(bh);
+        }
+}
+/*
+ * BIO operations
+ */
+static void nilfs_end_bio_write(struct bio *bio, int err)
+{
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct nilfs_write_info *wi = bio->bi_private;
+        if (err == -EOPNOTSUPP) {
+                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+                bio_put(bio);
+                /* to be detected by submit_seg_bio() */
+        }
+        if (!uptodate)
+                atomic_inc(&wi->err);
+        bio_put(bio);
+        complete(&wi->bio_event);
+}
+static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
+{
+        struct bio *bio = wi->bio;
+        int err;
+        if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
+                wait_for_completion(&wi->bio_event);
+                wi->nbio--;
+                if (unlikely(atomic_read(&wi->err))) {
+                        bio_put(bio);
+                        err = -EIO;
+                        goto failed;
+                }
+        }
+        bio->bi_end_io = nilfs_end_bio_write;
+        bio->bi_private = wi;
+        bio_get(bio);
+        submit_bio(mode, bio);
+        if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+                bio_put(bio);
+                err = -EOPNOTSUPP;
+                goto failed;
+        }
+        wi->nbio++;
+        bio_put(bio);
+        wi->bio = NULL;
+        wi->rest_blocks -= wi->end - wi->start;
+        wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+        wi->start = wi->end;
+        return 0;
+ failed:
+        wi->bio = NULL;
+        return err;
+}
+/**
+ * nilfs_alloc_seg_bio - allocate a bio for writing segment.
+ * @sb: super block
+ * @start: beginning disk block number of this BIO.
+ * @nr_vecs: request size of page vector.
+ *
+ * alloc_seg_bio() allocates a new BIO structure and initialize it.
+ *
+ * Return Value: On success, pointer to the struct bio is returned.
+ * On error, NULL is returned.
+ */
+static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
+                                       int nr_vecs)
+{
+        struct bio *bio;
+        bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+        if (bio == NULL) {
+                while (!bio && (nr_vecs >>= 1))
+                        bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+        }
+        if (likely(bio)) {
+                bio->bi_bdev = sb->s_bdev;
+                bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
+        }
+        return bio;
+}
+void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+                                struct nilfs_write_info *wi)
+{
+        wi->bio = NULL;
+        wi->rest_blocks = segbuf->sb_sum.nblocks;
+        wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
+        wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+        wi->start = wi->end = 0;
+        wi->nbio = 0;
+        wi->blocknr = segbuf->sb_pseg_start;
+        atomic_set(&wi->err, 0);
+        init_completion(&wi->bio_event);
+}
+static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
+                           int mode)
+{
+        int len, err;
+        BUG_ON(wi->nr_vecs <= 0);
+ repeat:
+        if (!wi->bio) {
+                wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
+                                              wi->nr_vecs);
+                if (unlikely(!wi->bio))
+                        return -ENOMEM;
+        }
+        len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
+        if (len == bh->b_size) {
+                wi->end++;
+                return 0;
+        }
+        /* bio is FULL */
+        err = nilfs_submit_seg_bio(wi, mode);
+        /* never submit current bh */
+        if (likely(!err))
+                goto repeat;
+        return err;
+}
+int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+                       struct nilfs_write_info *wi)
+{
+        struct buffer_head *bh;
+        int res, rw = WRITE;
+        list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
+                res = nilfs_submit_bh(wi, bh, rw);
+                if (unlikely(res))
+                        goto failed_bio;
+        }
+        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+                res = nilfs_submit_bh(wi, bh, rw);
+                if (unlikely(res))
+                        goto failed_bio;
+        }
+        if (wi->bio) {
+                /*
+                 * Last BIO is always sent through the following
+                 * submission.
+                 */
+                rw |= (1 << BIO_RW_SYNCIO);
+                res = nilfs_submit_seg_bio(wi, rw);
+                if (unlikely(res))
+                        goto failed_bio;
+        }
+        res = 0;
+ out:
+        return res;
+ failed_bio:
+        atomic_inc(&wi->err);
+        goto out;
+}
+/**
+ * nilfs_segbuf_wait - wait for completion of requested BIOs
+ * @wi: nilfs_write_info
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
+                      struct nilfs_write_info *wi)
+{
+        int err = 0;
+        if (!wi->nbio)
+                return 0;
+        do {
+                wait_for_completion(&wi->bio_event);
+        } while (--wi->nbio > 0);
+        if (unlikely(atomic_read(&wi->err) > 0)) {
+                printk(KERN_ERR "NILFS: IO error writing segment\n");
+                err = -EIO;
+                segbuf->sb_io_error = 1;
+        }
+        return err;
+}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
new file mode 100644
index 000000000000..0c3076f4e592
--- /dev/null
+++ b/fs/nilfs2/segbuf.h
@@ -0,0 +1,201 @@
+/*
+ * segbuf.h - NILFS Segment buffer prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGBUF_H
+#define _NILFS_SEGBUF_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+/**
+ * struct nilfs_segsum_info - On-memory segment summary
+ * @flags: Flags
+ * @nfinfo: Number of file information structures
+ * @nblocks: Number of blocks included in the partial segment
+ * @nsumblk: Number of summary blocks
+ * @sumbytes: Byte count of segment summary
+ * @nfileblk: Total number of file blocks
+ * @seg_seq: Segment sequence number
+ * @ctime: Creation time
+ * @next: Block number of the next full segment
+ */
+struct nilfs_segsum_info {
+        unsigned int            flags;
+        unsigned long           nfinfo;
+        unsigned long           nblocks;
+        unsigned long           nsumblk;
+        unsigned long           sumbytes;
+        unsigned long           nfileblk;
+        u64                     seg_seq;
+        time_t                  ctime;
+        sector_t                next;
+};
+/* macro for the flags */
+#define NILFS_SEG_HAS_SR(sum)    ((sum)->flags & NILFS_SS_SR)
+#define NILFS_SEG_LOGBGN(sum)    ((sum)->flags & NILFS_SS_LOGBGN)
+#define NILFS_SEG_LOGEND(sum)    ((sum)->flags & NILFS_SS_LOGEND)
+#define NILFS_SEG_DSYNC(sum)     ((sum)->flags & NILFS_SS_SYNDT)
+#define NILFS_SEG_SIMPLEX(sum) \
+        (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
+         (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
+#define NILFS_SEG_EMPTY(sum)    ((sum)->nblocks == (sum)->nsumblk)
+/**
+ * struct nilfs_segment_buffer - Segment buffer
+ * @sb_super: back pointer to a superblock struct
+ * @sb_list: List head to chain this structure
+ * @sb_sum: On-memory segment summary
+ * @sb_segnum: Index number of the full segment
+ * @sb_nextnum: Index number of the next full segment
+ * @sb_fseg_start: Start block number of the full segment
+ * @sb_fseg_end: End block number of the full segment
+ * @sb_pseg_start: Disk block number of partial segment
+ * @sb_rest_blocks: Number of residual blocks in the current segment
+ * @sb_segsum_buffers: List of buffers for segment summaries
+ * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_io_error: I/O error status
+ */
+struct nilfs_segment_buffer {
+        struct super_block     *sb_super;
+        struct list_head        sb_list;
+        /* Segment information */
+        struct nilfs_segsum_info sb_sum;
+        __u64                   sb_segnum;
+        __u64                   sb_nextnum;
+        sector_t                sb_fseg_start, sb_fseg_end;
+        sector_t                sb_pseg_start;
+        unsigned                sb_rest_blocks;
+        /* Buffers */
+        struct list_head        sb_segsum_buffers;
+        struct list_head        sb_payload_buffers; /* including super root */
+        /* io status */
+        int                     sb_io_error;
+};
+#define NILFS_LIST_SEGBUF(head)  \
+        list_entry((head), struct nilfs_segment_buffer, sb_list)
+#define NILFS_NEXT_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
+#define NILFS_PREV_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
+#define NILFS_LAST_SEGBUF(head)    NILFS_LIST_SEGBUF((head)->prev)
+#define NILFS_FIRST_SEGBUF(head)   NILFS_LIST_SEGBUF((head)->next)
+#define NILFS_SEGBUF_IS_LAST(segbuf, head)  ((segbuf)->sb_list.next == (head))
+#define nilfs_for_each_segbuf_before(s, t, h) \
+        for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
+             (s) = NILFS_NEXT_SEGBUF(s))
+#define NILFS_SEGBUF_FIRST_BH(head)  \
+        (list_entry((head)->next, struct buffer_head, b_assoc_buffers))
+#define NILFS_SEGBUF_NEXT_BH(bh)  \
+        (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
+                    b_assoc_buffers))
+#define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+int __init nilfs_init_segbuf_cache(void);
+void nilfs_destroy_segbuf_cache(void);
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
+void nilfs_segbuf_free(struct nilfs_segment_buffer *);
+void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
+                      struct the_nilfs *);
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
+                                  struct the_nilfs *);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
+                                struct buffer_head **);
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
+void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
+void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
+static inline void
+nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
+                               struct buffer_head *bh)
+{
+        list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
+        segbuf->sb_sum.nblocks++;
+        segbuf->sb_sum.nsumblk++;
+}
+static inline void
+nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
+                                struct buffer_head *bh)
+{
+        list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
+        segbuf->sb_sum.nblocks++;
+}
+static inline void
+nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
+                             struct buffer_head *bh)
+{
+        get_bh(bh);
+        nilfs_segbuf_add_payload_buffer(segbuf, bh);
+        segbuf->sb_sum.nfileblk++;
+}
+void nilfs_release_buffers(struct list_head *);
+static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+        nilfs_release_buffers(&segbuf->sb_payload_buffers);
+}
+struct nilfs_write_info {
+        struct bio             *bio;
+        int                     start, end; /* The region to be submitted */
+        int                     rest_blocks;
+        int                     max_pages;
+        int                     nr_vecs;
+        sector_t                blocknr;
+        int                     nbio;
+        atomic_t                err;
+        struct completion       bio_event;
+                                /* completion event of segment write */
+        /*
+         * The following fields must be set explicitly
+         */
+        struct super_block     *sb;
+        struct backing_dev_info *bdi; /* backing dev info */
+        struct buffer_head     *bh_sr;
+};
+void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
+                                struct nilfs_write_info *);
+int nilfs_segbuf_write(struct nilfs_segment_buffer *,
+                       struct nilfs_write_info *);
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
+                      struct nilfs_write_info *);
+#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
new file mode 100644
index 000000000000..d39df9144e99
--- /dev/null
+++ b/fs/nilfs2/seglist.h
@@ -0,0 +1,85 @@
+/*
+ * seglist.h - expediential structure and routines to handle list of segments
+ *             (would be removed in a future release)
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGLIST_H
+#define _NILFS_SEGLIST_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "sufile.h"
+struct nilfs_segment_entry {
+        __u64                   segnum;
+#define NILFS_SLH_FREED         0x0001  /* The segment was freed provisonally.
+                                           It must be cancelled if
+                                           construction aborted */
+        unsigned                flags;
+        struct list_head        list;
+        struct buffer_head     *bh_su;
+        struct nilfs_segment_usage *raw_su;
+};
+void nilfs_dispose_segment_list(struct list_head *);
+static inline struct nilfs_segment_entry *
+nilfs_alloc_segment_entry(__u64 segnum)
+{
+        struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+        if (likely(ent)) {
+                ent->segnum = segnum;
+                ent->flags = 0;
+                ent->bh_su = NULL;
+                ent->raw_su = NULL;
+                INIT_LIST_HEAD(&ent->list);
+        }
+        return ent;
+}
+static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
+                                           struct inode *sufile)
+{
+        return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
+                                              &ent->raw_su, &ent->bh_su);
+}
+static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
+                                             struct inode *sufile)
+{
+        if (!ent->bh_su)
+                return;
+        nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
+        ent->bh_su = NULL;
+        ent->raw_su = NULL;
+}
+static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
+{
+        kfree(ent);
+}
+#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
new file mode 100644
index 000000000000..fb70ec3be20e
--- /dev/null
+++ b/fs/nilfs2/segment.c
@@ -0,0 +1,2977 @@
+/*
+ * segment.c - NILFS segment constructor.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "page.h"
+#include "segment.h"
+#include "sufile.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "seglist.h"
+#include "segbuf.h"
+/*
+ * Segment constructor
+ */
+#define SC_N_INODEVEC   16   /* Size of locally allocated inode vector */
+#define SC_MAX_SEGDELTA 64   /* Upper limit of the number of segments
+                                appended in collection retry loop */
+/* Construction mode */
+enum {
+        SC_LSEG_SR = 1, /* Make a logical segment having a super root */
+        SC_LSEG_DSYNC,  /* Flush data blocks of a given file and make
+                           a logical segment without a super root */
+        SC_FLUSH_FILE,  /* Flush data files, leads to segment writes without
+                           creating a checkpoint */
+        SC_FLUSH_DAT,   /* Flush DAT file. This also creates segments without
+                           a checkpoint */
+};
+/* Stage numbers of dirty block collection */
+enum {
+        NILFS_ST_INIT = 0,
+        NILFS_ST_GC,            /* Collecting dirty blocks for GC */
+        NILFS_ST_FILE,
+        NILFS_ST_IFILE,
+        NILFS_ST_CPFILE,
+        NILFS_ST_SUFILE,
+        NILFS_ST_DAT,
+        NILFS_ST_SR,            /* Super root */
+        NILFS_ST_DSYNC,         /* Data sync blocks */
+        NILFS_ST_DONE,
+};
+/* State flags of collection */
+#define NILFS_CF_NODE           0x0001  /* Collecting node blocks */
+#define NILFS_CF_IFILE_STARTED  0x0002  /* IFILE stage has started */
+#define NILFS_CF_HISTORY_MASK   (NILFS_CF_IFILE_STARTED)
+/* Operations depending on the construction mode and file type */
+struct nilfs_sc_operations {
+        int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
+                            struct inode *);
+        int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
+                            struct inode *);
+        int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
+                            struct inode *);
+        void (*write_data_binfo)(struct nilfs_sc_info *,
+                                 struct nilfs_segsum_pointer *,
+                                 union nilfs_binfo *);
+        void (*write_node_binfo)(struct nilfs_sc_info *,
+                                 struct nilfs_segsum_pointer *,
+                                 union nilfs_binfo *);
+};
+/*
+ * Other definitions
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
+static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
+                               int);
+#define nilfs_cnt32_gt(a, b)   \
+        (typecheck(__u32, a) && typecheck(__u32, b) && \
+         ((__s32)(b) - (__s32)(a) < 0))
+#define nilfs_cnt32_ge(a, b)   \
+        (typecheck(__u32, a) && typecheck(__u32, b) && \
+         ((__s32)(a) - (__s32)(b) >= 0))
+#define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
+#define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
+/*
+ * Transaction
+ */
+static struct kmem_cache *nilfs_transaction_cachep;
+/**
+ * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
+ *
+ * nilfs_init_transaction_cache() creates a slab cache for the struct
+ * nilfs_transaction_info.
+ *
+ * Return Value: On success, it returns 0. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_init_transaction_cache(void)
+{
+        nilfs_transaction_cachep =
+                kmem_cache_create("nilfs2_transaction_cache",
+                                  sizeof(struct nilfs_transaction_info),
+                                  0, SLAB_RECLAIM_ACCOUNT, NULL);
+        return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
+}
+/**
+ * nilfs_detroy_transaction_cache - destroy the cache for transaction info
+ *
+ * nilfs_destroy_transaction_cache() frees the slab cache for the struct
+ * nilfs_transaction_info.
+ */
+void nilfs_destroy_transaction_cache(void)
+{
+        kmem_cache_destroy(nilfs_transaction_cachep);
+}
+static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
+{
+        struct nilfs_transaction_info *cur_ti = current->journal_info;
+        void *save = NULL;
+        if (cur_ti) {
+                if (cur_ti->ti_magic == NILFS_TI_MAGIC)
+                        return ++cur_ti->ti_count;
+                else {
+                        /*
+                         * If journal_info field is occupied by other FS,
+                         * it is saved and will be restored on
+                         * nilfs_transaction_commit().
+                         */
+                        printk(KERN_WARNING
+                               "NILFS warning: journal info from a different "
+                               "FS\n");
+                        save = current->journal_info;
+                }
+        }
+        if (!ti) {
+                ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
+                if (!ti)
+                        return -ENOMEM;
+                ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
+        } else {
+                ti->ti_flags = 0;
+        }
+        ti->ti_count = 0;
+        ti->ti_save = save;
+        ti->ti_magic = NILFS_TI_MAGIC;
+        current->journal_info = ti;
+        return 0;
+}
+/**
+ * nilfs_transaction_begin - start indivisible file operations.
+ * @sb: super block
+ * @ti: nilfs_transaction_info
+ * @vacancy_check: flags for vacancy rate checks
+ *
+ * nilfs_transaction_begin() acquires a reader/writer semaphore, called
+ * the segment semaphore, to make a segment construction and write tasks
+ * exclusive.  The function is used with nilfs_transaction_commit() in pairs.
+ * The region enclosed by these two functions can be nested.  To avoid a
+ * deadlock, the semaphore is only acquired or released in the outermost call.
+ *
+ * This function allocates a nilfs_transaction_info struct to keep context
+ * information on it.  It is initialized and hooked onto the current task in
+ * the outermost call.  If a pre-allocated struct is given to @ti, it is used
+ * instead; othewise a new struct is assigned from a slab.
+ *
+ * When @vacancy_check flag is set, this function will check the amount of
+ * free space, and will wait for the GC to reclaim disk space if low capacity.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-ENOSPC - No space left on device
+ */
+int nilfs_transaction_begin(struct super_block *sb,
+                            struct nilfs_transaction_info *ti,
+                            int vacancy_check)
+{
+        struct nilfs_sb_info *sbi;
+        struct the_nilfs *nilfs;
+        int ret = nilfs_prepare_segment_lock(ti);
+        if (unlikely(ret < 0))
+                return ret;
+        if (ret > 0)
+                return 0;
+        sbi = NILFS_SB(sb);
+        nilfs = sbi->s_nilfs;
+        down_read(&nilfs->ns_segctor_sem);
+        if (vacancy_check && nilfs_near_disk_full(nilfs)) {
+                up_read(&nilfs->ns_segctor_sem);
+                ret = -ENOSPC;
+                goto failed;
+        }
+        return 0;
+ failed:
+        ti = current->journal_info;
+        current->journal_info = ti->ti_save;
+        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+                kmem_cache_free(nilfs_transaction_cachep, ti);
+        return ret;
+}
+/**
+ * nilfs_transaction_commit - commit indivisible file operations.
+ * @sb: super block
+ *
+ * nilfs_transaction_commit() releases the read semaphore which is
+ * acquired by nilfs_transaction_begin(). This is only performed
+ * in outermost call of this function.  If a commit flag is set,
+ * nilfs_transaction_commit() sets a timer to start the segment
+ * constructor.  If a sync flag is set, it starts construction
+ * directly.
+ */
+int nilfs_transaction_commit(struct super_block *sb)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        struct nilfs_sb_info *sbi;
+        struct nilfs_sc_info *sci;
+        int err = 0;
+        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+        ti->ti_flags |= NILFS_TI_COMMIT;
+        if (ti->ti_count > 0) {
+                ti->ti_count--;
+                return 0;
+        }
+        sbi = NILFS_SB(sb);
+        sci = NILFS_SC(sbi);
+        if (sci != NULL) {
+                if (ti->ti_flags & NILFS_TI_COMMIT)
+                        nilfs_segctor_start_timer(sci);
+                if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
+                    sci->sc_watermark)
+                        nilfs_segctor_do_flush(sci, 0);
+        }
+        up_read(&sbi->s_nilfs->ns_segctor_sem);
+        current->journal_info = ti->ti_save;
+        if (ti->ti_flags & NILFS_TI_SYNC)
+                err = nilfs_construct_segment(sb);
+        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+                kmem_cache_free(nilfs_transaction_cachep, ti);
+        return err;
+}
+void nilfs_transaction_abort(struct super_block *sb)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+        if (ti->ti_count > 0) {
+                ti->ti_count--;
+                return;
+        }
+        up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
+        current->journal_info = ti->ti_save;
+        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+                kmem_cache_free(nilfs_transaction_cachep, ti);
+}
+void nilfs_relax_pressure_in_lock(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        if (!sci || !sci->sc_flush_request)
+                return;
+        set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+        up_read(&nilfs->ns_segctor_sem);
+        down_write(&nilfs->ns_segctor_sem);
+        if (sci->sc_flush_request &&
+            test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
+                struct nilfs_transaction_info *ti = current->journal_info;
+                ti->ti_flags |= NILFS_TI_WRITER;
+                nilfs_segctor_do_immediate_flush(sci);
+                ti->ti_flags &= ~NILFS_TI_WRITER;
+        }
+        downgrade_write(&nilfs->ns_segctor_sem);
+}
+static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
+                                   struct nilfs_transaction_info *ti,
+                                   int gcflag)
+{
+        struct nilfs_transaction_info *cur_ti = current->journal_info;
+        WARN_ON(cur_ti);
+        ti->ti_flags = NILFS_TI_WRITER;
+        ti->ti_count = 0;
+        ti->ti_save = cur_ti;
+        ti->ti_magic = NILFS_TI_MAGIC;
+        INIT_LIST_HEAD(&ti->ti_garbage);
+        current->journal_info = ti;
+        for (;;) {
+                down_write(&sbi->s_nilfs->ns_segctor_sem);
+                if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
+                        break;
+                nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
+                up_write(&sbi->s_nilfs->ns_segctor_sem);
+                yield();
+        }
+        if (gcflag)
+                ti->ti_flags |= NILFS_TI_GC;
+}
+static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+        BUG_ON(ti->ti_count > 0);
+        up_write(&sbi->s_nilfs->ns_segctor_sem);
+        current->journal_info = ti->ti_save;
+        if (!list_empty(&ti->ti_garbage))
+                nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
+}
+static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
+                                            struct nilfs_segsum_pointer *ssp,
+                                            unsigned bytes)
+{
+        struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+        unsigned blocksize = sci->sc_super->s_blocksize;
+        void *p;
+        if (unlikely(ssp->offset + bytes > blocksize)) {
+                ssp->offset = 0;
+                BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
+                                               &segbuf->sb_segsum_buffers));
+                ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
+        }
+        p = ssp->bh->b_data + ssp->offset;
+        ssp->offset += bytes;
+        return p;
+}
+/**
+ * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
+ * @sci: nilfs_sc_info
+ */
+static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+        struct buffer_head *sumbh;
+        unsigned sumbytes;
+        unsigned flags = 0;
+        int err;
+        if (nilfs_doing_gc())
+                flags = NILFS_SS_GC;
+        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+        if (unlikely(err))
+                return err;
+        sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+        sumbytes = segbuf->sb_sum.sumbytes;
+        sci->sc_finfo_ptr.bh = sumbh;  sci->sc_finfo_ptr.offset = sumbytes;
+        sci->sc_binfo_ptr.bh = sumbh;  sci->sc_binfo_ptr.offset = sumbytes;
+        sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+        return 0;
+}
+static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
+{
+        sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+        if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
+                return -E2BIG; /* The current segment is filled up
+                                  (internal code) */
+        sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
+        return nilfs_segctor_reset_segment_buffer(sci);
+}
+static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+        int err;
+        if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
+                err = nilfs_segctor_feed_segment(sci);
+                if (err)
+                        return err;
+                segbuf = sci->sc_curseg;
+        }
+        err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+        if (likely(!err))
+                segbuf->sb_sum.flags |= NILFS_SS_SR;
+        return err;
+}
+/*
+ * Functions for making segment summary and payloads
+ */
+static int nilfs_segctor_segsum_block_required(
+        struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
+        unsigned binfo_size)
+{
+        unsigned blocksize = sci->sc_super->s_blocksize;
+        /* Size of finfo and binfo is enough small against blocksize */
+        return ssp->offset + binfo_size +
+                (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
+                blocksize;
+}
+static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
+                                      struct inode *inode)
+{
+        sci->sc_curseg->sb_sum.nfinfo++;
+        sci->sc_binfo_ptr = sci->sc_finfo_ptr;
+        nilfs_segctor_map_segsum_entry(
+                sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
+        if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+                set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+        /* skip finfo */
+}
+static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
+                                    struct inode *inode)
+{
+        struct nilfs_finfo *finfo;
+        struct nilfs_inode_info *ii;
+        struct nilfs_segment_buffer *segbuf;
+        if (sci->sc_blk_cnt == 0)
+                return;
+        ii = NILFS_I(inode);
+        finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
+                                                 sizeof(*finfo));
+        finfo->fi_ino = cpu_to_le64(inode->i_ino);
+        finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
+        finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
+        finfo->fi_cno = cpu_to_le64(ii->i_cno);
+        segbuf = sci->sc_curseg;
+        segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
+                sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
+        sci->sc_finfo_ptr = sci->sc_binfo_ptr;
+        sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+}
+static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
+                                        struct buffer_head *bh,
+                                        struct inode *inode,
+                                        unsigned binfo_size)
+{
+        struct nilfs_segment_buffer *segbuf;
+        int required, err = 0;
+ retry:
+        segbuf = sci->sc_curseg;
+        required = nilfs_segctor_segsum_block_required(
+                sci, &sci->sc_binfo_ptr, binfo_size);
+        if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
+                nilfs_segctor_end_finfo(sci, inode);
+                err = nilfs_segctor_feed_segment(sci);
+                if (err)
+                        return err;
+                goto retry;
+        }
+        if (unlikely(required)) {
+                err = nilfs_segbuf_extend_segsum(segbuf);
+                if (unlikely(err))
+                        goto failed;
+        }
+        if (sci->sc_blk_cnt == 0)
+                nilfs_segctor_begin_finfo(sci, inode);
+        nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
+        /* Substitution to vblocknr is delayed until update_blocknr() */
+        nilfs_segbuf_add_file_buffer(segbuf, bh);
+        sci->sc_blk_cnt++;
+ failed:
+        return err;
+}
+static int nilfs_handle_bmap_error(int err, const char *fname,
+                                   struct inode *inode, struct super_block *sb)
+{
+        if (err == -EINVAL) {
+                nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
+                            inode->i_ino);
+                err = -EIO;
+        }
+        return err;
+}
+/*
+ * Callback functions that enumerate, mark, and collect dirty blocks
+ */
+static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
+                                   struct buffer_head *bh, struct inode *inode)
+{
+        int err;
+        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+        if (unlikely(err < 0))
+                return nilfs_handle_bmap_error(err, __func__, inode,
+                                               sci->sc_super);
+        err = nilfs_segctor_add_file_block(sci, bh, inode,
+                                           sizeof(struct nilfs_binfo_v));
+        if (!err)
+                sci->sc_datablk_cnt++;
+        return err;
+}
+static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
+                                   struct buffer_head *bh,
+                                   struct inode *inode)
+{
+        int err;
+        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+        if (unlikely(err < 0))
+                return nilfs_handle_bmap_error(err, __func__, inode,
+                                               sci->sc_super);
+        return 0;
+}
+static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
+                                   struct buffer_head *bh,
+                                   struct inode *inode)
+{
+        WARN_ON(!buffer_dirty(bh));
+        return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+}
+static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
+                                        struct nilfs_segsum_pointer *ssp,
+                                        union nilfs_binfo *binfo)
+{
+        struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
+                sci, ssp, sizeof(*binfo_v));
+        *binfo_v = binfo->bi_v;
+}
+static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
+                                        struct nilfs_segsum_pointer *ssp,
+                                        union nilfs_binfo *binfo)
+{
+        __le64 *vblocknr = nilfs_segctor_map_segsum_entry(
+                sci, ssp, sizeof(*vblocknr));
+        *vblocknr = binfo->bi_v.bi_vblocknr;
+}
+struct nilfs_sc_operations nilfs_sc_file_ops = {
+        .collect_data = nilfs_collect_file_data,
+        .collect_node = nilfs_collect_file_node,
+        .collect_bmap = nilfs_collect_file_bmap,
+        .write_data_binfo = nilfs_write_file_data_binfo,
+        .write_node_binfo = nilfs_write_file_node_binfo,
+};
+static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
+                                  struct buffer_head *bh, struct inode *inode)
+{
+        int err;
+        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+        if (unlikely(err < 0))
+                return nilfs_handle_bmap_error(err, __func__, inode,
+                                               sci->sc_super);
+        err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+        if (!err)
+                sci->sc_datablk_cnt++;
+        return err;
+}
+static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
+                                  struct buffer_head *bh, struct inode *inode)
+{
+        WARN_ON(!buffer_dirty(bh));
+        return nilfs_segctor_add_file_block(sci, bh, inode,
+                                            sizeof(struct nilfs_binfo_dat));
+}
+static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
+                                       struct nilfs_segsum_pointer *ssp,
+                                       union nilfs_binfo *binfo)
+{
+        __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
+                                                          sizeof(*blkoff));
+        *blkoff = binfo->bi_dat.bi_blkoff;
+}
+static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
+                                       struct nilfs_segsum_pointer *ssp,
+                                       union nilfs_binfo *binfo)
+{
+        struct nilfs_binfo_dat *binfo_dat =
+                nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
+        *binfo_dat = binfo->bi_dat;
+}
+struct nilfs_sc_operations nilfs_sc_dat_ops = {
+        .collect_data = nilfs_collect_dat_data,
+        .collect_node = nilfs_collect_file_node,
+        .collect_bmap = nilfs_collect_dat_bmap,
+        .write_data_binfo = nilfs_write_dat_data_binfo,
+        .write_node_binfo = nilfs_write_dat_node_binfo,
+};
+struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+        .collect_data = nilfs_collect_file_data,
+        .collect_node = NULL,
+        .collect_bmap = NULL,
+        .write_data_binfo = nilfs_write_file_data_binfo,
+        .write_node_binfo = NULL,
+};
+static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
+                                              struct list_head *listp,
+                                              size_t nlimit,
+                                              loff_t start, loff_t end)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct pagevec pvec;
+        pgoff_t index = 0, last = ULONG_MAX;
+        size_t ndirties = 0;
+        int i;
+        if (unlikely(start != 0 || end != LLONG_MAX)) {
+                /*
+                 * A valid range is given for sync-ing data pages. The
+                 * range is rounded to per-page; extra dirty buffers
+                 * may be included if blocksize < pagesize.
+                 */
+                index = start >> PAGE_SHIFT;
+                last = end >> PAGE_SHIFT;
+        }
+        pagevec_init(&pvec, 0);
+ repeat:
+        if (unlikely(index > last) ||
+            !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+                                min_t(pgoff_t, last - index,
+                                      PAGEVEC_SIZE - 1) + 1))
+                return ndirties;
+        for (i = 0; i < pagevec_count(&pvec); i++) {
+                struct buffer_head *bh, *head;
+                struct page *page = pvec.pages[i];
+                if (unlikely(page->index > last))
+                        break;
+                if (mapping->host) {
+                        lock_page(page);
+                        if (!page_has_buffers(page))
+                                create_empty_buffers(page,
+                                                     1 << inode->i_blkbits, 0);
+                        unlock_page(page);
+                }
+                bh = head = page_buffers(page);
+                do {
+                        if (!buffer_dirty(bh))
+                                continue;
+                        get_bh(bh);
+                        list_add_tail(&bh->b_assoc_buffers, listp);
+                        ndirties++;
+                        if (unlikely(ndirties >= nlimit)) {
+                                pagevec_release(&pvec);
+                                cond_resched();
+                                return ndirties;
+                        }
+                } while (bh = bh->b_this_page, bh != head);
+        }
+        pagevec_release(&pvec);
+        cond_resched();
+        goto repeat;
+}
+static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
+                                            struct list_head *listp)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct address_space *mapping = &ii->i_btnode_cache;
+        struct pagevec pvec;
+        struct buffer_head *bh, *head;
+        unsigned int i;
+        pgoff_t index = 0;
+        pagevec_init(&pvec, 0);
+        while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+                                  PAGEVEC_SIZE)) {
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        bh = head = page_buffers(pvec.pages[i]);
+                        do {
+                                if (buffer_dirty(bh)) {
+                                        get_bh(bh);
+                                        list_add_tail(&bh->b_assoc_buffers,
+                                                      listp);
+                                }
+                                bh = bh->b_this_page;
+                        } while (bh != head);
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
+                               struct list_head *head, int force)
+{
+        struct nilfs_inode_info *ii, *n;
+        struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
+        unsigned nv = 0;
+        while (!list_empty(head)) {
+                spin_lock(&sbi->s_inode_lock);
+                list_for_each_entry_safe(ii, n, head, i_dirty) {
+                        list_del_init(&ii->i_dirty);
+                        if (force) {
+                                if (unlikely(ii->i_bh)) {
+                                        brelse(ii->i_bh);
+                                        ii->i_bh = NULL;
+                                }
+                        } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+                                set_bit(NILFS_I_QUEUED, &ii->i_state);
+                                list_add_tail(&ii->i_dirty,
+                                              &sbi->s_dirty_files);
+                                continue;
+                        }
+                        ivec[nv++] = ii;
+                        if (nv == SC_N_INODEVEC)
+                                break;
+                }
+                spin_unlock(&sbi->s_inode_lock);
+                for (pii = ivec; nv > 0; pii++, nv--)
+                        iput(&(*pii)->vfs_inode);
+        }
+}
+static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int ret = 0;
+        if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
+                ret++;
+        if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
+                ret++;
+        if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
+                ret++;
+        if (ret || nilfs_doing_gc())
+                if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
+                        ret++;
+        return ret;
+}
+static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
+{
+        return list_empty(&sci->sc_dirty_files) &&
+                !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
+                list_empty(&sci->sc_cleaning_segments) &&
+                (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
+}
+static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        int ret = 0;
+        if (nilfs_test_metadata_dirty(sbi))
+                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+        spin_lock(&sbi->s_inode_lock);
+        if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
+                ret++;
+        spin_unlock(&sbi->s_inode_lock);
+        return ret;
+}
+static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        nilfs_mdt_clear_dirty(sbi->s_ifile);
+        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
+        nilfs_mdt_clear_dirty(nilfs->ns_sufile);
+        nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+}
+static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
+{
+        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct buffer_head *bh_cp;
+        struct nilfs_checkpoint *raw_cp;
+        int err;
+        /* XXX: this interface will be changed */
+        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
+                                          &raw_cp, &bh_cp);
+        if (likely(!err)) {
+                /* The following code is duplicated with cpfile.  But, it is
+                   needed to collect the checkpoint even if it was not newly
+                   created */
+                nilfs_mdt_mark_buffer_dirty(bh_cp);
+                nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
+                nilfs_cpfile_put_checkpoint(
+                        nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+        } else
+                WARN_ON(err == -EINVAL || err == -ENOENT);
+        return err;
+}
+static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct buffer_head *bh_cp;
+        struct nilfs_checkpoint *raw_cp;
+        int err;
+        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
+                                          &raw_cp, &bh_cp);
+        if (unlikely(err)) {
+                WARN_ON(err == -EINVAL || err == -ENOENT);
+                goto failed_ibh;
+        }
+        raw_cp->cp_snapshot_list.ssl_next = 0;
+        raw_cp->cp_snapshot_list.ssl_prev = 0;
+        raw_cp->cp_inodes_count =
+                cpu_to_le64(atomic_read(&sbi->s_inodes_count));
+        raw_cp->cp_blocks_count =
+                cpu_to_le64(atomic_read(&sbi->s_blocks_count));
+        raw_cp->cp_nblk_inc =
+                cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
+        raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
+        raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
+        if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+                nilfs_checkpoint_clear_minor(raw_cp);
+        else
+                nilfs_checkpoint_set_minor(raw_cp);
+        nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
+        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+        return 0;
+ failed_ibh:
+        return err;
+}
+static void nilfs_fill_in_file_bmap(struct inode *ifile,
+                                    struct nilfs_inode_info *ii)
+{
+        struct buffer_head *ibh;
+        struct nilfs_inode *raw_inode;
+        if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
+                ibh = ii->i_bh;
+                BUG_ON(!ibh);
+                raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
+                                                  ibh);
+                nilfs_bmap_write(ii->i_bmap, raw_inode);
+                nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+        }
+}
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
+                                            struct inode *ifile)
+{
+        struct nilfs_inode_info *ii;
+        list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
+                nilfs_fill_in_file_bmap(ifile, ii);
+                set_bit(NILFS_I_COLLECTED, &ii->i_state);
+        }
+}
+/*
+ * CRC calculation routines
+ */
+static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
+{
+        struct nilfs_super_root *raw_sr =
+                (struct nilfs_super_root *)bh_sr->b_data;
+        u32 crc;
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+        raw_sr->sr_sum = cpu_to_le32(crc);
+}
+static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
+                                            u32 seed)
+{
+        struct nilfs_segment_buffer *segbuf;
+        if (sci->sc_super_root)
+                nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
+        }
+}
+static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
+                                             struct the_nilfs *nilfs)
+{
+        struct buffer_head *bh_sr = sci->sc_super_root;
+        struct nilfs_super_root *raw_sr =
+                (struct nilfs_super_root *)bh_sr->b_data;
+        unsigned isz = nilfs->ns_inode_size;
+        raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
+        raw_sr->sr_nongc_ctime
+                = cpu_to_le64(nilfs_doing_gc() ?
+                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
+        raw_sr->sr_flags = 0;
+        nilfs_mdt_write_inode_direct(
+                nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
+        nilfs_mdt_write_inode_direct(
+                nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
+        nilfs_mdt_write_inode_direct(
+                nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
+}
+static void nilfs_redirty_inodes(struct list_head *head)
+{
+        struct nilfs_inode_info *ii;
+        list_for_each_entry(ii, head, i_dirty) {
+                if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
+                        clear_bit(NILFS_I_COLLECTED, &ii->i_state);
+        }
+}
+static void nilfs_drop_collected_inodes(struct list_head *head)
+{
+        struct nilfs_inode_info *ii;
+        list_for_each_entry(ii, head, i_dirty) {
+                if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
+                        continue;
+                clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+                set_bit(NILFS_I_UPDATED, &ii->i_state);
+        }
+}
+static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
+                                               struct inode *sufile)
+{
+        struct list_head *head = &sci->sc_cleaning_segments;
+        struct nilfs_segment_entry *ent;
+        int err;
+        list_for_each_entry(ent, head, list) {
+                if (!(ent->flags & NILFS_SLH_FREED))
+                        break;
+                err = nilfs_sufile_cancel_free(sufile, ent->segnum);
+                WARN_ON(err); /* do not happen */
+                ent->flags &= ~NILFS_SLH_FREED;
+        }
+}
+static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
+                                               struct inode *sufile)
+{
+        struct list_head *head = &sci->sc_cleaning_segments;
+        struct nilfs_segment_entry *ent;
+        int err;
+        list_for_each_entry(ent, head, list) {
+                err = nilfs_sufile_free(sufile, ent->segnum);
+                if (unlikely(err))
+                        return err;
+                ent->flags |= NILFS_SLH_FREED;
+        }
+        return 0;
+}
+static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
+{
+        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+}
+static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
+                                       struct inode *inode,
+                                       struct list_head *listp,
+                                       int (*collect)(struct nilfs_sc_info *,
+                                                      struct buffer_head *,
+                                                      struct inode *))
+{
+        struct buffer_head *bh, *n;
+        int err = 0;
+        if (collect) {
+                list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
+                        list_del_init(&bh->b_assoc_buffers);
+                        err = collect(sci, bh, inode);
+                        brelse(bh);
+                        if (unlikely(err))
+                                goto dispose_buffers;
+                }
+                return 0;
+        }
+ dispose_buffers:
+        while (!list_empty(listp)) {
+                bh = list_entry(listp->next, struct buffer_head,
+                                b_assoc_buffers);
+                list_del_init(&bh->b_assoc_buffers);
+                brelse(bh);
+        }
+        return err;
+}
+static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
+{
+        /* Remaining number of blocks within segment buffer */
+        return sci->sc_segbuf_nblocks -
+                (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
+}
+static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
+                                   struct inode *inode,
+                                   struct nilfs_sc_operations *sc_ops)
+{
+        LIST_HEAD(data_buffers);
+        LIST_HEAD(node_buffers);
+        int err;
+        if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+                size_t n, rest = nilfs_segctor_buffer_rest(sci);
+                n = nilfs_lookup_dirty_data_buffers(
+                        inode, &data_buffers, rest + 1, 0, LLONG_MAX);
+                if (n > rest) {
+                        err = nilfs_segctor_apply_buffers(
+                                sci, inode, &data_buffers,
+                                sc_ops->collect_data);
+                        BUG_ON(!err); /* always receive -E2BIG or true error */
+                        goto break_or_fail;
+                }
+        }
+        nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
+        if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+                err = nilfs_segctor_apply_buffers(
+                        sci, inode, &data_buffers, sc_ops->collect_data);
+                if (unlikely(err)) {
+                        /* dispose node list */
+                        nilfs_segctor_apply_buffers(
+                                sci, inode, &node_buffers, NULL);
+                        goto break_or_fail;
+                }
+                sci->sc_stage.flags |= NILFS_CF_NODE;
+        }
+        /* Collect node */
+        err = nilfs_segctor_apply_buffers(
+                sci, inode, &node_buffers, sc_ops->collect_node);
+        if (unlikely(err))
+                goto break_or_fail;
+        nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
+        err = nilfs_segctor_apply_buffers(
+                sci, inode, &node_buffers, sc_ops->collect_bmap);
+        if (unlikely(err))
+                goto break_or_fail;
+        nilfs_segctor_end_finfo(sci, inode);
+        sci->sc_stage.flags &= ~NILFS_CF_NODE;
+ break_or_fail:
+        return err;
+}
+static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
+                                         struct inode *inode)
+{
+        LIST_HEAD(data_buffers);
+        size_t n, rest = nilfs_segctor_buffer_rest(sci);
+        int err;
+        n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
+                                            sci->sc_dsync_start,
+                                            sci->sc_dsync_end);
+        err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
+                                          nilfs_collect_file_data);
+        if (!err) {
+                nilfs_segctor_end_finfo(sci, inode);
+                BUG_ON(n > rest);
+                /* always receive -E2BIG or true error if n > rest */
+        }
+        return err;
+}
+static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct list_head *head;
+        struct nilfs_inode_info *ii;
+        int err = 0;
+        switch (sci->sc_stage.scnt) {
+        case NILFS_ST_INIT:
+                /* Pre-processes */
+                sci->sc_stage.flags = 0;
+                if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
+                        sci->sc_nblk_inc = 0;
+                        sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
+                        if (mode == SC_LSEG_DSYNC) {
+                                sci->sc_stage.scnt = NILFS_ST_DSYNC;
+                                goto dsync_mode;
+                        }
+                }
+                sci->sc_stage.dirty_file_ptr = NULL;
+                sci->sc_stage.gc_inode_ptr = NULL;
+                if (mode == SC_FLUSH_DAT) {
+                        sci->sc_stage.scnt = NILFS_ST_DAT;
+                        goto dat_stage;
+                }
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_GC:
+                if (nilfs_doing_gc()) {
+                        head = &sci->sc_gc_inodes;
+                        ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
+                                                head, i_dirty);
+                        list_for_each_entry_continue(ii, head, i_dirty) {
+                                err = nilfs_segctor_scan_file(
+                                        sci, &ii->vfs_inode,
+                                        &nilfs_sc_file_ops);
+                                if (unlikely(err)) {
+                                        sci->sc_stage.gc_inode_ptr = list_entry(
+                                                ii->i_dirty.prev,
+                                                struct nilfs_inode_info,
+                                                i_dirty);
+                                        goto break_or_fail;
+                                }
+                                set_bit(NILFS_I_COLLECTED, &ii->i_state);
+                        }
+                        sci->sc_stage.gc_inode_ptr = NULL;
+                }
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_FILE:
+                head = &sci->sc_dirty_files;
+                ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
+                                        i_dirty);
+                list_for_each_entry_continue(ii, head, i_dirty) {
+                        clear_bit(NILFS_I_DIRTY, &ii->i_state);
+                        err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
+                                                      &nilfs_sc_file_ops);
+                        if (unlikely(err)) {
+                                sci->sc_stage.dirty_file_ptr =
+                                        list_entry(ii->i_dirty.prev,
+                                                   struct nilfs_inode_info,
+                                                   i_dirty);
+                                goto break_or_fail;
+                        }
+                        /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
+                        /* XXX: required ? */
+                }
+                sci->sc_stage.dirty_file_ptr = NULL;
+                if (mode == SC_FLUSH_FILE) {
+                        sci->sc_stage.scnt = NILFS_ST_DONE;
+                        return 0;
+                }
+                sci->sc_stage.scnt++;
+                sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
+                /* Fall through */
+        case NILFS_ST_IFILE:
+                err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
+                                              &nilfs_sc_file_ops);
+                if (unlikely(err))
+                        break;
+                sci->sc_stage.scnt++;
+                /* Creating a checkpoint */
+                err = nilfs_segctor_create_checkpoint(sci);
+                if (unlikely(err))
+                        break;
+                /* Fall through */
+        case NILFS_ST_CPFILE:
+                err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
+                                              &nilfs_sc_file_ops);
+                if (unlikely(err))
+                        break;
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_SUFILE:
+                err = nilfs_segctor_prepare_free_segments(sci,
+                                                          nilfs->ns_sufile);
+                if (unlikely(err))
+                        break;
+                err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
+                                              &nilfs_sc_file_ops);
+                if (unlikely(err))
+                        break;
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_DAT:
+ dat_stage:
+                err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+                                              &nilfs_sc_dat_ops);
+                if (unlikely(err))
+                        break;
+                if (mode == SC_FLUSH_DAT) {
+                        sci->sc_stage.scnt = NILFS_ST_DONE;
+                        return 0;
+                }
+                sci->sc_stage.scnt++;  /* Fall through */
+        case NILFS_ST_SR:
+                if (mode == SC_LSEG_SR) {
+                        /* Appending a super root */
+                        err = nilfs_segctor_add_super_root(sci);
+                        if (unlikely(err))
+                                break;
+                }
+                /* End of a logical segment */
+                sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+                sci->sc_stage.scnt = NILFS_ST_DONE;
+                return 0;
+        case NILFS_ST_DSYNC:
+ dsync_mode:
+                sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
+                ii = sci->sc_dsync_inode;
+                if (!test_bit(NILFS_I_BUSY, &ii->i_state))
+                        break;
+                err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
+                if (unlikely(err))
+                        break;
+                sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+                sci->sc_stage.scnt = NILFS_ST_DONE;
+                return 0;
+        case NILFS_ST_DONE:
+                return 0;
+        default:
+                BUG();
+        }
+ break_or_fail:
+        return err;
+}
+static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
+{
+        struct buffer_head *bh_su;
+        struct nilfs_segment_usage *raw_su;
+        int err;
+        err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
+        if (unlikely(err))
+                return err;
+        nilfs_mdt_mark_buffer_dirty(bh_su);
+        nilfs_mdt_mark_dirty(sufile);
+        nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
+        return 0;
+}
+static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
+                                            struct the_nilfs *nilfs)
+{
+        struct nilfs_segment_buffer *segbuf, *n;
+        __u64 nextnum;
+        int err;
+        if (list_empty(&sci->sc_segbufs)) {
+                segbuf = nilfs_segbuf_new(sci->sc_super);
+                if (unlikely(!segbuf))
+                        return -ENOMEM;
+                list_add(&segbuf->sb_list, &sci->sc_segbufs);
+        } else
+                segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
+                         nilfs);
+        if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+                nilfs_shift_to_next_segment(nilfs);
+                nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+        }
+        sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
+        err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
+        if (unlikely(err))
+                return err;
+        if (nilfs->ns_segnum == nilfs->ns_nextnum) {
+                /* Start from the head of a new full segment */
+                err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
+                if (unlikely(err))
+                        return err;
+        } else
+                nextnum = nilfs->ns_nextnum;
+        segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+        nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
+        /* truncating segment buffers */
+        list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+                                          sb_list) {
+                list_del_init(&segbuf->sb_list);
+                nilfs_segbuf_free(segbuf);
+        }
+        return 0;
+}
+static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
+                                         struct the_nilfs *nilfs, int nadd)
+{
+        struct nilfs_segment_buffer *segbuf, *prev, *n;
+        struct inode *sufile = nilfs->ns_sufile;
+        __u64 nextnextnum;
+        LIST_HEAD(list);
+        int err, ret, i;
+        prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+        /*
+         * Since the segment specified with nextnum might be allocated during
+         * the previous construction, the buffer including its segusage may
+         * not be dirty.  The following call ensures that the buffer is dirty
+         * and will pin the buffer on memory until the sufile is written.
+         */
+        err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
+        if (unlikely(err))
+                return err;
+        for (i = 0; i < nadd; i++) {
+                /* extend segment info */
+                err = -ENOMEM;
+                segbuf = nilfs_segbuf_new(sci->sc_super);
+                if (unlikely(!segbuf))
+                        goto failed;
+                /* map this buffer to region of segment on-disk */
+                nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+                sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
+                /* allocate the next next full segment */
+                err = nilfs_sufile_alloc(sufile, &nextnextnum);
+                if (unlikely(err))
+                        goto failed_segbuf;
+                segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
+                nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
+                list_add_tail(&segbuf->sb_list, &list);
+                prev = segbuf;
+        }
+        list_splice(&list, sci->sc_segbufs.prev);
+        return 0;
+ failed_segbuf:
+        nilfs_segbuf_free(segbuf);
+ failed:
+        list_for_each_entry_safe(segbuf, n, &list, sb_list) {
+                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+                WARN_ON(ret); /* never fails */
+                list_del_init(&segbuf->sb_list);
+                nilfs_segbuf_free(segbuf);
+        }
+        return err;
+}
+static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
+                                                   struct the_nilfs *nilfs)
+{
+        struct nilfs_segment_buffer *segbuf;
+        int ret, done = 0;
+        segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
+                ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+                WARN_ON(ret); /* never fails */
+        }
+        if (segbuf->sb_io_error) {
+                /* Case 1: The first segment failed */
+                if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
+                        /* Case 1a:  Partial segment appended into an existing
+                           segment */
+                        nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
+                                                segbuf->sb_fseg_end);
+                else /* Case 1b:  New full segment */
+                        set_nilfs_discontinued(nilfs);
+                done++;
+        }
+        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+                ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+                WARN_ON(ret); /* never fails */
+                if (!done && segbuf->sb_io_error) {
+                        if (segbuf->sb_segnum != nilfs->ns_nextnum)
+                                /* Case 2: extended segment (!= next) failed */
+                                nilfs_sufile_set_error(nilfs->ns_sufile,
+                                                       segbuf->sb_segnum);
+                        done++;
+                }
+        }
+}
+static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
+                nilfs_segbuf_clear(segbuf);
+        sci->sc_super_root = NULL;
+}
+static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf;
+        while (!list_empty(&sci->sc_segbufs)) {
+                segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+                list_del_init(&segbuf->sb_list);
+                nilfs_segbuf_free(segbuf);
+        }
+        /* sci->sc_curseg = NULL; */
+}
+static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
+                                           struct the_nilfs *nilfs, int err)
+{
+        if (unlikely(err)) {
+                nilfs_segctor_free_incomplete_segments(sci, nilfs);
+                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+        }
+        nilfs_segctor_clear_segment_buffers(sci);
+}
+static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
+                                          struct inode *sufile)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct buffer_head *bh_su;
+        struct nilfs_segment_usage *raw_su;
+        unsigned long live_blocks;
+        int ret;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+                                                     &raw_su, &bh_su);
+                WARN_ON(ret); /* always succeed because bh_su is dirty */
+                live_blocks = segbuf->sb_sum.nblocks +
+                        (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
+                raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
+                raw_su->su_nblocks = cpu_to_le32(live_blocks);
+                nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+                                               bh_su);
+        }
+}
+static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
+                                          struct inode *sufile)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct buffer_head *bh_su;
+        struct nilfs_segment_usage *raw_su;
+        int ret;
+        segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+                                             &raw_su, &bh_su);
+        WARN_ON(ret); /* always succeed because bh_su is dirty */
+        raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
+                                         segbuf->sb_fseg_start);
+        nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
+        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+                ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+                                                     &raw_su, &bh_su);
+                WARN_ON(ret); /* always succeed */
+                raw_su->su_nblocks = 0;
+                nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+                                               bh_su);
+        }
+}
+static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
+                                            struct nilfs_segment_buffer *last,
+                                            struct inode *sufile)
+{
+        struct nilfs_segment_buffer *segbuf = last, *n;
+        int ret;
+        list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+                                          sb_list) {
+                list_del_init(&segbuf->sb_list);
+                sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
+                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+                WARN_ON(ret);
+                nilfs_segbuf_free(segbuf);
+        }
+}
+static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
+                                 struct the_nilfs *nilfs, int mode)
+{
+        struct nilfs_cstage prev_stage = sci->sc_stage;
+        int err, nadd = 1;
+        /* Collection retry loop */
+        for (;;) {
+                sci->sc_super_root = NULL;
+                sci->sc_nblk_this_inc = 0;
+                sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+                err = nilfs_segctor_reset_segment_buffer(sci);
+                if (unlikely(err))
+                        goto failed;
+                err = nilfs_segctor_collect_blocks(sci, mode);
+                sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+                if (!err)
+                        break;
+                if (unlikely(err != -E2BIG))
+                        goto failed;
+                /* The current segment is filled up */
+                if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+                        break;
+                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+                nilfs_segctor_clear_segment_buffers(sci);
+                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+                if (unlikely(err))
+                        return err;
+                nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
+                sci->sc_stage = prev_stage;
+        }
+        nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
+        return 0;
+ failed:
+        return err;
+}
+static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
+                                      struct buffer_head *new_bh)
+{
+        BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
+        list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
+        /* The caller must release old_bh */
+}
+static int
+nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
+                                     struct nilfs_segment_buffer *segbuf,
+                                     int mode)
+{
+        struct inode *inode = NULL;
+        sector_t blocknr;
+        unsigned long nfinfo = segbuf->sb_sum.nfinfo;
+        unsigned long nblocks = 0, ndatablk = 0;
+        struct nilfs_sc_operations *sc_op = NULL;
+        struct nilfs_segsum_pointer ssp;
+        struct nilfs_finfo *finfo = NULL;
+        union nilfs_binfo binfo;
+        struct buffer_head *bh, *bh_org;
+        ino_t ino = 0;
+        int err = 0;
+        if (!nfinfo)
+                goto out;
+        blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
+        ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+        ssp.offset = sizeof(struct nilfs_segment_summary);
+        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+                if (bh == sci->sc_super_root)
+                        break;
+                if (!finfo) {
+                        finfo = nilfs_segctor_map_segsum_entry(
+                                sci, &ssp, sizeof(*finfo));
+                        ino = le64_to_cpu(finfo->fi_ino);
+                        nblocks = le32_to_cpu(finfo->fi_nblocks);
+                        ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+                        if (buffer_nilfs_node(bh))
+                                inode = NILFS_BTNC_I(bh->b_page->mapping);
+                        else
+                                inode = NILFS_AS_I(bh->b_page->mapping);
+                        if (mode == SC_LSEG_DSYNC)
+                                sc_op = &nilfs_sc_dsync_ops;
+                        else if (ino == NILFS_DAT_INO)
+                                sc_op = &nilfs_sc_dat_ops;
+                        else /* file blocks */
+                                sc_op = &nilfs_sc_file_ops;
+                }
+                bh_org = bh;
+                get_bh(bh_org);
+                err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
+                                        &binfo);
+                if (bh != bh_org)
+                        nilfs_list_replace_buffer(bh_org, bh);
+                brelse(bh_org);
+                if (unlikely(err))
+                        goto failed_bmap;
+                if (ndatablk > 0)
+                        sc_op->write_data_binfo(sci, &ssp, &binfo);
+                else
+                        sc_op->write_node_binfo(sci, &ssp, &binfo);
+                blocknr++;
+                if (--nblocks == 0) {
+                        finfo = NULL;
+                        if (--nfinfo == 0)
+                                break;
+                } else if (ndatablk > 0)
+                        ndatablk--;
+        }
+ out:
+        return 0;
+ failed_bmap:
+        err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
+        return err;
+}
+static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
+{
+        struct nilfs_segment_buffer *segbuf;
+        int err;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
+                if (unlikely(err))
+                        return err;
+                nilfs_segbuf_fill_in_segsum(segbuf);
+        }
+        return 0;
+}
+static int
+nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
+{
+        struct page *clone_page;
+        struct buffer_head *bh, *head, *bh2;
+        void *kaddr;
+        bh = head = page_buffers(page);
+        clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
+        if (unlikely(!clone_page))
+                return -ENOMEM;
+        bh2 = page_buffers(clone_page);
+        kaddr = kmap_atomic(page, KM_USER0);
+        do {
+                if (list_empty(&bh->b_assoc_buffers))
+                        continue;
+                get_bh(bh2);
+                page_cache_get(clone_page); /* for each bh */
+                memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
+                bh2->b_blocknr = bh->b_blocknr;
+                list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
+                list_add_tail(&bh->b_assoc_buffers, out);
+        } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (!TestSetPageWriteback(clone_page))
+                inc_zone_page_state(clone_page, NR_WRITEBACK);
+        unlock_page(clone_page);
+        return 0;
+}
+static int nilfs_test_page_to_be_frozen(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
+                return 0;
+        if (page_mapped(page)) {
+                ClearPageChecked(page);
+                return 1;
+        }
+        return PageChecked(page);
+}
+static int nilfs_begin_page_io(struct page *page, struct list_head *out)
+{
+        if (!page || PageWriteback(page))
+                /* For split b-tree node pages, this function may be called
+                   twice.  We ignore the 2nd or later calls by this check. */
+                return 0;
+        lock_page(page);
+        clear_page_dirty_for_io(page);
+        set_page_writeback(page);
+        unlock_page(page);
+        if (nilfs_test_page_to_be_frozen(page)) {
+                int err = nilfs_copy_replace_page_buffers(page, out);
+                if (unlikely(err))
+                        return err;
+        }
+        return 0;
+}
+static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
+                                       struct page **failed_page)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct page *bd_page = NULL, *fs_page = NULL;
+        struct list_head *list = &sci->sc_copied_buffers;
+        int err;
+        *failed_page = NULL;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                struct buffer_head *bh;
+                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+                                    b_assoc_buffers) {
+                        if (bh->b_page != bd_page) {
+                                if (bd_page) {
+                                        lock_page(bd_page);
+                                        clear_page_dirty_for_io(bd_page);
+                                        set_page_writeback(bd_page);
+                                        unlock_page(bd_page);
+                                }
+                                bd_page = bh->b_page;
+                        }
+                }
+                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+                                    b_assoc_buffers) {
+                        if (bh == sci->sc_super_root) {
+                                if (bh->b_page != bd_page) {
+                                        lock_page(bd_page);
+                                        clear_page_dirty_for_io(bd_page);
+                                        set_page_writeback(bd_page);
+                                        unlock_page(bd_page);
+                                        bd_page = bh->b_page;
+                                }
+                                break;
+                        }
+                        if (bh->b_page != fs_page) {
+                                err = nilfs_begin_page_io(fs_page, list);
+                                if (unlikely(err)) {
+                                        *failed_page = fs_page;
+                                        goto out;
+                                }
+                                fs_page = bh->b_page;
+                        }
+                }
+        }
+        if (bd_page) {
+                lock_page(bd_page);
+                clear_page_dirty_for_io(bd_page);
+                set_page_writeback(bd_page);
+                unlock_page(bd_page);
+        }
+        err = nilfs_begin_page_io(fs_page, list);
+        if (unlikely(err))
+                *failed_page = fs_page;
+ out:
+        return err;
+}
+static int nilfs_segctor_write(struct nilfs_sc_info *sci,
+                               struct backing_dev_info *bdi)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct nilfs_write_info wi;
+        int err, res;
+        wi.sb = sci->sc_super;
+        wi.bh_sr = sci->sc_super_root;
+        wi.bdi = bdi;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                nilfs_segbuf_prepare_write(segbuf, &wi);
+                err = nilfs_segbuf_write(segbuf, &wi);
+                res = nilfs_segbuf_wait(segbuf, &wi);
+                err = unlikely(err) ? : res;
+                if (unlikely(err))
+                        return err;
+        }
+        return 0;
+}
+static int nilfs_page_has_uncleared_buffer(struct page *page)
+{
+        struct buffer_head *head, *bh;
+        head = bh = page_buffers(page);
+        do {
+                if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
+                        return 1;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return 0;
+}
+static void __nilfs_end_page_io(struct page *page, int err)
+{
+        if (!err) {
+                if (!nilfs_page_buffers_clean(page))
+                        __set_page_dirty_nobuffers(page);
+                ClearPageError(page);
+        } else {
+                __set_page_dirty_nobuffers(page);
+                SetPageError(page);
+        }
+        if (buffer_nilfs_allocated(page_buffers(page))) {
+                if (TestClearPageWriteback(page))
+                        dec_zone_page_state(page, NR_WRITEBACK);
+        } else
+                end_page_writeback(page);
+}
+static void nilfs_end_page_io(struct page *page, int err)
+{
+        if (!page)
+                return;
+        if (buffer_nilfs_node(page_buffers(page)) &&
+            nilfs_page_has_uncleared_buffer(page))
+                /* For b-tree node pages, this function may be called twice
+                   or more because they might be split in a segment.
+                   This check assures that cleanup has been done for all
+                   buffers in a split btnode page. */
+                return;
+        __nilfs_end_page_io(page, err);
+}
+static void nilfs_clear_copied_buffers(struct list_head *list, int err)
+{
+        struct buffer_head *bh, *head;
+        struct page *page;
+        while (!list_empty(list)) {
+                bh = list_entry(list->next, struct buffer_head,
+                                b_assoc_buffers);
+                page = bh->b_page;
+                page_cache_get(page);
+                head = bh = page_buffers(page);
+                do {
+                        if (!list_empty(&bh->b_assoc_buffers)) {
+                                list_del_init(&bh->b_assoc_buffers);
+                                if (!err) {
+                                        set_buffer_uptodate(bh);
+                                        clear_buffer_dirty(bh);
+                                        clear_buffer_nilfs_volatile(bh);
+                                }
+                                brelse(bh); /* for b_assoc_buffers */
+                        }
+                } while ((bh = bh->b_this_page) != head);
+                __nilfs_end_page_io(page, err);
+                page_cache_release(page);
+        }
+}
+static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
+                                      struct page *failed_page, int err)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct page *bd_page = NULL, *fs_page = NULL;
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                struct buffer_head *bh;
+                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+                                    b_assoc_buffers) {
+                        if (bh->b_page != bd_page) {
+                                if (bd_page)
+                                        end_page_writeback(bd_page);
+                                bd_page = bh->b_page;
+                        }
+                }
+                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+                                    b_assoc_buffers) {
+                        if (bh == sci->sc_super_root) {
+                                if (bh->b_page != bd_page) {
+                                        end_page_writeback(bd_page);
+                                        bd_page = bh->b_page;
+                                }
+                                break;
+                        }
+                        if (bh->b_page != fs_page) {
+                                nilfs_end_page_io(fs_page, err);
+                                if (unlikely(fs_page == failed_page))
+                                        goto done;
+                                fs_page = bh->b_page;
+                        }
+                }
+        }
+        if (bd_page)
+                end_page_writeback(bd_page);
+        nilfs_end_page_io(fs_page, err);
+ done:
+        nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
+}
+static void nilfs_set_next_segment(struct the_nilfs *nilfs,
+                                   struct nilfs_segment_buffer *segbuf)
+{
+        nilfs->ns_segnum = segbuf->sb_segnum;
+        nilfs->ns_nextnum = segbuf->sb_nextnum;
+        nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
+                + segbuf->sb_sum.nblocks;
+        nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
+        nilfs->ns_ctime = segbuf->sb_sum.ctime;
+}
+static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segment_buffer *segbuf;
+        struct page *bd_page = NULL, *fs_page = NULL;
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int update_sr = (sci->sc_super_root != NULL);
+        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+                struct buffer_head *bh;
+                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+                                    b_assoc_buffers) {
+                        set_buffer_uptodate(bh);
+                        clear_buffer_dirty(bh);
+                        if (bh->b_page != bd_page) {
+                                if (bd_page)
+                                        end_page_writeback(bd_page);
+                                bd_page = bh->b_page;
+                        }
+                }
+                /*
+                 * We assume that the buffers which belong to the same page
+                 * continue over the buffer list.
+                 * Under this assumption, the last BHs of pages is
+                 * identifiable by the discontinuity of bh->b_page
+                 * (page != fs_page).
+                 *
+                 * For B-tree node blocks, however, this assumption is not
+                 * guaranteed.  The cleanup code of B-tree node pages needs
+                 * special care.
+                 */
+                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+                                    b_assoc_buffers) {
+                        set_buffer_uptodate(bh);
+                        clear_buffer_dirty(bh);
+                        clear_buffer_nilfs_volatile(bh);
+                        if (bh == sci->sc_super_root) {
+                                if (bh->b_page != bd_page) {
+                                        end_page_writeback(bd_page);
+                                        bd_page = bh->b_page;
+                                }
+                                break;
+                        }
+                        if (bh->b_page != fs_page) {
+                                nilfs_end_page_io(fs_page, 0);
+                                fs_page = bh->b_page;
+                        }
+                }
+                if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
+                        if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
+                                set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+                                sci->sc_lseg_stime = jiffies;
+                        }
+                        if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
+                                clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+                }
+        }
+        /*
+         * Since pages may continue over multiple segment buffers,
+         * end of the last page must be checked outside of the loop.
+         */
+        if (bd_page)
+                end_page_writeback(bd_page);
+        nilfs_end_page_io(fs_page, 0);
+        nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
+        nilfs_drop_collected_inodes(&sci->sc_dirty_files);
+        if (nilfs_doing_gc()) {
+                nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
+                if (update_sr)
+                        nilfs_commit_gcdat_inode(nilfs);
+        } else
+                nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
+        sci->sc_nblk_inc += sci->sc_nblk_this_inc;
+        segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+        nilfs_set_next_segment(nilfs, segbuf);
+        if (update_sr) {
+                nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
+                                       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
+                sbi->s_super->s_dirt = 1;
+                clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+                clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+                set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+        } else
+                clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+}
+static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
+                                        struct nilfs_sb_info *sbi)
+{
+        struct nilfs_inode_info *ii, *n;
+        __u64 cno = sbi->s_nilfs->ns_cno;
+        spin_lock(&sbi->s_inode_lock);
+ retry:
+        list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
+                if (!ii->i_bh) {
+                        struct buffer_head *ibh;
+                        int err;
+                        spin_unlock(&sbi->s_inode_lock);
+                        err = nilfs_ifile_get_inode_block(
+                                sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
+                        if (unlikely(err)) {
+                                nilfs_warning(sbi->s_super, __func__,
+                                              "failed to get inode block.\n");
+                                return err;
+                        }
+                        nilfs_mdt_mark_buffer_dirty(ibh);
+                        nilfs_mdt_mark_dirty(sbi->s_ifile);
+                        spin_lock(&sbi->s_inode_lock);
+                        if (likely(!ii->i_bh))
+                                ii->i_bh = ibh;
+                        else
+                                brelse(ibh);
+                        goto retry;
+                }
+                ii->i_cno = cno;
+                clear_bit(NILFS_I_QUEUED, &ii->i_state);
+                set_bit(NILFS_I_BUSY, &ii->i_state);
+                list_del(&ii->i_dirty);
+                list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
+        }
+        spin_unlock(&sbi->s_inode_lock);
+        NILFS_I(sbi->s_ifile)->i_cno = cno;
+        return 0;
+}
+static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
+                                          struct nilfs_sb_info *sbi)
+{
+        struct nilfs_transaction_info *ti = current->journal_info;
+        struct nilfs_inode_info *ii, *n;
+        __u64 cno = sbi->s_nilfs->ns_cno;
+        spin_lock(&sbi->s_inode_lock);
+        list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
+                if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
+                    test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+                        /* The current checkpoint number (=nilfs->ns_cno) is
+                           changed between check-in and check-out only if the
+                           super root is written out.  So, we can update i_cno
+                           for the inodes that remain in the dirty list. */
+                        ii->i_cno = cno;
+                        continue;
+                }
+                clear_bit(NILFS_I_BUSY, &ii->i_state);
+                brelse(ii->i_bh);
+                ii->i_bh = NULL;
+                list_del(&ii->i_dirty);
+                list_add_tail(&ii->i_dirty, &ti->ti_garbage);
+        }
+        spin_unlock(&sbi->s_inode_lock);
+}
+/*
+ * Main procedure of segment constructor
+ */
+static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct page *failed_page;
+        int err, has_sr = 0;
+        sci->sc_stage.scnt = NILFS_ST_INIT;
+        err = nilfs_segctor_check_in_files(sci, sbi);
+        if (unlikely(err))
+                goto out;
+        if (nilfs_test_metadata_dirty(sbi))
+                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+        if (nilfs_segctor_clean(sci))
+                goto out;
+        do {
+                sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
+                err = nilfs_segctor_begin_construction(sci, nilfs);
+                if (unlikely(err))
+                        goto out;
+                /* Update time stamp */
+                sci->sc_seg_ctime = get_seconds();
+                err = nilfs_segctor_collect(sci, nilfs, mode);
+                if (unlikely(err))
+                        goto failed;
+                has_sr = (sci->sc_super_root != NULL);
+                /* Avoid empty segment */
+                if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+                    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
+                        nilfs_segctor_end_construction(sci, nilfs, 1);
+                        goto out;
+                }
+                err = nilfs_segctor_assign(sci, mode);
+                if (unlikely(err))
+                        goto failed;
+                if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+                        nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
+                if (has_sr) {
+                        err = nilfs_segctor_fill_in_checkpoint(sci);
+                        if (unlikely(err))
+                                goto failed_to_make_up;
+                        nilfs_segctor_fill_in_super_root(sci, nilfs);
+                }
+                nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
+                /* Write partial segments */
+                err = nilfs_segctor_prepare_write(sci, &failed_page);
+                if (unlikely(err))
+                        goto failed_to_write;
+                nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+                err = nilfs_segctor_write(sci, nilfs->ns_bdi);
+                if (unlikely(err))
+                        goto failed_to_write;
+                nilfs_segctor_complete_write(sci);
+                /* Commit segments */
+                if (has_sr) {
+                        nilfs_segctor_commit_free_segments(sci);
+                        nilfs_segctor_clear_metadata_dirty(sci);
+                }
+                nilfs_segctor_end_construction(sci, nilfs, 0);
+        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+ out:
+        nilfs_segctor_destroy_segment_buffers(sci);
+        nilfs_segctor_check_out_files(sci, sbi);
+        return err;
+ failed_to_write:
+        nilfs_segctor_abort_write(sci, failed_page, err);
+        nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
+ failed_to_make_up:
+        if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+                nilfs_redirty_inodes(&sci->sc_dirty_files);
+ failed:
+        if (nilfs_doing_gc())
+                nilfs_redirty_inodes(&sci->sc_gc_inodes);
+        nilfs_segctor_end_construction(sci, nilfs, err);
+        goto out;
+}
+/**
+ * nilfs_secgtor_start_timer - set timer of background write
+ * @sci: nilfs_sc_info
+ *
+ * If the timer has already been set, it ignores the new request.
+ * This function MUST be called within a section locking the segment
+ * semaphore.
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
+{
+        spin_lock(&sci->sc_state_lock);
+        if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+                sci->sc_timer->expires = jiffies + sci->sc_interval;
+                add_timer(sci->sc_timer);
+                sci->sc_state |= NILFS_SEGCTOR_COMMIT;
+        }
+        spin_unlock(&sci->sc_state_lock);
+}
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
+{
+        spin_lock(&sci->sc_state_lock);
+        if (!(sci->sc_flush_request & (1 << bn))) {
+                unsigned long prev_req = sci->sc_flush_request;
+                sci->sc_flush_request |= (1 << bn);
+                if (!prev_req)
+                        wake_up(&sci->sc_wait_daemon);
+        }
+        spin_unlock(&sci->sc_state_lock);
+}
+/**
+ * nilfs_flush_segment - trigger a segment construction for resource control
+ * @sb: super block
+ * @ino: inode number of the file to be flushed out.
+ */
+void nilfs_flush_segment(struct super_block *sb, ino_t ino)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        if (!sci || nilfs_doing_construction())
+                return;
+        nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
+                                        /* assign bit 0 to data files */
+}
+int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
+                                           __u64 *segnum, size_t nsegs)
+{
+        struct nilfs_segment_entry *ent;
+        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct inode *sufile = nilfs->ns_sufile;
+        LIST_HEAD(list);
+        __u64 *pnum;
+        size_t i;
+        int err;
+        for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
+                ent = nilfs_alloc_segment_entry(*pnum);
+                if (unlikely(!ent)) {
+                        err = -ENOMEM;
+                        goto failed;
+                }
+                list_add_tail(&ent->list, &list);
+                err = nilfs_open_segment_entry(ent, sufile);
+                if (unlikely(err))
+                        goto failed;
+                if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
+                        printk(KERN_WARNING "NILFS: unused segment is "
+                               "requested to be cleaned (segnum=%llu)\n",
+                               (unsigned long long)ent->segnum);
+                nilfs_close_segment_entry(ent, sufile);
+        }
+        list_splice(&list, sci->sc_cleaning_segments.prev);
+        return 0;
+ failed:
+        nilfs_dispose_segment_list(&list);
+        return err;
+}
+void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
+{
+        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+}
+struct nilfs_segctor_wait_request {
+        wait_queue_t    wq;
+        __u32           seq;
+        int             err;
+        atomic_t        done;
+};
+static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
+{
+        struct nilfs_segctor_wait_request wait_req;
+        int err = 0;
+        spin_lock(&sci->sc_state_lock);
+        init_wait(&wait_req.wq);
+        wait_req.err = 0;
+        atomic_set(&wait_req.done, 0);
+        wait_req.seq = ++sci->sc_seq_request;
+        spin_unlock(&sci->sc_state_lock);
+        init_waitqueue_entry(&wait_req.wq, current);
+        add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
+        set_current_state(TASK_INTERRUPTIBLE);
+        wake_up(&sci->sc_wait_daemon);
+        for (;;) {
+                if (atomic_read(&wait_req.done)) {
+                        err = wait_req.err;
+                        break;
+                }
+                if (!signal_pending(current)) {
+                        schedule();
+                        continue;
+                }
+                err = -ERESTARTSYS;
+                break;
+        }
+        finish_wait(&sci->sc_wait_request, &wait_req.wq);
+        return err;
+}
+static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
+{
+        struct nilfs_segctor_wait_request *wrq, *n;
+        unsigned long flags;
+        spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
+        list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
+                                 wq.task_list) {
+                if (!atomic_read(&wrq->done) &&
+                    nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
+                        wrq->err = err;
+                        atomic_set(&wrq->done, 1);
+                }
+                if (atomic_read(&wrq->done)) {
+                        wrq->wq.func(&wrq->wq,
+                                     TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+                                     0, NULL);
+                }
+        }
+        spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
+}
+/**
+ * nilfs_construct_segment - construct a logical segment
+ * @sb: super block
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_segment(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_transaction_info *ti;
+        int err;
+        if (!sci)
+                return -EROFS;
+        /* A call inside transactions causes a deadlock. */
+        BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
+        err = nilfs_segctor_sync(sci);
+        return err;
+}
+/**
+ * nilfs_construct_dsync_segment - construct a data-only logical segment
+ * @sb: super block
+ * @inode: inode whose data blocks should be written out
+ * @start: start byte offset
+ * @end: end byte offset (inclusive)
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
+                                  loff_t start, loff_t end)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_inode_info *ii;
+        struct nilfs_transaction_info ti;
+        int err = 0;
+        if (!sci)
+                return -EROFS;
+        nilfs_transaction_lock(sbi, &ti, 0);
+        ii = NILFS_I(inode);
+        if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
+            nilfs_test_opt(sbi, STRICT_ORDER) ||
+            test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+            nilfs_discontinued(sbi->s_nilfs)) {
+                nilfs_transaction_unlock(sbi);
+                err = nilfs_segctor_sync(sci);
+                return err;
+        }
+        spin_lock(&sbi->s_inode_lock);
+        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+                spin_unlock(&sbi->s_inode_lock);
+                nilfs_transaction_unlock(sbi);
+                return 0;
+        }
+        spin_unlock(&sbi->s_inode_lock);
+        sci->sc_dsync_inode = ii;
+        sci->sc_dsync_start = start;
+        sci->sc_dsync_end = end;
+        err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
+        nilfs_transaction_unlock(sbi);
+        return err;
+}
+struct nilfs_segctor_req {
+        int mode;
+        __u32 seq_accepted;
+        int sc_err;  /* construction failure */
+        int sb_err;  /* super block writeback failure */
+};
+#define FLUSH_FILE_BIT  (0x1) /* data file only */
+#define FLUSH_DAT_BIT   (1 << NILFS_DAT_INO) /* DAT only */
+static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
+                                 struct nilfs_segctor_req *req)
+{
+        req->sc_err = req->sb_err = 0;
+        spin_lock(&sci->sc_state_lock);
+        req->seq_accepted = sci->sc_seq_request;
+        spin_unlock(&sci->sc_state_lock);
+        if (sci->sc_timer)
+                del_timer_sync(sci->sc_timer);
+}
+static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
+                                 struct nilfs_segctor_req *req)
+{
+        /* Clear requests (even when the construction failed) */
+        spin_lock(&sci->sc_state_lock);
+        sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
+        if (req->mode == SC_LSEG_SR) {
+                sci->sc_seq_done = req->seq_accepted;
+                nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
+                sci->sc_flush_request = 0;
+        } else if (req->mode == SC_FLUSH_FILE)
+                sci->sc_flush_request &= ~FLUSH_FILE_BIT;
+        else if (req->mode == SC_FLUSH_DAT)
+                sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+        spin_unlock(&sci->sc_state_lock);
+}
+static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
+                                   struct nilfs_segctor_req *req)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err = 0;
+        if (nilfs_discontinued(nilfs))
+                req->mode = SC_LSEG_SR;
+        if (!nilfs_segctor_confirm(sci)) {
+                err = nilfs_segctor_do_construct(sci, req->mode);
+                req->sc_err = err;
+        }
+        if (likely(!err)) {
+                if (req->mode != SC_FLUSH_DAT)
+                        atomic_set(&nilfs->ns_ndirtyblks, 0);
+                if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
+                    nilfs_discontinued(nilfs)) {
+                        down_write(&nilfs->ns_sem);
+                        req->sb_err = nilfs_commit_super(sbi, 0);
+                        up_write(&nilfs->ns_sem);
+                }
+        }
+        return err;
+}
+static void nilfs_construction_timeout(unsigned long data)
+{
+        struct task_struct *p = (struct task_struct *)data;
+        wake_up_process(p);
+}
+static void
+nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
+{
+        struct nilfs_inode_info *ii, *n;
+        list_for_each_entry_safe(ii, n, head, i_dirty) {
+                if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
+                        continue;
+                hlist_del_init(&ii->vfs_inode.i_hash);
+                list_del_init(&ii->i_dirty);
+                nilfs_clear_gcinode(&ii->vfs_inode);
+        }
+}
+int nilfs_clean_segments(struct super_block *sb, void __user *argp)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_transaction_info ti;
+        struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
+        int err;
+        if (unlikely(!sci))
+                return -EROFS;
+        nilfs_transaction_lock(sbi, &ti, 1);
+        err = nilfs_init_gcdat_inode(nilfs);
+        if (unlikely(err))
+                goto out_unlock;
+        err = nilfs_ioctl_prepare_clean_segments(nilfs, argp);
+        if (unlikely(err))
+                goto out_unlock;
+        list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
+        for (;;) {
+                nilfs_segctor_accept(sci, &req);
+                err = nilfs_segctor_construct(sci, &req);
+                nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
+                nilfs_segctor_notify(sci, &req);
+                if (likely(!err))
+                        break;
+                nilfs_warning(sb, __func__,
+                              "segment construction failed. (err=%d)", err);
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(sci->sc_interval);
+        }
+ out_unlock:
+        nilfs_clear_gcdat_inode(nilfs);
+        nilfs_transaction_unlock(sbi);
+        return err;
+}
+static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct nilfs_transaction_info ti;
+        struct nilfs_segctor_req req = { .mode = mode };
+        nilfs_transaction_lock(sbi, &ti, 0);
+        nilfs_segctor_accept(sci, &req);
+        nilfs_segctor_construct(sci, &req);
+        nilfs_segctor_notify(sci, &req);
+        /*
+         * Unclosed segment should be retried.  We do this using sc_timer.
+         * Timeout of sc_timer will invoke complete construction which leads
+         * to close the current logical segment.
+         */
+        if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
+                nilfs_segctor_start_timer(sci);
+        nilfs_transaction_unlock(sbi);
+}
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
+{
+        int mode = 0;
+        int err;
+        spin_lock(&sci->sc_state_lock);
+        mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
+                SC_FLUSH_DAT : SC_FLUSH_FILE;
+        spin_unlock(&sci->sc_state_lock);
+        if (mode) {
+                err = nilfs_segctor_do_construct(sci, mode);
+                spin_lock(&sci->sc_state_lock);
+                sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
+                        ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
+                spin_unlock(&sci->sc_state_lock);
+        }
+        clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+}
+static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
+{
+        if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+            time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
+                if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
+                        return SC_FLUSH_FILE;
+                else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
+                        return SC_FLUSH_DAT;
+        }
+        return SC_LSEG_SR;
+}
+/**
+ * nilfs_segctor_thread - main loop of the segment constructor thread.
+ * @arg: pointer to a struct nilfs_sc_info.
+ *
+ * nilfs_segctor_thread() initializes a timer and serves as a daemon
+ * to execute segment constructions.
+ */
+static int nilfs_segctor_thread(void *arg)
+{
+        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
+        struct timer_list timer;
+        int timeout = 0;
+        init_timer(&timer);
+        timer.data = (unsigned long)current;
+        timer.function = nilfs_construction_timeout;
+        sci->sc_timer = &timer;
+        /* start sync. */
+        sci->sc_task = current;
+        wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
+        printk(KERN_INFO
+               "segctord starting. Construction interval = %lu seconds, "
+               "CP frequency < %lu seconds\n",
+               sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+        spin_lock(&sci->sc_state_lock);
+ loop:
+        for (;;) {
+                int mode;
+                if (sci->sc_state & NILFS_SEGCTOR_QUIT)
+                        goto end_thread;
+                if (timeout || sci->sc_seq_request != sci->sc_seq_done)
+                        mode = SC_LSEG_SR;
+                else if (!sci->sc_flush_request)
+                        break;
+                else
+                        mode = nilfs_segctor_flush_mode(sci);
+                spin_unlock(&sci->sc_state_lock);
+                nilfs_segctor_thread_construct(sci, mode);
+                spin_lock(&sci->sc_state_lock);
+                timeout = 0;
+        }
+        if (freezing(current)) {
+                spin_unlock(&sci->sc_state_lock);
+                refrigerator();
+                spin_lock(&sci->sc_state_lock);
+        } else {
+                DEFINE_WAIT(wait);
+                int should_sleep = 1;
+                prepare_to_wait(&sci->sc_wait_daemon, &wait,
+                                TASK_INTERRUPTIBLE);
+                if (sci->sc_seq_request != sci->sc_seq_done)
+                        should_sleep = 0;
+                else if (sci->sc_flush_request)
+                        should_sleep = 0;
+                else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
+                        should_sleep = time_before(jiffies,
+                                                   sci->sc_timer->expires);
+                if (should_sleep) {
+                        spin_unlock(&sci->sc_state_lock);
+                        schedule();
+                        spin_lock(&sci->sc_state_lock);
+                }
+                finish_wait(&sci->sc_wait_daemon, &wait);
+                timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+                           time_after_eq(jiffies, sci->sc_timer->expires));
+        }
+        goto loop;
+ end_thread:
+        spin_unlock(&sci->sc_state_lock);
+        del_timer_sync(sci->sc_timer);
+        sci->sc_timer = NULL;
+        /* end sync. */
+        sci->sc_task = NULL;
+        wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
+        return 0;
+}
+static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
+{
+        struct task_struct *t;
+        t = kthread_run(nilfs_segctor_thread, sci, "segctord");
+        if (IS_ERR(t)) {
+                int err = PTR_ERR(t);
+                printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
+                       err);
+                return err;
+        }
+        wait_event(sci->sc_wait_task, sci->sc_task != NULL);
+        return 0;
+}
+static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+{
+        sci->sc_state |= NILFS_SEGCTOR_QUIT;
+        while (sci->sc_task) {
+                wake_up(&sci->sc_wait_daemon);
+                spin_unlock(&sci->sc_state_lock);
+                wait_event(sci->sc_wait_task, sci->sc_task == NULL);
+                spin_lock(&sci->sc_state_lock);
+        }
+}
+static int nilfs_segctor_init(struct nilfs_sc_info *sci)
+{
+        sci->sc_seq_done = sci->sc_seq_request;
+        return nilfs_segctor_start_thread(sci);
+}
+/*
+ * Setup & clean-up functions
+ */
+static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
+{
+        struct nilfs_sc_info *sci;
+        sci = kzalloc(sizeof(*sci), GFP_KERNEL);
+        if (!sci)
+                return NULL;
+        sci->sc_sbi = sbi;
+        sci->sc_super = sbi->s_super;
+        init_waitqueue_head(&sci->sc_wait_request);
+        init_waitqueue_head(&sci->sc_wait_daemon);
+        init_waitqueue_head(&sci->sc_wait_task);
+        spin_lock_init(&sci->sc_state_lock);
+        INIT_LIST_HEAD(&sci->sc_dirty_files);
+        INIT_LIST_HEAD(&sci->sc_segbufs);
+        INIT_LIST_HEAD(&sci->sc_gc_inodes);
+        INIT_LIST_HEAD(&sci->sc_cleaning_segments);
+        INIT_LIST_HEAD(&sci->sc_copied_buffers);
+        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
+        sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
+        sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
+        if (sbi->s_interval)
+                sci->sc_interval = sbi->s_interval;
+        if (sbi->s_watermark)
+                sci->sc_watermark = sbi->s_watermark;
+        return sci;
+}
+static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
+{
+        int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
+        /* The segctord thread was stopped and its timer was removed.
+           But some tasks remain. */
+        do {
+                struct nilfs_sb_info *sbi = sci->sc_sbi;
+                struct nilfs_transaction_info ti;
+                struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
+                nilfs_transaction_lock(sbi, &ti, 0);
+                nilfs_segctor_accept(sci, &req);
+                ret = nilfs_segctor_construct(sci, &req);
+                nilfs_segctor_notify(sci, &req);
+                nilfs_transaction_unlock(sbi);
+        } while (ret && retrycount-- > 0);
+}
+/**
+ * nilfs_segctor_destroy - destroy the segment constructor.
+ * @sci: nilfs_sc_info
+ *
+ * nilfs_segctor_destroy() kills the segctord thread and frees
+ * the nilfs_sc_info struct.
+ * Caller must hold the segment semaphore.
+ */
+static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
+{
+        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        int flag;
+        up_write(&sbi->s_nilfs->ns_segctor_sem);
+        spin_lock(&sci->sc_state_lock);
+        nilfs_segctor_kill_thread(sci);
+        flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
+                || sci->sc_seq_request != sci->sc_seq_done);
+        spin_unlock(&sci->sc_state_lock);
+        if (flag || nilfs_segctor_confirm(sci))
+                nilfs_segctor_write_out(sci);
+        WARN_ON(!list_empty(&sci->sc_copied_buffers));
+        if (!list_empty(&sci->sc_dirty_files)) {
+                nilfs_warning(sbi->s_super, __func__,
+                              "dirty file(s) after the final construction\n");
+                nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
+        }
+        if (!list_empty(&sci->sc_cleaning_segments))
+                nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+        WARN_ON(!list_empty(&sci->sc_segbufs));
+        down_write(&sbi->s_nilfs->ns_segctor_sem);
+        kfree(sci);
+}
+/**
+ * nilfs_attach_segment_constructor - attach a segment constructor
+ * @sbi: nilfs_sb_info
+ *
+ * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
+ * initilizes it, and starts the segment constructor.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err;
+        /* Each field of nilfs_segctor is cleared through the initialization
+           of super-block info */
+        sbi->s_sc_info = nilfs_segctor_new(sbi);
+        if (!sbi->s_sc_info)
+                return -ENOMEM;
+        nilfs_attach_writer(nilfs, sbi);
+        err = nilfs_segctor_init(NILFS_SC(sbi));
+        if (err) {
+                nilfs_detach_writer(nilfs, sbi);
+                kfree(sbi->s_sc_info);
+                sbi->s_sc_info = NULL;
+        }
+        return err;
+}
+/**
+ * nilfs_detach_segment_constructor - destroy the segment constructor
+ * @sbi: nilfs_sb_info
+ *
+ * nilfs_detach_segment_constructor() kills the segment constructor daemon,
+ * frees the struct nilfs_sc_info, and destroy the dirty file list.
+ */
+void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        LIST_HEAD(garbage_list);
+        down_write(&nilfs->ns_segctor_sem);
+        if (NILFS_SC(sbi)) {
+                nilfs_segctor_destroy(NILFS_SC(sbi));
+                sbi->s_sc_info = NULL;
+        }
+        /* Force to free the list of dirty files */
+        spin_lock(&sbi->s_inode_lock);
+        if (!list_empty(&sbi->s_dirty_files)) {
+                list_splice_init(&sbi->s_dirty_files, &garbage_list);
+                nilfs_warning(sbi->s_super, __func__,
+                              "Non empty dirty list after the last "
+                              "segment construction\n");
+        }
+        spin_unlock(&sbi->s_inode_lock);
+        up_write(&nilfs->ns_segctor_sem);
+        nilfs_dispose_list(sbi, &garbage_list, 1);
+        nilfs_detach_writer(nilfs, sbi);
+}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
new file mode 100644
index 000000000000..a98fc1ed0bbb
--- /dev/null
+++ b/fs/nilfs2/segment.h
@@ -0,0 +1,243 @@
+/*
+ * segment.h - NILFS Segment constructor prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGMENT_H
+#define _NILFS_SEGMENT_H
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "sb.h"
+/**
+ * struct nilfs_recovery_info - Recovery infomation
+ * @ri_need_recovery: Recovery status
+ * @ri_super_root: Block number of the last super root
+ * @ri_ri_cno: Number of the last checkpoint
+ * @ri_lsegs_start: Region for roll-forwarding (start block number)
+ * @ri_lsegs_end: Region for roll-forwarding (end block number)
+ * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
+ * @ri_used_segments: List of segments to be mark active
+ * @ri_pseg_start: Block number of the last partial segment
+ * @ri_seq: Sequence number on the last partial segment
+ * @ri_segnum: Segment number on the last partial segment
+ * @ri_nextnum: Next segment number on the last partial segment
+ */
+struct nilfs_recovery_info {
+        int                     ri_need_recovery;
+        sector_t                ri_super_root;
+        __u64                   ri_cno;
+        sector_t                ri_lsegs_start;
+        sector_t                ri_lsegs_end;
+        u64                     ri_lsegs_start_seq;
+        struct list_head        ri_used_segments;
+        sector_t                ri_pseg_start;
+        u64                     ri_seq;
+        __u64                   ri_segnum;
+        __u64                   ri_nextnum;
+};
+/* ri_need_recovery */
+#define NILFS_RECOVERY_SR_UPDATED        1  /* The super root was updated */
+#define NILFS_RECOVERY_ROLLFORWARD_DONE  2  /* Rollforward was carried out */
+/**
+ * struct nilfs_cstage - Context of collection stage
+ * @scnt: Stage count
+ * @flags: State flags
+ * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
+ * @gc_inode_ptr: Pointer on the list of gc-inodes
+ */
+struct nilfs_cstage {
+        int                     scnt;
+        unsigned                flags;
+        struct nilfs_inode_info *dirty_file_ptr;
+        struct nilfs_inode_info *gc_inode_ptr;
+};
+struct nilfs_segment_buffer;
+struct nilfs_segsum_pointer {
+        struct buffer_head     *bh;
+        unsigned                offset; /* offset in bytes */
+};
+/**
+ * struct nilfs_sc_info - Segment constructor information
+ * @sc_super: Back pointer to super_block struct
+ * @sc_sbi: Back pointer to nilfs_sb_info struct
+ * @sc_nblk_inc: Block count of current generation
+ * @sc_dirty_files: List of files to be written
+ * @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_cleaning_segments: List of segments to be freed through construction
+ * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
+ * @sc_dsync_inode: inode whose data pages are written for a sync operation
+ * @sc_dsync_start: start byte offset of data pages
+ * @sc_dsync_end: end byte offset of data pages (inclusive)
+ * @sc_segbufs: List of segment buffers
+ * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
+ * @sc_curseg: Current segment buffer
+ * @sc_super_root: Pointer to the super root buffer
+ * @sc_stage: Collection stage
+ * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
+ * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
+ * @sc_blk_cnt: Block count of a file
+ * @sc_datablk_cnt: Data block count of a file
+ * @sc_nblk_this_inc: Number of blocks included in the current logical segment
+ * @sc_seg_ctime: Creation time
+ * @sc_flags: Internal flags
+ * @sc_state_lock: spinlock for sc_state and so on
+ * @sc_state: Segctord state flags
+ * @sc_flush_request: inode bitmap of metadata files to be flushed
+ * @sc_wait_request: Client request queue
+ * @sc_wait_daemon: Daemon wait queue
+ * @sc_wait_task: Start/end wait queue to control segctord task
+ * @sc_seq_request: Request counter
+ * @sc_seq_done: Completion counter
+ * @sc_sync: Request of explicit sync operation
+ * @sc_interval: Timeout value of background construction
+ * @sc_mjcp_freq: Frequency of creating checkpoints
+ * @sc_lseg_stime: Start time of the latest logical segment
+ * @sc_watermark: Watermark for the number of dirty buffers
+ * @sc_timer: Timer for segctord
+ * @sc_task: current thread of segctord
+ */
+struct nilfs_sc_info {
+        struct super_block     *sc_super;
+        struct nilfs_sb_info   *sc_sbi;
+        unsigned long           sc_nblk_inc;
+        struct list_head        sc_dirty_files;
+        struct list_head        sc_gc_inodes;
+        struct list_head        sc_cleaning_segments;
+        struct list_head        sc_copied_buffers;
+        struct nilfs_inode_info *sc_dsync_inode;
+        loff_t                  sc_dsync_start;
+        loff_t                  sc_dsync_end;
+        /* Segment buffers */
+        struct list_head        sc_segbufs;
+        unsigned long           sc_segbuf_nblocks;
+        struct nilfs_segment_buffer *sc_curseg;
+        struct buffer_head     *sc_super_root;
+        struct nilfs_cstage     sc_stage;
+        struct nilfs_segsum_pointer sc_finfo_ptr;
+        struct nilfs_segsum_pointer sc_binfo_ptr;
+        unsigned long           sc_blk_cnt;
+        unsigned long           sc_datablk_cnt;
+        unsigned long           sc_nblk_this_inc;
+        time_t                  sc_seg_ctime;
+        unsigned long           sc_flags;
+        spinlock_t              sc_state_lock;
+        unsigned long           sc_state;
+        unsigned long           sc_flush_request;
+        wait_queue_head_t       sc_wait_request;
+        wait_queue_head_t       sc_wait_daemon;
+        wait_queue_head_t       sc_wait_task;
+        __u32                   sc_seq_request;
+        __u32                   sc_seq_done;
+        int                     sc_sync;
+        unsigned long           sc_interval;
+        unsigned long           sc_mjcp_freq;
+        unsigned long           sc_lseg_stime;  /* in 1/HZ seconds */
+        unsigned long           sc_watermark;
+        struct timer_list      *sc_timer;
+        struct task_struct     *sc_task;
+};
+/* sc_flags */
+enum {
+        NILFS_SC_DIRTY,         /* One or more dirty meta-data blocks exist */
+        NILFS_SC_UNCLOSED,      /* Logical segment is not closed */
+        NILFS_SC_SUPER_ROOT,    /* The latest segment has a super root */
+        NILFS_SC_PRIOR_FLUSH,   /* Requesting immediate flush without making a
+                                   checkpoint */
+        NILFS_SC_HAVE_DELTA,    /* Next checkpoint will have update of files
+                                   other than DAT, cpfile, sufile, or files
+                                   moved by GC */
+};
+/* sc_state */
+#define NILFS_SEGCTOR_QUIT          0x0001  /* segctord is being destroyed */
+#define NILFS_SEGCTOR_COMMIT        0x0004  /* committed transaction exists */
+/*
+ * Constant parameters
+ */
+#define NILFS_SC_CLEANUP_RETRY      3  /* Retry count of construction when
+                                          destroying segctord */
+/*
+ * Default values of timeout, in seconds.
+ */
+#define NILFS_SC_DEFAULT_TIMEOUT    5   /* Timeout value of dirty blocks.
+                                           It triggers construction of a
+                                           logical segment with a super root */
+#define NILFS_SC_DEFAULT_SR_FREQ    30  /* Maximum frequency of super root
+                                           creation */
+/*
+ * The default threshold amount of data, in block counts.
+ */
+#define NILFS_SC_DEFAULT_WATERMARK  3600
+/* segment.c */
+extern int nilfs_init_transaction_cache(void);
+extern void nilfs_destroy_transaction_cache(void);
+extern void nilfs_relax_pressure_in_lock(struct super_block *);
+extern int nilfs_construct_segment(struct super_block *);
+extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
+                                         loff_t, loff_t);
+extern void nilfs_flush_segment(struct super_block *, ino_t);
+extern int nilfs_clean_segments(struct super_block *, void __user *);
+extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
+                                                  __u64 *, size_t);
+extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
+extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
+extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
+/* recovery.c */
+extern int nilfs_read_super_root_block(struct super_block *, sector_t,
+                                       struct buffer_head **, int);
+extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
+                                   struct nilfs_recovery_info *);
+extern int nilfs_recover_logical_segments(struct the_nilfs *,
+                                          struct nilfs_sb_info *,
+                                          struct nilfs_recovery_info *);
+#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
new file mode 100644
index 000000000000..c774cf397e2f
--- /dev/null
+++ b/fs/nilfs2/sufile.c
@@ -0,0 +1,640 @@
+/*
+ * sufile.c - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "sufile.h"
+static inline unsigned long
+nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
+{
+        return NILFS_MDT(sufile)->mi_entries_per_block;
+}
+static unsigned long
+nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
+{
+        __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+        do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+        return (unsigned long)t;
+}
+static unsigned long
+nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
+{
+        __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+        return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+}
+static unsigned long
+nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
+                                     __u64 max)
+{
+        return min_t(unsigned long,
+                     nilfs_sufile_segment_usages_per_block(sufile) -
+                     nilfs_sufile_get_offset(sufile, curr),
+                     max - curr + 1);
+}
+static inline struct nilfs_sufile_header *
+nilfs_sufile_block_get_header(const struct inode *sufile,
+                              struct buffer_head *bh,
+                              void *kaddr)
+{
+        return kaddr + bh_offset(bh);
+}
+static struct nilfs_segment_usage *
+nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
+                                     struct buffer_head *bh, void *kaddr)
+{
+        return kaddr + bh_offset(bh) +
+                nilfs_sufile_get_offset(sufile, segnum) *
+                NILFS_MDT(sufile)->mi_entry_size;
+}
+static inline int nilfs_sufile_get_header_block(struct inode *sufile,
+                                                struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
+}
+static inline int
+nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
+                                     int create, struct buffer_head **bhp)
+{
+        return nilfs_mdt_get_block(sufile,
+                                   nilfs_sufile_get_blkoff(sufile, segnum),
+                                   create, NULL, bhp);
+}
+/**
+ * nilfs_sufile_alloc - allocate a segment
+ * @sufile: inode of segment usage file
+ * @segnump: pointer to segment number
+ *
+ * Description: nilfs_sufile_alloc() allocates a clean segment.
+ *
+ * Return Value: On success, 0 is returned and the segment number of the
+ * allocated segment is stored in the place pointed by @segnump. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No clean segment left.
+ */
+int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
+{
+        struct buffer_head *header_bh, *su_bh;
+        struct the_nilfs *nilfs;
+        struct nilfs_sufile_header *header;
+        struct nilfs_segment_usage *su;
+        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+        __u64 segnum, maxsegnum, last_alloc;
+        void *kaddr;
+        unsigned long nsegments, ncleansegs, nsus;
+        int ret, i, j;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+        last_alloc = le64_to_cpu(header->sh_last_alloc);
+        kunmap_atomic(kaddr, KM_USER0);
+        nsegments = nilfs_sufile_get_nsegments(sufile);
+        segnum = last_alloc + 1;
+        maxsegnum = nsegments - 1;
+        for (i = 0; i < nsegments; i += nsus) {
+                if (segnum >= nsegments) {
+                        /* wrap around */
+                        segnum = 0;
+                        maxsegnum = last_alloc;
+                }
+                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
+                                                           &su_bh);
+                if (ret < 0)
+                        goto out_header;
+                kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+                su = nilfs_sufile_block_get_segment_usage(
+                        sufile, segnum, su_bh, kaddr);
+                nsus = nilfs_sufile_segment_usages_in_block(
+                        sufile, segnum, maxsegnum);
+                for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
+                        if (!nilfs_segment_usage_clean(su))
+                                continue;
+                        /* found a clean segment */
+                        nilfs_segment_usage_set_dirty(su);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                        header = nilfs_sufile_block_get_header(
+                                sufile, header_bh, kaddr);
+                        le64_add_cpu(&header->sh_ncleansegs, -1);
+                        le64_add_cpu(&header->sh_ndirtysegs, 1);
+                        header->sh_last_alloc = cpu_to_le64(segnum);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        nilfs_mdt_mark_buffer_dirty(header_bh);
+                        nilfs_mdt_mark_buffer_dirty(su_bh);
+                        nilfs_mdt_mark_dirty(sufile);
+                        brelse(su_bh);
+                        *segnump = segnum;
+                        goto out_header;
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(su_bh);
+        }
+        /* no segments left */
+        ret = -ENOSPC;
+ out_header:
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_cancel_free -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
+{
+        struct buffer_head *header_bh, *su_bh;
+        struct the_nilfs *nilfs;
+        struct nilfs_sufile_header *header;
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int ret;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
+        if (ret < 0)
+                goto out_header;
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(
+                sufile, segnum, su_bh, kaddr);
+        if (unlikely(!nilfs_segment_usage_clean(su))) {
+                printk(KERN_WARNING "%s: segment %llu must be clean\n",
+                       __func__, (unsigned long long)segnum);
+                kunmap_atomic(kaddr, KM_USER0);
+                goto out_su_bh;
+        }
+        nilfs_segment_usage_set_dirty(su);
+        kunmap_atomic(kaddr, KM_USER0);
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        le64_add_cpu(&header->sh_ncleansegs, -1);
+        le64_add_cpu(&header->sh_ndirtysegs, 1);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+        nilfs_mdt_mark_buffer_dirty(su_bh);
+        nilfs_mdt_mark_dirty(sufile);
+ out_su_bh:
+        brelse(su_bh);
+ out_header:
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_freev - free segments
+ * @sufile: inode of segment usage file
+ * @segnum: array of segment numbers
+ * @nsegs: number of segments
+ *
+ * Description: nilfs_sufile_freev() frees segments specified by @segnum and
+ * @nsegs, which must have been returned by a previous call to
+ * nilfs_sufile_alloc().
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+#define NILFS_SUFILE_FREEV_PREALLOC     16
+int nilfs_sufile_freev(struct inode *sufile, __u64 *segnum, size_t nsegs)
+{
+        struct buffer_head *header_bh, **su_bh,
+                *su_bh_prealloc[NILFS_SUFILE_FREEV_PREALLOC];
+        struct the_nilfs *nilfs;
+        struct nilfs_sufile_header *header;
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int ret, i;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        /* prepare resources */
+        if (nsegs <= NILFS_SUFILE_FREEV_PREALLOC)
+                su_bh = su_bh_prealloc;
+        else {
+                su_bh = kmalloc(sizeof(*su_bh) * nsegs, GFP_NOFS);
+                if (su_bh == NULL) {
+                        ret = -ENOMEM;
+                        goto out_sem;
+                }
+        }
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_su_bh;
+        for (i = 0; i < nsegs; i++) {
+                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum[i],
+                                                           0, &su_bh[i]);
+                if (ret < 0)
+                        goto out_bh;
+        }
+        /* free segments */
+        for (i = 0; i < nsegs; i++) {
+                kaddr = kmap_atomic(su_bh[i]->b_page, KM_USER0);
+                su = nilfs_sufile_block_get_segment_usage(
+                        sufile, segnum[i], su_bh[i], kaddr);
+                WARN_ON(nilfs_segment_usage_error(su));
+                nilfs_segment_usage_set_clean(su);
+                kunmap_atomic(kaddr, KM_USER0);
+                nilfs_mdt_mark_buffer_dirty(su_bh[i]);
+        }
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        le64_add_cpu(&header->sh_ncleansegs, nsegs);
+        le64_add_cpu(&header->sh_ndirtysegs, -(u64)nsegs);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+        nilfs_mdt_mark_dirty(sufile);
+ out_bh:
+        for (i--; i >= 0; i--)
+                brelse(su_bh[i]);
+        brelse(header_bh);
+ out_su_bh:
+        if (su_bh != su_bh_prealloc)
+                kfree(su_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_free -
+ * @sufile:
+ * @segnum:
+ */
+int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_freev(sufile, &segnum, 1);
+}
+/**
+ * nilfs_sufile_get_segment_usage - get a segment usage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @sup: pointer to segment usage
+ * @bhp: pointer to buffer head
+ *
+ * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
+ * specified by @segnum.
+ *
+ * Return Value: On success, 0 is returned, and the segment usage and the
+ * buffer head of the buffer on which the segment usage is located are stored
+ * in the place pointed by @sup and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
+                                   struct nilfs_segment_usage **sup,
+                                   struct buffer_head **bhp)
+{
+        struct buffer_head *bh;
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int ret;
+        /* segnum is 0 origin */
+        if (segnum >= nilfs_sufile_get_nsegments(sufile))
+                return -EINVAL;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap(bh->b_page);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+        if (nilfs_segment_usage_error(su)) {
+                kunmap(bh->b_page);
+                brelse(bh);
+                ret = -EINVAL;
+                goto out_sem;
+        }
+        if (sup != NULL)
+                *sup = su;
+        *bhp = bh;
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_put_segment_usage - put a segment usage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @bh: buffer head
+ *
+ * Description: nilfs_sufile_put_segment_usage() releases the segment usage
+ * specified by @segnum. @bh must be the buffer head which have been returned
+ * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
+ */
+void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
+                                    struct buffer_head *bh)
+{
+        kunmap(bh->b_page);
+        brelse(bh);
+}
+/**
+ * nilfs_sufile_get_stat - get segment usage statistics
+ * @sufile: inode of segment usage file
+ * @stat: pointer to a structure of segment usage statistics
+ *
+ * Description: nilfs_sufile_get_stat() returns information about segment
+ * usage.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
+{
+        struct buffer_head *header_bh;
+        struct nilfs_sufile_header *header;
+        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        void *kaddr;
+        int ret;
+        down_read(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
+        sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+        sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
+        sustat->ss_ctime = nilfs->ns_ctime;
+        sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
+        spin_lock(&nilfs->ns_last_segment_lock);
+        sustat->ss_prot_seq = nilfs->ns_prot_seq;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(header_bh);
+ out_sem:
+        up_read(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_get_ncleansegs - get the number of clean segments
+ * @sufile: inode of segment usage file
+ * @nsegsp: pointer to the number of clean segments
+ *
+ * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
+ * segments.
+ *
+ * Return Value: On success, 0 is returned and the number of clean segments is
+ * stored in the place pointed by @nsegsp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
+{
+        struct nilfs_sustat sustat;
+        int ret;
+        ret = nilfs_sufile_get_stat(sufile, &sustat);
+        if (ret == 0)
+                *nsegsp = sustat.ss_ncleansegs;
+        return ret;
+}
+/**
+ * nilfs_sufile_set_error - mark a segment as erroneous
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description: nilfs_sufile_set_error() marks the segment specified by
+ * @segnum as erroneous. The error segment will never be used again.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
+{
+        struct buffer_head *header_bh, *su_bh;
+        struct nilfs_segment_usage *su;
+        struct nilfs_sufile_header *header;
+        void *kaddr;
+        int ret;
+        if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
+                printk(KERN_WARNING "%s: invalid segment number: %llu\n",
+                       __func__, (unsigned long long)segnum);
+                return -EINVAL;
+        }
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
+        if (ret < 0)
+                goto out_header;
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+        if (nilfs_segment_usage_error(su)) {
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(su_bh);
+                goto out_header;
+        }
+        nilfs_segment_usage_set_error(su);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(su_bh);
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        le64_add_cpu(&header->sh_ndirtysegs, -1);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+        nilfs_mdt_mark_buffer_dirty(su_bh);
+        nilfs_mdt_mark_dirty(sufile);
+        brelse(su_bh);
+ out_header:
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_get_suinfo -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to start looking
+ * @si: array of suinfo
+ * @nsi: size of suinfo array
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned and .... On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
+                                struct nilfs_suinfo *si, size_t nsi)
+{
+        struct buffer_head *su_bh;
+        struct nilfs_segment_usage *su;
+        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        void *kaddr;
+        unsigned long nsegs, segusages_per_block;
+        ssize_t n;
+        int ret, i, j;
+        down_read(&NILFS_MDT(sufile)->mi_sem);
+        segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+        nsegs = min_t(unsigned long,
+                      nilfs_sufile_get_nsegments(sufile) - segnum,
+                      nsi);
+        for (i = 0; i < nsegs; i += n, segnum += n) {
+                n = min_t(unsigned long,
+                          segusages_per_block -
+                                  nilfs_sufile_get_offset(sufile, segnum),
+                          nsegs - i);
+                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+                                                           &su_bh);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                goto out;
+                        /* hole */
+                        memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
+                        continue;
+                }
+                kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+                su = nilfs_sufile_block_get_segment_usage(
+                        sufile, segnum, su_bh, kaddr);
+                for (j = 0; j < n; j++, su = (void *)su + susz) {
+                        si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
+                        si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
+                        si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
+                                ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+                        if (nilfs_segment_is_active(nilfs, segnum + i + j))
+                                si[i + j].sui_flags |=
+                                        (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+                }
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(su_bh);
+        }
+        ret = nsegs;
+ out:
+        up_read(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
new file mode 100644
index 000000000000..d595f33a768d
--- /dev/null
+++ b/fs/nilfs2/sufile.h
@@ -0,0 +1,54 @@
+/*
+ * sufile.h - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+#ifndef _NILFS_SUFILE_H
+#define _NILFS_SUFILE_H
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#define NILFS_SUFILE_GFP        NILFS_MDT_GFP
+static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
+{
+        return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
+}
+int nilfs_sufile_alloc(struct inode *, __u64 *);
+int nilfs_sufile_cancel_free(struct inode *, __u64);
+int nilfs_sufile_freev(struct inode *, __u64 *, size_t);
+int nilfs_sufile_free(struct inode *, __u64);
+int nilfs_sufile_get_segment_usage(struct inode *, __u64,
+                                   struct nilfs_segment_usage **,
+                                   struct buffer_head **);
+void nilfs_sufile_put_segment_usage(struct inode *, __u64,
+                                    struct buffer_head *);
+int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
+int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
+int nilfs_sufile_set_error(struct inode *, __u64);
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
+                                size_t);
+#endif  /* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
new file mode 100644
index 000000000000..e117e1ea9bff
--- /dev/null
+++ b/fs/nilfs2/super.c
@@ -0,0 +1,1323 @@
+/*
+ * super.c - NILFS module and super block management.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/smp_lock.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/kobject.h>
+#include <linux/exportfs.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "page.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "dat.h"
+#include "segment.h"
+#include "segbuf.h"
+MODULE_AUTHOR("NTT Corp.");
+MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
+                   "(NILFS)");
+MODULE_VERSION(NILFS_VERSION);
+MODULE_LICENSE("GPL");
+static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+static int test_exclusive_mount(struct file_system_type *fs_type,
+                                struct block_device *bdev, int flags);
+/**
+ * nilfs_error() - report failure condition on a filesystem
+ *
+ * nilfs_error() sets an ERROR_FS flag on the superblock as well as
+ * reporting an error message.  It should be called when NILFS detects
+ * incoherences or defects of meta data on disk.  As for sustainable
+ * errors such as a single-shot I/O error, nilfs_warning() or the printk()
+ * function should be used instead.
+ *
+ * The segment constructor must not call this function because it can
+ * kill itself.
+ */
+void nilfs_error(struct super_block *sb, const char *function,
+                 const char *fmt, ...)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        va_list args;
+        va_start(args, fmt);
+        printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                struct the_nilfs *nilfs = sbi->s_nilfs;
+                if (!nilfs_test_opt(sbi, ERRORS_CONT))
+                        nilfs_detach_segment_constructor(sbi);
+                down_write(&nilfs->ns_sem);
+                if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
+                        nilfs->ns_mount_state |= NILFS_ERROR_FS;
+                        nilfs->ns_sbp[0]->s_state |=
+                                cpu_to_le16(NILFS_ERROR_FS);
+                        nilfs_commit_super(sbi, 1);
+                }
+                up_write(&nilfs->ns_sem);
+                if (nilfs_test_opt(sbi, ERRORS_RO)) {
+                        printk(KERN_CRIT "Remounting filesystem read-only\n");
+                        sb->s_flags |= MS_RDONLY;
+                }
+        }
+        if (nilfs_test_opt(sbi, ERRORS_PANIC))
+                panic("NILFS (device %s): panic forced after error\n",
+                      sb->s_id);
+}
+void nilfs_warning(struct super_block *sb, const char *function,
+                   const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        printk(KERN_WARNING "NILFS warning (device %s): %s: ",
+               sb->s_id, function);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+}
+static struct kmem_cache *nilfs_inode_cachep;
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+        struct nilfs_inode_info *ii;
+        ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
+        if (!ii)
+                return NULL;
+        ii->i_bh = NULL;
+        ii->i_state = 0;
+        ii->vfs_inode.i_version = 1;
+        nilfs_btnode_cache_init(&ii->i_btnode_cache);
+        return &ii->vfs_inode;
+}
+void nilfs_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
+}
+static void init_once(void *obj)
+{
+        struct nilfs_inode_info *ii = obj;
+        INIT_LIST_HEAD(&ii->i_dirty);
+#ifdef CONFIG_NILFS_XATTR
+        init_rwsem(&ii->xattr_sem);
+#endif
+        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+        inode_init_once(&ii->vfs_inode);
+}
+static int nilfs_init_inode_cache(void)
+{
+        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+                                               sizeof(struct nilfs_inode_info),
+                                               0, SLAB_RECLAIM_ACCOUNT,
+                                               init_once);
+        return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
+}
+static inline void nilfs_destroy_inode_cache(void)
+{
+        kmem_cache_destroy(nilfs_inode_cachep);
+}
+static void nilfs_clear_inode(struct inode *inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+#ifdef CONFIG_NILFS_POSIX_ACL
+        if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
+                posix_acl_release(ii->i_acl);
+                ii->i_acl = NILFS_ACL_NOT_CACHED;
+        }
+        if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
+                posix_acl_release(ii->i_default_acl);
+                ii->i_default_acl = NILFS_ACL_NOT_CACHED;
+        }
+#endif
+        /*
+         * Free resources allocated in nilfs_read_inode(), here.
+         */
+        BUG_ON(!list_empty(&ii->i_dirty));
+        brelse(ii->i_bh);
+        ii->i_bh = NULL;
+        if (test_bit(NILFS_I_BMAP, &ii->i_state))
+                nilfs_bmap_clear(ii->i_bmap);
+        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+}
+static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err;
+        int barrier_done = 0;
+        if (nilfs_test_opt(sbi, BARRIER)) {
+                set_buffer_ordered(nilfs->ns_sbh[0]);
+                barrier_done = 1;
+        }
+ retry:
+        set_buffer_dirty(nilfs->ns_sbh[0]);
+        err = sync_dirty_buffer(nilfs->ns_sbh[0]);
+        if (err == -EOPNOTSUPP && barrier_done) {
+                nilfs_warning(sbi->s_super, __func__,
+                              "barrier-based sync failed. "
+                              "disabling barriers\n");
+                nilfs_clear_opt(sbi, BARRIER);
+                barrier_done = 0;
+                clear_buffer_ordered(nilfs->ns_sbh[0]);
+                goto retry;
+        }
+        if (unlikely(err)) {
+                printk(KERN_ERR
+                       "NILFS: unable to write superblock (err=%d)\n", err);
+                if (err == -EIO && nilfs->ns_sbh[1]) {
+                        nilfs_fall_back_super_block(nilfs);
+                        goto retry;
+                }
+        } else {
+                struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+                /*
+                 * The latest segment becomes trailable from the position
+                 * written in superblock.
+                 */
+                clear_nilfs_discontinued(nilfs);
+                /* update GC protection for recent segments */
+                if (nilfs->ns_sbh[1]) {
+                        sbp = NULL;
+                        if (dupsb) {
+                                set_buffer_dirty(nilfs->ns_sbh[1]);
+                                if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
+                                        sbp = nilfs->ns_sbp[1];
+                        }
+                }
+                if (sbp) {
+                        spin_lock(&nilfs->ns_last_segment_lock);
+                        nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+                        spin_unlock(&nilfs->ns_last_segment_lock);
+                }
+        }
+        return err;
+}
+int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        sector_t nfreeblocks;
+        time_t t;
+        int err;
+        /* nilfs->sem must be locked by the caller. */
+        if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
+                if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+                        nilfs_swap_super_block(nilfs);
+                else {
+                        printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
+                               sbi->s_super->s_id);
+                        return -EIO;
+                }
+        }
+        err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+        if (unlikely(err)) {
+                printk(KERN_ERR "NILFS: failed to count free blocks\n");
+                return err;
+        }
+        spin_lock(&nilfs->ns_last_segment_lock);
+        sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+        sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+        sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        t = get_seconds();
+        nilfs->ns_sbwtime[0] = t;
+        sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+        sbp[0]->s_wtime = cpu_to_le64(t);
+        sbp[0]->s_sum = 0;
+        sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+                                             (unsigned char *)sbp[0],
+                                             nilfs->ns_sbsize));
+        if (dupsb && sbp[1]) {
+                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+                nilfs->ns_sbwtime[1] = t;
+        }
+        sbi->s_super->s_dirt = 0;
+        return nilfs_sync_super(sbi, dupsb);
+}
+static void nilfs_put_super(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        nilfs_detach_segment_constructor(sbi);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                down_write(&nilfs->ns_sem);
+                nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+                nilfs_commit_super(sbi, 1);
+                up_write(&nilfs->ns_sem);
+        }
+        nilfs_detach_checkpoint(sbi);
+        put_nilfs(sbi->s_nilfs);
+        sbi->s_super = NULL;
+        sb->s_fs_info = NULL;
+        kfree(sbi);
+}
+/**
+ * nilfs_write_super - write super block(s) of NILFS
+ * @sb: super_block
+ *
+ * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
+ * clears s_dirt.  This function is called in the section protected by
+ * lock_super().
+ *
+ * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
+ * of the struct the_nilfs.  Lock order must be as follows:
+ *
+ *   1. lock_super()
+ *   2.    down_write(&nilfs->ns_sem)
+ *
+ * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
+ * of the super block (nilfs->ns_sbp[]).
+ *
+ * In most cases, VFS functions call lock_super() before calling these
+ * methods.  So we must be careful not to bring on deadlocks when using
+ * lock_super();  see generic_shutdown_super(), write_super(), and so on.
+ *
+ * Note that order of lock_kernel() and lock_super() depends on contexts
+ * of VFS.  We should also note that lock_kernel() can be used in its
+ * protective section and only the outermost one has an effect.
+ */
+static void nilfs_write_super(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        down_write(&nilfs->ns_sem);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                struct nilfs_super_block **sbp = nilfs->ns_sbp;
+                u64 t = get_seconds();
+                int dupsb;
+                if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
+                    t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
+                        up_write(&nilfs->ns_sem);
+                        return;
+                }
+                dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+                nilfs_commit_super(sbi, dupsb);
+        }
+        sb->s_dirt = 0;
+        up_write(&nilfs->ns_sem);
+}
+static int nilfs_sync_fs(struct super_block *sb, int wait)
+{
+        int err = 0;
+        /* This function is called when super block should be written back */
+        if (wait)
+                err = nilfs_construct_segment(sb);
+        return err;
+}
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_checkpoint *raw_cp;
+        struct buffer_head *bh_cp;
+        int err;
+        down_write(&nilfs->ns_sem);
+        list_add(&sbi->s_list, &nilfs->ns_supers);
+        up_write(&nilfs->ns_sem);
+        sbi->s_ifile = nilfs_mdt_new(
+                nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
+        if (!sbi->s_ifile)
+                return -ENOMEM;
+        err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
+        if (unlikely(err))
+                goto failed;
+        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
+                                          &bh_cp);
+        if (unlikely(err)) {
+                if (err == -ENOENT || err == -EINVAL) {
+                        printk(KERN_ERR
+                               "NILFS: Invalid checkpoint "
+                               "(checkpoint number=%llu)\n",
+                               (unsigned long long)cno);
+                        err = -EINVAL;
+                }
+                goto failed;
+        }
+        err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
+        if (unlikely(err))
+                goto failed_bh;
+        atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+        atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+        return 0;
+ failed_bh:
+        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+ failed:
+        nilfs_mdt_destroy(sbi->s_ifile);
+        sbi->s_ifile = NULL;
+        down_write(&nilfs->ns_sem);
+        list_del_init(&sbi->s_list);
+        up_write(&nilfs->ns_sem);
+        return err;
+}
+void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        nilfs_mdt_clear(sbi->s_ifile);
+        nilfs_mdt_destroy(sbi->s_ifile);
+        sbi->s_ifile = NULL;
+        down_write(&nilfs->ns_sem);
+        list_del_init(&sbi->s_list);
+        up_write(&nilfs->ns_sem);
+}
+static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err = 0;
+        down_write(&nilfs->ns_sem);
+        if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+                nilfs->ns_mount_state |= NILFS_VALID_FS;
+                err = nilfs_commit_super(sbi, 1);
+                if (likely(!err))
+                        printk(KERN_INFO "NILFS: recovery complete.\n");
+        }
+        up_write(&nilfs->ns_sem);
+        return err;
+}
+static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        unsigned long long blocks;
+        unsigned long overhead;
+        unsigned long nrsvblocks;
+        sector_t nfreeblocks;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err;
+        /*
+         * Compute all of the segment blocks
+         *
+         * The blocks before first segment and after last segment
+         * are excluded.
+         */
+        blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
+                - nilfs->ns_first_data_block;
+        nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
+        /*
+         * Compute the overhead
+         *
+         * When distributing meta data blocks outside semgent structure,
+         * We must count them as the overhead.
+         */
+        overhead = 0;
+        err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+        if (unlikely(err))
+                return err;
+        buf->f_type = NILFS_SUPER_MAGIC;
+        buf->f_bsize = sb->s_blocksize;
+        buf->f_blocks = blocks - overhead;
+        buf->f_bfree = nfreeblocks;
+        buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
+                (buf->f_bfree - nrsvblocks) : 0;
+        buf->f_files = atomic_read(&sbi->s_inodes_count);
+        buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+        buf->f_namelen = NILFS_NAME_LEN;
+        return 0;
+}
+static struct super_operations nilfs_sops = {
+        .alloc_inode    = nilfs_alloc_inode,
+        .destroy_inode  = nilfs_destroy_inode,
+        .dirty_inode    = nilfs_dirty_inode,
+        /* .write_inode    = nilfs_write_inode, */
+        /* .put_inode      = nilfs_put_inode, */
+        /* .drop_inode    = nilfs_drop_inode, */
+        .delete_inode   = nilfs_delete_inode,
+        .put_super      = nilfs_put_super,
+        .write_super    = nilfs_write_super,
+        .sync_fs        = nilfs_sync_fs,
+        /* .write_super_lockfs */
+        /* .unlockfs */
+        .statfs         = nilfs_statfs,
+        .remount_fs     = nilfs_remount,
+        .clear_inode    = nilfs_clear_inode,
+        /* .umount_begin */
+        /* .show_options */
+};
+static struct inode *
+nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
+{
+        struct inode *inode;
+        if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
+            ino != NILFS_SKETCH_INO)
+                return ERR_PTR(-ESTALE);
+        inode = nilfs_iget(sb, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (generation && inode->i_generation != generation) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return inode;
+}
+static struct dentry *
+nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
+                   int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                    nilfs_nfs_get_inode);
+}
+static struct dentry *
+nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
+                   int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                    nilfs_nfs_get_inode);
+}
+static struct export_operations nilfs_export_ops = {
+        .fh_to_dentry = nilfs_fh_to_dentry,
+        .fh_to_parent = nilfs_fh_to_parent,
+        .get_parent = nilfs_get_parent,
+};
+enum {
+        Opt_err_cont, Opt_err_panic, Opt_err_ro,
+        Opt_barrier, Opt_snapshot, Opt_order,
+        Opt_err,
+};
+static match_table_t tokens = {
+        {Opt_err_cont, "errors=continue"},
+        {Opt_err_panic, "errors=panic"},
+        {Opt_err_ro, "errors=remount-ro"},
+        {Opt_barrier, "barrier=%s"},
+        {Opt_snapshot, "cp=%u"},
+        {Opt_order, "order=%s"},
+        {Opt_err, NULL}
+};
+static int match_bool(substring_t *s, int *result)
+{
+        int len = s->to - s->from;
+        if (strncmp(s->from, "on", len) == 0)
+                *result = 1;
+        else if (strncmp(s->from, "off", len) == 0)
+                *result = 0;
+        else
+                return 1;
+        return 0;
+}
+static int parse_options(char *options, struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        if (!options)
+                return 1;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_barrier:
+                        if (match_bool(&args[0], &option))
+                                return 0;
+                        if (option)
+                                nilfs_set_opt(sbi, BARRIER);
+                        else
+                                nilfs_clear_opt(sbi, BARRIER);
+                        break;
+                case Opt_order:
+                        if (strcmp(args[0].from, "relaxed") == 0)
+                                /* Ordered data semantics */
+                                nilfs_clear_opt(sbi, STRICT_ORDER);
+                        else if (strcmp(args[0].from, "strict") == 0)
+                                /* Strict in-order semantics */
+                                nilfs_set_opt(sbi, STRICT_ORDER);
+                        else
+                                return 0;
+                        break;
+                case Opt_err_panic:
+                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
+                        break;
+                case Opt_err_ro:
+                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
+                        break;
+                case Opt_err_cont:
+                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
+                        break;
+                case Opt_snapshot:
+                        if (match_int(&args[0], &option) || option <= 0)
+                                return 0;
+                        if (!(sb->s_flags & MS_RDONLY))
+                                return 0;
+                        sbi->s_snapshot_cno = option;
+                        nilfs_set_opt(sbi, SNAPSHOT);
+                        break;
+                default:
+                        printk(KERN_ERR
+                               "NILFS: Unrecognized mount option \"%s\"\n", p);
+                        return 0;
+                }
+        }
+        return 1;
+}
+static inline void
+nilfs_set_default_options(struct nilfs_sb_info *sbi,
+                          struct nilfs_super_block *sbp)
+{
+        sbi->s_mount_opt =
+                NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+}
+static int nilfs_setup_super(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+        int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
+        int mnt_count = le16_to_cpu(sbp->s_mnt_count);
+        /* nilfs->sem must be locked by the caller. */
+        if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+                printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+        } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
+                printk(KERN_WARNING
+                       "NILFS warning: mounting fs with errors\n");
+#if 0
+        } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
+                printk(KERN_WARNING
+                       "NILFS warning: maximal mount count reached\n");
+#endif
+        }
+        if (!max_mnt_count)
+                sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+        sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
+        sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
+        sbp->s_mtime = cpu_to_le64(get_seconds());
+        return nilfs_commit_super(sbi, 1);
+}
+struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
+                                                 u64 pos, int blocksize,
+                                                 struct buffer_head **pbh)
+{
+        unsigned long long sb_index = pos;
+        unsigned long offset;
+        offset = do_div(sb_index, blocksize);
+        *pbh = sb_bread(sb, sb_index);
+        if (!*pbh)
+                return NULL;
+        return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
+}
+int nilfs_store_magic_and_option(struct super_block *sb,
+                                 struct nilfs_super_block *sbp,
+                                 char *data)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        sb->s_magic = le16_to_cpu(sbp->s_magic);
+        /* FS independent flags */
+#ifdef NILFS_ATIME_DISABLE
+        sb->s_flags |= MS_NOATIME;
+#endif
+        nilfs_set_default_options(sbi, sbp);
+        sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
+        sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
+        sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
+        sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
+        return !parse_options(data, sb) ? -EINVAL : 0 ;
+}
+/**
+ * nilfs_fill_super() - initialize a super block instance
+ * @sb: super_block
+ * @data: mount options
+ * @silent: silent mode flag
+ * @nilfs: the_nilfs struct
+ *
+ * This function is called exclusively by bd_mount_mutex.
+ * So, the recovery process is protected from other simultaneous mounts.
+ */
+static int
+nilfs_fill_super(struct super_block *sb, void *data, int silent,
+                 struct the_nilfs *nilfs)
+{
+        struct nilfs_sb_info *sbi;
+        struct inode *root;
+        __u64 cno;
+        int err;
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+        if (!sbi)
+                return -ENOMEM;
+        sb->s_fs_info = sbi;
+        get_nilfs(nilfs);
+        sbi->s_nilfs = nilfs;
+        sbi->s_super = sb;
+        err = init_nilfs(nilfs, sbi, (char *)data);
+        if (err)
+                goto failed_sbi;
+        spin_lock_init(&sbi->s_inode_lock);
+        INIT_LIST_HEAD(&sbi->s_dirty_files);
+        INIT_LIST_HEAD(&sbi->s_list);
+        /*
+         * Following initialization is overlapped because
+         * nilfs_sb_info structure has been cleared at the beginning.
+         * But we reserve them to keep our interest and make ready
+         * for the future change.
+         */
+        get_random_bytes(&sbi->s_next_generation,
+                         sizeof(sbi->s_next_generation));
+        spin_lock_init(&sbi->s_next_gen_lock);
+        sb->s_op = &nilfs_sops;
+        sb->s_export_op = &nilfs_export_ops;
+        sb->s_root = NULL;
+        sb->s_time_gran = 1;
+        if (!nilfs_loaded(nilfs)) {
+                err = load_nilfs(nilfs, sbi);
+                if (err)
+                        goto failed_sbi;
+        }
+        cno = nilfs_last_cno(nilfs);
+        if (sb->s_flags & MS_RDONLY) {
+                if (nilfs_test_opt(sbi, SNAPSHOT)) {
+                        err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
+                                                       sbi->s_snapshot_cno);
+                        if (err < 0)
+                                goto failed_sbi;
+                        if (!err) {
+                                printk(KERN_ERR
+                                       "NILFS: The specified checkpoint is "
+                                       "not a snapshot "
+                                       "(checkpoint number=%llu).\n",
+                                       (unsigned long long)sbi->s_snapshot_cno);
+                                err = -EINVAL;
+                                goto failed_sbi;
+                        }
+                        cno = sbi->s_snapshot_cno;
+                } else
+                        /* Read-only mount */
+                        sbi->s_snapshot_cno = cno;
+        }
+        err = nilfs_attach_checkpoint(sbi, cno);
+        if (err) {
+                printk(KERN_ERR "NILFS: error loading a checkpoint"
+                       " (checkpoint number=%llu).\n", (unsigned long long)cno);
+                goto failed_sbi;
+        }
+        if (!(sb->s_flags & MS_RDONLY)) {
+                err = nilfs_attach_segment_constructor(sbi);
+                if (err)
+                        goto failed_checkpoint;
+        }
+        root = nilfs_iget(sb, NILFS_ROOT_INO);
+        if (IS_ERR(root)) {
+                printk(KERN_ERR "NILFS: get root inode failed\n");
+                err = PTR_ERR(root);
+                goto failed_segctor;
+        }
+        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+                iput(root);
+                printk(KERN_ERR "NILFS: corrupt root inode.\n");
+                err = -EINVAL;
+                goto failed_segctor;
+        }
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                iput(root);
+                printk(KERN_ERR "NILFS: get root dentry failed\n");
+                err = -ENOMEM;
+                goto failed_segctor;
+        }
+        if (!(sb->s_flags & MS_RDONLY)) {
+                down_write(&nilfs->ns_sem);
+                nilfs_setup_super(sbi);
+                up_write(&nilfs->ns_sem);
+        }
+        err = nilfs_mark_recovery_complete(sbi);
+        if (unlikely(err)) {
+                printk(KERN_ERR "NILFS: recovery failed.\n");
+                goto failed_root;
+        }
+        return 0;
+ failed_root:
+        dput(sb->s_root);
+        sb->s_root = NULL;
+ failed_segctor:
+        nilfs_detach_segment_constructor(sbi);
+ failed_checkpoint:
+        nilfs_detach_checkpoint(sbi);
+ failed_sbi:
+        put_nilfs(nilfs);
+        sb->s_fs_info = NULL;
+        kfree(sbi);
+        return err;
+}
+static int nilfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_super_block *sbp;
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        unsigned long old_sb_flags;
+        struct nilfs_mount_options old_opts;
+        int err;
+        old_sb_flags = sb->s_flags;
+        old_opts.mount_opt = sbi->s_mount_opt;
+        old_opts.snapshot_cno = sbi->s_snapshot_cno;
+        if (!parse_options(data, sb)) {
+                err = -EINVAL;
+                goto restore_opts;
+        }
+        sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+        if ((*flags & MS_RDONLY) &&
+            sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+                printk(KERN_WARNING "NILFS (device %s): couldn't "
+                       "remount to a different snapshot. \n",
+                       sb->s_id);
+                err = -EINVAL;
+                goto restore_opts;
+        }
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+                goto out;
+        if (*flags & MS_RDONLY) {
+                /* Shutting down the segment constructor */
+                nilfs_detach_segment_constructor(sbi);
+                sb->s_flags |= MS_RDONLY;
+                sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
+                /* nilfs_set_opt(sbi, SNAPSHOT); */
+                /*
+                 * Remounting a valid RW partition RDONLY, so set
+                 * the RDONLY flag and then mark the partition as valid again.
+                 */
+                down_write(&nilfs->ns_sem);
+                sbp = nilfs->ns_sbp[0];
+                if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
+                    (nilfs->ns_mount_state & NILFS_VALID_FS))
+                        sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
+                sbp->s_mtime = cpu_to_le64(get_seconds());
+                nilfs_commit_super(sbi, 1);
+                up_write(&nilfs->ns_sem);
+        } else {
+                /*
+                 * Mounting a RDONLY partition read-write, so reread and
+                 * store the current valid flag.  (It may have been changed
+                 * by fsck since we originally mounted the partition.)
+                 */
+                down(&sb->s_bdev->bd_mount_sem);
+                /* Check existing RW-mount */
+                if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
+                        printk(KERN_WARNING "NILFS (device %s): couldn't "
+                               "remount because a RW-mount exists.\n",
+                               sb->s_id);
+                        err = -EBUSY;
+                        goto rw_remount_failed;
+                }
+                if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
+                        printk(KERN_WARNING "NILFS (device %s): couldn't "
+                               "remount because the current RO-mount is not "
+                               "the latest one.\n",
+                               sb->s_id);
+                        err = -EINVAL;
+                        goto rw_remount_failed;
+                }
+                sb->s_flags &= ~MS_RDONLY;
+                nilfs_clear_opt(sbi, SNAPSHOT);
+                sbi->s_snapshot_cno = 0;
+                err = nilfs_attach_segment_constructor(sbi);
+                if (err)
+                        goto rw_remount_failed;
+                down_write(&nilfs->ns_sem);
+                nilfs_setup_super(sbi);
+                up_write(&nilfs->ns_sem);
+                up(&sb->s_bdev->bd_mount_sem);
+        }
+ out:
+        return 0;
+ rw_remount_failed:
+        up(&sb->s_bdev->bd_mount_sem);
+ restore_opts:
+        sb->s_flags = old_sb_flags;
+        sbi->s_mount_opt = old_opts.mount_opt;
+        sbi->s_snapshot_cno = old_opts.snapshot_cno;
+        return err;
+}
+struct nilfs_super_data {
+        struct block_device *bdev;
+        __u64 cno;
+        int flags;
+};
+/**
+ * nilfs_identify - pre-read mount options needed to identify mount instance
+ * @data: mount options
+ * @sd: nilfs_super_data
+ */
+static int nilfs_identify(char *data, struct nilfs_super_data *sd)
+{
+        char *p, *options = data;
+        substring_t args[MAX_OPT_ARGS];
+        int option, token;
+        int ret = 0;
+        do {
+                p = strsep(&options, ",");
+                if (p != NULL && *p) {
+                        token = match_token(p, tokens, args);
+                        if (token == Opt_snapshot) {
+                                if (!(sd->flags & MS_RDONLY))
+                                        ret++;
+                                else {
+                                        ret = match_int(&args[0], &option);
+                                        if (!ret) {
+                                                if (option > 0)
+                                                        sd->cno = option;
+                                                else
+                                                        ret++;
+                                        }
+                                }
+                        }
+                        if (ret)
+                                printk(KERN_ERR
+                                       "NILFS: invalid mount option: %s\n", p);
+                }
+                if (!options)
+                        break;
+                BUG_ON(options == data);
+                *(options - 1) = ',';
+        } while (!ret);
+        return ret;
+}
+static int nilfs_set_bdev_super(struct super_block *s, void *data)
+{
+        struct nilfs_super_data *sd = data;
+        s->s_bdev = sd->bdev;
+        s->s_dev = s->s_bdev->bd_dev;
+        return 0;
+}
+static int nilfs_test_bdev_super(struct super_block *s, void *data)
+{
+        struct nilfs_super_data *sd = data;
+        return s->s_bdev == sd->bdev;
+}
+static int nilfs_test_bdev_super2(struct super_block *s, void *data)
+{
+        struct nilfs_super_data *sd = data;
+        int ret;
+        if (s->s_bdev != sd->bdev)
+                return 0;
+        if (!((s->s_flags | sd->flags) & MS_RDONLY))
+                return 1; /* Reuse an old R/W-mode super_block */
+        if (s->s_flags & sd->flags & MS_RDONLY) {
+                if (down_read_trylock(&s->s_umount)) {
+                        ret = s->s_root &&
+                                (sd->cno == NILFS_SB(s)->s_snapshot_cno);
+                        up_read(&s->s_umount);
+                        /*
+                         * This path is locked with sb_lock by sget().
+                         * So, drop_super() causes deadlock.
+                         */
+                        return ret;
+                }
+        }
+        return 0;
+}
+static int
+nilfs_get_sb(struct file_system_type *fs_type, int flags,
+             const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct nilfs_super_data sd;
+        struct super_block *s, *s2;
+        struct the_nilfs *nilfs = NULL;
+        int err, need_to_close = 1;
+        sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+        if (IS_ERR(sd.bdev))
+                return PTR_ERR(sd.bdev);
+        /*
+         * To get mount instance using sget() vfs-routine, NILFS needs
+         * much more information than normal filesystems to identify mount
+         * instance.  For snapshot mounts, not only a mount type (ro-mount
+         * or rw-mount) but also a checkpoint number is required.
+         * The results are passed in sget() using nilfs_super_data.
+         */
+        sd.cno = 0;
+        sd.flags = flags;
+        if (nilfs_identify((char *)data, &sd)) {
+                err = -EINVAL;
+                goto failed;
+        }
+        /*
+         * once the super is inserted into the list by sget, s_umount
+         * will protect the lockfs code from trying to start a snapshot
+         * while we are mounting
+         */
+        down(&sd.bdev->bd_mount_sem);
+        if (!sd.cno &&
+            (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
+                err = (err < 0) ? : -EBUSY;
+                goto failed_unlock;
+        }
+        /*
+         * Phase-1: search any existent instance and get the_nilfs
+         */
+        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+        if (IS_ERR(s))
+                goto error_s;
+        if (!s->s_root) {
+                err = -ENOMEM;
+                nilfs = alloc_nilfs(sd.bdev);
+                if (!nilfs)
+                        goto cancel_new;
+        } else {
+                struct nilfs_sb_info *sbi = NILFS_SB(s);
+                /*
+                 * s_umount protects super_block from unmount process;
+                 * It covers pointers of nilfs_sb_info and the_nilfs.
+                 */
+                nilfs = sbi->s_nilfs;
+                get_nilfs(nilfs);
+                up_write(&s->s_umount);
+                /*
+                 * Phase-2: search specified snapshot or R/W mode super_block
+                 */
+                if (!sd.cno)
+                        /* trying to get the latest checkpoint.  */
+                        sd.cno = nilfs_last_cno(nilfs);
+                s2 = sget(fs_type, nilfs_test_bdev_super2,
+                          nilfs_set_bdev_super, &sd);
+                deactivate_super(s);
+                /*
+                 * Although deactivate_super() invokes close_bdev_exclusive() at
+                 * kill_block_super().  Here, s is an existent mount; we need
+                 * one more close_bdev_exclusive() call.
+                 */
+                s = s2;
+                if (IS_ERR(s))
+                        goto error_s;
+        }
+        if (!s->s_root) {
+                char b[BDEVNAME_SIZE];
+                s->s_flags = flags;
+                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+                sb_set_blocksize(s, block_size(sd.bdev));
+                err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+                if (err)
+                        goto cancel_new;
+                s->s_flags |= MS_ACTIVE;
+                need_to_close = 0;
+        } else if (!(s->s_flags & MS_RDONLY)) {
+                err = -EBUSY;
+        }
+        up(&sd.bdev->bd_mount_sem);
+        put_nilfs(nilfs);
+        if (need_to_close)
+                close_bdev_exclusive(sd.bdev, flags);
+        simple_set_mnt(mnt, s);
+        return 0;
+ error_s:
+        up(&sd.bdev->bd_mount_sem);
+        if (nilfs)
+                put_nilfs(nilfs);
+        close_bdev_exclusive(sd.bdev, flags);
+        return PTR_ERR(s);
+ failed_unlock:
+        up(&sd.bdev->bd_mount_sem);
+ failed:
+        close_bdev_exclusive(sd.bdev, flags);
+        return err;
+ cancel_new:
+        /* Abandoning the newly allocated superblock */
+        up(&sd.bdev->bd_mount_sem);
+        if (nilfs)
+                put_nilfs(nilfs);
+        up_write(&s->s_umount);
+        deactivate_super(s);
+        /*
+         * deactivate_super() invokes close_bdev_exclusive().
+         * We must finish all post-cleaning before this call;
+         * put_nilfs() and unlocking bd_mount_sem need the block device.
+         */
+        return err;
+}
+static int nilfs_test_bdev_super3(struct super_block *s, void *data)
+{
+        struct nilfs_super_data *sd = data;
+        int ret;
+        if (s->s_bdev != sd->bdev)
+                return 0;
+        if (down_read_trylock(&s->s_umount)) {
+                ret = (s->s_flags & MS_RDONLY) && s->s_root &&
+                        nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
+                up_read(&s->s_umount);
+                if (ret)
+                        return 0; /* ignore snapshot mounts */
+        }
+        return !((sd->flags ^ s->s_flags) & MS_RDONLY);
+}
+static int __false_bdev_super(struct super_block *s, void *data)
+{
+#if 0 /* XXX: workaround for lock debug. This is not good idea */
+        up_write(&s->s_umount);
+#endif
+        return -EFAULT;
+}
+/**
+ * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
+ * fs_type: filesystem type
+ * bdev: block device
+ * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
+ * res: pointer to an integer to store result
+ *
+ * This function must be called within a section protected by bd_mount_mutex.
+ */
+static int test_exclusive_mount(struct file_system_type *fs_type,
+                                struct block_device *bdev, int flags)
+{
+        struct super_block *s;
+        struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
+        s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
+        if (IS_ERR(s)) {
+                if (PTR_ERR(s) != -EFAULT)
+                        return PTR_ERR(s);
+                return 0;  /* Not found */
+        }
+        up_write(&s->s_umount);
+        deactivate_super(s);
+        return 1;  /* Found */
+}
+struct file_system_type nilfs_fs_type = {
+        .owner    = THIS_MODULE,
+        .name     = "nilfs2",
+        .get_sb   = nilfs_get_sb,
+        .kill_sb  = kill_block_super,
+        .fs_flags = FS_REQUIRES_DEV,
+};
+static int __init init_nilfs_fs(void)
+{
+        int err;
+        err = nilfs_init_inode_cache();
+        if (err)
+                goto failed;
+        err = nilfs_init_transaction_cache();
+        if (err)
+                goto failed_inode_cache;
+        err = nilfs_init_segbuf_cache();
+        if (err)
+                goto failed_transaction_cache;
+        err = nilfs_btree_path_cache_init();
+        if (err)
+                goto failed_segbuf_cache;
+        err = register_filesystem(&nilfs_fs_type);
+        if (err)
+                goto failed_btree_path_cache;
+        return 0;
+ failed_btree_path_cache:
+        nilfs_btree_path_cache_destroy();
+ failed_segbuf_cache:
+        nilfs_destroy_segbuf_cache();
+ failed_transaction_cache:
+        nilfs_destroy_transaction_cache();
+ failed_inode_cache:
+        nilfs_destroy_inode_cache();
+ failed:
+        return err;
+}
+static void __exit exit_nilfs_fs(void)
+{
+        nilfs_destroy_segbuf_cache();
+        nilfs_destroy_transaction_cache();
+        nilfs_destroy_inode_cache();
+        nilfs_btree_path_cache_destroy();
+        unregister_filesystem(&nilfs_fs_type);
+}
+module_init(init_nilfs_fs)
+module_exit(exit_nilfs_fs)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
new file mode 100644
index 000000000000..33400cf0bbe2
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,637 @@
+/*
+ * the_nilfs.c - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "alloc.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+#include "seglist.h"
+#include "segbuf.h"
+void nilfs_set_last_segment(struct the_nilfs *nilfs,
+                            sector_t start_blocknr, u64 seq, __u64 cno)
+{
+        spin_lock(&nilfs->ns_last_segment_lock);
+        nilfs->ns_last_pseg = start_blocknr;
+        nilfs->ns_last_seq = seq;
+        nilfs->ns_last_cno = cno;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+}
+/**
+ * alloc_nilfs - allocate the_nilfs structure
+ * @bdev: block device to which the_nilfs is related
+ *
+ * alloc_nilfs() allocates memory for the_nilfs and
+ * initializes its reference count and locks.
+ *
+ * Return Value: On success, pointer to the_nilfs is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+{
+        struct the_nilfs *nilfs;
+        nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
+        if (!nilfs)
+                return NULL;
+        nilfs->ns_bdev = bdev;
+        atomic_set(&nilfs->ns_count, 1);
+        atomic_set(&nilfs->ns_writer_refcount, -1);
+        atomic_set(&nilfs->ns_ndirtyblks, 0);
+        init_rwsem(&nilfs->ns_sem);
+        mutex_init(&nilfs->ns_writer_mutex);
+        INIT_LIST_HEAD(&nilfs->ns_supers);
+        spin_lock_init(&nilfs->ns_last_segment_lock);
+        nilfs->ns_gc_inodes_h = NULL;
+        init_rwsem(&nilfs->ns_segctor_sem);
+        return nilfs;
+}
+/**
+ * put_nilfs - release a reference to the_nilfs
+ * @nilfs: the_nilfs structure to be released
+ *
+ * put_nilfs() decrements a reference counter of the_nilfs.
+ * If the reference count reaches zero, the_nilfs is freed.
+ */
+void put_nilfs(struct the_nilfs *nilfs)
+{
+        if (!atomic_dec_and_test(&nilfs->ns_count))
+                return;
+        /*
+         * Increment of ns_count never occur below because the caller
+         * of get_nilfs() holds at least one reference to the_nilfs.
+         * Thus its exclusion control is not required here.
+         */
+        might_sleep();
+        if (nilfs_loaded(nilfs)) {
+                nilfs_mdt_clear(nilfs->ns_sufile);
+                nilfs_mdt_destroy(nilfs->ns_sufile);
+                nilfs_mdt_clear(nilfs->ns_cpfile);
+                nilfs_mdt_destroy(nilfs->ns_cpfile);
+                nilfs_mdt_clear(nilfs->ns_dat);
+                nilfs_mdt_destroy(nilfs->ns_dat);
+                /* XXX: how and when to clear nilfs->ns_gc_dat? */
+                nilfs_mdt_destroy(nilfs->ns_gc_dat);
+        }
+        if (nilfs_init(nilfs)) {
+                nilfs_destroy_gccache(nilfs);
+                brelse(nilfs->ns_sbh[0]);
+                brelse(nilfs->ns_sbh[1]);
+        }
+        kfree(nilfs);
+}
+static int nilfs_load_super_root(struct the_nilfs *nilfs,
+                                 struct nilfs_sb_info *sbi, sector_t sr_block)
+{
+        struct buffer_head *bh_sr;
+        struct nilfs_super_root *raw_sr;
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        unsigned dat_entry_size, segment_usage_size, checkpoint_size;
+        unsigned inode_size;
+        int err;
+        err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
+        if (unlikely(err))
+                return err;
+        down_read(&nilfs->ns_sem);
+        dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
+        checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
+        segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
+        up_read(&nilfs->ns_sem);
+        inode_size = nilfs->ns_inode_size;
+        err = -ENOMEM;
+        nilfs->ns_dat = nilfs_mdt_new(
+                nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+        if (unlikely(!nilfs->ns_dat))
+                goto failed;
+        nilfs->ns_gc_dat = nilfs_mdt_new(
+                nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+        if (unlikely(!nilfs->ns_gc_dat))
+                goto failed_dat;
+        nilfs->ns_cpfile = nilfs_mdt_new(
+                nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
+        if (unlikely(!nilfs->ns_cpfile))
+                goto failed_gc_dat;
+        nilfs->ns_sufile = nilfs_mdt_new(
+                nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
+        if (unlikely(!nilfs->ns_sufile))
+                goto failed_cpfile;
+        err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
+        if (unlikely(err))
+                goto failed_sufile;
+        err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
+        if (unlikely(err))
+                goto failed_sufile;
+        nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
+        nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
+                                 sizeof(struct nilfs_cpfile_header));
+        nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
+                                 sizeof(struct nilfs_sufile_header));
+        err = nilfs_mdt_read_inode_direct(
+                nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
+        if (unlikely(err))
+                goto failed_sufile;
+        err = nilfs_mdt_read_inode_direct(
+                nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
+        if (unlikely(err))
+                goto failed_sufile;
+        err = nilfs_mdt_read_inode_direct(
+                nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
+        if (unlikely(err))
+                goto failed_sufile;
+        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+        nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
+ failed:
+        brelse(bh_sr);
+        return err;
+ failed_sufile:
+        nilfs_mdt_destroy(nilfs->ns_sufile);
+ failed_cpfile:
+        nilfs_mdt_destroy(nilfs->ns_cpfile);
+ failed_gc_dat:
+        nilfs_mdt_destroy(nilfs->ns_gc_dat);
+ failed_dat:
+        nilfs_mdt_destroy(nilfs->ns_dat);
+        goto failed;
+}
+static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
+{
+        memset(ri, 0, sizeof(*ri));
+        INIT_LIST_HEAD(&ri->ri_used_segments);
+}
+static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
+{
+        nilfs_dispose_segment_list(&ri->ri_used_segments);
+}
+/**
+ * load_nilfs - load and recover the nilfs
+ * @nilfs: the_nilfs structure to be released
+ * @sbi: nilfs_sb_info used to recover past segment
+ *
+ * load_nilfs() searches and load the latest super root,
+ * attaches the last segment, and does recovery if needed.
+ * The caller must call this exclusively for simultaneous mounts.
+ */
+int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+        struct nilfs_recovery_info ri;
+        unsigned int s_flags = sbi->s_super->s_flags;
+        int really_read_only = bdev_read_only(nilfs->ns_bdev);
+        unsigned valid_fs;
+        int err = 0;
+        nilfs_init_recovery_info(&ri);
+        down_write(&nilfs->ns_sem);
+        valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+        up_write(&nilfs->ns_sem);
+        if (!valid_fs && (s_flags & MS_RDONLY)) {
+                printk(KERN_INFO "NILFS: INFO: recovery "
+                       "required for readonly filesystem.\n");
+                if (really_read_only) {
+                        printk(KERN_ERR "NILFS: write access "
+                               "unavailable, cannot proceed.\n");
+                        err = -EROFS;
+                        goto failed;
+                }
+                printk(KERN_INFO "NILFS: write access will "
+                       "be enabled during recovery.\n");
+                sbi->s_super->s_flags &= ~MS_RDONLY;
+        }
+        err = nilfs_search_super_root(nilfs, sbi, &ri);
+        if (unlikely(err)) {
+                printk(KERN_ERR "NILFS: error searching super root.\n");
+                goto failed;
+        }
+        err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
+        if (unlikely(err)) {
+                printk(KERN_ERR "NILFS: error loading super root.\n");
+                goto failed;
+        }
+        if (!valid_fs) {
+                err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+                if (unlikely(err)) {
+                        nilfs_mdt_destroy(nilfs->ns_cpfile);
+                        nilfs_mdt_destroy(nilfs->ns_sufile);
+                        nilfs_mdt_destroy(nilfs->ns_dat);
+                        goto failed;
+                }
+                if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
+                        sbi->s_super->s_dirt = 1;
+        }
+        set_nilfs_loaded(nilfs);
+ failed:
+        nilfs_clear_recovery_info(&ri);
+        sbi->s_super->s_flags = s_flags;
+        return err;
+}
+static unsigned long long nilfs_max_size(unsigned int blkbits)
+{
+        unsigned int max_bits;
+        unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
+        max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
+        if (max_bits < 64)
+                res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
+        return res;
+}
+static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
+                                   struct nilfs_super_block *sbp)
+{
+        if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
+                printk(KERN_ERR "NILFS: revision mismatch "
+                       "(superblock rev.=%d.%d, current rev.=%d.%d). "
+                       "Please check the version of mkfs.nilfs.\n",
+                       le32_to_cpu(sbp->s_rev_level),
+                       le16_to_cpu(sbp->s_minor_rev_level),
+                       NILFS_CURRENT_REV, NILFS_MINOR_REV);
+                return -EINVAL;
+        }
+        nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
+        if (nilfs->ns_sbsize > BLOCK_SIZE)
+                return -EINVAL;
+        nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
+        nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+        nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+        if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
+                printk(KERN_ERR "NILFS: too short segment. \n");
+                return -EINVAL;
+        }
+        nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
+        nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
+        nilfs->ns_r_segments_percentage =
+                le32_to_cpu(sbp->s_r_segments_percentage);
+        nilfs->ns_nrsvsegs =
+                max_t(unsigned long, NILFS_MIN_NRSVSEGS,
+                      DIV_ROUND_UP(nilfs->ns_nsegments *
+                                   nilfs->ns_r_segments_percentage, 100));
+        nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
+        return 0;
+}
+static int nilfs_valid_sb(struct nilfs_super_block *sbp)
+{
+        static unsigned char sum[4];
+        const int sumoff = offsetof(struct nilfs_super_block, s_sum);
+        size_t bytes;
+        u32 crc;
+        if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
+                return 0;
+        bytes = le16_to_cpu(sbp->s_bytes);
+        if (bytes > BLOCK_SIZE)
+                return 0;
+        crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
+                       sumoff);
+        crc = crc32_le(crc, sum, 4);
+        crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
+                       bytes - sumoff - 4);
+        return crc == le32_to_cpu(sbp->s_sum);
+}
+static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+{
+        return offset < ((le64_to_cpu(sbp->s_nsegments) *
+                          le32_to_cpu(sbp->s_blocks_per_segment)) <<
+                         (le32_to_cpu(sbp->s_log_block_size) + 10));
+}
+static void nilfs_release_super_block(struct the_nilfs *nilfs)
+{
+        int i;
+        for (i = 0; i < 2; i++) {
+                if (nilfs->ns_sbp[i]) {
+                        brelse(nilfs->ns_sbh[i]);
+                        nilfs->ns_sbh[i] = NULL;
+                        nilfs->ns_sbp[i] = NULL;
+                }
+        }
+}
+void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
+{
+        brelse(nilfs->ns_sbh[0]);
+        nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+        nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+        nilfs->ns_sbh[1] = NULL;
+        nilfs->ns_sbp[1] = NULL;
+}
+void nilfs_swap_super_block(struct the_nilfs *nilfs)
+{
+        struct buffer_head *tsbh = nilfs->ns_sbh[0];
+        struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
+        nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+        nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+        nilfs->ns_sbh[1] = tsbh;
+        nilfs->ns_sbp[1] = tsbp;
+}
+static int nilfs_load_super_block(struct the_nilfs *nilfs,
+                                  struct super_block *sb, int blocksize,
+                                  struct nilfs_super_block **sbpp)
+{
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        struct buffer_head **sbh = nilfs->ns_sbh;
+        u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+        int valid[2], swp = 0;
+        sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
+                                        &sbh[0]);
+        sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
+        if (!sbp[0]) {
+                if (!sbp[1]) {
+                        printk(KERN_ERR "NILFS: unable to read superblock\n");
+                        return -EIO;
+                }
+                printk(KERN_WARNING
+                       "NILFS warning: unable to read primary superblock\n");
+        } else if (!sbp[1])
+                printk(KERN_WARNING
+                       "NILFS warning: unable to read secondary superblock\n");
+        valid[0] = nilfs_valid_sb(sbp[0]);
+        valid[1] = nilfs_valid_sb(sbp[1]);
+        swp = valid[1] &&
+                (!valid[0] ||
+                 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+        if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
+                brelse(sbh[1]);
+                sbh[1] = NULL;
+                sbp[1] = NULL;
+                swp = 0;
+        }
+        if (!valid[swp]) {
+                nilfs_release_super_block(nilfs);
+                printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
+                       sb->s_id);
+                return -EINVAL;
+        }
+        if (swp) {
+                printk(KERN_WARNING "NILFS warning: broken superblock. "
+                       "using spare superblock.\n");
+                nilfs_swap_super_block(nilfs);
+        }
+        nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
+        nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
+        nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
+        *sbpp = sbp[0];
+        return 0;
+}
+/**
+ * init_nilfs - initialize a NILFS instance.
+ * @nilfs: the_nilfs structure
+ * @sbi: nilfs_sb_info
+ * @sb: super block
+ * @data: mount options
+ *
+ * init_nilfs() performs common initialization per block device (e.g.
+ * reading the super block, getting disk layout information, initializing
+ * shared fields in the_nilfs). It takes on some portion of the jobs
+ * typically done by a fill_super() routine. This division arises from
+ * the nature that multiple NILFS instances may be simultaneously
+ * mounted on a device.
+ * For multiple mounts on the same device, only the first mount
+ * invokes these tasks.
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error
+ * code is returned.
+ */
+int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
+{
+        struct super_block *sb = sbi->s_super;
+        struct nilfs_super_block *sbp;
+        struct backing_dev_info *bdi;
+        int blocksize;
+        int err;
+        down_write(&nilfs->ns_sem);
+        if (nilfs_init(nilfs)) {
+                /* Load values from existing the_nilfs */
+                sbp = nilfs->ns_sbp[0];
+                err = nilfs_store_magic_and_option(sb, sbp, data);
+                if (err)
+                        goto out;
+                blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+                if (sb->s_blocksize != blocksize &&
+                    !sb_set_blocksize(sb, blocksize)) {
+                        printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
+                               blocksize);
+                        err = -EINVAL;
+                }
+                sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+                goto out;
+        }
+        blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+        if (!blocksize) {
+                printk(KERN_ERR "NILFS: unable to set blocksize\n");
+                err = -EINVAL;
+                goto out;
+        }
+        err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+        if (err)
+                goto out;
+        err = nilfs_store_magic_and_option(sb, sbp, data);
+        if (err)
+                goto failed_sbh;
+        blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+        if (sb->s_blocksize != blocksize) {
+                int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+                if (blocksize < hw_blocksize) {
+                        printk(KERN_ERR
+                               "NILFS: blocksize %d too small for device "
+                               "(sector-size = %d).\n",
+                               blocksize, hw_blocksize);
+                        err = -EINVAL;
+                        goto failed_sbh;
+                }
+                nilfs_release_super_block(nilfs);
+                sb_set_blocksize(sb, blocksize);
+                err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+                if (err)
+                        goto out;
+                        /* not failed_sbh; sbh is released automatically
+                           when reloading fails. */
+        }
+        nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
+        err = nilfs_store_disk_layout(nilfs, sbp);
+        if (err)
+                goto failed_sbh;
+        sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+        nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
+        bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
+        if (!bdi)
+                bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+        nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
+        /* Finding last segment */
+        nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+        nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
+        nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
+        nilfs->ns_seg_seq = nilfs->ns_last_seq;
+        nilfs->ns_segnum =
+                nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
+        nilfs->ns_cno = nilfs->ns_last_cno + 1;
+        if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
+                printk(KERN_ERR "NILFS invalid last segment number.\n");
+                err = -EINVAL;
+                goto failed_sbh;
+        }
+        /* Dummy values  */
+        nilfs->ns_free_segments_count =
+                nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
+        /* Initialize gcinode cache */
+        err = nilfs_init_gccache(nilfs);
+        if (err)
+                goto failed_sbh;
+        set_nilfs_init(nilfs);
+        err = 0;
+ out:
+        up_write(&nilfs->ns_sem);
+        return err;
+ failed_sbh:
+        nilfs_release_super_block(nilfs);
+        goto out;
+}
+int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
+{
+        struct inode *dat = nilfs_dat_inode(nilfs);
+        unsigned long ncleansegs;
+        int err;
+        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
+        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        if (likely(!err))
+                *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+        return err;
+}
+int nilfs_near_disk_full(struct the_nilfs *nilfs)
+{
+        struct inode *sufile = nilfs->ns_sufile;
+        unsigned long ncleansegs, nincsegs;
+        int ret;
+        ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
+        if (likely(!ret)) {
+                nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+                        nilfs->ns_blocks_per_segment + 1;
+                if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
+                        ret++;
+        }
+        return ret;
+}
+int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
+                                int snapshot_mount)
+{
+        struct nilfs_sb_info *sbi;
+        int ret = 0;
+        down_read(&nilfs->ns_sem);
+        if (cno == 0 || cno > nilfs->ns_cno)
+                goto out_unlock;
+        list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+                if (sbi->s_snapshot_cno == cno &&
+                    (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
+                                        /* exclude read-only mounts */
+                        ret++;
+                        break;
+                }
+        }
+        /* for protecting recent checkpoints */
+        if (cno >= nilfs_last_cno(nilfs))
+                ret++;
+ out_unlock:
+        up_read(&nilfs->ns_sem);
+        return ret;
+}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
new file mode 100644
index 000000000000..30fe58778d05
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,298 @@
+/*
+ * the_nilfs.h - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _THE_NILFS_H
+#define _THE_NILFS_H
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "sb.h"
+/* the_nilfs struct */
+enum {
+        THE_NILFS_INIT = 0,     /* Information from super_block is set */
+        THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
+                                   the latest checkpoint was loaded */
+        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
+};
+/**
+ * struct the_nilfs - struct to supervise multiple nilfs mount points
+ * @ns_flags: flags
+ * @ns_count: reference count
+ * @ns_bdev: block device
+ * @ns_bdi: backing dev info
+ * @ns_writer: back pointer to writable nilfs_sb_info
+ * @ns_sem: semaphore for shared states
+ * @ns_writer_mutex: mutex protecting ns_writer attach/detach
+ * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_sbh: buffer heads of on-disk super blocks
+ * @ns_sbp: pointers to super block data
+ * @ns_sbwtime: previous write time of super blocks
+ * @ns_sbsize: size of valid data in super block
+ * @ns_supers: list of nilfs super block structs
+ * @ns_seg_seq: segment sequence counter
+ * @ns_segnum: index number of the latest full segment.
+ * @ns_nextnum: index number of the full segment index to be used next
+ * @ns_pseg_offset: offset of next partial segment in the current full segment
+ * @ns_cno: next checkpoint number
+ * @ns_ctime: write time of the last segment
+ * @ns_nongc_ctime: write time of the last segment not for cleaner operation
+ * @ns_ndirtyblks: Number of dirty data blocks
+ * @ns_last_segment_lock: lock protecting fields for the latest segment
+ * @ns_last_pseg: start block number of the latest segment
+ * @ns_last_seq: sequence value of the latest segment
+ * @ns_last_cno: checkpoint number of the latest segment
+ * @ns_prot_seq: least sequence number of segments which must not be reclaimed
+ * @ns_free_segments_count: counter of free segments
+ * @ns_segctor_sem: segment constructor semaphore
+ * @ns_dat: DAT file inode
+ * @ns_cpfile: checkpoint file inode
+ * @ns_sufile: segusage file inode
+ * @ns_gc_dat: shadow inode of the DAT file inode for GC
+ * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
+ * @ns_blocksize_bits: bit length of block size
+ * @ns_nsegments: number of segments in filesystem
+ * @ns_blocks_per_segment: number of blocks per segment
+ * @ns_r_segments_percentage: reserved segments percentage
+ * @ns_nrsvsegs: number of reserved segments
+ * @ns_first_data_block: block number of first data block
+ * @ns_inode_size: size of on-disk inode
+ * @ns_first_ino: first not-special inode number
+ * @ns_crc_seed: seed value of CRC32 calculation
+ */
+struct the_nilfs {
+        unsigned long           ns_flags;
+        atomic_t                ns_count;
+        struct block_device    *ns_bdev;
+        struct backing_dev_info *ns_bdi;
+        struct nilfs_sb_info   *ns_writer;
+        struct rw_semaphore     ns_sem;
+        struct mutex            ns_writer_mutex;
+        atomic_t                ns_writer_refcount;
+        /*
+         * used for
+         * - loading the latest checkpoint exclusively.
+         * - allocating a new full segment.
+         * - protecting s_dirt in the super_block struct
+         *   (see nilfs_write_super) and the following fields.
+         */
+        struct buffer_head     *ns_sbh[2];
+        struct nilfs_super_block *ns_sbp[2];
+        time_t                  ns_sbwtime[2];
+        unsigned                ns_sbsize;
+        unsigned                ns_mount_state;
+        struct list_head        ns_supers;
+        /*
+         * Following fields are dedicated to a writable FS-instance.
+         * Except for the period seeking checkpoint, code outside the segment
+         * constructor must lock a segment semaphore while accessing these
+         * fields.
+         * The writable FS-instance is sole during a lifetime of the_nilfs.
+         */
+        u64                     ns_seg_seq;
+        __u64                   ns_segnum;
+        __u64                   ns_nextnum;
+        unsigned long           ns_pseg_offset;
+        __u64                   ns_cno;
+        time_t                  ns_ctime;
+        time_t                  ns_nongc_ctime;
+        atomic_t                ns_ndirtyblks;
+        /*
+         * The following fields hold information on the latest partial segment
+         * written to disk with a super root.  These fields are protected by
+         * ns_last_segment_lock.
+         */
+        spinlock_t              ns_last_segment_lock;
+        sector_t                ns_last_pseg;
+        u64                     ns_last_seq;
+        __u64                   ns_last_cno;
+        u64                     ns_prot_seq;
+        unsigned long           ns_free_segments_count;
+        struct rw_semaphore     ns_segctor_sem;
+        /*
+         * Following fields are lock free except for the period before
+         * the_nilfs is initialized.
+         */
+        struct inode           *ns_dat;
+        struct inode           *ns_cpfile;
+        struct inode           *ns_sufile;
+        struct inode           *ns_gc_dat;
+        /* GC inode list and hash table head */
+        struct list_head        ns_gc_inodes;
+        struct hlist_head      *ns_gc_inodes_h;
+        /* Disk layout information (static) */
+        unsigned int            ns_blocksize_bits;
+        unsigned long           ns_nsegments;
+        unsigned long           ns_blocks_per_segment;
+        unsigned long           ns_r_segments_percentage;
+        unsigned long           ns_nrsvsegs;
+        unsigned long           ns_first_data_block;
+        int                     ns_inode_size;
+        int                     ns_first_ino;
+        u32                     ns_crc_seed;
+};
+#define NILFS_GCINODE_HASH_BITS         8
+#define NILFS_GCINODE_HASH_SIZE         (1<<NILFS_GCINODE_HASH_BITS)
+#define THE_NILFS_FNS(bit, name)                                        \
+static inline void set_nilfs_##name(struct the_nilfs *nilfs)            \
+{                                                                       \
+        set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);                   \
+}                                                                       \
+static inline void clear_nilfs_##name(struct the_nilfs *nilfs)          \
+{                                                                       \
+        clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);                 \
+}                                                                       \
+static inline int nilfs_##name(struct the_nilfs *nilfs)                 \
+{                                                                       \
+        return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);           \
+}
+THE_NILFS_FNS(INIT, init)
+THE_NILFS_FNS(LOADED, loaded)
+THE_NILFS_FNS(DISCONTINUED, discontinued)
+/* Minimum interval of periodical update of superblocks (in seconds) */
+#define NILFS_SB_FREQ           10
+#define NILFS_ALTSB_FREQ        60  /* spare superblock */
+void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
+struct the_nilfs *alloc_nilfs(struct block_device *);
+void put_nilfs(struct the_nilfs *);
+int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
+int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
+int nilfs_near_disk_full(struct the_nilfs *);
+void nilfs_fall_back_super_block(struct the_nilfs *);
+void nilfs_swap_super_block(struct the_nilfs *);
+static inline void get_nilfs(struct the_nilfs *nilfs)
+{
+        /* Caller must have at least one reference of the_nilfs. */
+        atomic_inc(&nilfs->ns_count);
+}
+static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
+{
+        if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
+                mutex_lock(&nilfs->ns_writer_mutex);
+        return nilfs->ns_writer;
+}
+static inline void nilfs_put_writer(struct the_nilfs *nilfs)
+{
+        if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
+                mutex_unlock(&nilfs->ns_writer_mutex);
+}
+static inline void
+nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+        mutex_lock(&nilfs->ns_writer_mutex);
+        nilfs->ns_writer = sbi;
+        mutex_unlock(&nilfs->ns_writer_mutex);
+}
+static inline void
+nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+        mutex_lock(&nilfs->ns_writer_mutex);
+        if (sbi == nilfs->ns_writer)
+                nilfs->ns_writer = NULL;
+        mutex_unlock(&nilfs->ns_writer_mutex);
+}
+static inline void
+nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
+                        sector_t *seg_start, sector_t *seg_end)
+{
+        *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
+        *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
+        if (segnum == 0)
+                *seg_start = nilfs->ns_first_data_block;
+}
+static inline sector_t
+nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
+{
+        return (segnum == 0) ? nilfs->ns_first_data_block :
+                (sector_t)nilfs->ns_blocks_per_segment * segnum;
+}
+static inline __u64
+nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
+{
+        sector_t segnum = blocknr;
+        sector_div(segnum, nilfs->ns_blocks_per_segment);
+        return segnum;
+}
+static inline void
+nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
+                        sector_t seg_end)
+{
+        /* terminate the current full segment (used in case of I/O-error) */
+        nilfs->ns_pseg_offset = seg_end - seg_start + 1;
+}
+static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
+{
+        /* move forward with a full segment */
+        nilfs->ns_segnum = nilfs->ns_nextnum;
+        nilfs->ns_pseg_offset = 0;
+        nilfs->ns_seg_seq++;
+}
+static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
+{
+        __u64 cno;
+        spin_lock(&nilfs->ns_last_segment_lock);
+        cno = nilfs->ns_last_cno;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        return cno;
+}
+static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
+{
+        return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
+}
+#endif /* _THE_NILFS_H */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
                                return PTR_ERR(acl);
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19e3a96aa02c..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
        .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 };
+static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
+                                          u64 blkno)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
+}
+static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        return le64_to_cpu(dx_root->dr_last_eb_blk);
+}
+static void ocfs2_dx_root_update_clusters(struct inode *inode,
+                                          struct ocfs2_extent_tree *et,
+                                          u32 clusters)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        le32_add_cpu(&dx_root->dr_clusters, clusters);
+}
+static int ocfs2_dx_root_sanity_check(struct inode *inode,
+                                      struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
+        return 0;
+}
+static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        et->et_root_el = &dx_root->dr_list;
+}
+static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+        .eo_set_last_eb_blk     = ocfs2_dx_root_set_last_eb_blk,
+        .eo_get_last_eb_blk     = ocfs2_dx_root_get_last_eb_blk,
+        .eo_update_clusters     = ocfs2_dx_root_update_clusters,
+        .eo_sanity_check        = ocfs2_dx_root_sanity_check,
+        .eo_fill_root_el        = ocfs2_dx_root_fill_root_el,
+};
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
                                     struct inode *inode,
                                     struct buffer_head *bh,
@@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                 &ocfs2_xattr_value_et_ops);
 }
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+                                    struct inode *inode,
+                                    struct buffer_head *bh)
+{
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
+                                 NULL, &ocfs2_dx_root_et_ops);
+}
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
                                            u64 new_last_eb_blk)
 {
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
                                        struct ocfs2_xattr_value_buf *vb);
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+                                    struct inode *inode,
+                                    struct buffer_head *bh);
 /*
 * Read an extent block into *bh.  If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8e1709a679b7..b2c52b3a1484 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 }
 const struct address_space_operations ocfs2_aops = {
-        .readpage       = ocfs2_readpage,
+        .readpage               = ocfs2_readpage,
-        .readpages      = ocfs2_readpages,
+        .readpages              = ocfs2_readpages,
-        .writepage      = ocfs2_writepage,
+        .writepage              = ocfs2_writepage,
-        .write_begin    = ocfs2_write_begin,
+        .write_begin            = ocfs2_write_begin,
-        .write_end      = ocfs2_write_end,
+        .write_end              = ocfs2_write_end,
-        .bmap           = ocfs2_bmap,
+        .bmap                   = ocfs2_bmap,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .direct_IO      = ocfs2_direct_IO,
+        .direct_IO              = ocfs2_direct_IO,
-        .invalidatepage = ocfs2_invalidatepage,
+        .invalidatepage         = ocfs2_invalidatepage,
-        .releasepage    = ocfs2_releasepage,
+        .releasepage            = ocfs2_releasepage,
-        .migratepage    = buffer_migrate_page,
+        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 04697ba7f73e..4f85eceab376 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,7 @@
 #include <linux/random.h>
 #include <linux/crc32.h>
 #include <linux/time.h>
+#include <linux/debugfs.h>
 #include "heartbeat.h"
 #include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+#define O2HB_DEBUG_DIR                  "o2hb"
+#define O2HB_DEBUG_LIVENODES            "livenodes"
+static struct dentry *o2hb_debug_dir;
+static struct dentry *o2hb_debug_livenodes;
 static LIST_HEAD(o2hb_all_regions);
 static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
        return 0;
 }
-void o2hb_init(void)
+#ifdef CONFIG_DEBUG_FS
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+        unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        char *buf = NULL;
+        int i = -1;
+        int out = 0;
+        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!buf)
+                goto bail;
+        o2hb_fill_node_map(map, sizeof(map));
+        while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+                out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+        out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+        i_size_write(inode, out);
+        file->private_data = buf;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+        kfree(file->private_data);
+        return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+                                 size_t nbytes, loff_t *ppos)
+{
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
+}
+#else
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+                               size_t nbytes, loff_t *ppos)
+{
+        return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+static struct file_operations o2hb_debug_fops = {
+        .open =         o2hb_debug_open,
+        .release =      o2hb_debug_release,
+        .read =         o2hb_debug_read,
+        .llseek =       generic_file_llseek,
+};
+void o2hb_exit(void)
+{
+        if (o2hb_debug_livenodes)
+                debugfs_remove(o2hb_debug_livenodes);
+        if (o2hb_debug_dir)
+                debugfs_remove(o2hb_debug_dir);
+}
+int o2hb_init(void)
 {
        int i;
@@ -918,6 +994,24 @@ void o2hb_init(void)
        INIT_LIST_HEAD(&o2hb_node_events);
        memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+        o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+        if (!o2hb_debug_dir) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
+                                                   S_IFREG|S_IRUSR,
+                                                   o2hb_debug_dir, NULL,
+                                                   &o2hb_debug_fops);
+        if (!o2hb_debug_livenodes) {
+                mlog_errno(-ENOMEM);
+                debugfs_remove(o2hb_debug_dir);
+                return -ENOMEM;
+        }
+        return 0;
 }
 /* if we're already in a callback then we're already serialized by the sem */
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index e511339886b3..2f1649253b49 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
                              struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
                        unsigned bytes);
-void o2hb_init(void);
+void o2hb_exit(void);
+int o2hb_init(void);
 int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 70e8fa9e2539..7ee6188bc79a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
        o2cb_sys_shutdown();
        o2net_exit();
+        o2hb_exit();
 }
 static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
        cluster_print_version();
-        o2hb_init();
+        ret = o2hb_init();
+        if (ret)
+                goto out;
        ret = o2net_init();
        if (ret)
-                goto out;
+                goto out_o2hb;
        ret = o2net_register_hb_callbacks();
        if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
        o2net_unregister_hb_callbacks();
 out_o2net:
        o2net_exit();
+out_o2hb:
+        o2hb_exit();
 out:
        return ret;
 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2c4098cf337..e71160cda110 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/quotaops.h>
+#include <linux/sort.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -58,6 +59,7 @@
 #include "namei.h"
 #include "suballoc.h"
 #include "super.h"
+#include "sysfile.h"
 #include "uptodate.h"
 #include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
-static int ocfs2_extend_dir(struct ocfs2_super *osb,
-                            struct inode *dir,
-                            struct buffer_head *parent_fe_bh,
-                            unsigned int blocks_wanted,
-                            struct buffer_head **new_de_bh);
 static int ocfs2_do_extend_dir(struct super_block *sb,
                               handle_t *handle,
                               struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
+static int ocfs2_dir_indexed(struct inode *inode);
 /*
 * These are distinct checks because future versions of the file system will
 * want to have a trailing dirent structure independent of indexing.
 */
-static int ocfs2_dir_has_trailer(struct inode *dir)
+static int ocfs2_supports_dir_trailer(struct inode *dir)
 {
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
-        return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+        return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
 }
-static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+/*
+ * "new' here refers to the point at which we're creating a new
+ * directory via "mkdir()", but also when we're expanding an inline
+ * directory. In either case, we don't yet have the indexing bit set
+ * on the directory, so the standard checks will fail in when metaecc
+ * is turned off. Only directory-initialization type functions should
+ * use this then. Everything else wants ocfs2_supports_dir_trailer()
+ */
+static int ocfs2_new_dir_wants_trailer(struct inode *dir)
 {
-        return ocfs2_meta_ecc(osb);
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        return ocfs2_meta_ecc(osb) ||
+                ocfs2_supports_indexed_dirs(osb);
 }
 static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 {
        unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
-        if (!ocfs2_dir_has_trailer(dir))
+        if (!ocfs2_supports_dir_trailer(dir))
                return 0;
        if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 }
 static void ocfs2_init_dir_trailer(struct inode *inode,
-                                   struct buffer_head *bh)
+                                   struct buffer_head *bh, u16 rec_len)
 {
        struct ocfs2_dir_block_trailer *trailer;
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
                        cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
        trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
        trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
+        trailer->db_free_rec_len = cpu_to_le16(rec_len);
+}
+/*
+ * Link an unindexed block with a dir trailer structure into the index free
+ * list. This function will modify dirdata_bh, but assumes you've already
+ * passed it to the journal.
+ */
+static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
+                                     struct buffer_head *dx_root_bh,
+                                     struct buffer_head *dirdata_bh)
+{
+        int ret;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dir_block_trailer *trailer;
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        trailer->db_free_next = dx_root->dr_free_blk;
+        dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+        ocfs2_journal_dirty(handle, dx_root_bh);
+out:
+        return ret;
+}
+static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
+{
+        return res->dl_prev_leaf_bh == NULL;
+}
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
+{
+        brelse(res->dl_dx_root_bh);
+        brelse(res->dl_leaf_bh);
+        brelse(res->dl_dx_leaf_bh);
+        brelse(res->dl_prev_leaf_bh);
+}
+static int ocfs2_dir_indexed(struct inode *inode)
+{
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
+                return 1;
+        return 0;
+}
+static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
+{
+        return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
+}
+/*
+ * Hashing code adapted from ext3
+ */
+#define DELTA 0x9E3779B9
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+        __u32   sum = 0;
+        __u32   b0 = buf[0], b1 = buf[1];
+        __u32   a = in[0], b = in[1], c = in[2], d = in[3];
+        int     n = 16;
+        do {
+                sum += DELTA;
+                b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+                b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+        } while (--n);
+        buf[0] += b0;
+        buf[1] += b1;
+}
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+        __u32   pad, val;
+        int     i;
+        pad = (__u32)len | ((__u32)len << 8);
+        pad |= pad << 16;
+        val = pad;
+        if (len > num*4)
+                len = num * 4;
+        for (i = 0; i < len; i++) {
+                if ((i % 4) == 0)
+                        val = pad;
+                val = msg[i] + (val << 8);
+                if ((i % 4) == 3) {
+                        *buf++ = val;
+                        val = pad;
+                        num--;
+                }
+        }
+        if (--num >= 0)
+                *buf++ = val;
+        while (--num >= 0)
+                *buf++ = pad;
+}
+static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
+                                   struct ocfs2_dx_hinfo *hinfo)
+{
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        const char      *p;
+        __u32           in[8], buf[4];
+        /*
+         * XXX: Is this really necessary, if the index is never looked
+         * at by readdir? Is a hash value of '0' a bad idea?
+         */
+        if ((len == 1 && !strncmp(".", name, 1)) ||
+            (len == 2 && !strncmp("..", name, 2))) {
+                buf[0] = buf[1] = 0;
+                goto out;
+        }
+#ifdef OCFS2_DEBUG_DX_DIRS
+        /*
+         * This makes it very easy to debug indexing problems. We
+         * should never allow this to be selected without hand editing
+         * this file though.
+         */
+        buf[0] = buf[1] = len;
+        goto out;
+#endif
+        memcpy(buf, osb->osb_dx_seed, sizeof(buf));
+        p = name;
+        while (len > 0) {
+                str2hashbuf(p, len, in, 4);
+                TEA_transform(buf, in);
+                len -= 16;
+                p += 16;
+        }
+out:
+        hinfo->major_hash = buf[0];
+        hinfo->minor_hash = buf[1];
 }
 /*
@@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
 }
 /*
+ * Validate a directory trailer.
+ *
+ * We check the trailer here rather than in ocfs2_validate_dir_block()
+ * because that function doesn't have the inode to test.
+ */
+static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
+{
+        int rc = 0;
+        struct ocfs2_dir_block_trailer *trailer;
+        trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
+        if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                rc = -EINVAL;
+                ocfs2_error(dir->i_sb,
+                            "Invalid dirblock #%llu: "
+                            "signature = %.*s\n",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            trailer->db_signature);
+                goto out;
+        }
+        if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
+                rc = -EINVAL;
+                ocfs2_error(dir->i_sb,
+                            "Directory block #%llu has an invalid "
+                            "db_blkno of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                goto out;
+        }
+        if (le64_to_cpu(trailer->db_parent_dinode) !=
+            OCFS2_I(dir)->ip_blkno) {
+                rc = -EINVAL;
+                ocfs2_error(dir->i_sb,
+                            "Directory block #%llu on dinode "
+                            "#%llu has an invalid parent_dinode "
+                            "of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                goto out;
+        }
+out:
+        return rc;
+}
+/*
 * This function forces all errors to -EIO for consistency with its
 * predecessor, ocfs2_bread().  We haven't audited what returning the
 * real error codes would do to callers.  We log the real codes with
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 {
        int rc = 0;
        struct buffer_head *tmp = *bh;
-        struct ocfs2_dir_block_trailer *trailer;
        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
                                    ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
                goto out;
        }
-        /*
-         * We check the trailer here rather than in
-         * ocfs2_validate_dir_block() because that function doesn't have
-         * the inode to test.
-         */
        if (!(flags & OCFS2_BH_READAHEAD) &&
-            ocfs2_dir_has_trailer(inode)) {
+            ocfs2_supports_dir_trailer(inode)) {
-                trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+                rc = ocfs2_check_dir_trailer(inode, tmp);
-                if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                if (rc) {
-                        rc = -EINVAL;
+                        if (!*bh)
-                        ocfs2_error(inode->i_sb,
+                                brelse(tmp);
-                                    "Invalid dirblock #%llu: "
+                        mlog_errno(rc);
-                                    "signature = %.*s\n",
-                                    (unsigned long long)tmp->b_blocknr, 7,
-                                    trailer->db_signature);
-                        goto out;
-                }
-                if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
-                        rc = -EINVAL;
-                        ocfs2_error(inode->i_sb,
-                                    "Directory block #%llu has an invalid "
-                                    "db_blkno of %llu",
-                                    (unsigned long long)tmp->b_blocknr,
-                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
-                        goto out;
-                }
-                if (le64_to_cpu(trailer->db_parent_dinode) !=
-                    OCFS2_I(inode)->ip_blkno) {
-                        rc = -EINVAL;
-                        ocfs2_error(inode->i_sb,
-                                    "Directory block #%llu on dinode "
-                                    "#%llu has an invalid parent_dinode "
-                                    "of %llu",
-                                    (unsigned long long)tmp->b_blocknr,
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
                        goto out;
                }
        }
@@ -379,6 +553,141 @@ out:
        return rc ? -EIO : 0;
 }
+/*
+ * Read the block at 'phys' which belongs to this directory
+ * inode. This function does no virtual->physical block translation -
+ * what's passed in is assumed to be a valid directory block.
+ */
+static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
+                                       struct buffer_head **bh)
+{
+        int ret;
+        struct buffer_head *tmp = *bh;
+        ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (ocfs2_supports_dir_trailer(dir)) {
+                ret = ocfs2_check_dir_trailer(dir, tmp);
+                if (ret) {
+                        if (!*bh)
+                                brelse(tmp);
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (!ret && !*bh)
+                *bh = tmp;
+out:
+        return ret;
+}
+static int ocfs2_validate_dx_root(struct super_block *sb,
+                                  struct buffer_head *bh)
+{
+        int ret;
+        struct ocfs2_dx_root_block *dx_root;
+        BUG_ON(!buffer_uptodate(bh));
+        dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
+        ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
+        if (ret) {
+                mlog(ML_ERROR,
+                     "Checksum failed for dir index root block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return ret;
+        }
+        if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
+                ocfs2_error(sb,
+                            "Dir Index Root # %llu has bad signature %.*s",
+                            (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+                            7, dx_root->dr_signature);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
+                              struct buffer_head **dx_root_bh)
+{
+        int ret;
+        u64 blkno = le64_to_cpu(di->i_dx_root);
+        struct buffer_head *tmp = *dx_root_bh;
+        ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!ret && !*dx_root_bh)
+                *dx_root_bh = tmp;
+        return ret;
+}
+static int ocfs2_validate_dx_leaf(struct super_block *sb,
+                                  struct buffer_head *bh)
+{
+        int ret;
+        struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
+        BUG_ON(!buffer_uptodate(bh));
+        ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
+        if (ret) {
+                mlog(ML_ERROR,
+                     "Checksum failed for dir index leaf block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return ret;
+        }
+        if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
+                ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+                            7, dx_leaf->dl_signature);
+                return -EROFS;
+        }
+        return 0;
+}
+static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
+                              struct buffer_head **dx_leaf_bh)
+{
+        int ret;
+        struct buffer_head *tmp = *dx_leaf_bh;
+        ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!ret && !*dx_leaf_bh)
+                *dx_leaf_bh = tmp;
+        return ret;
+}
+/*
+ * Read a series of dx_leaf blocks. This expects all buffer_head
+ * pointers to be NULL on function entry.
+ */
+static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
+                                struct buffer_head **dx_leaf_bhs)
+{
+        int ret;
+        ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
+                                ocfs2_validate_dx_leaf);
+        if (ret)
+                mlog_errno(ret);
+        return ret;
+}
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
                                               struct inode *dir,
                                               struct ocfs2_dir_entry **res_dir)
@@ -480,39 +789,340 @@ cleanup_and_exit:
        return ret;
 }
+static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
+                                   struct ocfs2_extent_list *el,
+                                   u32 major_hash,
+                                   u32 *ret_cpos,
+                                   u64 *ret_phys_blkno,
+                                   unsigned int *ret_clen)
+{
+        int ret = 0, i, found;
+        struct buffer_head *eb_bh = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_rec *rec = NULL;
+        if (el->l_tree_depth) {
+                ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+                el = &eb->h_list;
+                if (el->l_tree_depth) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %lu has non zero tree depth in "
+                                    "btree tree block %llu\n", inode->i_ino,
+                                    (unsigned long long)eb_bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
+                }
+        }
+        found = 0;
+        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+                rec = &el->l_recs[i];
+                if (le32_to_cpu(rec->e_cpos) <= major_hash) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found) {
+                ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                            "record (%u, %u, 0) in btree", inode->i_ino,
+                            le32_to_cpu(rec->e_cpos),
+                            ocfs2_rec_clusters(el, rec));
+                ret = -EROFS;
+                goto out;
+        }
+        if (ret_phys_blkno)
+                *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
+        if (ret_cpos)
+                *ret_cpos = le32_to_cpu(rec->e_cpos);
+        if (ret_clen)
+                *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
+out:
+        brelse(eb_bh);
+        return ret;
+}
+/*
+ * Returns the block index, from the start of the cluster which this
+ * hash belongs too.
+ */
+static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+                                                   u32 minor_hash)
+{
+        return minor_hash & osb->osb_dx_mask;
+}
+static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+                                          struct ocfs2_dx_hinfo *hinfo)
+{
+        return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
+}
+static int ocfs2_dx_dir_lookup(struct inode *inode,
+                               struct ocfs2_extent_list *el,
+                               struct ocfs2_dx_hinfo *hinfo,
+                               u32 *ret_cpos,
+                               u64 *ret_phys_blkno)
+{
+        int ret = 0;
+        unsigned int cend, uninitialized_var(clen);
+        u32 uninitialized_var(cpos);
+        u64 uninitialized_var(blkno);
+        u32 name_hash = hinfo->major_hash;
+        ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
+                                      &clen);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        cend = cpos + clen;
+        if (name_hash >= cend) {
+                /* We want the last cluster */
+                blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
+                cpos += clen - 1;
+        } else {
+                blkno += ocfs2_clusters_to_blocks(inode->i_sb,
+                                                  name_hash - cpos);
+                cpos = name_hash;
+        }
+        /*
+         * We now have the cluster which should hold our entry. To
+         * find the exact block from the start of the cluster to
+         * search, we take the lower bits of the hash.
+         */
+        blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
+        if (ret_phys_blkno)
+                *ret_phys_blkno = blkno;
+        if (ret_cpos)
+                *ret_cpos = cpos;
+out:
+        return ret;
+}
+static int ocfs2_dx_dir_search(const char *name, int namelen,
+                               struct inode *dir,
+                               struct ocfs2_dx_root_block *dx_root,
+                               struct ocfs2_dir_lookup_result *res)
+{
+        int ret, i, found;
+        u64 uninitialized_var(phys);
+        struct buffer_head *dx_leaf_bh = NULL;
+        struct ocfs2_dx_leaf *dx_leaf;
+        struct ocfs2_dx_entry *dx_entry = NULL;
+        struct buffer_head *dir_ent_bh = NULL;
+        struct ocfs2_dir_entry *dir_ent = NULL;
+        struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
+        struct ocfs2_extent_list *dr_el;
+        struct ocfs2_dx_entry_list *entry_list;
+        ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
+        if (ocfs2_dx_root_inline(dx_root)) {
+                entry_list = &dx_root->dr_entries;
+                goto search;
+        }
+        dr_el = &dx_root->dr_list;
+        ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
+             "returns: %llu\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+             namelen, name, hinfo->major_hash, hinfo->minor_hash,
+             (unsigned long long)phys);
+        ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
+        mlog(0, "leaf info: num_used: %d, count: %d\n",
+             le16_to_cpu(dx_leaf->dl_list.de_num_used),
+             le16_to_cpu(dx_leaf->dl_list.de_count));
+        entry_list = &dx_leaf->dl_list;
+search:
+        /*
+         * Empty leaf is legal, so no need to check for that.
+         */
+        found = 0;
+        for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+                dx_entry = &entry_list->de_entries[i];
+                if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
+                    || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
+                        continue;
+                /*
+                 * Search unindexed leaf block now. We're not
+                 * guaranteed to find anything.
+                 */
+                ret = ocfs2_read_dir_block_direct(dir,
+                                          le64_to_cpu(dx_entry->dx_dirent_blk),
+                                          &dir_ent_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * XXX: We should check the unindexed block here,
+                 * before using it.
+                 */
+                found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
+                                              0, dir_ent_bh->b_data,
+                                              dir->i_sb->s_blocksize, &dir_ent);
+                if (found == 1)
+                        break;
+                if (found == -1) {
+                        /* This means we found a bad directory entry. */
+                        ret = -EIO;
+                        mlog_errno(ret);
+                        goto out;
+                }
+                brelse(dir_ent_bh);
+                dir_ent_bh = NULL;
+        }
+        if (found <= 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        res->dl_leaf_bh = dir_ent_bh;
+        res->dl_entry = dir_ent;
+        res->dl_dx_leaf_bh = dx_leaf_bh;
+        res->dl_dx_entry = dx_entry;
+        ret = 0;
+out:
+        if (ret) {
+                brelse(dx_leaf_bh);
+                brelse(dir_ent_bh);
+        }
+        return ret;
+}
+static int ocfs2_find_entry_dx(const char *name, int namelen,
+                               struct inode *dir,
+                               struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        ret = ocfs2_read_inode_block(dir, &di_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
+        if (ret) {
+                if (ret != -ENOENT)
+                        mlog_errno(ret);
+                goto out;
+        }
+        lookup->dl_dx_root_bh = dx_root_bh;
+        dx_root_bh = NULL;
+out:
+        brelse(di_bh);
+        brelse(dx_root_bh);
+        return ret;
+}
 /*
 * Try to find an entry of the provided name within 'dir'.
 *
- * If nothing was found, NULL is returned. Otherwise, a buffer_head
+ * If nothing was found, -ENOENT is returned. Otherwise, zero is
- * and pointer to the dir entry are passed back.
+ * returned and the struct 'res' will contain information useful to
+ * other directory manipulation functions.
 *
 * Caller can NOT assume anything about the contents of the
- * buffer_head - it is passed back only so that it can be passed into
+ * buffer_heads - they are passed back only so that it can be passed
- * any one of the manipulation functions (add entry, delete entry,
+ * into any one of the manipulation functions (add entry, delete
- * etc). As an example, bh in the extent directory case is a data
+ * entry, etc). As an example, bh in the extent directory case is a
- * block, in the inline-data case it actually points to an inode.
+ * data block, in the inline-data case it actually points to an inode,
+ * in the indexed directory case, multiple buffers are involved.
 */
-struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
+int ocfs2_find_entry(const char *name, int namelen,
-                                     struct inode *dir,
+                     struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
-                                     struct ocfs2_dir_entry **res_dir)
 {
-        *res_dir = NULL;
+        struct buffer_head *bh;
+        struct ocfs2_dir_entry *res_dir = NULL;
+        if (ocfs2_dir_indexed(dir))
+                return ocfs2_find_entry_dx(name, namelen, dir, lookup);
+        /*
+         * The unindexed dir code only uses part of the lookup
+         * structure, so there's no reason to push it down further
+         * than this.
+         */
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return ocfs2_find_entry_id(name, namelen, dir, res_dir);
+                bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
+        else
+                bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+        if (bh == NULL)
+                return -ENOENT;
-        return ocfs2_find_entry_el(name, namelen, dir, res_dir);
+        lookup->dl_leaf_bh = bh;
+        lookup->dl_entry = res_dir;
+        return 0;
 }
 /*
 * Update inode number and type of a previously found directory entry.
 */
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-                       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+                       struct ocfs2_dir_lookup_result *res,
                       struct inode *new_entry_inode)
 {
        int ret;
        ocfs2_journal_access_func access = ocfs2_journal_access_db;
+        struct ocfs2_dir_entry *de = res->dl_entry;
+        struct buffer_head *de_bh = res->dl_leaf_bh;
        /*
         * The same code works fine for both inline-data and extent
@@ -538,6 +1148,10 @@ out:
        return ret;
 }
+/*
+ * __ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
 static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                                struct ocfs2_dir_entry *de_del,
                                struct buffer_head *bh, char *first_de,
@@ -587,6 +1201,181 @@ bail:
        return status;
 }
+static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
+{
+        unsigned int hole;
+        if (le64_to_cpu(de->inode) == 0)
+                hole = le16_to_cpu(de->rec_len);
+        else
+                hole = le16_to_cpu(de->rec_len) -
+                        OCFS2_DIR_REC_LEN(de->name_len);
+        return hole;
+}
+static int ocfs2_find_max_rec_len(struct super_block *sb,
+                                  struct buffer_head *dirblock_bh)
+{
+        int size, this_hole, largest_hole = 0;
+        char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
+        struct ocfs2_dir_entry *de;
+        trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
+        size = ocfs2_dir_trailer_blk_off(sb);
+        limit = start + size;
+        de_buf = start;
+        de = (struct ocfs2_dir_entry *)de_buf;
+        do {
+                if (de_buf != trailer) {
+                        this_hole = ocfs2_figure_dirent_hole(de);
+                        if (this_hole > largest_hole)
+                                largest_hole = this_hole;
+                }
+                de_buf += le16_to_cpu(de->rec_len);
+                de = (struct ocfs2_dir_entry *)de_buf;
+        } while (de_buf < limit);
+        if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+                return largest_hole;
+        return 0;
+}
+static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
+                                       int index)
+{
+        int num_used = le16_to_cpu(entry_list->de_num_used);
+        if (num_used == 1 || index == (num_used - 1))
+                goto clear;
+        memmove(&entry_list->de_entries[index],
+                &entry_list->de_entries[index + 1],
+                (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
+clear:
+        num_used--;
+        memset(&entry_list->de_entries[num_used], 0,
+               sizeof(struct ocfs2_dx_entry));
+        entry_list->de_num_used = cpu_to_le16(num_used);
+}
+static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
+                                 struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret, index, max_rec_len, add_to_free_list = 0;
+        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+        struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
+        struct ocfs2_dx_leaf *dx_leaf;
+        struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
+        struct ocfs2_dir_block_trailer *trailer;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        /*
+         * This function gets a bit messy because we might have to
+         * modify the root block, regardless of whether the indexed
+         * entries are stored inline.
+         */
+        /*
+         * *Only* set 'entry_list' here, based on where we're looking
+         * for the indexed entries. Later, we might still want to
+         * journal both blocks, based on free list state.
+         */
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (ocfs2_dx_root_inline(dx_root)) {
+                entry_list = &dx_root->dr_entries;
+        } else {
+                dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
+                entry_list = &dx_leaf->dl_list;
+        }
+        /* Neither of these are a disk corruption - that should have
+         * been caught by lookup, before we got here. */
+        BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
+        BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
+        index = (char *)dx_entry - (char *)entry_list->de_entries;
+        index /= sizeof(*dx_entry);
+        if (index >= le16_to_cpu(entry_list->de_num_used)) {
+                mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
+                     (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
+                     entry_list, dx_entry);
+                return -EIO;
+        }
+        /*
+         * We know that removal of this dirent will leave enough room
+         * for a new one, so add this block to the free list if it
+         * isn't already there.
+         */
+        trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+        if (trailer->db_free_rec_len == 0)
+                add_to_free_list = 1;
+        /*
+         * Add the block holding our index into the journal before
+         * removing the unindexed entry. If we get an error return
+         * from __ocfs2_delete_entry(), then it hasn't removed the
+         * entry yet. Likewise, successful return means we *must*
+         * remove the indexed entry.
+         *
+         * We're also careful to journal the root tree block here as
+         * the entry count needs to be updated. Also, we might be
+         * adding to the start of the free list.
+         */
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!ocfs2_dx_root_inline(dx_root)) {
+                ret = ocfs2_journal_access_dl(handle, dir,
+                                              lookup->dl_dx_leaf_bh,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        mlog(0, "Dir %llu: delete entry at index: %d\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+        ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
+                                   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
+        trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+        if (add_to_free_list) {
+                trailer->db_free_next = dx_root->dr_free_blk;
+                dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
+                ocfs2_journal_dirty(handle, dx_root_bh);
+        }
+        /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
+        ocfs2_journal_dirty(handle, leaf_bh);
+        le32_add_cpu(&dx_root->dr_num_entries, -1);
+        ocfs2_journal_dirty(handle, dx_root_bh);
+        ocfs2_dx_list_remove_entry(entry_list, index);
+        if (!ocfs2_dx_root_inline(dx_root))
+                ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
+out:
+        return ret;
+}
 static inline int ocfs2_delete_entry_id(handle_t *handle,
                                        struct inode *dir,
                                        struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
 }
 /*
- * ocfs2_delete_entry deletes a directory entry by merging it with the
+ * Delete a directory entry. Hide the details of directory
- * previous entry
+ * implementation from the caller.
 */
 int ocfs2_delete_entry(handle_t *handle,
                       struct inode *dir,
-                       struct ocfs2_dir_entry *de_del,
+                       struct ocfs2_dir_lookup_result *res)
-                       struct buffer_head *bh)
 {
+        if (ocfs2_dir_indexed(dir))
+                return ocfs2_delete_entry_dx(handle, dir, res);
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return ocfs2_delete_entry_id(handle, dir, de_del, bh);
+                return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
+                                             res->dl_leaf_bh);
-        return ocfs2_delete_entry_el(handle, dir, de_del, bh);
+        return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
+                                     res->dl_leaf_bh);
 }
 /*
@@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
        return 0;
 }
+static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
+                                          struct ocfs2_dx_entry *dx_new_entry)
+{
+        int i;
+        i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+        dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
+        le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
+}
+static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
+                                       struct ocfs2_dx_hinfo *hinfo,
+                                       u64 dirent_blk)
+{
+        int i;
+        struct ocfs2_dx_entry *dx_entry;
+        i = le16_to_cpu(entry_list->de_num_used);
+        dx_entry = &entry_list->de_entries[i];
+        memset(dx_entry, 0, sizeof(*dx_entry));
+        dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
+        dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
+        dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
+        le16_add_cpu(&entry_list->de_num_used, 1);
+}
+static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
+                                      struct ocfs2_dx_hinfo *hinfo,
+                                      u64 dirent_blk,
+                                      struct buffer_head *dx_leaf_bh)
+{
+        int ret;
+        struct ocfs2_dx_leaf *dx_leaf;
+        ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+        ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
+        ocfs2_journal_dirty(handle, dx_leaf_bh);
+out:
+        return ret;
+}
+static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
+                                        struct ocfs2_dx_hinfo *hinfo,
+                                        u64 dirent_blk,
+                                        struct ocfs2_dx_root_block *dx_root)
+{
+        ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
+}
+static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
+                               struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret = 0;
+        struct ocfs2_dx_root_block *dx_root;
+        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
+        if (ocfs2_dx_root_inline(dx_root)) {
+                ocfs2_dx_inline_root_insert(dir, handle,
+                                            &lookup->dl_hinfo,
+                                            lookup->dl_leaf_bh->b_blocknr,
+                                            dx_root);
+        } else {
+                ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
+                                                 lookup->dl_leaf_bh->b_blocknr,
+                                                 lookup->dl_dx_leaf_bh);
+                if (ret)
+                        goto out;
+        }
+        le32_add_cpu(&dx_root->dr_num_entries, 1);
+        ocfs2_journal_dirty(handle, dx_root_bh);
+out:
+        return ret;
+}
+static void ocfs2_remove_block_from_free_list(struct inode *dir,
+                                       handle_t *handle,
+                                       struct ocfs2_dir_lookup_result *lookup)
+{
+        struct ocfs2_dir_block_trailer *trailer, *prev;
+        struct ocfs2_dx_root_block *dx_root;
+        struct buffer_head *bh;
+        trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+        if (ocfs2_free_list_at_root(lookup)) {
+                bh = lookup->dl_dx_root_bh;
+                dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
+                dx_root->dr_free_blk = trailer->db_free_next;
+        } else {
+                bh = lookup->dl_prev_leaf_bh;
+                prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
+                prev->db_free_next = trailer->db_free_next;
+        }
+        trailer->db_free_rec_len = cpu_to_le16(0);
+        trailer->db_free_next = cpu_to_le64(0);
+        ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+}
+/*
+ * This expects that a journal write has been reserved on
+ * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
+ */
+static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
+                                   struct ocfs2_dir_lookup_result *lookup)
+{
+        int max_rec_len;
+        struct ocfs2_dir_block_trailer *trailer;
+        /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
+        max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
+        if (max_rec_len) {
+                /*
+                 * There's still room in this block, so no need to remove it
+                 * from the free list. In this case, we just want to update
+                 * the rec len accounting.
+                 */
+                trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+                trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+                ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+        } else {
+                ocfs2_remove_block_from_free_list(dir, handle, lookup);
+        }
+}
 /* we don't always have a dentry for what we want to add, so people
 * like orphan dir can call this instead.
 *
- * If you pass me insert_bh, I'll skip the search of the other dir
+ * The lookup context must have been filled from
- * blocks and put the record in there.
+ * ocfs2_prepare_dir_for_insert.
 */
 int __ocfs2_add_entry(handle_t *handle,
                      struct inode *dir,
                      const char *name, int namelen,
                      struct inode *inode, u64 blkno,
                      struct buffer_head *parent_fe_bh,
-                      struct buffer_head *insert_bh)
+                      struct ocfs2_dir_lookup_result *lookup)
 {
        unsigned long offset;
        unsigned short rec_len;
@@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle,
        struct super_block *sb = dir->i_sb;
        int retval, status;
        unsigned int size = sb->s_blocksize;
+        struct buffer_head *insert_bh = lookup->dl_leaf_bh;
        char *data_start = insert_bh->b_data;
        mlog_entry_void();
@@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle,
        if (!namelen)
                return -EINVAL;
-        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+        if (ocfs2_dir_indexed(dir)) {
+                struct buffer_head *bh;
+                /*
+                 * An indexed dir may require that we update the free space
+                 * list. Reserve a write to the previous node in the list so
+                 * that we don't fail later.
+                 *
+                 * XXX: This can be either a dx_root_block, or an unindexed
+                 * directory tree leaf block.
+                 */
+                if (ocfs2_free_list_at_root(lookup)) {
+                        bh = lookup->dl_dx_root_bh;
+                        retval = ocfs2_journal_access_dr(handle, dir, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                } else {
+                        bh = lookup->dl_prev_leaf_bh;
+                        retval = ocfs2_journal_access_db(handle, dir, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                }
+                if (retval) {
+                        mlog_errno(retval);
+                        return retval;
+                }
+        } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                data_start = di->id2.i_data.id_data;
                size = i_size_read(dir);
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
                                status = ocfs2_journal_access_di(handle, dir,
                                                                 insert_bh,
                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
-                        else
+                        else {
                                status = ocfs2_journal_access_db(handle, dir,
                                                                 insert_bh,
-                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                if (ocfs2_dir_indexed(dir)) {
+                                        status = ocfs2_dx_dir_insert(dir,
+                                                                handle,
+                                                                lookup);
+                                        if (status) {
+                                                mlog_errno(status);
+                                                goto bail;
+                                        }
+                                }
+                        }
                        /* By now the buffer is marked for journaling */
                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
                        de->name_len = namelen;
                        memcpy(de->name, name, namelen);
+                        if (ocfs2_dir_indexed(dir))
+                                ocfs2_recalc_free_list(dir, handle, lookup);
                        dir->i_version++;
                        status = ocfs2_journal_dirty(handle, insert_bh);
                        retval = 0;
@@ -870,6 +1851,10 @@ out:
        return 0;
 }
+/*
+ * NOTE: This function can be called against unindexed directories,
+ * and indexed ones.
+ */
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                                    u64 *f_version,
                                    loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
                             int namelen,
                             u64 *blkno,
                             struct inode *inode,
-                             struct buffer_head **dirent_bh,
+                             struct ocfs2_dir_lookup_result *lookup)
-                             struct ocfs2_dir_entry **dirent)
 {
        int status = -ENOENT;
-        mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
+        mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
-                   namelen, name, blkno, inode, dirent_bh, dirent);
+             (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
+        status = ocfs2_find_entry(name, namelen, inode, lookup);
-        if (!*dirent_bh || !*dirent) {
+        if (status)
-                status = -ENOENT;
                goto leave;
-        }
-        *blkno = le64_to_cpu((*dirent)->inode);
+        *blkno = le64_to_cpu(lookup->dl_entry->inode);
        status = 0;
 leave:
-        if (status < 0) {
-                *dirent = NULL;
-                brelse(*dirent_bh);
-                *dirent_bh = NULL;
-        }
-        mlog_exit(status);
        return status;
 }
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno)
 {
        int ret;
-        struct buffer_head *bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        struct ocfs2_dir_entry *dirent = NULL;
-        ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent);
+        ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
-        brelse(bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        return ret;
 }
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
                              int namelen)
 {
        int ret;
-        struct buffer_head *dirent_bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        struct ocfs2_dir_entry *dirent = NULL;
        mlog_entry("dir %llu, name '%.*s'\n",
                   (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
        ret = -EEXIST;
-        dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
+        if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
-        if (dirent_bh)
                goto bail;
        ret = 0;
 bail:
-        brelse(dirent_bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(ret);
        return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
        unsigned seen_dot;
        unsigned seen_dot_dot;
        unsigned seen_other;
+        unsigned dx_dir;
 };
 static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
                                   loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
        /*
         * Check the positions of "." and ".." records to be sure
         * they're in the correct place.
+         *
+         * Indexed directories don't need to proceed past the first
+         * two entries, so we end the scan after seeing '..'. Despite
+         * that, we allow the scan to proceed In the event that we
+         * have a corrupted indexed directory (no dot or dot dot
+         * entries). This allows us to double check for existing
+         * entries which might not have been found in the index.
         */
        if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
                p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
        if (name_len == 2 && !strncmp("..", name, 2) &&
            pos == OCFS2_DIR_REC_LEN(1)) {
                p->seen_dot_dot = 1;
+                if (p->dx_dir && p->seen_dot)
+                        return 1;
                return 0;
        }
        p->seen_other = 1;
        return 1;
 }
+static int ocfs2_empty_dir_dx(struct inode *inode,
+                              struct ocfs2_empty_dir_priv *priv)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_dx_root_block *dx_root;
+        priv->dx_dir = 1;
+        ret = ocfs2_read_inode_block(inode, &di_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (le32_to_cpu(dx_root->dr_num_entries) != 2)
+                priv->seen_other = 1;
+out:
+        brelse(di_bh);
+        brelse(dx_root_bh);
+        return ret;
+}
 /*
 * routine to check that the specified directory is empty (for rmdir)
 *
 * Returns 1 if dir is empty, zero otherwise.
+ *
+ * XXX: This is a performance problem for unindexed directories.
 */
 int ocfs2_empty_dir(struct inode *inode)
 {
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
        memset(&priv, 0, sizeof(priv));
+        if (ocfs2_dir_indexed(inode)) {
+                ret = ocfs2_empty_dir_dx(inode, &priv);
+                if (ret)
+                        mlog_errno(ret);
+                /*
+                 * We still run ocfs2_dir_foreach to get the checks
+                 * for "." and "..".
+                 */
+        }
        ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
        if (ret)
                mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                                 struct inode *parent,
                                 struct inode *inode,
                                 struct buffer_head *fe_bh,
-                                 struct ocfs2_alloc_context *data_ac)
+                                 struct ocfs2_alloc_context *data_ac,
+                                 struct buffer_head **ret_new_bh)
 {
        int status;
        unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        mlog_entry_void();
-        if (ocfs2_supports_dir_trailer(osb))
+        if (ocfs2_new_dir_wants_trailer(inode))
                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
        de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
-        if (ocfs2_supports_dir_trailer(osb))
+        if (ocfs2_new_dir_wants_trailer(inode)) {
-                ocfs2_init_dir_trailer(inode, new_bh);
+                int size = le16_to_cpu(de->rec_len);
+                /*
+                 * Figure out the size of the hole left over after
+                 * insertion of '.' and '..'. The trailer wants this
+                 * information.
+                 */
+                size -= OCFS2_DIR_REC_LEN(2);
+                size -= sizeof(struct ocfs2_dir_block_trailer);
+                ocfs2_init_dir_trailer(inode, new_bh, size);
+        }
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        }
        status = 0;
+        if (ret_new_bh) {
+                *ret_new_bh = new_bh;
+                new_bh = NULL;
+        }
 bail:
        brelse(new_bh);
@@ -1336,20 +2384,427 @@ bail:
        return status;
 }
+static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
+                                     handle_t *handle, struct inode *dir,
+                                     struct buffer_head *di_bh,
+                                     struct buffer_head *dirdata_bh,
+                                     struct ocfs2_alloc_context *meta_ac,
+                                     int dx_inline, u32 num_entries,
+                                     struct buffer_head **ret_dx_root_bh)
+{
+        int ret;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+        u16 dr_suballoc_bit;
+        u64 dr_blkno;
+        unsigned int num_bits;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
+                                   &num_bits, &dr_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Dir %llu, attach new index block: %llu\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+             (unsigned long long)dr_blkno);
+        dx_root_bh = sb_getblk(osb->sb, dr_blkno);
+        if (dx_root_bh == NULL) {
+                ret = -EIO;
+                goto out;
+        }
+        ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        memset(dx_root, 0, osb->sb->s_blocksize);
+        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
+        dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
+        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
+        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
+        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
+        dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
+        dx_root->dr_num_entries = cpu_to_le32(num_entries);
+        if (le16_to_cpu(trailer->db_free_rec_len))
+                dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+        else
+                dx_root->dr_free_blk = cpu_to_le64(0);
+        if (dx_inline) {
+                dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
+                dx_root->dr_entries.de_count =
+                        cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
+        } else {
+                dx_root->dr_list.l_count =
+                        cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+        }
+        ret = ocfs2_journal_dirty(handle, dx_root_bh);
+        if (ret)
+                mlog_errno(ret);
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di->i_dx_root = cpu_to_le64(dr_blkno);
+        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
+        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+        *ret_dx_root_bh = dx_root_bh;
+        dx_root_bh = NULL;
+out:
+        brelse(dx_root_bh);
+        return ret;
+}
+static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
+                                       handle_t *handle, struct inode *dir,
+                                       struct buffer_head **dx_leaves,
+                                       int num_dx_leaves, u64 start_blk)
+{
+        int ret, i;
+        struct ocfs2_dx_leaf *dx_leaf;
+        struct buffer_head *bh;
+        for (i = 0; i < num_dx_leaves; i++) {
+                bh = sb_getblk(osb->sb, start_blk + i);
+                if (bh == NULL) {
+                        ret = -EIO;
+                        goto out;
+                }
+                dx_leaves[i] = bh;
+                ocfs2_set_new_buffer_uptodate(dir, bh);
+                ret = ocfs2_journal_access_dl(handle, dir, bh,
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
+                memset(dx_leaf, 0, osb->sb->s_blocksize);
+                strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
+                dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
+                dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
+                dx_leaf->dl_list.de_count =
+                        cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
+                mlog(0,
+                     "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
+                     (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                     (unsigned long long)bh->b_blocknr,
+                     le16_to_cpu(dx_leaf->dl_list.de_count));
+                ocfs2_journal_dirty(handle, bh);
+        }
+        ret = 0;
+out:
+        return ret;
+}
+/*
+ * Allocates and formats a new cluster for use in an indexed dir
+ * leaf. This version will not do the extent insert, so that it can be
+ * used by operations which need careful ordering.
+ */
+static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
+                                      u32 cpos, handle_t *handle,
+                                      struct ocfs2_alloc_context *data_ac,
+                                      struct buffer_head **dx_leaves,
+                                      int num_dx_leaves, u64 *ret_phys_blkno)
+{
+        int ret;
+        u32 phys, num;
+        u64 phys_blkno;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        /*
+         * XXX: For create, this should claim cluster for the index
+         * *before* the unindexed insert so that we have a better
+         * chance of contiguousness as the directory grows in number
+         * of entries.
+         */
+        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Format the new cluster first. That way, we're inserting
+         * valid data.
+         */
+        phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
+        ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
+                                          num_dx_leaves, phys_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *ret_phys_blkno = phys_blkno;
+out:
+        return ret;
+}
+static int ocfs2_dx_dir_new_cluster(struct inode *dir,
+                                    struct ocfs2_extent_tree *et,
+                                    u32 cpos, handle_t *handle,
+                                    struct ocfs2_alloc_context *data_ac,
+                                    struct ocfs2_alloc_context *meta_ac,
+                                    struct buffer_head **dx_leaves,
+                                    int num_dx_leaves)
+{
+        int ret;
+        u64 phys_blkno;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
+                                         num_dx_leaves, &phys_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
+                                  meta_ac);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
+}
+static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
+                                                        int *ret_num_leaves)
+{
+        int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
+        struct buffer_head **dx_leaves;
+        dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
+                            GFP_NOFS);
+        if (dx_leaves && ret_num_leaves)
+                *ret_num_leaves = num_dx_leaves;
+        return dx_leaves;
+}
+static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
+                                 handle_t *handle,
+                                 struct inode *parent,
+                                 struct inode *inode,
+                                 struct buffer_head *di_bh,
+                                 struct ocfs2_alloc_context *data_ac,
+                                 struct ocfs2_alloc_context *meta_ac)
+{
+        int ret;
+        struct buffer_head *leaf_bh = NULL;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_hinfo hinfo;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        /*
+         * Our strategy is to create the directory as though it were
+         * unindexed, then add the index block. This works with very
+         * little complication since the state of a new directory is a
+         * very well known quantity.
+         *
+         * Essentially, we have two dirents ("." and ".."), in the 1st
+         * block which need indexing. These are easily inserted into
+         * the index block.
+         */
+        ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
+                                    data_ac, &leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
+                                        meta_ac, 1, 2, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        entry_list = &dx_root->dr_entries;
+        /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
+        ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
+        ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+        ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
+        ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+out:
+        brelse(dx_root_bh);
+        brelse(leaf_bh);
+        return ret;
+}
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       handle_t *handle,
                       struct inode *parent,
                       struct inode *inode,
                       struct buffer_head *fe_bh,
-                       struct ocfs2_alloc_context *data_ac)
+                       struct ocfs2_alloc_context *data_ac,
+                       struct ocfs2_alloc_context *meta_ac)
 {
        BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
+        if (ocfs2_supports_indexed_dirs(osb))
+                return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
+                                             data_ac, meta_ac);
        return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
-                                     data_ac);
+                                     data_ac, NULL);
+}
+static int ocfs2_dx_dir_index_block(struct inode *dir,
+                                    handle_t *handle,
+                                    struct buffer_head **dx_leaves,
+                                    int num_dx_leaves,
+                                    u32 *num_dx_entries,
+                                    struct buffer_head *dirent_bh)
+{
+        int ret, namelen, i;
+        char *de_buf, *limit;
+        struct ocfs2_dir_entry *de;
+        struct buffer_head *dx_leaf_bh;
+        struct ocfs2_dx_hinfo hinfo;
+        u64 dirent_blk = dirent_bh->b_blocknr;
+        de_buf = dirent_bh->b_data;
+        limit = de_buf + dir->i_sb->s_blocksize;
+        while (de_buf < limit) {
+                de = (struct ocfs2_dir_entry *)de_buf;
+                namelen = de->name_len;
+                if (!namelen || !de->inode)
+                        goto inc;
+                ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
+                i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
+                dx_leaf_bh = dx_leaves[i];
+                ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
+                                                 dirent_blk, dx_leaf_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                *num_dx_entries = *num_dx_entries + 1;
+inc:
+                de_buf += le16_to_cpu(de->rec_len);
+        }
+out:
+        return ret;
+}
+/*
+ * XXX: This expects dx_root_bh to already be part of the transaction.
+ */
+static void ocfs2_dx_dir_index_root_block(struct inode *dir,
+                                         struct buffer_head *dx_root_bh,
+                                         struct buffer_head *dirent_bh)
+{
+        char *de_buf, *limit;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dir_entry *de;
+        struct ocfs2_dx_hinfo hinfo;
+        u64 dirent_blk = dirent_bh->b_blocknr;
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        de_buf = dirent_bh->b_data;
+        limit = de_buf + dir->i_sb->s_blocksize;
+        while (de_buf < limit) {
+                de = (struct ocfs2_dir_entry *)de_buf;
+                if (!de->name_len || !de->inode)
+                        goto inc;
+                ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
+                mlog(0,
+                     "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
+                     (unsigned long long)dir->i_ino, hinfo.major_hash,
+                     hinfo.minor_hash,
+                     le16_to_cpu(dx_root->dr_entries.de_num_used),
+                     de->name_len, de->name);
+                ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
+                                           dirent_blk);
+                le32_add_cpu(&dx_root->dr_num_entries, 1);
+inc:
+                de_buf += le16_to_cpu(de->rec_len);
+        }
+}
+/*
+ * Count the number of inline directory entries in di_bh and compare
+ * them against the number of entries we can hold in an inline dx root
+ * block.
+ */
+static int ocfs2_new_dx_should_be_inline(struct inode *dir,
+                                         struct buffer_head *di_bh)
+{
+        int dirent_count = 0;
+        char *de_buf, *limit;
+        struct ocfs2_dir_entry *de;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        de_buf = di->id2.i_data.id_data;
+        limit = de_buf + i_size_read(dir);
+        while (de_buf < limit) {
+                de = (struct ocfs2_dir_entry *)de_buf;
+                if (de->name_len && de->inode)
+                        dirent_count++;
+                de_buf += le16_to_cpu(de->rec_len);
+        }
+        /* We are careful to leave room for one extra record. */
+        return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
 }
 /*
@@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 * expansion from an inline directory to one with extents. The first dir block
 * in that case is taken from the inline data portion of the inode block.
 *
+ * This will also return the largest amount of contiguous space for a dirent
+ * in the block. That value is *not* necessarily the last dirent, even after
+ * expansion. The directory indexing code wants this value for free space
+ * accounting. We do this here since we're already walking the entire dir
+ * block.
+ *
 * We add the dir trailer if this filesystem wants it.
 */
-static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
+static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-                                     struct super_block *sb)
+                                             struct inode *dir)
 {
+        struct super_block *sb = dir->i_sb;
        struct ocfs2_dir_entry *de;
        struct ocfs2_dir_entry *prev_de;
        char *de_buf, *limit;
        unsigned int new_size = sb->s_blocksize;
-        unsigned int bytes;
+        unsigned int bytes, this_hole;
+        unsigned int largest_hole = 0;
-        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+        if (ocfs2_new_dir_wants_trailer(dir))
                new_size = ocfs2_dir_trailer_blk_off(sb);
        bytes = new_size - old_size;
@@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
        de_buf = start;
        de = (struct ocfs2_dir_entry *)de_buf;
        do {
+                this_hole = ocfs2_figure_dirent_hole(de);
+                if (this_hole > largest_hole)
+                        largest_hole = this_hole;
                prev_de = de;
                de_buf += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *)de_buf;
        } while (de_buf < limit);
        le16_add_cpu(&prev_de->rec_len, bytes);
+        /* We need to double check this after modification of the final
+         * dirent. */
+        this_hole = ocfs2_figure_dirent_hole(prev_de);
+        if (this_hole > largest_hole)
+                largest_hole = this_hole;
+        if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+                return largest_hole;
+        return 0;
 }
 /*
@@ -1396,29 +2873,61 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
 */
 static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int blocks_wanted,
+                                   struct ocfs2_dir_lookup_result *lookup,
                                   struct buffer_head **first_block_bh)
 {
-        u32 alloc, bit_off, len;
+        u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
        struct super_block *sb = dir->i_sb;
-        int ret, credits = ocfs2_inline_to_extents_credits(sb);
+        int ret, i, num_dx_leaves = 0, dx_inline = 0,
-        u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
+                credits = ocfs2_inline_to_extents_credits(sb);
+        u64 dx_insert_blkno, blkno,
+                bytes = blocks_wanted << sb->s_blocksize_bits;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(dir);
        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_alloc_context *meta_ac = NULL;
        struct buffer_head *dirdata_bh = NULL;
+        struct buffer_head *dx_root_bh = NULL;
+        struct buffer_head **dx_leaves = NULL;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle;
        struct ocfs2_extent_tree et;
-        int did_quota = 0;
+        struct ocfs2_extent_tree dx_et;
+        int did_quota = 0, bytes_allocated = 0;
        ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
        alloc = ocfs2_clusters_for_bytes(sb, bytes);
+        dx_alloc = 0;
+        if (ocfs2_supports_indexed_dirs(osb)) {
+                credits += ocfs2_add_dir_index_credits(sb);
+                dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
+                if (!dx_inline) {
+                        /* Add one more cluster for an index leaf */
+                        dx_alloc++;
+                        dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
+                                                                &num_dx_leaves);
+                        if (!dx_leaves) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+                /* This gets us the dx_root */
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
        /*
-         * We should never need more than 2 clusters for this -
+         * We should never need more than 2 clusters for the unindexed
-         * maximum dirent size is far less than one block. In fact,
+         * tree - maximum dirent size is far less than one block. In
-         * the only time we'd need more than one cluster is if
+         * fact, the only time we'd need more than one cluster is if
         * blocksize == clustersize and the dirent won't fit in the
         * extra space that the expansion to a single block gives. As
         * of today, that only happens on 4k/4k file systems.
@@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        /*
         * Prepare for worst case allocation scenario of two separate
-         * extents.
+         * extents in the unindexed tree.
         */
        if (alloc == 2)
                credits += OCFS2_SUBALLOC_ALLOC;
@@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        }
        if (vfs_dq_alloc_space_nodirty(dir,
-                                ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+                                ocfs2_clusters_to_bytes(osb->sb,
+                                                        alloc + dx_alloc))) {
                ret = -EDQUOT;
                goto out_commit;
        }
        did_quota = 1;
+        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+                /*
+                 * Allocate our index cluster first, to maximize the
+                 * possibility that unindexed leaves grow
+                 * contiguously.
+                 */
+                ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
+                                                 dx_leaves, num_dx_leaves,
+                                                 &dx_insert_blkno);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+        }
        /*
         * Try to claim as many clusters as the bitmap can give though
         * if we only get one now, that's enough to continue. The rest
@@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                mlog_errno(ret);
                goto out_commit;
        }
+        bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
        /*
         * Operations are carefully ordered so that we set up the new
@@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
        memset(dirdata_bh->b_data + i_size_read(dir), 0,
               sb->s_blocksize - i_size_read(dir));
-        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
+        i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
-        if (ocfs2_supports_dir_trailer(osb))
+        if (ocfs2_new_dir_wants_trailer(dir)) {
-                ocfs2_init_dir_trailer(dir, dirdata_bh);
+                /*
+                 * Prepare the dir trailer up front. It will otherwise look
+                 * like a valid dirent. Even if inserting the index fails
+                 * (unlikely), then all we'll have done is given first dir
+                 * block a small amount of fragmentation.
+                 */
+                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
+        }
        ret = ocfs2_journal_dirty(handle, dirdata_bh);
        if (ret) {
@@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_commit;
        }
+        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+                /*
+                 * Dx dirs with an external cluster need to do this up
+                 * front. Inline dx root's get handled later, after
+                 * we've allocated our root block. We get passed back
+                 * a total number of items so that dr_num_entries can
+                 * be correctly set once the dx_root has been
+                 * allocated.
+                 */
+                ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
+                                               num_dx_leaves, &num_dx_entries,
+                                               dirdata_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+        }
        /*
         * Set extent, i_size, etc on the directory. After this, the
         * inode should contain the same exact dirents as before and
@@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_commit;
        }
+        if (ocfs2_supports_indexed_dirs(osb)) {
+                ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
+                                                dirdata_bh, meta_ac, dx_inline,
+                                                num_dx_entries, &dx_root_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                if (dx_inline) {
+                        ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
+                                                      dirdata_bh);
+                } else {
+                        ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
+                        ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
+                                                  dx_insert_blkno, 1, 0, NULL);
+                        if (ret)
+                                mlog_errno(ret);
+                }
+        }
        /*
         * We asked for two clusters, but only got one in the 1st
         * pass. Claim the 2nd cluster as a separate extent.
@@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                        mlog_errno(ret);
                        goto out_commit;
                }
+                bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
        }
        *first_block_bh = dirdata_bh;
        dirdata_bh = NULL;
+        if (ocfs2_supports_indexed_dirs(osb)) {
+                unsigned int off;
+                if (!dx_inline) {
+                        /*
+                         * We need to return the correct block within the
+                         * cluster which should hold our entry.
+                         */
+                        off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+                                                    &lookup->dl_hinfo);
+                        get_bh(dx_leaves[off]);
+                        lookup->dl_dx_leaf_bh = dx_leaves[off];
+                }
+                lookup->dl_dx_root_bh = dx_root_bh;
+                dx_root_bh = NULL;
+        }
 out_commit:
        if (ret < 0 && did_quota)
-                vfs_dq_free_space_nodirty(dir,
+                vfs_dq_free_space_nodirty(dir, bytes_allocated);
-                        ocfs2_clusters_to_bytes(osb->sb, 2));
        ocfs2_commit_trans(osb, handle);
 out_sem:
@@ -1587,8 +3178,17 @@ out_sem:
 out:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        if (dx_leaves) {
+                for (i = 0; i < num_dx_leaves; i++)
+                        brelse(dx_leaves[i]);
+                kfree(dx_leaves);
+        }
        brelse(dirdata_bh);
+        brelse(dx_root_bh);
        return ret;
 }
@@ -1658,11 +3258,14 @@ bail:
 * is to be turned into an extent based one. The size of the dirent to
 * insert might be larger than the space gained by growing to just one
 * block, so we may have to grow the inode by two blocks in that case.
+ *
+ * If the directory is already indexed, dx_root_bh must be provided.
 */
 static int ocfs2_extend_dir(struct ocfs2_super *osb,
                            struct inode *dir,
                            struct buffer_head *parent_fe_bh,
                            unsigned int blocks_wanted,
+                            struct ocfs2_dir_lookup_result *lookup,
                            struct buffer_head **new_de_bh)
 {
        int status = 0;
@@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        struct ocfs2_dir_entry * de;
        struct super_block *sb = osb->sb;
        struct ocfs2_extent_tree et;
+        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
        mlog_entry_void();
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                /*
+                 * This would be a code error as an inline directory should
+                 * never have an index root.
+                 */
+                BUG_ON(dx_root_bh);
                status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
-                                                 blocks_wanted, &new_bh);
+                                                 blocks_wanted, lookup,
+                                                 &new_bh);
                if (status) {
                        mlog_errno(status);
                        goto bail;
                }
+                /* Expansion from inline to an indexed directory will
+                 * have given us this. */
+                dx_root_bh = lookup->dl_dx_root_bh;
                if (blocks_wanted == 1) {
                        /*
                         * If the new dirent will fit inside the space
@@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        }
 do_extend:
+        if (ocfs2_dir_indexed(dir))
+                credits++; /* For attaching the new dirent block to the
+                            * dx_root */
        down_write(&OCFS2_I(dir)->ip_alloc_sem);
        drop_alloc_sem = 1;
@@ -1781,9 +3400,19 @@ do_extend:
        de = (struct ocfs2_dir_entry *) new_bh->b_data;
        de->inode = 0;
-        if (ocfs2_dir_has_trailer(dir)) {
+        if (ocfs2_supports_dir_trailer(dir)) {
                de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
-                ocfs2_init_dir_trailer(dir, new_bh);
+                ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
+                if (ocfs2_dir_indexed(dir)) {
+                        status = ocfs2_dx_dir_link_trailer(dir, handle,
+                                                           dx_root_bh, new_bh);
+                        if (status) {
+                                mlog_errno(status);
+                                goto bail;
+                        }
+                }
        } else {
                de->rec_len = cpu_to_le16(sb->s_blocksize);
        }
@@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
         * This calculates how many free bytes we'd have in block zero, should
         * this function force expansion to an extent tree.
         */
-        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+        if (ocfs2_new_dir_wants_trailer(dir))
                free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
        else
                free_space = dir->i_sb->s_blocksize - i_size_read(dir);
@@ -1970,12 +3599,766 @@ bail:
        return status;
 }
+static int dx_leaf_sort_cmp(const void *a, const void *b)
+{
+        const struct ocfs2_dx_entry *entry1 = a;
+        const struct ocfs2_dx_entry *entry2 = b;
+        u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
+        u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
+        u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
+        u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
+        if (major_hash1 > major_hash2)
+                return 1;
+        if (major_hash1 < major_hash2)
+                return -1;
+        /*
+         * It is not strictly necessary to sort by minor
+         */
+        if (minor_hash1 > minor_hash2)
+                return 1;
+        if (minor_hash1 < minor_hash2)
+                return -1;
+        return 0;
+}
+static void dx_leaf_sort_swap(void *a, void *b, int size)
+{
+        struct ocfs2_dx_entry *entry1 = a;
+        struct ocfs2_dx_entry *entry2 = b;
+        struct ocfs2_dx_entry tmp;
+        BUG_ON(size != sizeof(*entry1));
+        tmp = *entry1;
+        *entry1 = *entry2;
+        *entry2 = tmp;
+}
+static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
+{
+        struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+        int i, num = le16_to_cpu(dl_list->de_num_used);
+        for (i = 0; i < (num - 1); i++) {
+                if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
+                    le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
+                        return 0;
+        }
+        return 1;
+}
+/*
+ * Find the optimal value to split this leaf on. This expects the leaf
+ * entries to be in sorted order.
+ *
+ * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
+ * the hash we want to insert.
+ *
+ * This function is only concerned with the major hash - that which
+ * determines which cluster an item belongs to.
+ */
+static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
+                                        u32 leaf_cpos, u32 insert_hash,
+                                        u32 *split_hash)
+{
+        struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+        int i, num_used = le16_to_cpu(dl_list->de_num_used);
+        int allsame;
+        /*
+         * There's a couple rare, but nasty corner cases we have to
+         * check for here. All of them involve a leaf where all value
+         * have the same hash, which is what we look for first.
+         *
+         * Most of the time, all of the above is false, and we simply
+         * pick the median value for a split.
+         */
+        allsame = ocfs2_dx_leaf_same_major(dx_leaf);
+        if (allsame) {
+                u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
+                if (val == insert_hash) {
+                        /*
+                         * No matter where we would choose to split,
+                         * the new entry would want to occupy the same
+                         * block as these. Since there's no space left
+                         * in their existing block, we know there
+                         * won't be space after the split.
+                         */
+                        return -ENOSPC;
+                }
+                if (val == leaf_cpos) {
+                        /*
+                         * Because val is the same as leaf_cpos (which
+                         * is the smallest value this leaf can have),
+                         * yet is not equal to insert_hash, then we
+                         * know that insert_hash *must* be larger than
+                         * val (and leaf_cpos). At least cpos+1 in value.
+                         *
+                         * We also know then, that there cannot be an
+                         * adjacent extent (otherwise we'd be looking
+                         * at it). Choosing this value gives us a
+                         * chance to get some contiguousness.
+                         */
+                        *split_hash = leaf_cpos + 1;
+                        return 0;
+                }
+                if (val > insert_hash) {
+                        /*
+                         * val can not be the same as insert hash, and
+                         * also must be larger than leaf_cpos. Also,
+                         * we know that there can't be a leaf between
+                         * cpos and val, otherwise the entries with
+                         * hash 'val' would be there.
+                         */
+                        *split_hash = val;
+                        return 0;
+                }
+                *split_hash = insert_hash;
+                return 0;
+        }
+        /*
+         * Since the records are sorted and the checks above
+         * guaranteed that not all records in this block are the same,
+         * we simple travel forward, from the median, and pick the 1st
+         * record whose value is larger than leaf_cpos.
+         */
+        for (i = (num_used / 2); i < num_used; i++)
+                if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
+                    leaf_cpos)
+                        break;
+        BUG_ON(i == num_used); /* Should be impossible */
+        *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
+        return 0;
+}
+/*
+ * Transfer all entries in orig_dx_leaves whose major hash is equal to or
+ * larger than split_hash into new_dx_leaves. We use a temporary
+ * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
+ *
+ * Since the block offset inside a leaf (cluster) is a constant mask
+ * of minor_hash, we can optimize - an item at block offset X within
+ * the original cluster, will be at offset X within the new cluster.
+ */
+static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
+                                       handle_t *handle,
+                                       struct ocfs2_dx_leaf *tmp_dx_leaf,
+                                       struct buffer_head **orig_dx_leaves,
+                                       struct buffer_head **new_dx_leaves,
+                                       int num_dx_leaves)
+{
+        int i, j, num_used;
+        u32 major_hash;
+        struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
+        struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+        struct ocfs2_dx_entry *dx_entry;
+        tmp_list = &tmp_dx_leaf->dl_list;
+        for (i = 0; i < num_dx_leaves; i++) {
+                orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
+                orig_list = &orig_dx_leaf->dl_list;
+                new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
+                new_list = &new_dx_leaf->dl_list;
+                num_used = le16_to_cpu(orig_list->de_num_used);
+                memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
+                tmp_list->de_num_used = cpu_to_le16(0);
+                memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
+                for (j = 0; j < num_used; j++) {
+                        dx_entry = &orig_list->de_entries[j];
+                        major_hash = le32_to_cpu(dx_entry->dx_major_hash);
+                        if (major_hash >= split_hash)
+                                ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
+                                                              dx_entry);
+                        else
+                                ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
+                                                              dx_entry);
+                }
+                memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
+                ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
+                ocfs2_journal_dirty(handle, new_dx_leaves[i]);
+        }
+}
+static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
+                                          struct ocfs2_dx_root_block *dx_root)
+{
+        int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
+        credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+        credits += ocfs2_quota_trans_credits(osb->sb);
+        return credits;
+}
+/*
+ * Find the median value in dx_leaf_bh and allocate a new leaf to move
+ * half our entries into.
+ */
+static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
+                                  struct buffer_head *dx_root_bh,
+                                  struct buffer_head *dx_leaf_bh,
+                                  struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
+                                  u64 leaf_blkno)
+{
+        struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+        int credits, ret, i, num_used, did_quota = 0;
+        u32 cpos, split_hash, insert_hash = hinfo->major_hash;
+        u64 orig_leaves_start;
+        int num_dx_leaves;
+        struct buffer_head **orig_dx_leaves = NULL;
+        struct buffer_head **new_dx_leaves = NULL;
+        struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
+        struct ocfs2_extent_tree et;
+        handle_t *handle = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
+        mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+             (unsigned long long)leaf_blkno, insert_hash);
+        ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        /*
+         * XXX: This is a rather large limit. We should use a more
+         * realistic value.
+         */
+        if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
+                return -ENOSPC;
+        num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+        if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
+                mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
+                     "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                     (unsigned long long)leaf_blkno, num_used);
+                ret = -EIO;
+                goto out;
+        }
+        orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+        if (!orig_dx_leaves) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
+        if (!new_dx_leaves) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
+        if (ret) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+        credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                       ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
+        ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * This block is changing anyway, so we can sort it in place.
+         */
+        sort(dx_leaf->dl_list.de_entries, num_used,
+             sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
+             dx_leaf_sort_swap);
+        ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
+                                           &split_hash);
+        if (ret) {
+                mlog_errno(ret);
+                goto  out_commit;
+        }
+        mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
+             leaf_cpos, split_hash, insert_hash);
+        /*
+         * We have to carefully order operations here. There are items
+         * which want to be in the new cluster before insert, but in
+         * order to put those items in the new cluster, we alter the
+         * old cluster. A failure to insert gets nasty.
+         *
+         * So, start by reserving writes to the old
+         * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
+         * the new cluster for us, before inserting it. The insert
+         * won't happen if there's an error before that. Once the
+         * insert is done then, we can transfer from one leaf into the
+         * other without fear of hitting any error.
+         */
+        /*
+         * The leaf transfer wants some scratch space so that we don't
+         * wind up doing a bunch of expensive memmove().
+         */
+        tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
+        if (!tmp_dx_leaf) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
+        ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
+                                   orig_dx_leaves);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        for (i = 0; i < num_dx_leaves; i++) {
+                ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+        }
+        cpos = split_hash;
+        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                       data_ac, meta_ac, new_dx_leaves,
+                                       num_dx_leaves);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
+                                   orig_dx_leaves, new_dx_leaves, num_dx_leaves);
+out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                                ocfs2_clusters_to_bytes(dir->i_sb, 1));
+        ocfs2_commit_trans(osb, handle);
+out:
+        if (orig_dx_leaves || new_dx_leaves) {
+                for (i = 0; i < num_dx_leaves; i++) {
+                        if (orig_dx_leaves)
+                                brelse(orig_dx_leaves[i]);
+                        if (new_dx_leaves)
+                                brelse(new_dx_leaves[i]);
+                }
+                kfree(orig_dx_leaves);
+                kfree(new_dx_leaves);
+        }
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        kfree(tmp_dx_leaf);
+        return ret;
+}
+static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
+                                   struct buffer_head *di_bh,
+                                   struct buffer_head *dx_root_bh,
+                                   const char *name, int namelen,
+                                   struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret, rebalanced = 0;
+        struct ocfs2_dx_root_block *dx_root;
+        struct buffer_head *dx_leaf_bh = NULL;
+        struct ocfs2_dx_leaf *dx_leaf;
+        u64 blkno;
+        u32 leaf_cpos;
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+restart_search:
+        ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
+                                  &leaf_cpos, &blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+        if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
+            le16_to_cpu(dx_leaf->dl_list.de_count)) {
+                if (rebalanced) {
+                        /*
+                         * Rebalancing should have provided us with
+                         * space in an appropriate leaf.
+                         *
+                         * XXX: Is this an abnormal condition then?
+                         * Should we print a message here?
+                         */
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
+                                             &lookup->dl_hinfo, leaf_cpos,
+                                             blkno);
+                if (ret) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * Restart the lookup. The rebalance might have
+                 * changed which block our item fits into. Mark our
+                 * progress, so we only execute this once.
+                 */
+                brelse(dx_leaf_bh);
+                dx_leaf_bh = NULL;
+                rebalanced = 1;
+                goto restart_search;
+        }
+        lookup->dl_dx_leaf_bh = dx_leaf_bh;
+        dx_leaf_bh = NULL;
+out:
+        brelse(dx_leaf_bh);
+        return ret;
+}
+static int ocfs2_search_dx_free_list(struct inode *dir,
+                                     struct buffer_head *dx_root_bh,
+                                     int namelen,
+                                     struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret = -ENOSPC;
+        struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
+        struct ocfs2_dir_block_trailer *db;
+        u64 next_block;
+        int rec_len = OCFS2_DIR_REC_LEN(namelen);
+        struct ocfs2_dx_root_block *dx_root;
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        next_block = le64_to_cpu(dx_root->dr_free_blk);
+        while (next_block) {
+                brelse(prev_leaf_bh);
+                prev_leaf_bh = leaf_bh;
+                leaf_bh = NULL;
+                ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+                if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
+                        lookup->dl_leaf_bh = leaf_bh;
+                        lookup->dl_prev_leaf_bh = prev_leaf_bh;
+                        leaf_bh = NULL;
+                        prev_leaf_bh = NULL;
+                        break;
+                }
+                next_block = le64_to_cpu(db->db_free_next);
+        }
+        if (!next_block)
+                ret = -ENOSPC;
+out:
+        brelse(leaf_bh);
+        brelse(prev_leaf_bh);
+        return ret;
+}
+static int ocfs2_expand_inline_dx_root(struct inode *dir,
+                                       struct buffer_head *dx_root_bh)
+{
+        int ret, num_dx_leaves, i, j, did_quota = 0;
+        struct buffer_head **dx_leaves = NULL;
+        struct ocfs2_extent_tree et;
+        u64 insert_blkno;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        handle_t *handle = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        struct ocfs2_dx_entry *dx_entry;
+        struct ocfs2_dx_leaf *target_leaf;
+        ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+        if (!dx_leaves) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
+        /*
+         * We do this up front, before the allocation, so that a
+         * failure to add the dx_root_bh to the journal won't result
+         * us losing clusters.
+         */
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
+                                         num_dx_leaves, &insert_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * Transfer the entries from our dx_root into the appropriate
+         * block
+         */
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        entry_list = &dx_root->dr_entries;
+        for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+                dx_entry = &entry_list->de_entries[i];
+                j = __ocfs2_dx_dir_hash_idx(osb,
+                                            le32_to_cpu(dx_entry->dx_minor_hash));
+                target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
+                ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
+                /* Each leaf has been passed to the journal already
+                 * via __ocfs2_dx_dir_new_cluster() */
+        }
+        dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
+        memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
+               offsetof(struct ocfs2_dx_root_block, dr_list));
+        dx_root->dr_list.l_count =
+                cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+        /* This should never fail considering we start with an empty
+         * dx_root. */
+        ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+        ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
+                                  insert_blkno, 1, 0, NULL);
+        if (ret)
+                mlog_errno(ret);
+        did_quota = 0;
+        ocfs2_journal_dirty(handle, dx_root_bh);
+out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                                          ocfs2_clusters_to_bytes(dir->i_sb, 1));
+        ocfs2_commit_trans(osb, handle);
+out:
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (dx_leaves) {
+                for (i = 0; i < num_dx_leaves; i++)
+                        brelse(dx_leaves[i]);
+                kfree(dx_leaves);
+        }
+        return ret;
+}
+static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
+{
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        entry_list = &dx_root->dr_entries;
+        if (le16_to_cpu(entry_list->de_num_used) >=
+            le16_to_cpu(entry_list->de_count))
+                return -ENOSPC;
+        return 0;
+}
+static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
+                                           struct buffer_head *di_bh,
+                                           const char *name,
+                                           int namelen,
+                                           struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret, free_dx_root = 1;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct buffer_head *dx_root_bh = NULL;
+        struct buffer_head *leaf_bh = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_dx_root_block *dx_root;
+        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
+                ret = -ENOSPC;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (ocfs2_dx_root_inline(dx_root)) {
+                ret = ocfs2_inline_dx_has_space(dx_root_bh);
+                if (ret == 0)
+                        goto search_el;
+                /*
+                 * We ran out of room in the root block. Expand it to
+                 * an extent, then allow ocfs2_find_dir_space_dx to do
+                 * the rest.
+                 */
+                ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        /*
+         * Insert preparation for an indexed directory is split into two
+         * steps. The call to find_dir_space_dx reserves room in the index for
+         * an additional item. If we run out of space there, it's a real error
+         * we can't continue on.
+         */
+        ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
+                                      namelen, lookup);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+search_el:
+        /*
+         * Next, we need to find space in the unindexed tree. This call
+         * searches using the free space linked list. If the unindexed tree
+         * lacks sufficient space, we'll expand it below. The expansion code
+         * is smart enough to add any new blocks to the free space list.
+         */
+        ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
+        if (ret && ret != -ENOSPC) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* Do this up here - ocfs2_extend_dir might need the dx_root */
+        lookup->dl_dx_root_bh = dx_root_bh;
+        free_dx_root = 0;
+        if (ret == -ENOSPC) {
+                ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * We make the assumption here that new leaf blocks are added
+                 * to the front of our free list.
+                 */
+                lookup->dl_prev_leaf_bh = NULL;
+                lookup->dl_leaf_bh = leaf_bh;
+        }
+out:
+        if (free_dx_root)
+                brelse(dx_root_bh);
+        return ret;
+}
+/*
+ * Get a directory ready for insert. Any directory allocation required
+ * happens here. Success returns zero, and enough context in the dir
+ * lookup result that ocfs2_add_entry() will be able complete the task
+ * with minimal performance impact.
+ */
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct inode *dir,
                                 struct buffer_head *parent_fe_bh,
                                 const char *name,
                                 int namelen,
-                                 struct buffer_head **ret_de_bh)
+                                 struct ocfs2_dir_lookup_result *lookup)
 {
        int ret;
        unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
        mlog(0, "getting ready to insert namelen %d into dir %llu\n",
             namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        *ret_de_bh = NULL;
        if (!namelen) {
                ret = -EINVAL;
                mlog_errno(ret);
                goto out;
        }
+        /*
+         * Do this up front to reduce confusion.
+         *
+         * The directory might start inline, then be turned into an
+         * indexed one, in which case we'd need to hash deep inside
+         * ocfs2_find_dir_space_id(). Since
+         * ocfs2_prepare_dx_dir_for_insert() also needs this hash
+         * done, there seems no point in spreading out the calls. We
+         * can optimize away the case where the file system doesn't
+         * support indexing.
+         */
+        if (ocfs2_supports_indexed_dirs(osb))
+                ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
+        if (ocfs2_dir_indexed(dir)) {
+                ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
+                                                      name, namelen, lookup);
+                if (ret)
+                        mlog_errno(ret);
+                goto out;
+        }
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
                                              namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                BUG_ON(bh);
                ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
-                                       &bh);
+                                       lookup, &bh);
                if (ret) {
                        if (ret != -ENOSPC)
                                mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                BUG_ON(!bh);
        }
-        *ret_de_bh = bh;
+        lookup->dl_leaf_bh = bh;
        bh = NULL;
 out:
        brelse(bh);
        return ret;
 }
+static int ocfs2_dx_dir_remove_index(struct inode *dir,
+                                     struct buffer_head *di_bh,
+                                     struct buffer_head *dx_root_bh)
+{
+        int ret;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_dx_root_block *dx_root;
+        struct inode *dx_alloc_inode = NULL;
+        struct buffer_head *dx_alloc_bh = NULL;
+        handle_t *handle;
+        u64 blk;
+        u16 bit;
+        u64 bg_blkno;
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        dx_alloc_inode = ocfs2_get_system_file_inode(osb,
+                                        EXTENT_ALLOC_SYSTEM_INODE,
+                                        le16_to_cpu(dx_root->dr_suballoc_slot));
+        if (!dx_alloc_inode) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&dx_alloc_inode->i_mutex);
+        ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
+        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        di->i_dx_root = cpu_to_le64(0ULL);
+        ocfs2_journal_dirty(handle, di_bh);
+        blk = le64_to_cpu(dx_root->dr_blkno);
+        bit = le16_to_cpu(dx_root->dr_suballoc_bit);
+        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
+                                       bit, bg_blkno, 1);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        ocfs2_inode_unlock(dx_alloc_inode, 1);
+out_mutex:
+        mutex_unlock(&dx_alloc_inode->i_mutex);
+        brelse(dx_alloc_bh);
+out:
+        iput(dx_alloc_inode);
+        return ret;
+}
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
+{
+        int ret;
+        unsigned int uninitialized_var(clen);
+        u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
+        u64 uninitialized_var(blkno);
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_dealloc_ctxt(&dealloc);
+        if (!ocfs2_dir_indexed(dir))
+                return 0;
+        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (ocfs2_dx_root_inline(dx_root))
+                goto remove_index;
+        ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+        /* XXX: What if dr_clusters is too large? */
+        while (le32_to_cpu(dx_root->dr_clusters)) {
+                ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
+                                              major_hash, &cpos, &blkno, &clen);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
+                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
+                                               &dealloc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (cpos == 0)
+                        break;
+                major_hash = cpos - 1;
+        }
+remove_index:
+        ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_remove_from_cache(dir, dx_root_bh);
+out:
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &dealloc);
+        brelse(dx_root_bh);
+        return ret;
+}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index c511e2e18e9f..e683f3deb645 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,44 +26,70 @@
 #ifndef OCFS2_DIR_H
 #define OCFS2_DIR_H
-struct buffer_head *ocfs2_find_entry(const char *name,
+struct ocfs2_dx_hinfo {
-                                     int namelen,
+        u32     major_hash;
-                                     struct inode *dir,
+        u32     minor_hash;
-                                     struct ocfs2_dir_entry **res_dir);
+};
+struct ocfs2_dir_lookup_result {
+        struct buffer_head              *dl_leaf_bh;    /* Unindexed leaf
+                                                         * block */
+        struct ocfs2_dir_entry          *dl_entry;      /* Target dirent in
+                                                         * unindexed leaf */
+        struct buffer_head              *dl_dx_root_bh; /* Root of indexed
+                                                         * tree */
+        struct buffer_head              *dl_dx_leaf_bh; /* Indexed leaf block */
+        struct ocfs2_dx_entry           *dl_dx_entry;   /* Target dx_entry in
+                                                         * indexed leaf */
+        struct ocfs2_dx_hinfo           dl_hinfo;       /* Name hash results */
+        struct buffer_head              *dl_prev_leaf_bh;/* Previous entry in
+                                                          * dir free space
+                                                          * list. NULL if
+                                                          * previous entry is
+                                                          * dx root block. */
+};
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
+int ocfs2_find_entry(const char *name, int namelen,
+                     struct inode *dir,
+                     struct ocfs2_dir_lookup_result *lookup);
 int ocfs2_delete_entry(handle_t *handle,
                       struct inode *dir,
-                       struct ocfs2_dir_entry *de_del,
+                       struct ocfs2_dir_lookup_result *res);
-                       struct buffer_head *bh);
 int __ocfs2_add_entry(handle_t *handle,
                      struct inode *dir,
                      const char *name, int namelen,
                      struct inode *inode, u64 blkno,
                      struct buffer_head *parent_fe_bh,
-                      struct buffer_head *insert_bh);
+                      struct ocfs2_dir_lookup_result *lookup);
 static inline int ocfs2_add_entry(handle_t *handle,
                                  struct dentry *dentry,
                                  struct inode *inode, u64 blkno,
                                  struct buffer_head *parent_fe_bh,
-                                  struct buffer_head *insert_bh)
+                                  struct ocfs2_dir_lookup_result *lookup)
 {
        return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
                                 dentry->d_name.name, dentry->d_name.len,
-                                 inode, blkno, parent_fe_bh, insert_bh);
+                                 inode, blkno, parent_fe_bh, lookup);
 }
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-                       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+                       struct ocfs2_dir_lookup_result *res,
                       struct inode *new_entry_inode);
 int ocfs2_check_dir_for_entry(struct inode *dir,
                              const char *name,
                              int namelen);
 int ocfs2_empty_dir(struct inode *inode);
 int ocfs2_find_files_on_disk(const char *name,
                             int namelen,
                             u64 *blkno,
                             struct inode *inode,
-                             struct buffer_head **dirent_bh,
+                             struct ocfs2_dir_lookup_result *res);
-                             struct ocfs2_dir_entry **dirent);
 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno);
 int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct buffer_head *parent_fe_bh,
                                 const char *name,
                                 int namelen,
-                                 struct buffer_head **ret_de_bh);
+                                 struct ocfs2_dir_lookup_result *lookup);
 struct ocfs2_alloc_context;
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       handle_t *handle,
                       struct inode *parent,
                       struct inode *inode,
                       struct buffer_head *fe_bh,
-                       struct ocfs2_alloc_context *data_ac);
+                       struct ocfs2_alloc_context *data_ac,
+                       struct ocfs2_alloc_context *meta_ac);
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
 struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
                                                            void *data);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bb53714813ab..0102be35980c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -52,16 +52,12 @@
 enum dlm_mle_type {
        DLM_MLE_BLOCK,
        DLM_MLE_MASTER,
-        DLM_MLE_MIGRATION
+        DLM_MLE_MIGRATION,
-};
+        DLM_MLE_NUM_TYPES
-struct dlm_lock_name {
-        u8 len;
-        u8 name[DLM_LOCKID_NAME_MAX];
 };
 struct dlm_master_list_entry {
-        struct list_head list;
+        struct hlist_node master_hash_node;
        struct list_head hb_events;
        struct dlm_ctxt *dlm;
        spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
        enum dlm_mle_type type;
        struct o2hb_callback_func mle_hb_up;
        struct o2hb_callback_func mle_hb_down;
-        union {
+        struct dlm_lock_resource *mleres;
-                struct dlm_lock_resource *res;
+        unsigned char mname[DLM_LOCKID_NAME_MAX];
-                struct dlm_lock_name name;
+        unsigned int mnamelen;
-        } u;
+        unsigned int mnamehash;
 };
 enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
        unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct dlm_recovery_ctxt reco;
        spinlock_t master_lock;
-        struct list_head master_list;
+        struct hlist_head **master_hash;
        struct list_head mle_hb_events;
        /* these give a really vague idea of the system load */
-        atomic_t local_resources;
+        atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
-        atomic_t remote_resources;
+        atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
-        atomic_t unknown_resources;
+        atomic_t res_tot_count;
+        atomic_t res_cur_count;
        struct dlm_debug_ctxt *dlm_debug_ctxt;
        struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
        return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
 }
+static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
+                                                 unsigned i)
+{
+        return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
+                        (i % DLM_BUCKETS_PER_PAGE);
+}
 /* these keventd work queue items are for less-frequently
 * called functions that cannot be directly called from the
 * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                              unsigned int len);
 int dlm_is_host_down(int errno);
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res,
-                              u8 owner);
 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
                                                 const char *lockid,
                                                 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
                                          DLM_LOCK_RES_MIGRATING));
 }
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
 /* create/destroy slab caches */
 int dlm_init_master_caches(void);
 void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
        return bit;
 }
+static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+                                         struct dlm_lock_resource *res,
+                                         u8 owner)
+{
+        assert_spin_locked(&res->spinlock);
+        res->owner = owner;
+}
+static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+                                            struct dlm_lock_resource *res,
+                                            u8 owner)
+{
+        assert_spin_locked(&res->spinlock);
+        if (owner != res->owner)
+                dlm_set_lockres_owner(dlm, res, owner);
+}
 #endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index b32f60a5acfb..df52f706f669 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
 static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 {
        int out = 0;
-        unsigned int namelen;
-        const char *name;
        char *mle_type;
-        if (mle->type != DLM_MLE_MASTER) {
-                namelen = mle->u.name.len;
-                name = mle->u.name.name;
-        } else {
-                namelen = mle->u.res->lockname.len;
-                name = mle->u.res->lockname.name;
-        }
        if (mle->type == DLM_MLE_BLOCK)
                mle_type = "BLK";
        else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
        else
                mle_type = "MIG";
-        out += stringify_lockname(name, namelen, buf + out, len - out);
+        out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
        out += snprintf(buf + out, len - out,
                        "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
                        mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
 static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
        struct dlm_master_list_entry *mle;
-        int out = 0;
+        struct hlist_head *bucket;
-        unsigned long total = 0;
+        struct hlist_node *list;
+        int i, out = 0;
+        unsigned long total = 0, longest = 0, bktcnt;
        out += snprintf(db->buf + out, db->len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
        spin_lock(&dlm->master_lock);
-        list_for_each_entry(mle, &dlm->master_list, list) {
+        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                ++total;
+                bucket = dlm_master_hash(dlm, i);
-                if (db->len - out < 200)
+                hlist_for_each(list, bucket) {
-                        continue;
+                        mle = hlist_entry(list, struct dlm_master_list_entry,
-                out += dump_mle(mle, db->buf + out, db->len - out);
+                                          master_hash_node);
+                        ++total;
+                        ++bktcnt;
+                        if (db->len - out < 200)
+                                continue;
+                        out += dump_mle(mle, db->buf + out, db->len - out);
+                }
+                longest = max(longest, bktcnt);
+                bktcnt = 0;
        }
        spin_unlock(&dlm->master_lock);
        out += snprintf(db->buf + out, db->len - out,
-                        "Total on list: %ld\n", total);
+                        "Total: %ld, Longest: %ld\n", total, longest);
        return out;
 }
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        int out = 0;
        struct dlm_reco_node_data *node;
        char *state;
-        int lres, rres, ures, tres;
+        int cur_mles = 0, tot_mles = 0;
+        int i;
-        lres = atomic_read(&dlm->local_resources);
-        rres = atomic_read(&dlm->remote_resources);
-        ures = atomic_read(&dlm->unknown_resources);
-        tres = lres + rres + ures;
        spin_lock(&dlm->spinlock);
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                                 db->buf + out, db->len - out);
        out += snprintf(db->buf + out, db->len - out, "\n");
-        /* Mastered Resources Total: xxx  Locally: xxx  Remotely: ... */
+        /* Lock Resources: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Lock Resources: %d (%d)\n",
+                        atomic_read(&dlm->res_cur_count),
+                        atomic_read(&dlm->res_tot_count));
+        for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+                tot_mles += atomic_read(&dlm->mle_tot_count[i]);
+        for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+                cur_mles += atomic_read(&dlm->mle_cur_count[i]);
+        /* MLEs: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "MLEs: %d (%d)\n", cur_mles, tot_mles);
+        /*  Blocking: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "  Blocking: %d (%d)\n",
+                        atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
+                        atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
+        /*  Mastery: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "  Mastery: %d (%d)\n",
+                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
+                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
+        /*  Migration: xxx (xxx) */
        out += snprintf(db->buf + out, db->len - out,
-                        "Mastered Resources Total: %d  Locally: %d  "
+                        "  Migration: %d (%d)\n",
-                        "Remotely: %d  Unknown: %d\n",
+                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
-                        tres, lres, rres, ures);
+                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
        out += snprintf(db->buf + out, db->len - out,
                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
-                        "PendingBASTs=%s  Master=%s\n",
+                        "PendingBASTs=%s\n",
                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
                        (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
                        (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
-                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
-                        (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
        /* Purge Count: xxx  Refs: xxx */
        out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d8d578f45613..4d9e6b288dd8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
        if (dlm->lockres_hash)
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+        if (dlm->master_hash)
+                dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
        if (dlm->name)
                kfree(dlm->name);
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        for (i = 0; i < DLM_HASH_BUCKETS; i++)
                INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
+        dlm->master_hash = (struct hlist_head **)
+                                dlm_alloc_pagevec(DLM_HASH_PAGES);
+        if (!dlm->master_hash) {
+                mlog_errno(-ENOMEM);
+                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+                kfree(dlm->name);
+                kfree(dlm);
+                dlm = NULL;
+                goto leave;
+        }
+        for (i = 0; i < DLM_HASH_BUCKETS; i++)
+                INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
        strcpy(dlm->name, domain);
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
        ret = dlm_create_debugfs_subroot(dlm);
        if (ret < 0) {
+                dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
                kfree(dlm->name);
                kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        init_waitqueue_head(&dlm->reco.event);
        init_waitqueue_head(&dlm->ast_wq);
        init_waitqueue_head(&dlm->migration_wq);
-        INIT_LIST_HEAD(&dlm->master_list);
        INIT_LIST_HEAD(&dlm->mle_hb_events);
        dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
        dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-        atomic_set(&dlm->local_resources, 0);
-        atomic_set(&dlm->remote_resources, 0);
+        atomic_set(&dlm->res_tot_count, 0);
-        atomic_set(&dlm->unknown_resources, 0);
+        atomic_set(&dlm->res_cur_count, 0);
+        for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
+                atomic_set(&dlm->mle_tot_count[i], 0);
+                atomic_set(&dlm->mle_cur_count[i], 0);
+        }
        spin_lock_init(&dlm->work_lock);
        INIT_LIST_HEAD(&dlm->work_list);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0a2813947853..f8b653fcd4dd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
                                const char *name,
                                unsigned int namelen)
 {
-        struct dlm_lock_resource *res;
        if (dlm != mle->dlm)
                return 0;
-        if (mle->type == DLM_MLE_BLOCK ||
+        if (namelen != mle->mnamelen ||
-            mle->type == DLM_MLE_MIGRATION) {
+            memcmp(name, mle->mname, namelen) != 0)
-                if (namelen != mle->u.name.len ||
+                return 0;
-                    memcmp(name, mle->u.name.name, namelen)!=0)
-                        return 0;
-        } else {
-                res = mle->u.res;
-                if (namelen != res->lockname.len ||
-                    memcmp(res->lockname.name, name, namelen) != 0)
-                        return 0;
-        }
        return 1;
 }
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        mle->dlm = dlm;
        mle->type = type;
-        INIT_LIST_HEAD(&mle->list);
+        INIT_HLIST_NODE(&mle->master_hash_node);
        INIT_LIST_HEAD(&mle->hb_events);
        memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
        spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        mle->new_master = O2NM_MAX_NODES;
        mle->inuse = 0;
+        BUG_ON(mle->type != DLM_MLE_BLOCK &&
+               mle->type != DLM_MLE_MASTER &&
+               mle->type != DLM_MLE_MIGRATION);
        if (mle->type == DLM_MLE_MASTER) {
                BUG_ON(!res);
-                mle->u.res = res;
+                mle->mleres = res;
-        } else if (mle->type == DLM_MLE_BLOCK) {
+                memcpy(mle->mname, res->lockname.name, res->lockname.len);
-                BUG_ON(!name);
+                mle->mnamelen = res->lockname.len;
-                memcpy(mle->u.name.name, name, namelen);
+                mle->mnamehash = res->lockname.hash;
-                mle->u.name.len = namelen;
+        } else {
-        } else /* DLM_MLE_MIGRATION */ {
                BUG_ON(!name);
-                memcpy(mle->u.name.name, name, namelen);
+                mle->mleres = NULL;
-                mle->u.name.len = namelen;
+                memcpy(mle->mname, name, namelen);
+                mle->mnamelen = namelen;
+                mle->mnamehash = dlm_lockid_hash(name, namelen);
        }
+        atomic_inc(&dlm->mle_tot_count[mle->type]);
+        atomic_inc(&dlm->mle_cur_count[mle->type]);
        /* copy off the node_map and register hb callbacks on our copy */
        memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
        memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        __dlm_mle_attach_hb_events(dlm, mle);
 }
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&dlm->master_lock);
+        if (!hlist_unhashed(&mle->master_hash_node))
+                hlist_del_init(&mle->master_hash_node);
+}
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+        struct hlist_head *bucket;
+        assert_spin_locked(&dlm->master_lock);
+        bucket = dlm_master_hash(dlm, mle->mnamehash);
+        hlist_add_head(&mle->master_hash_node, bucket);
+}
 /* returns 1 if found, 0 if not */
 static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
                        char *name, unsigned int namelen)
 {
        struct dlm_master_list_entry *tmpmle;
+        struct hlist_head *bucket;
+        struct hlist_node *list;
+        unsigned int hash;
        assert_spin_locked(&dlm->master_lock);
-        list_for_each_entry(tmpmle, &dlm->master_list, list) {
+        hash = dlm_lockid_hash(name, namelen);
+        bucket = dlm_master_hash(dlm, hash);
+        hlist_for_each(list, bucket) {
+                tmpmle = hlist_entry(list, struct dlm_master_list_entry,
+                                     master_hash_node);
                if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
                        continue;
                dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
        mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
        dlm = mle->dlm;
-        if (mle->type != DLM_MLE_MASTER) {
-                mlog(0, "calling mle_release for %.*s, type %d\n",
-                     mle->u.name.len, mle->u.name.name, mle->type);
-        } else {
-                mlog(0, "calling mle_release for %.*s, type %d\n",
-                     mle->u.res->lockname.len,
-                     mle->u.res->lockname.name, mle->type);
-        }
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&dlm->master_lock);
+        mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
+             mle->type);
        /* remove from list if not already */
-        if (!list_empty(&mle->list))
+        __dlm_unlink_mle(dlm, mle);
-                list_del_init(&mle->list);
        /* detach the mle from the domain node up/down events */
        __dlm_mle_detach_hb_events(dlm, mle);
+        atomic_dec(&dlm->mle_cur_count[mle->type]);
        /* NOTE: kfree under spinlock here.
         * if this is bad, we can move this to a freelist. */
        kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
                kmem_cache_destroy(dlm_lockres_cache);
 }
-static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
-                                  struct dlm_lock_resource *res,
-                                  u8 owner)
-{
-        assert_spin_locked(&res->spinlock);
-        mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
-        if (owner == dlm->node_num)
-                atomic_inc(&dlm->local_resources);
-        else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-                atomic_inc(&dlm->unknown_resources);
-        else
-                atomic_inc(&dlm->remote_resources);
-        res->owner = owner;
-}
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res, u8 owner)
-{
-        assert_spin_locked(&res->spinlock);
-        if (owner == res->owner)
-                return;
-        if (res->owner == dlm->node_num)
-                atomic_dec(&dlm->local_resources);
-        else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-                atomic_dec(&dlm->unknown_resources);
-        else
-                atomic_dec(&dlm->remote_resources);
-        dlm_set_lockres_owner(dlm, res, owner);
-}
 static void dlm_lockres_release(struct kref *kref)
 {
        struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
        }
        spin_unlock(&dlm->track_lock);
+        atomic_dec(&dlm->res_cur_count);
        dlm_put(dlm);
        if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        kref_init(&res->refs);
+        atomic_inc(&dlm->res_tot_count);
+        atomic_inc(&dlm->res_cur_count);
        /* just for consistency */
        spin_lock(&res->spinlock);
        dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
                alloc_mle = NULL;
                dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
                set_bit(dlm->node_num, mle->maybe_map);
-                list_add(&mle->list, &dlm->master_list);
+                __dlm_insert_mle(dlm, mle);
                /* still holding the dlm spinlock, check the recovery map
                 * to see if there are any nodes that still need to be 
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                                                     res->lockname.len,
                                                     res->lockname.name);
                                                mle->type = DLM_MLE_MASTER;
-                                                mle->u.res = res;
+                                                mle->mleres = res;
                                        }
                                }
                        }
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
        BUG_ON(mle->type == DLM_MLE_MIGRATION);
-        if (mle->type != DLM_MLE_MASTER) {
+        request.namelen = (u8)mle->mnamelen;
-                request.namelen = mle->u.name.len;
+        memcpy(request.name, mle->mname, request.namelen);
-                memcpy(request.name, mle->u.name.name, request.namelen);
-        } else {
-                request.namelen = mle->u.res->lockname.len;
-                memcpy(request.name, mle->u.res->lockname.name,
-                        request.namelen);
-        }
 again:
        ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
                // "add the block.\n");
                dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
                set_bit(request->node_idx, mle->maybe_map);
-                list_add(&mle->list, &dlm->master_list);
+                __dlm_insert_mle(dlm, mle);
                response = DLM_MASTER_RESP_NO;
        } else {
                // mlog(0, "mle was found\n");
@@ -1967,7 +1949,7 @@ ok:
                             assert->node_idx, rr, extra_ref, mle->inuse);
                        dlm_print_one_mle(mle);
                }
-                list_del_init(&mle->list);
+                __dlm_unlink_mle(dlm, mle);
                __dlm_mle_detach_hb_events(dlm, mle);
                __dlm_put_mle(mle);
                if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
                        tmp->master = master;
                        atomic_set(&tmp->woken, 1);
                        wake_up(&tmp->wq);
-                        /* remove it from the list so that only one
+                        /* remove it so that only one mle will be found */
-                         * mle will be found */
+                        __dlm_unlink_mle(dlm, tmp);
-                        list_del_init(&tmp->list);
-                        /* this was obviously WRONG.  mle is uninited here.  should be tmp. */
                        __dlm_mle_detach_hb_events(dlm, tmp);
                        ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
                        mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
        mle->master = master;
        /* do this for consistency with other mle types */
        set_bit(new_master, mle->maybe_map);
-        list_add(&mle->list, &dlm->master_list);
+        __dlm_insert_mle(dlm, mle);
        return ret;
 }
+/*
-void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+ * Sets the owner of the lockres, associated to the mle, to UNKNOWN
+ */
+static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
+                                        struct dlm_master_list_entry *mle)
 {
-        struct dlm_master_list_entry *mle, *next;
        struct dlm_lock_resource *res;
-        unsigned int hash;
-        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+        /* Find the lockres associated to the mle and set its owner to UNK */
-top:
+        res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
-        assert_spin_locked(&dlm->spinlock);
+                                   mle->mnamehash);
+        if (res) {
+                spin_unlock(&dlm->master_lock);
-        /* clean the master list */
+                /* move lockres onto recovery list */
-        spin_lock(&dlm->master_lock);
+                spin_lock(&res->spinlock);
-        list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
+                dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
-                BUG_ON(mle->type != DLM_MLE_BLOCK &&
+                dlm_move_lockres_to_recovery_list(dlm, res);
-                       mle->type != DLM_MLE_MASTER &&
+                spin_unlock(&res->spinlock);
-                       mle->type != DLM_MLE_MIGRATION);
+                dlm_lockres_put(res);
-                /* MASTER mles are initiated locally.  the waiting
-                 * process will notice the node map change
-                 * shortly.  let that happen as normal. */
-                if (mle->type == DLM_MLE_MASTER)
-                        continue;
+                /* about to get rid of mle, detach from heartbeat */
+                __dlm_mle_detach_hb_events(dlm, mle);
-                /* BLOCK mles are initiated by other nodes.
+                /* dump the mle */
-                 * need to clean up if the dead node would have
+                spin_lock(&dlm->master_lock);
-                 * been the master. */
+                __dlm_put_mle(mle);
-                if (mle->type == DLM_MLE_BLOCK) {
+                spin_unlock(&dlm->master_lock);
-                        int bit;
+        }
-                        spin_lock(&mle->spinlock);
+        return res;
-                        bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+}
-                        if (bit != dead_node) {
-                                mlog(0, "mle found, but dead node %u would "
-                                     "not have been master\n", dead_node);
-                                spin_unlock(&mle->spinlock);
-                        } else {
-                                /* must drop the refcount by one since the
-                                 * assert_master will never arrive.  this
-                                 * may result in the mle being unlinked and
-                                 * freed, but there may still be a process
-                                 * waiting in the dlmlock path which is fine. */
-                                mlog(0, "node %u was expected master\n",
-                                     dead_node);
-                                atomic_set(&mle->woken, 1);
-                                spin_unlock(&mle->spinlock);
-                                wake_up(&mle->wq);
-                                /* do not need events any longer, so detach 
-                                 * from heartbeat */
-                                __dlm_mle_detach_hb_events(dlm, mle);
-                                __dlm_put_mle(mle);
-                        }
-                        continue;
-                }
-                /* everything else is a MIGRATION mle */
+static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
+                                    struct dlm_master_list_entry *mle)
-                /* the rule for MIGRATION mles is that the master
+{
-                 * becomes UNKNOWN if *either* the original or
+        __dlm_mle_detach_hb_events(dlm, mle);
-                 * the new master dies.  all UNKNOWN lockreses
-                 * are sent to whichever node becomes the recovery
-                 * master.  the new master is responsible for
-                 * determining if there is still a master for
-                 * this lockres, or if he needs to take over
-                 * mastery.  either way, this node should expect
-                 * another message to resolve this. */
-                if (mle->master != dead_node &&
-                    mle->new_master != dead_node)
-                        continue;
-                /* if we have reached this point, this mle needs to
+        spin_lock(&mle->spinlock);
-                 * be removed from the list and freed. */
+        __dlm_unlink_mle(dlm, mle);
+        atomic_set(&mle->woken, 1);
+        spin_unlock(&mle->spinlock);
-                /* remove from the list early.  NOTE: unlinking
+        wake_up(&mle->wq);
-                 * list_head while in list_for_each_safe */
+}
-                __dlm_mle_detach_hb_events(dlm, mle);
-                spin_lock(&mle->spinlock);
+static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
-                list_del_init(&mle->list);
+                                struct dlm_master_list_entry *mle, u8 dead_node)
+{
+        int bit;
+        BUG_ON(mle->type != DLM_MLE_BLOCK);
+        spin_lock(&mle->spinlock);
+        bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+        if (bit != dead_node) {
+                mlog(0, "mle found, but dead node %u would not have been "
+                     "master\n", dead_node);
+                spin_unlock(&mle->spinlock);
+        } else {
+                /* Must drop the refcount by one since the assert_master will
+                 * never arrive. This may result in the mle being unlinked and
+                 * freed, but there may still be a process waiting in the
+                 * dlmlock path which is fine. */
+                mlog(0, "node %u was expected master\n", dead_node);
                atomic_set(&mle->woken, 1);
                spin_unlock(&mle->spinlock);
                wake_up(&mle->wq);
-                mlog(0, "%s: node %u died during migration from "
+                /* Do not need events any longer, so detach from heartbeat */
-                     "%u to %u!\n", dlm->name, dead_node,
+                __dlm_mle_detach_hb_events(dlm, mle);
-                     mle->master, mle->new_master);
+                __dlm_put_mle(mle);
-                /* if there is a lockres associated with this
+        }
-                 * mle, find it and set its owner to UNKNOWN */
+}
-                hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
-                res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-                                           mle->u.name.len, hash);
-                if (res) {
-                        /* unfortunately if we hit this rare case, our
-                         * lock ordering is messed.  we need to drop
-                         * the master lock so that we can take the
-                         * lockres lock, meaning that we will have to
-                         * restart from the head of list. */
-                        spin_unlock(&dlm->master_lock);
-                        /* move lockres onto recovery list */
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
-                        spin_lock(&res->spinlock);
+{
-                        dlm_set_lockres_owner(dlm, res,
+        struct dlm_master_list_entry *mle;
-                                        DLM_LOCK_RES_OWNER_UNKNOWN);
+        struct dlm_lock_resource *res;
-                        dlm_move_lockres_to_recovery_list(dlm, res);
+        struct hlist_head *bucket;
-                        spin_unlock(&res->spinlock);
+        struct hlist_node *list;
-                        dlm_lockres_put(res);
+        unsigned int i;
-                        /* about to get rid of mle, detach from heartbeat */
+        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
-                        __dlm_mle_detach_hb_events(dlm, mle);
+top:
+        assert_spin_locked(&dlm->spinlock);
-                        /* dump the mle */
+        /* clean the master list */
-                        spin_lock(&dlm->master_lock);
+        spin_lock(&dlm->master_lock);
-                        __dlm_put_mle(mle);
+        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                        spin_unlock(&dlm->master_lock);
+                bucket = dlm_master_hash(dlm, i);
+                hlist_for_each(list, bucket) {
+                        mle = hlist_entry(list, struct dlm_master_list_entry,
+                                          master_hash_node);
+                        BUG_ON(mle->type != DLM_MLE_BLOCK &&
+                               mle->type != DLM_MLE_MASTER &&
+                               mle->type != DLM_MLE_MIGRATION);
+                        /* MASTER mles are initiated locally. The waiting
+                         * process will notice the node map change shortly.
+                         * Let that happen as normal. */
+                        if (mle->type == DLM_MLE_MASTER)
+                                continue;
+                        /* BLOCK mles are initiated by other nodes. Need to
+                         * clean up if the dead node would have been the
+                         * master. */
+                        if (mle->type == DLM_MLE_BLOCK) {
+                                dlm_clean_block_mle(dlm, mle, dead_node);
+                                continue;
+                        }
-                        /* restart */
+                        /* Everything else is a MIGRATION mle */
-                        goto top;
-                }
+                        /* The rule for MIGRATION mles is that the master
+                         * becomes UNKNOWN if *either* the original or the new
+                         * master dies. All UNKNOWN lockres' are sent to
+                         * whichever node becomes the recovery master. The new
+                         * master is responsible for determining if there is
+                         * still a master for this lockres, or if he needs to
+                         * take over mastery. Either way, this node should
+                         * expect another message to resolve this. */
+                        if (mle->master != dead_node &&
+                            mle->new_master != dead_node)
+                                continue;
+                        /* If we have reached this point, this mle needs to be
+                         * removed from the list and freed. */
+                        dlm_clean_migration_mle(dlm, mle);
+                        mlog(0, "%s: node %u died during migration from "
+                             "%u to %u!\n", dlm->name, dead_node, mle->master,
+                             mle->new_master);
+                        /* If we find a lockres associated with the mle, we've
+                         * hit this rare case that messes up our lock ordering.
+                         * If so, we need to drop the master lock so that we can
+                         * take the lockres lock, meaning that we will have to
+                         * restart from the head of list. */
+                        res = dlm_reset_mleres_owner(dlm, mle);
+                        if (res)
+                                /* restart */
+                                goto top;
-                /* this may be the last reference */
+                        /* This may be the last reference */
-                __dlm_put_mle(mle);
+                        __dlm_put_mle(mle);
+                }
        }
        spin_unlock(&dlm->master_lock);
 }
 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         u8 old_master)
 {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d490b66ad9d7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        if (!__dlm_lockres_unused(res)) {
-                spin_unlock(&res->spinlock);
                mlog(0, "%s:%.*s: tried to purge but not unused\n",
                     dlm->name, res->lockname.len, res->lockname.name);
-                return -ENOTEMPTY;
+                __dlm_print_one_lock_resource(res);
+                spin_unlock(&res->spinlock);
+                BUG();
        }
+        if (res->state & DLM_LOCK_RES_MIGRATING) {
+                mlog(0, "%s:%.*s: Delay dropref as this lockres is "
+                     "being remastered\n", dlm->name, res->lockname.len,
+                     res->lockname.name);
+                /* Re-add the lockres to the end of the purge list */
+                if (!list_empty(&res->purge)) {
+                        list_del_init(&res->purge);
+                        list_add_tail(&res->purge, &dlm->purge_list);
+                }
+                spin_unlock(&res->spinlock);
+                return 0;
+        }
        master = (res->owner == dlm->node_num);
        if (!master)
                res->state |= DLM_LOCK_RES_DROPPING_REF;
        spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7219a86d34cc..e15fc7d50827 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+        .flags          = 0,
+};
 static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .get_osb        = ocfs2_get_dentry_osb,
        .post_unlock    = ocfs2_dentry_post_unlock,
@@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
                                   &ocfs2_rename_lops, osb);
 }
+static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
+                                         struct ocfs2_super *osb)
+{
+        /* nfs_sync lockres doesn't come from a slab so we call init
+         * once on it manually.  */
+        ocfs2_lock_res_init_once(res);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
+        ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
+                                   &ocfs2_nfs_sync_lops, osb);
+}
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp)
 {
@@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
                ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
 }
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
+{
+        int status;
+        struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+        if (ocfs2_is_hard_readonly(osb))
+                return -EROFS;
+        if (ocfs2_mount_local(osb))
+                return 0;
+        status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
+                                    0, 0);
+        if (status < 0)
+                mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+        return status;
+}
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
+{
+        struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+        if (!ocfs2_mount_local(osb))
+                ocfs2_cluster_unlock(osb, lockres,
+                                     ex ? LKM_EXMODE : LKM_PRMODE);
+}
 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 {
        int ret;
@@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 local:
        ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
        ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+        ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
        osb->cconn = conn;
@@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
        ocfs2_lock_res_free(&osb->osb_super_lockres);
        ocfs2_lock_res_free(&osb->osb_rename_lockres);
+        ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
        ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
        osb->cconn = NULL;
@@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
 {
        ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
        ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+        ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
 }
 int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3f8d9986b8e0..e1fd5721cd7f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex);
 int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
 int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 2f27b332d8b3..de3da8eb558c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -31,6 +31,7 @@
 #include "ocfs2.h"
+#include "alloc.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "dcache.h"
@@ -38,6 +39,7 @@
 #include "inode.h"
 #include "buffer_head_io.h"
+#include "suballoc.h"
 struct ocfs2_inode_handle
 {
@@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
                struct ocfs2_inode_handle *handle)
 {
        struct inode *inode;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        u64 blkno = handle->ih_blkno;
+        int status, set;
        struct dentry *result;
        mlog_entry("(0x%p, 0x%p)\n", sb, handle);
-        if (handle->ih_blkno == 0) {
+        if (blkno == 0) {
-                mlog_errno(-ESTALE);
+                mlog(0, "nfs wants inode with blkno: 0\n");
-                return ERR_PTR(-ESTALE);
+                result = ERR_PTR(-ESTALE);
+                goto bail;
+        }
+        inode = ocfs2_ilookup(sb, blkno);
+        /*
+         * If the inode exists in memory, we only need to check it's
+         * generation number
+         */
+        if (inode)
+                goto check_gen;
+        /*
+         * This will synchronize us against ocfs2_delete_inode() on
+         * all nodes
+         */
+        status = ocfs2_nfs_sync_lock(osb, 1);
+        if (status < 0) {
+                mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+                goto check_err;
+        }
+        status = ocfs2_test_inode_bit(osb, blkno, &set);
+        if (status < 0) {
+                if (status == -EINVAL) {
+                        /*
+                         * The blkno NFS gave us doesn't even show up
+                         * as an inode, we return -ESTALE to be
+                         * nice
+                         */
+                        mlog(0, "test inode bit failed %d\n", status);
+                        status = -ESTALE;
+                } else {
+                        mlog(ML_ERROR, "test inode bit failed %d\n", status);
+                }
+                goto unlock_nfs_sync;
+        }
+        /* If the inode allocator bit is clear, this inode must be stale */
+        if (!set) {
+                mlog(0, "inode %llu suballoc bit is clear\n", blkno);
+                status = -ESTALE;
+                goto unlock_nfs_sync;
        }
-        inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
+        inode = ocfs2_iget(osb, blkno, 0, 0);
-        if (IS_ERR(inode))
+unlock_nfs_sync:
-                return (void *)inode;
+        ocfs2_nfs_sync_unlock(osb, 1);
+check_err:
+        if (status < 0) {
+                if (status == -ESTALE) {
+                        mlog(0, "stale inode ino: %llu generation: %u\n",
+                             blkno, handle->ih_generation);
+                }
+                result = ERR_PTR(status);
+                goto bail;
+        }
+        if (IS_ERR(inode)) {
+                mlog_errno(PTR_ERR(inode));
+                result = (void *)inode;
+                goto bail;
+        }
+check_gen:
        if (handle->ih_generation != inode->i_generation) {
                iput(inode);
-                return ERR_PTR(-ESTALE);
+                mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
+                     handle->ih_generation);
+                result = ERR_PTR(-ESTALE);
+                goto bail;
        }
        result = d_obtain_alias(inode);
        if (!IS_ERR(result))
                result->d_op = &ocfs2_dentry_ops;
+        else
+                mlog_errno(PTR_ERR(result));
+bail:
        mlog_exit_ptr(result);
        return result;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5887df2cd8a..8672b9536039 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1926,7 +1926,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                   out->f_path.dentry->d_name.len,
                   out->f_path.dentry->d_name.name);
-        inode_double_lock(inode, pipe->inode);
+        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
        ret = ocfs2_rw_lock(inode, 1);
        if (ret < 0) {
@@ -1941,12 +1941,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                goto out_unlock;
        }
+        if (pipe->inode)
+                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
        ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+        if (pipe->inode)
+                mutex_unlock(&pipe->inode->i_mutex);
 out_unlock:
        ocfs2_rw_unlock(inode, 1);
 out:
-        inode_double_unlock(inode, pipe->inode);
+        mutex_unlock(&inode->i_mutex);
        mlog_exit(ret);
        return ret;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..10e1fa87396a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "dir.h"
 #include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
                oi->ip_attr |= OCFS2_DIRSYNC_FL;
 }
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
+{
+        struct ocfs2_find_inode_args args;
+        args.fi_blkno = blkno;
+        args.fi_flags = 0;
+        args.fi_ino = ino_from_blkno(sb, blkno);
+        args.fi_sysfile_type = 0;
+        return ilookup5(sb, blkno, ocfs2_find_actor, &args);
+}
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
                         int sysfile_type)
 {
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
                     (unsigned long long)le64_to_cpu(fe->i_blkno));
-        inode->i_nlink = le16_to_cpu(fe->i_links_count);
+        inode->i_nlink = ocfs2_read_links_count(fe);
        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        ocfs2_set_inode_flags(inode);
+        OCFS2_I(inode)->ip_last_used_slot = 0;
+        OCFS2_I(inode)->ip_last_used_group = 0;
        mlog_exit_void();
 }
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
-                                        ocfs2_quota_trans_credits(inode->i_sb));
+                                   ocfs2_quota_trans_credits(inode->i_sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
                goto bail_unlock_dir;
        }
+        /* Remove any dir index tree */
+        if (S_ISDIR(inode->i_mode)) {
+                status = ocfs2_dx_dir_truncate(inode, di_bh);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail_unlock_dir;
+                }
+        }
        /*Free extended attribute resources associated with this inode.*/
        status = ocfs2_xattr_remove(inode, di_bh);
        if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
                goto bail;
        }
+        /*
+         * Synchronize us against ocfs2_get_dentry. We take this in
+         * shared mode so that all nodes can still concurrently
+         * process deletes.
+         */
+        status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
+        if (status < 0) {
+                mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
+                ocfs2_cleanup_delete_inode(inode, 0);
+                goto bail_unblock;
+        }
        /* Lock down the inode. This gives us an up to date view of
         * it's metadata (for verification), and allows us to
         * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
                if (status != -ENOENT)
                        mlog_errno(status);
                ocfs2_cleanup_delete_inode(inode, 0);
-                goto bail_unblock;
+                goto bail_unlock_nfs_sync;
        }
        /* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
 bail_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
+bail_unlock_nfs_sync:
+        ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
 bail_unblock:
        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
        if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
        fe->i_size = cpu_to_le64(i_size_read(inode));
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_uid = cpu_to_le32(inode->i_uid);
        fe->i_gid = cpu_to_le32(inode->i_gid);
        fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
        OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
        ocfs2_set_inode_flags(inode);
        i_size_write(inode, le64_to_cpu(fe->i_size));
-        inode->i_nlink = le16_to_cpu(fe->i_links_count);
+        inode->i_nlink = ocfs2_read_links_count(fe);
        inode->i_uid = le32_to_cpu(fe->i_uid);
        inode->i_gid = le32_to_cpu(fe->i_gid);
        inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index eb3c302b38d3..ea71525aad41 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
        struct inode                    vfs_inode;
        struct jbd2_inode               ip_jinode;
+        /* Only valid if the inode is the dir. */
+        u32                             ip_last_used_slot;
+        u64                             ip_last_used_group;
 };
 /*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE           0x1
 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x2
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 57d7d25a2b9a..a20a0f1e37fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+                                            int slot_num,
+                                            struct ocfs2_dinode *la_dinode,
+                                            struct ocfs2_dinode *tl_dinode,
+                                            struct ocfs2_quota_recovery *qrec);
 static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
 {
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
        return __ocfs2_wait_on_mount(osb, 1);
 }
 /*
- * The recovery_list is a simple linked list of node numbers to recover.
+ * This replay_map is to track online/offline slots, so we could recover
- * It is protected by the recovery_lock.
+ * offline slots during recovery and mount
 */
-struct ocfs2_recovery_map {
+enum ocfs2_replay_state {
-        unsigned int rm_used;
+        REPLAY_UNNEEDED = 0,    /* Replay is not needed, so ignore this map */
-        unsigned int *rm_entries;
+        REPLAY_NEEDED,          /* Replay slots marked in rm_replay_slots */
+        REPLAY_DONE             /* Replay was already queued */
 };
+struct ocfs2_replay_map {
+        unsigned int rm_slots;
+        enum ocfs2_replay_state rm_state;
+        unsigned char rm_replay_slots[0];
+};
+void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+{
+        if (!osb->replay_map)
+                return;
+        /* If we've already queued the replay, we don't have any more to do */
+        if (osb->replay_map->rm_state == REPLAY_DONE)
+                return;
+        osb->replay_map->rm_state = state;
+}
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
+{
+        struct ocfs2_replay_map *replay_map;
+        int i, node_num;
+        /* If replay map is already set, we don't do it again */
+        if (osb->replay_map)
+                return 0;
+        replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
+                             (osb->max_slots * sizeof(char)), GFP_KERNEL);
+        if (!replay_map) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        spin_lock(&osb->osb_lock);
+        replay_map->rm_slots = osb->max_slots;
+        replay_map->rm_state = REPLAY_UNNEEDED;
+        /* set rm_replay_slots for offline slot(s) */
+        for (i = 0; i < replay_map->rm_slots; i++) {
+                if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
+                        replay_map->rm_replay_slots[i] = 1;
+        }
+        osb->replay_map = replay_map;
+        spin_unlock(&osb->osb_lock);
+        return 0;
+}
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+{
+        struct ocfs2_replay_map *replay_map = osb->replay_map;
+        int i;
+        if (!replay_map)
+                return;
+        if (replay_map->rm_state != REPLAY_NEEDED)
+                return;
+        for (i = 0; i < replay_map->rm_slots; i++)
+                if (replay_map->rm_replay_slots[i])
+                        ocfs2_queue_recovery_completion(osb->journal, i, NULL,
+                                                        NULL, NULL);
+        replay_map->rm_state = REPLAY_DONE;
+}
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+{
+        struct ocfs2_replay_map *replay_map = osb->replay_map;
+        if (!osb->replay_map)
+                return;
+        kfree(replay_map);
+        osb->replay_map = NULL;
+}
 int ocfs2_recovery_init(struct ocfs2_super *osb)
 {
        struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
        },
 };
+static struct ocfs2_triggers dr_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dx_root_block, dr_check),
+};
+static struct ocfs2_triggers dl_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dx_leaf, dl_check),
+};
 static int __ocfs2_journal_access(handle_t *handle,
                                  struct inode *inode,
                                  struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
                                      type);
 }
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
+                                      type);
+}
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
+                                      type);
+}
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh, int type)
 {
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 }
 /* Called by the mount code to queue recovery the last part of
- * recovery for it's own slot. */
+ * recovery for it's own and offline slot(s). */
 void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 {
        struct ocfs2_journal *journal = osb->journal;
-        if (osb->dirty) {
+        /* No need to queue up our truncate_log as regular cleanup will catch
-                /* No need to queue up our truncate_log as regular
+         * that */
-                 * cleanup will catch that. */
+        ocfs2_queue_recovery_completion(journal, osb->slot_num,
-                ocfs2_queue_recovery_completion(journal,
+                                        osb->local_alloc_copy, NULL, NULL);
-                                                osb->slot_num,
+        ocfs2_schedule_truncate_log_flush(osb, 0);
-                                                osb->local_alloc_copy,
-                                                NULL,
-                                                NULL);
-                ocfs2_schedule_truncate_log_flush(osb, 0);
-                osb->local_alloc_copy = NULL;
+        osb->local_alloc_copy = NULL;
-                osb->dirty = 0;
+        osb->dirty = 0;
-        }
+        /* queue to recover orphan slots for all offline slots */
+        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+        ocfs2_queue_replay_slots(osb);
+        ocfs2_free_replay_slots(osb);
 }
 void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
                goto bail;
        }
+        status = ocfs2_compute_replay_slots(osb);
+        if (status < 0)
+                mlog_errno(status);
+        /* queue recovery for our own slot */
+        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+                                        NULL, NULL);
        spin_lock(&osb->osb_lock);
        while (rm->rm_used) {
                /* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
        ocfs2_super_unlock(osb, 1);
-        /* We always run recovery on our own orphan dir - the dead
+        /* queue recovery for offline slots */
-         * node(s) may have disallowd a previos inode delete. Re-processing
+        ocfs2_queue_replay_slots(osb);
-         * is therefore required. */
-        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                        NULL, NULL);
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
                goto restart;
        }
+        ocfs2_free_replay_slots(osb);
        osb->recovery_thread_task = NULL;
        mb(); /* sync with ocfs2_recovery_thread_running */
        wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
                goto done;
        }
+        /* we need to run complete recovery for offline orphan slots */
+        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
        mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
             node_num, slot_num,
             MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 172850a9a12a..619dd7f6c053 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
 struct ocfs2_super;
 struct ocfs2_dinode;
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+struct ocfs2_recovery_map {
+        unsigned int rm_used;
+        unsigned int *rm_entries;
+};
 struct ocfs2_journal {
        enum ocfs2_journal_state   j_state;    /* Journals current state   */
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
 int ocfs2_recovery_init(struct ocfs2_super *osb);
 void ocfs2_recovery_exit(struct ocfs2_super *osb);
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
 /*
 *  Journal Control:
 *  Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
 /* dirblock */
 int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
                            struct buffer_head *bh, int type);
+/* ocfs2_dx_root_block */
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_dx_leaf */
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
 /* Anything that has no ecc */
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
 }
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) */
+ * bitmap block for the new bit) dx_root update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
+{
+        /* 1 block for index, 2 allocs (data, metadata), 1 clusters
+         * worth of blocks for initial extent. */
+        return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
+                ocfs2_clusters_to_blocks(sb, 1);
+}
-/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
+/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
- * group descriptor + mkdir/symlink blocks + quota update */
+ * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
-static inline int ocfs2_mknod_credits(struct super_block *sb)
+ * blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
+                                      int xattr_credits)
 {
-        return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+        int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
+        if (is_dir)
+                dir_credits += ocfs2_add_dir_index_credits(sb);
+        return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
               ocfs2_quota_trans_credits(sb);
 }
@@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir */
+ * update on dir + index leaf + dx root update for free list */
 static inline int ocfs2_link_credits(struct super_block *sb)
 {
-        return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+        return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
               ocfs2_quota_trans_credits(sb);
 }
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
- * dir inode link */
+ * dir inode link + dir inode index leaf + dir index root */
 static inline int ocfs2_unlink_credits(struct super_block *sb)
 {
        /* The quota update from ocfs2_link_credits is unused here... */
-        return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+        return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
 }
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
- * inode alloc group descriptor */
+ * inode alloc group descriptor + orphan dir index leaf */
-#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
- * directory + target unlink */
+ * directory + target unlink + 3 x dir index leaves */
 static inline int ocfs2_rename_credits(struct super_block *sb)
 {
-        return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+        return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
 }
 /* global bitmap dinode, group desc., relinked group,
@@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
                                          + OCFS2_INODE_UPDATE_CREDITS \
                                          + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+/* inode update, removal of dx root block from allocator */
+#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS +      \
+                                      OCFS2_SUBALLOC_FREE)
+static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
+{
+        int credits = 1 + OCFS2_SUBALLOC_ALLOC;
+        credits += ocfs2_clusters_to_blocks(sb, 1);
+        credits += ocfs2_quota_trans_credits(sb);
+        return credits;
+}
 /*
 * Please note that the caller must make sure that root_el is the root
 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-        int blocks = ocfs2_mknod_credits(sb);
+        int blocks = ocfs2_mknod_credits(sb, 0, 0);
        /* links can be longer than one block so we may update many
         * within our single allocated extent. */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec70cdbe77fc..bac7e6abaf47 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
-#include <linux/debugfs.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
-#ifdef CONFIG_OCFS2_FS_STATS
-static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
-{
-        file->private_data = inode->i_private;
-        return 0;
-}
-#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
-#define LA_DEBUG_VER    1
-static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
-                                   size_t count, loff_t *ppos)
-{
-        static DEFINE_MUTEX(la_debug_mutex);
-        struct ocfs2_super *osb = file->private_data;
-        int written, ret;
-        char *buf = osb->local_alloc_debug_buf;
-        mutex_lock(&la_debug_mutex);
-        memset(buf, 0, LA_DEBUG_BUF_SZ);
-        written = snprintf(buf, LA_DEBUG_BUF_SZ,
-                           "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
-                           LA_DEBUG_VER,
-                           (unsigned long long)osb->la_last_gd,
-                           osb->local_alloc_default_bits,
-                           osb->local_alloc_bits, osb->local_alloc_state);
-        ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
-        mutex_unlock(&la_debug_mutex);
-        return ret;
-}
-static const struct file_operations ocfs2_la_debug_fops = {
-        .open =         ocfs2_la_debug_open,
-        .read =         ocfs2_la_debug_read,
-};
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-        osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
-        if (!osb->local_alloc_debug_buf)
-                return;
-        osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
-                                                     S_IFREG|S_IRUSR,
-                                                     osb->osb_debug_root,
-                                                     osb,
-                                                     &ocfs2_la_debug_fops);
-        if (!osb->local_alloc_debug) {
-                kfree(osb->local_alloc_debug_buf);
-                osb->local_alloc_debug_buf = NULL;
-        }
-}
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-        if (osb->local_alloc_debug)
-                debugfs_remove(osb->local_alloc_debug);
-        if (osb->local_alloc_debug_buf)
-                kfree(osb->local_alloc_debug_buf);
-        osb->local_alloc_debug_buf = NULL;
-        osb->local_alloc_debug = NULL;
-}
-#else   /* CONFIG_OCFS2_FS_STATS */
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-        return;
-}
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-        return;
-}
-#endif
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
        mlog_entry_void();
-        ocfs2_init_la_debug(osb);
        if (osb->local_alloc_bits == 0)
                goto bail;
@@ -299,9 +218,6 @@ bail:
        if (inode)
                iput(inode);
-        if (status < 0)
-                ocfs2_shutdown_la_debug(osb);
        mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
        mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        cancel_delayed_work(&osb->la_enable_wq);
        flush_workqueue(ocfs2_wq);
-        ocfs2_shutdown_la_debug(osb);
        if (osb->local_alloc_state == OCFS2_LA_UNUSED)
                goto out;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4b11762f249e..2220f93f668b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    struct inode *inode,
                                    char *name,
-                                    struct buffer_head **de_bh);
+                                    struct ocfs2_dir_lookup_result *lookup);
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
                            struct ocfs2_dinode *fe,
                            char *name,
-                            struct buffer_head *de_bh,
+                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode);
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
        struct ocfs2_super *osb;
        struct ocfs2_dinode *dirfe;
        struct buffer_head *new_fe_bh = NULL;
-        struct buffer_head *de_bh = NULL;
        struct inode *inode = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *xattr_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
        int want_clusters = 0;
+        int want_meta = 0;
        int xattr_credits = 0;
        struct ocfs2_security_xattr_info si = {
                .enable = 1,
        };
        int did_quota_inode = 0;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
                return status;
        }
-        if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+        if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
                status = -EMLINK;
                goto leave;
        }
        dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-        if (!dirfe->i_links_count) {
+        if (!ocfs2_read_links_count(dirfe)) {
                /* can't make a file in a deleted directory. */
                status = -ENOENT;
                goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
        /* get a spot inside the dir. */
        status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                              dentry->d_name.name,
-                                              dentry->d_name.len, &de_bh);
+                                              dentry->d_name.len, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
        /* calculate meta data/clusters for setting security and acl xattr */
        status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
-                                        &si, &want_clusters,
+                                       &si, &want_clusters,
-                                        &xattr_credits, &xattr_ac);
+                                       &xattr_credits, &want_meta);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        /* Reserve a cluster if creating an extent based directory. */
-        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
                want_clusters += 1;
+                /* Dir indexing requires extra space as well */
+                if (ocfs2_supports_indexed_dirs(osb))
+                        want_meta++;
+        }
+        status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto leave;
+        }
        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
-        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
-                                   xattr_credits);
+                                                            S_ISDIR(mode),
+                                                            xattr_credits));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
        if (S_ISDIR(mode)) {
                status = ocfs2_fill_new_dir(osb, handle, dir, inode,
-                                            new_fe_bh, data_ac);
+                                            new_fe_bh, data_ac, meta_ac);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
                        mlog_errno(status);
                        goto leave;
                }
-                le16_add_cpu(&dirfe->i_links_count, 1);
+                ocfs2_add_links_count(dirfe, 1);
                status = ocfs2_journal_dirty(handle, parent_fe_bh);
                if (status < 0) {
                        mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
        }
        status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
-                                xattr_ac, data_ac);
+                                meta_ac, data_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
        if (si.enable) {
                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
-                                                 xattr_ac, data_ac);
+                                                 meta_ac, data_ac);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
-                                 de_bh);
+                                 &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -423,11 +437,12 @@ leave:
                mlog(0, "Disk is full\n");
        brelse(new_fe_bh);
-        brelse(de_bh);
        brelse(parent_fe_bh);
        kfree(si.name);
        kfree(si.value);
+        ocfs2_free_dir_lookup_result(&lookup);
        if ((status < 0) && inode) {
                clear_nlink(inode);
                iput(inode);
@@ -439,8 +454,8 @@ leave:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
-        if (xattr_ac)
+        if (meta_ac)
-                ocfs2_free_alloc_context(xattr_ac);
+                ocfs2_free_alloc_context(meta_ac);
        mlog_exit(status);
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        struct ocfs2_extent_list *fel;
        u64 fe_blkno = 0;
        u16 suballoc_bit;
+        u16 feat;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
                   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        *new_fe_bh = NULL;
-        status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
+        status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
-                                       &fe_blkno);
+                                       inode_ac, &suballoc_bit, &fe_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_mode = cpu_to_le16(inode->i_mode);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_last_eb_blk = 0;
        strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_dtime = 0;
        /*
-         * If supported, directories start with inline data.
+         * If supported, directories start with inline data. If inline
+         * isn't supported, but indexing is, we start them as indexed.
         */
+        feat = le16_to_cpu(fe->i_dyn_features);
        if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
-                u16 feat = le16_to_cpu(fe->i_dyn_features);
                fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
                fe->id2.i_data.id_count = cpu_to_le16(
@@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
        int err;
        struct buffer_head *fe_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
-        struct buffer_head *de_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
                   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                           dentry->d_name.name,
-                                           dentry->d_name.len, &de_bh);
+                                           dentry->d_name.len, &lookup);
        if (err < 0) {
                mlog_errno(err);
                goto out;
@@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        }
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+        if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
                err = -EMLINK;
                goto out_unlock_inode;
        }
@@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
        inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME;
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        err = ocfs2_journal_dirty(handle, fe_bh);
        if (err < 0) {
-                le16_add_cpu(&fe->i_links_count, -1);
+                ocfs2_add_links_count(fe, -1);
                drop_nlink(inode);
                mlog_errno(err);
                goto out_commit;
@@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
        err = ocfs2_add_entry(handle, dentry, inode,
                              OCFS2_I(inode)->ip_blkno,
-                              parent_fe_bh, de_bh);
+                              parent_fe_bh, &lookup);
        if (err) {
-                le16_add_cpu(&fe->i_links_count, -1);
+                ocfs2_add_links_count(fe, -1);
                drop_nlink(inode);
                mlog_errno(err);
                goto out_commit;
@@ -714,10 +731,11 @@ out_unlock_inode:
 out:
        ocfs2_inode_unlock(dir, 1);
-        brelse(de_bh);
        brelse(fe_bh);
        brelse(parent_fe_bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(err);
        return err;
@@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
        struct buffer_head *fe_bh = NULL;
        struct buffer_head *parent_node_bh = NULL;
        handle_t *handle = NULL;
-        struct ocfs2_dir_entry *dirent = NULL;
-        struct buffer_head *dirent_bh = NULL;
        char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
-        struct buffer_head *orphan_entry_bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
        mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
        }
        status = ocfs2_find_files_on_disk(dentry->d_name.name,
-                                          dentry->d_name.len, &blkno,
+                                          dentry->d_name.len, &blkno, dir,
-                                          dir, &dirent_bh, &dirent);
+                                          &lookup);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
        child_locked = 1;
        if (S_ISDIR(inode->i_mode)) {
-                if (!ocfs2_empty_dir(inode)) {
+                if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
-                        status = -ENOTEMPTY;
-                        goto leave;
-                } else if (inode->i_nlink != 2) {
                        status = -ENOTEMPTY;
                        goto leave;
                }
@@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (inode_is_unlinkable(inode)) {
                status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
-                                                  orphan_name,
+                                                  orphan_name, &orphan_insert);
-                                                  &orphan_entry_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (inode_is_unlinkable(inode)) {
                status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
-                                          orphan_entry_bh, orphan_dir);
+                                          &orphan_insert, orphan_dir);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
        }
        /* delete the name from the parent dir */
-        status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+        status = ocfs2_delete_entry(handle, dir, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (S_ISDIR(inode->i_mode))
                drop_nlink(inode);
        drop_nlink(inode);
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0) {
@@ -916,9 +929,10 @@ leave:
        }
        brelse(fe_bh);
-        brelse(dirent_bh);
        brelse(parent_node_bh);
-        brelse(orphan_entry_bh);
+        ocfs2_free_dir_lookup_result(&orphan_insert);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(status);
@@ -1004,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir,
                        struct inode *new_dir,
                        struct dentry *new_dentry)
 {
-        int status = 0, rename_lock = 0, parents_locked = 0;
+        int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
-        int old_child_locked = 0, new_child_locked = 0;
+        int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct inode *orphan_dir = NULL;
@@ -1020,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir,
        handle_t *handle = NULL;
        struct buffer_head *old_dir_bh = NULL;
        struct buffer_head *new_dir_bh = NULL;
-        struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
-                *new_de = NULL;
-        struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
-        struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
-                                                    // this is the 1st dirent bh
        nlink_t old_dir_nlink = old_dir->i_nlink;
        struct ocfs2_dinode *old_di;
+        struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
+        struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
+        struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
+        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+        struct ocfs2_dir_lookup_result target_insert = { NULL, };
        /* At some point it might be nice to break this function up a
         * bit. */
@@ -1108,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir,
        if (S_ISDIR(old_inode->i_mode)) {
                u64 old_inode_parent;
+                update_dot_dot = 1;
                status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
-                                                  old_inode, &old_inode_de_bh,
+                                                  old_inode,
-                                                  &old_inode_dot_dot_de);
+                                                  &old_inode_dot_dot_res);
                if (status) {
                        status = -EIO;
                        goto bail;
@@ -1122,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir,
                }
                if (!new_inode && new_dir != old_dir &&
-                    new_dir->i_nlink >= OCFS2_LINK_MAX) {
+                    new_dir->i_nlink >= ocfs2_link_max(osb)) {
                        status = -EMLINK;
                        goto bail;
                }
@@ -1151,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir,
         * to delete it */
        status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
                                          new_dentry->d_name.len,
-                                          &newfe_blkno, new_dir, &new_de_bh,
+                                          &newfe_blkno, new_dir,
-                                          &new_de);
+                                          &target_lookup_res);
        /* The only error we allow here is -ENOENT because the new
         * file not existing is perfectly valid. */
        if ((status < 0) && (status != -ENOENT)) {
@@ -1161,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir,
                mlog_errno(status);
                goto bail;
        }
+        if (status == 0)
+                target_exists = 1;
-        if (!new_de && new_inode) {
+        if (!target_exists && new_inode) {
                /*
                 * Target was unlinked by another node while we were
                 * waiting to get to ocfs2_rename(). There isn't
@@ -1175,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir,
        /* In case we need to overwrite an existing file, we blow it
         * away first */
-        if (new_de) {
+        if (target_exists) {
                /* VFS didn't think there existed an inode here, but
                 * someone else in the cluster must have raced our
                 * rename to create one. Today we error cleanly, in
@@ -1216,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir,
                newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
-                mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu "
+                mlog(0, "aha rename over existing... new_blkno=%llu "
-                     "newfebh=%p bhblocknr=%llu\n", new_de,
+                     "newfebh=%p bhblocknr=%llu\n",
                     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
                     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
@@ -1225,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                          new_inode,
                                                          orphan_name,
-                                                          &orphan_entry_bh);
+                                                          &orphan_insert);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1243,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir,
                status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
                                                      new_dentry->d_name.name,
                                                      new_dentry->d_name.len,
-                                                      &insert_entry_bh);
+                                                      &target_insert);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1258,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir,
                goto bail;
        }
-        if (new_de) {
+        if (target_exists) {
                if (S_ISDIR(new_inode->i_mode)) {
-                        if (!ocfs2_empty_dir(new_inode) ||
+                        if (new_inode->i_nlink != 2 ||
-                            new_inode->i_nlink != 2) {
+                            !ocfs2_empty_dir(new_inode)) {
                                status = -ENOTEMPTY;
                                goto bail;
                        }
@@ -1274,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir,
                }
                if (S_ISDIR(new_inode->i_mode) ||
-                    (newfe->i_links_count == cpu_to_le16(1))){
+                    (ocfs2_read_links_count(newfe) == 1)) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
                                                  newfe, orphan_name,
-                                                  orphan_entry_bh, orphan_dir);
+                                                  &orphan_insert, orphan_dir);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1285,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir,
                }
                /* change the dirent to point to the correct inode */
-                status = ocfs2_update_entry(new_dir, handle, new_de_bh,
+                status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
-                                            new_de, old_inode);
+                                            old_inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1294,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir,
                new_dir->i_version++;
                if (S_ISDIR(new_inode->i_mode))
-                        newfe->i_links_count = 0;
+                        ocfs2_set_links_count(newfe, 0);
                else
-                        le16_add_cpu(&newfe->i_links_count, -1);
+                        ocfs2_add_links_count(newfe, -1);
                status = ocfs2_journal_dirty(handle, newfe_bh);
                if (status < 0) {
@@ -1307,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir,
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
                                         OCFS2_I(old_inode)->ip_blkno,
-                                         new_dir_bh, insert_entry_bh);
+                                         new_dir_bh, &target_insert);
        }
        old_inode->i_ctime = CURRENT_TIME;
@@ -1334,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir,
         * because the insert might have changed the type of directory
         * we're dealing with.
         */
-        old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
+        status = ocfs2_find_entry(old_dentry->d_name.name,
-                                     old_dentry->d_name.len,
+                                  old_dentry->d_name.len, old_dir,
-                                     old_dir, &old_de);
+                                  &old_entry_lookup);
-        if (!old_de_bh) {
+        if (status)
-                status = -EIO;
                goto bail;
-        }
-        status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+        status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1353,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir,
                new_inode->i_ctime = CURRENT_TIME;
        }
        old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
-        if (old_inode_de_bh) {
-                status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh,
+        if (update_dot_dot) {
-                                            old_inode_dot_dot_de, new_dir);
+                status = ocfs2_update_entry(old_inode, handle,
+                                            &old_inode_dot_dot_res, new_dir);
                old_dir->i_nlink--;
                if (new_inode) {
                        new_inode->i_nlink--;
@@ -1391,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir,
                } else {
                        struct ocfs2_dinode *fe;
                        status = ocfs2_journal_access_di(handle, old_dir,
-                                                         old_dir_bh,
+                                                      old_dir_bh,
-                                                         OCFS2_JOURNAL_ACCESS_WRITE);
+                                                      OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
-                        fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
+                        ocfs2_set_links_count(fe, old_dir->i_nlink);
                        status = ocfs2_journal_dirty(handle, old_dir_bh);
                }
        }
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
        status = 0;
 bail:
@@ -1429,13 +1444,17 @@ bail:
        if (new_inode)
                iput(new_inode);
+        ocfs2_free_dir_lookup_result(&target_lookup_res);
+        ocfs2_free_dir_lookup_result(&old_entry_lookup);
+        ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
+        ocfs2_free_dir_lookup_result(&orphan_insert);
+        ocfs2_free_dir_lookup_result(&target_insert);
        brelse(newfe_bh);
        brelse(old_inode_bh);
        brelse(old_dir_bh);
        brelse(new_dir_bh);
-        brelse(new_de_bh);
-        brelse(old_de_bh);
-        brelse(old_inode_de_bh);
        brelse(orphan_entry_bh);
        brelse(insert_entry_bh);
@@ -1558,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir,
        struct inode *inode = NULL;
        struct super_block *sb;
        struct buffer_head *new_fe_bh = NULL;
-        struct buffer_head *de_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_dinode *dirfe;
@@ -1572,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir,
                .enable = 1,
        };
        int did_quota = 0, did_quota_inode = 0;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1592,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir,
        }
        dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-        if (!dirfe->i_links_count) {
+        if (!ocfs2_read_links_count(dirfe)) {
                /* can't make a file in a deleted directory. */
                status = -ENOENT;
                goto bail;
@@ -1605,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir,
        status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                              dentry->d_name.name,
-                                              dentry->d_name.len, &de_bh);
+                                              dentry->d_name.len, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1744,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir,
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
-                                 de_bh);
+                                 &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1772,9 +1791,9 @@ bail:
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
-        brelse(de_bh);
        kfree(si.name);
        kfree(si.value);
+        ocfs2_free_dir_lookup_result(&lookup);
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
        if (data_ac)
@@ -1826,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    struct inode *inode,
                                    char *name,
-                                    struct buffer_head **de_bh)
+                                    struct ocfs2_dir_lookup_result *lookup)
 {
        struct inode *orphan_dir_inode;
        struct buffer_head *orphan_dir_bh = NULL;
@@ -1857,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
        status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
                                              orphan_dir_bh, name,
-                                              OCFS2_ORPHAN_NAMELEN, de_bh);
+                                              OCFS2_ORPHAN_NAMELEN, lookup);
        if (status < 0) {
                ocfs2_inode_unlock(orphan_dir_inode, 1);
@@ -1884,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct inode *inode,
                            struct ocfs2_dinode *fe,
                            char *name,
-                            struct buffer_head *de_bh,
+                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode)
 {
        struct buffer_head *orphan_dir_bh = NULL;
@@ -1910,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
         * underneath us... */
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
-                le16_add_cpu(&orphan_fe->i_links_count, 1);
+                ocfs2_add_links_count(orphan_fe, 1);
-        orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
        if (status < 0) {
@@ -1922,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
                                   OCFS2_ORPHAN_NAMELEN, inode,
                                   OCFS2_I(inode)->ip_blkno,
-                                   orphan_dir_bh, de_bh);
+                                   orphan_dir_bh, lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1955,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        char name[OCFS2_ORPHAN_NAMELEN + 1];
        struct ocfs2_dinode *orphan_fe;
        int status = 0;
-        struct buffer_head *target_de_bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        struct ocfs2_dir_entry *target_de = NULL;
        mlog_entry_void();
@@ -1971,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
             OCFS2_ORPHAN_NAMELEN);
        /* find it's spot in the orphan directory */
-        target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
+        status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
-                                        orphan_dir_inode, &target_de);
+                                  &lookup);
-        if (!target_de_bh) {
+        if (status) {
-                status = -ENOENT;
                mlog_errno(status);
                goto leave;
        }
        /* remove it from the orphan directory */
-        status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
+        status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
-                                    target_de_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1997,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        /* do the i_nlink dance! :) */
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
-                le16_add_cpu(&orphan_fe->i_links_count, -1);
+                ocfs2_add_links_count(orphan_fe, -1);
-        orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
        if (status < 0) {
@@ -2007,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        }
 leave:
-        brelse(target_de_bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(status);
        return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 946d3c34b90b..1386281950db 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_replay_map;
 struct ocfs2_quota_recovery;
 struct ocfs2_dentry_lock;
 struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
        atomic_t vol_state;
        struct mutex recovery_lock;
        struct ocfs2_recovery_map *recovery_map;
+        struct ocfs2_replay_map *replay_map;
        struct task_struct *recovery_thread_task;
        int disable_recovery;
        wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
        u64 la_last_gd;
-#ifdef CONFIG_OCFS2_FS_STATS
-        struct dentry *local_alloc_debug;
-        char *local_alloc_debug_buf;
-#endif
        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
        struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
        struct ocfs2_lock_res osb_rename_lockres;
+        struct ocfs2_lock_res osb_nfs_sync_lockres;
        struct ocfs2_dlm_debug *osb_dlm_debug;
        struct dentry *osb_debug_root;
+        struct dentry *osb_ctxt;
        wait_queue_head_t recovery_event;
@@ -344,6 +343,12 @@ struct ocfs2_super
        /* used to protect metaecc calculation check of xattr. */
        spinlock_t osb_xattr_lock;
+        unsigned int                    osb_dx_mask;
+        u32                             osb_dx_seed[4];
+        /* the group we used to allocate inodes. */
+        u64                             osb_inode_alloc_group;
 };
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+                return 1;
+        return 0;
+}
+static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
+{
+        if (ocfs2_supports_indexed_dirs(osb))
+                return OCFS2_DX_LINK_MAX;
+        return OCFS2_LINK_MAX;
+}
+static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
+{
+        u32 nlink = le16_to_cpu(di->i_links_count);
+        u32 hi = le16_to_cpu(di->i_links_count_hi);
+        if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
+                nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+        return nlink;
+}
+static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
+{
+        u16 lo, hi;
+        lo = nlink;
+        hi = nlink >> OCFS2_LINKS_HI_SHIFT;
+        di->i_links_count = cpu_to_le16(lo);
+        di->i_links_count_hi = cpu_to_le16(hi);
+}
+static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
+{
+        u32 links = ocfs2_read_links_count(di);
+        links += n;
+        ocfs2_set_links_count(di, links);
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DIR_TRAILER(ptr)                                 \
        (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+#define OCFS2_IS_VALID_DX_ROOT(ptr)                                     \
+        (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
+#define OCFS2_IS_VALID_DX_LEAF(ptr)                                     \
+        (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
 {
@@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
        return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
 }
+static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
+                                               u64 blocks)
+{
+        int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
+        unsigned int clusters;
+        clusters = ocfs2_blocks_to_clusters(sb, blocks);
+        return (u64)clusters << bits;
+}
 static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
                                                u64 bytes)
 {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 2332ef740f4f..7ab6e9e5e77c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE     "XATTR01"
 #define OCFS2_DIR_TRAILER_SIGNATURE     "DIRTRL1"
+#define OCFS2_DX_ROOT_SIGNATURE         "DXDIR01"
+#define OCFS2_DX_LEAF_SIGNATURE         "DXLEAF1"
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)                       \
@@ -95,7 +97,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
-                                         | OCFS2_FEATURE_INCOMPAT_META_ECC)
+                                         | OCFS2_FEATURE_INCOMPAT_META_ECC \
+                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -151,6 +154,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR            0x0200
+/* Support for indexed directores */
+#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS     0x0400
 /* Metadata checksum and error correction */
 #define OCFS2_FEATURE_INCOMPAT_META_ECC         0x0800
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 #define OCFS2_DIR_REC_LEN(name_len)     (((name_len) + OCFS2_DIR_MEMBER_LEN + \
                                          OCFS2_DIR_ROUND) & \
                                         ~OCFS2_DIR_ROUND)
+#define OCFS2_DIR_MIN_REC_LEN   OCFS2_DIR_REC_LEN(1)
 #define OCFS2_LINK_MAX          32000
+#define OCFS2_DX_LINK_MAX       ((1U << 31) - 1U)
+#define OCFS2_LINKS_HI_SHIFT    16
+#define OCFS2_DX_ENTRIES_MAX    (0xffffffffU)
 #define S_SHIFT                 12
 static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
 /*B8*/  __le16 s_xattr_inline_size;     /* extended attribute inline size
                                           for this fs*/
        __le16 s_reserved0;
-        __le32 s_reserved1;
+        __le32 s_dx_seed[3];            /* seed[0-2] for dx dir hash.
-/*C0*/  __le64 s_reserved2[16];         /* Fill out superblock */
+                                         * s_uuid_hash serves as seed[3]. */
+/*C0*/  __le64 s_reserved2[15];         /* Fill out superblock */
 /*140*/
        /*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
                                           belongs to */
        __le16 i_suballoc_bit;          /* Bit offset in suballocator
                                           block group */
-/*10*/  __le16 i_reserved0;
+/*10*/  __le16 i_links_count_hi;        /* High 16 bits of links count */
        __le16 i_xattr_inline_size;
        __le32 i_clusters;              /* Cluster count */
        __le32 i_uid;                   /* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
        __le16 i_dyn_features;
        __le64 i_xattr_loc;
 /*80*/  struct ocfs2_block_check i_check;       /* Error checking */
-/*88*/  __le64 i_reserved2[6];
+/*88*/  __le64 i_dx_root;               /* Pointer to dir index root block */
+        __le64 i_reserved2[5];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
 /*40*/
 };
+ /*
+ * A directory entry in the indexed tree. We don't store the full name here,
+ * but instead provide a pointer to the full dirent in the unindexed tree.
+ *
+ * We also store name_len here so as to reduce the number of leaf blocks we
+ * need to search in case of collisions.
+ */
+struct ocfs2_dx_entry {
+        __le32          dx_major_hash;  /* Used to find logical
+                                         * cluster in index */
+        __le32          dx_minor_hash;  /* Lower bits used to find
+                                         * block in cluster */
+        __le64          dx_dirent_blk;  /* Physical block in unindexed
+                                         * tree holding this dirent. */
+};
+struct ocfs2_dx_entry_list {
+        __le32          de_reserved;
+        __le16          de_count;       /* Maximum number of entries
+                                         * possible in de_entries */
+        __le16          de_num_used;    /* Current number of
+                                         * de_entries entries */
+        struct  ocfs2_dx_entry          de_entries[0];  /* Indexed dir entries
+                                                         * in a packed array of
+                                                         * length de_num_used */
+};
+#define OCFS2_DX_FLAG_INLINE    0x01
+/*
+ * A directory indexing block. Each indexed directory has one of these,
+ * pointed to by ocfs2_dinode.
+ *
+ * This block stores an indexed btree root, and a set of free space
+ * start-of-list pointers.
+ */
+struct ocfs2_dx_root_block {
+        __u8            dr_signature[8];        /* Signature for verification */
+        struct ocfs2_block_check dr_check;      /* Error checking */
+        __le16          dr_suballoc_slot;       /* Slot suballocator this
+                                                 * block belongs to. */
+        __le16          dr_suballoc_bit;        /* Bit offset in suballocator
+                                                 * block group */
+        __le32          dr_fs_generation;       /* Must match super block */
+        __le64          dr_blkno;               /* Offset on disk, in blocks */
+        __le64          dr_last_eb_blk;         /* Pointer to last
+                                                 * extent block */
+        __le32          dr_clusters;            /* Clusters allocated
+                                                 * to the indexed tree. */
+        __u8            dr_flags;               /* OCFS2_DX_FLAG_* flags */
+        __u8            dr_reserved0;
+        __le16          dr_reserved1;
+        __le64          dr_dir_blkno;           /* Pointer to parent inode */
+        __le32          dr_num_entries;         /* Total number of
+                                                 * names stored in
+                                                 * this directory.*/
+        __le32          dr_reserved2;
+        __le64          dr_free_blk;            /* Pointer to head of free
+                                                 * unindexed block list. */
+        __le64          dr_reserved3[15];
+        union {
+                struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
+                                                   * bits for maximum space
+                                                   * efficiency. */
+                struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
+                                                        * entries. We grow out
+                                                        * to extents if this
+                                                        * gets too big. */
+        };
+};
+/*
+ * The header of a leaf block in the indexed tree.
+ */
+struct ocfs2_dx_leaf {
+        __u8            dl_signature[8];/* Signature for verification */
+        struct ocfs2_block_check dl_check;      /* Error checking */
+        __le64          dl_blkno;       /* Offset on disk, in blocks */
+        __le32          dl_fs_generation;/* Must match super block */
+        __le32          dl_reserved0;
+        __le64          dl_reserved1;
+        struct ocfs2_dx_entry_list      dl_list;
+};
 /*
 * On disk allocator group structure for OCFS2
 */
@@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
        int size;
@@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
+        return size / sizeof(struct ocfs2_dx_entry);
+}
+static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
+        return size / sizeof(struct ocfs2_dx_entry);
+}
 static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
 {
        u16 size;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index eb6f50c9ceca..a53ce87481bf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_LOCK_TYPE_FLOCK,
        OCFS2_LOCK_TYPE_QINFO,
+        OCFS2_LOCK_TYPE_NFS_SYNC,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_QINFO:
                        c = 'Q';
                        break;
+                case OCFS2_LOCK_TYPE_NFS_SYNC:
+                        c = 'Y';
+                        break;
                default:
                        c = '\0';
        }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a69628603e18..b4ca5911caaf 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -48,7 +48,8 @@
 #include "buffer_head_io.h"
 #define NOT_ALLOC_NEW_GROUP             0
-#define ALLOC_NEW_GROUP                 1
+#define ALLOC_NEW_GROUP                 0x1
+#define ALLOC_GROUPS_FROM_GLOBAL        0x2
 #define OCFS2_MAX_INODES_TO_STEAL       1024
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
                                   struct buffer_head *bh,
-                                   u64 max_block);
+                                   u64 max_block,
+                                   u64 *last_alloc_group,
+                                   int flags);
 static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u16 *bg_bit_off);
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                                             u32 bits_wanted, u64 max_block,
+                                             int flags,
                                             struct ocfs2_alloc_context **ac);
 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
                                   struct buffer_head *bh,
-                                   u64 max_block)
+                                   u64 max_block,
+                                   u64 *last_alloc_group,
+                                   int flags)
 {
        int status, credits;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        cl = &fe->id2.i_chain;
        status = ocfs2_reserve_clusters_with_limit(osb,
                                                   le16_to_cpu(cl->cl_cpg),
-                                                   max_block, &ac);
+                                                   max_block, flags, &ac);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                goto bail;
        }
+        if (last_alloc_group && *last_alloc_group != 0) {
+                mlog(0, "use old allocation group %llu for block group alloc\n",
+                     (unsigned long long)*last_alloc_group);
+                ac->ac_last_group = *last_alloc_group;
+        }
        status = ocfs2_claim_clusters(osb,
                                      handle,
                                      ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
        status = 0;
+        /* save the new last alloc group so that the caller can cache it. */
+        if (last_alloc_group)
+                *last_alloc_group = ac->ac_last_group;
 bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                                       struct ocfs2_alloc_context *ac,
                                       int type,
                                       u32 slot,
-                                       int alloc_new_group)
+                                       u64 *last_alloc_group,
+                                       int flags)
 {
        int status;
        u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                        goto bail;
                }
-                if (alloc_new_group != ALLOC_NEW_GROUP) {
+                if (!(flags & ALLOC_NEW_GROUP)) {
                        mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
                             "and we don't alloc a new group for it.\n",
                             slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                }
                status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
-                                                 ac->ac_max_block);
+                                                 ac->ac_max_block,
+                                                 last_alloc_group, flags);
                if (status < 0) {
                        if (status != -ENOSPC)
                                mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
-                                             slot, ALLOC_NEW_GROUP);
+                                             slot, NULL, ALLOC_NEW_GROUP);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
                status = ocfs2_reserve_suballoc_bits(osb, ac,
                                                     INODE_ALLOC_SYSTEM_INODE,
-                                                     slot, NOT_ALLOC_NEW_GROUP);
+                                                     slot, NULL,
+                                                     NOT_ALLOC_NEW_GROUP);
                if (status >= 0) {
                        ocfs2_set_inode_steal_slot(osb, slot);
                        break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 {
        int status;
        s16 slot = ocfs2_get_inode_steal_slot(osb);
+        u64 alloc_group;
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
        if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                goto inode_steal;
        atomic_set(&osb->s_num_inodes_stolen, 0);
+        alloc_group = osb->osb_inode_alloc_group;
        status = ocfs2_reserve_suballoc_bits(osb, *ac,
                                             INODE_ALLOC_SYSTEM_INODE,
-                                             osb->slot_num, ALLOC_NEW_GROUP);
+                                             osb->slot_num,
+                                             &alloc_group,
+                                             ALLOC_NEW_GROUP |
+                                             ALLOC_GROUPS_FROM_GLOBAL);
        if (status >= 0) {
                status = 0;
+                spin_lock(&osb->osb_lock);
+                osb->osb_inode_alloc_group = alloc_group;
+                spin_unlock(&osb->osb_lock);
+                mlog(0, "after reservation, new allocation group is "
+                     "%llu\n", (unsigned long long)alloc_group);
                /*
                 * Some inodes must be freed by us, so try to allocate
                 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, ac,
                                             GLOBAL_BITMAP_SYSTEM_INODE,
-                                             OCFS2_INVALID_SLOT,
+                                             OCFS2_INVALID_SLOT, NULL,
                                             ALLOC_NEW_GROUP);
        if (status < 0 && status != -ENOSPC) {
                mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
 * things a bit. */
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                                             u32 bits_wanted, u64 max_block,
+                                             int flags,
                                             struct ocfs2_alloc_context **ac)
 {
        int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
        (*ac)->ac_max_block = max_block;
        status = -ENOSPC;
-        if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+        if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
+            ocfs2_alloc_should_use_local(osb, bits_wanted)) {
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac)
 {
-        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
+                                                 ALLOC_NEW_GROUP, ac);
 }
 /*
@@ -1618,8 +1651,41 @@ bail:
        return status;
 }
+static void ocfs2_init_inode_ac_group(struct inode *dir,
+                                      struct buffer_head *parent_fe_bh,
+                                      struct ocfs2_alloc_context *ac)
+{
+        struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+        /*
+         * Try to allocate inodes from some specific group.
+         *
+         * If the parent dir has recorded the last group used in allocation,
+         * cool, use it. Otherwise if we try to allocate new inode from the
+         * same slot the parent dir belongs to, use the same chunk.
+         *
+         * We are very careful here to avoid the mistake of setting
+         * ac_last_group to a group descriptor from a different (unlocked) slot.
+         */
+        if (OCFS2_I(dir)->ip_last_used_group &&
+            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
+                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
+        else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
+                ac->ac_last_group = ocfs2_which_suballoc_group(
+                                        le64_to_cpu(fe->i_blkno),
+                                        le16_to_cpu(fe->i_suballoc_bit));
+}
+static inline void ocfs2_save_inode_ac_group(struct inode *dir,
+                                             struct ocfs2_alloc_context *ac)
+{
+        OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
+        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
+}
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
                          handle_t *handle,
+                          struct inode *dir,
+                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
                          u16 *suballoc_bit,
                          u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        BUG_ON(ac->ac_bits_wanted != 1);
        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
        status = ocfs2_claim_suballoc_bits(osb,
                                           ac,
                                           handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
        ac->ac_bits_given++;
+        ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
 bail:
        mlog_exit(status);
@@ -2116,3 +2185,162 @@ out:
        return ret;
 }
+/*
+ * Read the inode specified by blkno to get suballoc_slot and
+ * suballoc_bit.
+ */
+static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
+                                       u16 *suballoc_slot, u16 *suballoc_bit)
+{
+        int status;
+        struct buffer_head *inode_bh = NULL;
+        struct ocfs2_dinode *inode_fe;
+        mlog_entry("blkno: %llu\n", blkno);
+        /* dirty read disk */
+        status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
+        if (status < 0) {
+                mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
+                goto bail;
+        }
+        inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
+        if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
+                mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
+                status = -EINVAL;
+                goto bail;
+        }
+        if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
+            (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
+                mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
+                     blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+                status = -EINVAL;
+                goto bail;
+        }
+        if (suballoc_slot)
+                *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
+        if (suballoc_bit)
+                *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+bail:
+        brelse(inode_bh);
+        mlog_exit(status);
+        return status;
+}
+/*
+ * test whether bit is SET in allocator bitmap or not.  on success, 0
+ * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
+ * is returned and *res is meaningless.  Call this after you have
+ * cluster locked against suballoc, or you may get a result based on
+ * non-up2date contents
+ */
+static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
+                                   struct inode *suballoc,
+                                   struct buffer_head *alloc_bh, u64 blkno,
+                                   u16 bit, int *res)
+{
+        struct ocfs2_dinode *alloc_fe;
+        struct ocfs2_group_desc *group;
+        struct buffer_head *group_bh = NULL;
+        u64 bg_blkno;
+        int status;
+        mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
+        alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+        if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
+                     (unsigned int)bit,
+                     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+                status = -EINVAL;
+                goto bail;
+        }
+        bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+                                             &group_bh);
+        if (status < 0) {
+                mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
+                goto bail;
+        }
+        group = (struct ocfs2_group_desc *) group_bh->b_data;
+        *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
+bail:
+        brelse(group_bh);
+        mlog_exit(status);
+        return status;
+}
+/*
+ * Test if the bit representing this inode (blkno) is set in the
+ * suballocator.
+ *
+ * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ *
+ * In the event of failure, a negative value is returned and *res is
+ * meaningless.
+ *
+ * Callers must make sure to hold nfs_sync_lock to prevent
+ * ocfs2_delete_inode() on another node from accessing the same
+ * suballocator concurrently.
+ */
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
+{
+        int status;
+        u16 suballoc_bit = 0, suballoc_slot = 0;
+        struct inode *inode_alloc_inode;
+        struct buffer_head *alloc_bh = NULL;
+        mlog_entry("blkno: %llu", blkno);
+        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
+                                             &suballoc_bit);
+        if (status < 0) {
+                mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
+                goto bail;
+        }
+        inode_alloc_inode =
+                ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+                                            suballoc_slot);
+        if (!inode_alloc_inode) {
+                /* the error code could be inaccurate, but we are not able to
+                 * get the correct one. */
+                status = -EINVAL;
+                mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
+                     (u32)suballoc_slot);
+                goto bail;
+        }
+        mutex_lock(&inode_alloc_inode->i_mutex);
+        status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
+        if (status < 0) {
+                mutex_unlock(&inode_alloc_inode->i_mutex);
+                mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
+                     (u32)suballoc_slot, status);
+                goto bail;
+        }
+        status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
+                                         blkno, suballoc_bit, res);
+        if (status < 0)
+                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+        ocfs2_inode_unlock(inode_alloc_inode, 0);
+        mutex_unlock(&inode_alloc_inode->i_mutex);
+        iput(inode_alloc_inode);
+        brelse(alloc_bh);
+bail:
+        mlog_exit(status);
+        return status;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e3c13c77f9e8..8c9a78a43164 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
                         u64 *blkno_start);
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
                          handle_t *handle,
+                          struct inode *dir,
+                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
                          u16 *suballoc_bit,
                          u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
                          struct ocfs2_alloc_context **meta_ac);
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7ac83a81ee55..79ff8d9d37e0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
        {Opt_err, NULL}
 };
+#ifdef CONFIG_DEBUG_FS
+static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
+{
+        int out = 0;
+        int i;
+        struct ocfs2_cluster_connection *cconn = osb->cconn;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        out += snprintf(buf + out, len - out,
+                        "%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
+                        "Device", osb->dev_str, osb->uuid_str,
+                        osb->fs_generation, osb->vol_label);
+        out += snprintf(buf + out, len - out,
+                        "%10s => State: %d  Flags: 0x%lX\n", "Volume",
+                        atomic_read(&osb->vol_state), osb->osb_flags);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Block: %lu  Cluster: %d\n", "Sizes",
+                        osb->sb->s_blocksize, osb->s_clustersize);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Compat: 0x%X  Incompat: 0x%X  "
+                        "ROcompat: 0x%X\n",
+                        "Features", osb->s_feature_compat,
+                        osb->s_feature_incompat, osb->s_feature_ro_compat);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
+                        osb->s_mount_opt, osb->s_atime_quantum);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Stack: %s  Name: %*s  Version: %d.%d\n",
+                        "Cluster",
+                        (*osb->osb_cluster_stack == '\0' ?
+                         "o2cb" : osb->osb_cluster_stack),
+                        cconn->cc_namelen, cconn->cc_name,
+                        cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
+        spin_lock(&osb->dc_task_lock);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
+                        "WorkSeq: %lu\n", "DownCnvt",
+                        task_pid_nr(osb->dc_task), osb->blocked_lock_count,
+                        osb->dc_wake_sequence, osb->dc_work_sequence);
+        spin_unlock(&osb->dc_task_lock);
+        spin_lock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out, "%10s => Pid: %d  Nodes:",
+                        "Recovery",
+                        (osb->recovery_thread_task ?
+                         task_pid_nr(osb->recovery_thread_task) : -1));
+        if (rm->rm_used == 0)
+                out += snprintf(buf + out, len - out, " None\n");
+        else {
+                for (i = 0; i < rm->rm_used; i++)
+                        out += snprintf(buf + out, len - out, " %d",
+                                        rm->rm_entries[i]);
+                out += snprintf(buf + out, len - out, "\n");
+        }
+        spin_unlock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+                        task_pid_nr(osb->commit_task), osb->osb_commit_interval,
+                        atomic_read(&osb->needs_checkpoint));
+        out += snprintf(buf + out, len - out,
+                        "%10s => State: %d  NumTxns: %d  TxnId: %lu\n",
+                        "Journal", osb->journal->j_state,
+                        atomic_read(&osb->journal->j_num_trans),
+                        osb->journal->j_trans_id);
+        out += snprintf(buf + out, len - out,
+                        "%10s => GlobalAllocs: %d  LocalAllocs: %d  "
+                        "SubAllocs: %d  LAWinMoves: %d  SAExtends: %d\n",
+                        "Stats",
+                        atomic_read(&osb->alloc_stats.bitmap_data),
+                        atomic_read(&osb->alloc_stats.local_data),
+                        atomic_read(&osb->alloc_stats.bg_allocs),
+                        atomic_read(&osb->alloc_stats.moves),
+                        atomic_read(&osb->alloc_stats.bg_extends));
+        out += snprintf(buf + out, len - out,
+                        "%10s => State: %u  Descriptor: %llu  Size: %u bits  "
+                        "Default: %u bits\n",
+                        "LocalAlloc", osb->local_alloc_state,
+                        (unsigned long long)osb->la_last_gd,
+                        osb->local_alloc_bits, osb->local_alloc_default_bits);
+        spin_lock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Slot: %d  NumStolen: %d\n", "Steal",
+                        osb->s_inode_steal_slot,
+                        atomic_read(&osb->s_num_inodes_stolen));
+        spin_unlock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
+                        "Slots", "Num", "RecoGen");
+        for (i = 0; i < osb->max_slots; ++i) {
+                out += snprintf(buf + out, len - out,
+                                "%10s  %c %3d  %10d\n",
+                                " ",
+                                (i == osb->slot_num ? '*' : ' '),
+                                i, osb->slot_recovery_generations[i]);
+        }
+        return out;
+}
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+        struct ocfs2_super *osb = inode->i_private;
+        char *buf = NULL;
+        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!buf)
+                goto bail;
+        i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
+        file->private_data = buf;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+        kfree(file->private_data);
+        return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
+}
+#else
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+static struct file_operations ocfs2_osb_debug_fops = {
+        .open =         ocfs2_osb_debug_open,
+        .release =      ocfs2_debug_release,
+        .read =         ocfs2_debug_read,
+        .llseek =       generic_file_llseek,
+};
 /*
 * write_super and sync_fs ripped right out of ext3.
 */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                goto read_super_error;
        }
+        osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
+                                            osb->osb_debug_root,
+                                            osb,
+                                            &ocfs2_osb_debug_fops);
+        if (!osb->osb_ctxt) {
+                status = -EINVAL;
+                mlog_errno(status);
+                goto read_super_error;
+        }
        status = ocfs2_mount_volume(sb);
        if (osb->root_inode)
                inode = igrab(osb->root_inode);
@@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        osb = OCFS2_SB(sb);
        BUG_ON(!osb);
+        debugfs_remove(osb->osb_ctxt);
        ocfs2_disable_quotas(osb);
        ocfs2_shutdown_local_alloc(osb);
@@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
        sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+        osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
+        for (i = 0; i < 3; i++)
+                osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
+        osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
        osb->sb = sb;
        /* Save off for ocfs2_rw_direct */
        osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
         * lock, and it's marked as dirty, set the bit in the recover
         * map and launch a recovery thread for it. */
        status = ocfs2_mark_dead_nodes(osb);
+        if (status < 0) {
+                mlog_errno(status);
+                goto finally;
+        }
+        status = ocfs2_compute_replay_slots(osb);
        if (status < 0)
                mlog_errno(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2563df89fc2a..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
                          struct ocfs2_security_xattr_info *si,
                          int *want_clusters,
                          int *xattr_credits,
-                          struct ocfs2_alloc_context **xattr_ac)
+                          int *want_meta)
 {
        int ret = 0;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
            (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
            (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
-                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                *want_meta = *want_meta + 1;
-                if (ret) {
-                        mlog_errno(ret);
-                        return ret;
-                }
                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
        }
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
                             int *, int *, struct ocfs2_alloc_context **);
 int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
                          int, struct ocfs2_security_xattr_info *,
-                          int *, int *, struct ocfs2_alloc_context **);
+                          int *, int *, int *);
 /*
 * xattrs can live inside an inode, as part of an external xattr block,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972bb..379ae5fb4411 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *s = dentry->d_sb;
        struct omfs_sb_info *sbi = OMFS_SB(s);
+        u64 id = huge_encode_dev(s->s_bdev->bd_dev);
        buf->f_type = OMFS_MAGIC;
        buf->f_bsize = sbi->s_blocksize;
        buf->f_blocks = sbi->s_num_blocks;
        buf->f_files = sbi->s_num_blocks;
        buf->f_namelen = OMFS_NAMELEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_bfree = buf->f_bavail = buf->f_ffree =
                omfs_count_free(s);
        return 0;
 }
@@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_uid = current_uid();
        sbi->s_gid = current_gid();
-        sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+        sbi->s_dmask = sbi->s_fmask = current_umask();
        if (!parse_options((char *) data, sbi))
                goto end;
diff --git a/fs/open.c b/fs/open.c
index 75b61677daaf..377eb25b6abf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
 #include <linux/falloc.h>
+#include <linux/fs_struct.h>
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 38e337d51ced..99e33ef40be4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <linux/blktrace_api.h>
 #include "check.h"
@@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = {
 static struct attribute_group *part_attr_groups[] = {
        &part_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+        &blk_trace_attr_group,
+#endif
        NULL
 };
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b688..f71559784bfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 /* NOTE:
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..74ea974f5ca6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeram-i.freehigh),
 #endif
 #ifndef CONFIG_MMU
-                K((unsigned long) atomic_read(&mmap_pages_allocated)),
+                K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
 #endif
                K(i.totalswap),
                K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index b446d7ad0b0d..7e14d1a04001 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -76,7 +76,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 /*
 * display a list of all the REGIONs the kernel knows about
- * - nommu kernals have a single flat list
+ * - nommu kernels have a single flat list
 */
 static int nommu_region_list_show(struct seq_file *m, void *_p)
 {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b0ae0be4801f..39e4ad4f59f4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
        struct file *file = vma->vm_file;
        int flags = vma->vm_flags;
        unsigned long ino = 0;
+        unsigned long long pgoff = 0;
        dev_t dev = 0;
        int len;
@@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
+                pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
        }
        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
@@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                        flags & VM_WRITE ? 'w' : '-',
                        flags & VM_EXEC ? 'x' : '-',
                        flags & VM_MAYSHARE ? 's' : 'p',
-                        ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+                        pgoff,
                        MAJOR(dev), MINOR(dev), ino, &len);
        /*
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..12c20377772d 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
@@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
        else
                bytes += kobjsize(mm);
        
-        if (current->fs && atomic_read(&current->fs->count) > 1)
+        if (current->fs && current->fs->users > 1)
                sbytes += kobjsize(current->fs);
        else
                bytes += kobjsize(current->fs);
@@ -125,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
        struct file *file;
        dev_t dev = 0;
        int flags, len;
+        unsigned long long pgoff = 0;
        flags = vma->vm_flags;
        file = vma->vm_file;
@@ -133,17 +135,18 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
+                pgoff = (loff_t)vma->pg_off << PAGE_SHIFT;
        }
        seq_printf(m,
-                   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+                   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
                   vma->vm_start,
                   vma->vm_end,
                   flags & VM_READ ? 'r' : '-',
                   flags & VM_WRITE ? 'w' : '-',
                   flags & VM_EXEC ? 'x' : '-',
                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-                   vma->vm_pgoff << PAGE_SHIFT,
+                   pgoff,
                   MAJOR(dev), MINOR(dev), ino, &len);
        if (file) {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2aad1044b84c..fe1f0f31d11c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        lock_kernel();
@@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bfree   = qnx4_count_free_blocks(sb);
        buf->f_bavail  = buf->f_bfree;
        buf->f_namelen = QNX4_NAME_MAX;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        unlock_kernel();
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2ca967a5ef77..607c579e5eca 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -823,7 +823,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                        continue;
                if (!atomic_read(&inode->i_writecount))
                        continue;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a404fb88e456..3a6b193d8444 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -221,22 +221,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
        save_mount_options(sb, data);
        fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+        sb->s_fs_info = fsi;
        if (!fsi) {
                err = -ENOMEM;
                goto fail;
        }
-        sb->s_fs_info = fsi;
        err = ramfs_parse_options(data, &fsi->mount_opts);
        if (err)
                goto fail;
-        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_maxbytes          = MAX_LFS_FILESIZE;
-        sb->s_blocksize = PAGE_CACHE_SIZE;
+        sb->s_blocksize         = PAGE_CACHE_SIZE;
-        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+        sb->s_blocksize_bits    = PAGE_CACHE_SHIFT;
-        sb->s_magic = RAMFS_MAGIC;
+        sb->s_magic             = RAMFS_MAGIC;
-        sb->s_op = &ramfs_ops;
+        sb->s_op                = &ramfs_ops;
-        sb->s_time_gran = 1;
+        sb->s_time_gran         = 1;
        inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
        if (!inode) {
                err = -ENOMEM;
@@ -244,14 +245,16 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
        }
        root = d_alloc_root(inode);
+        sb->s_root = root;
        if (!root) {
                err = -ENOMEM;
                goto fail;
        }
-        sb->s_root = root;
        return 0;
 fail:
        kfree(fsi);
+        sb->s_fs_info = NULL;
        iput(inode);
        return err;
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973e..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,62 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
        return ret;
 }
+static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
+{
+#define HALF_LONG_BITS (BITS_PER_LONG / 2)
+        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
+}
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        struct file *file;
+        ssize_t ret = -EBADF;
+        int fput_needed;
+        if (pos < 0)
+                return -EINVAL;
+        file = fget_light(fd, &fput_needed);
+        if (file) {
+                ret = -ESPIPE;
+                if (file->f_mode & FMODE_PREAD)
+                        ret = vfs_readv(file, vec, vlen, &pos);
+                fput_light(file, fput_needed);
+        }
+        if (ret > 0)
+                add_rchar(current, ret);
+        inc_syscr(current);
+        return ret;
+}
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        struct file *file;
+        ssize_t ret = -EBADF;
+        int fput_needed;
+        if (pos < 0)
+                return -EINVAL;
+        file = fget_light(fd, &fput_needed);
+        if (file) {
+                ret = -ESPIPE;
+                if (file->f_mode & FMODE_PWRITE)
+                        ret = vfs_writev(file, vec, vlen, &pos);
+                fput_light(file, fput_needed);
+        }
+        if (ret > 0)
+                add_wchar(current, ret);
+        inc_syscw(current);
+        return ret;
+}
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                           size_t count, loff_t max)
 {
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 949b8c6addc8..513f431038f9 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,5 +1,6 @@
 config REISERFS_FS
        tristate "Reiserfs support"
+        select CRC32
        help
          Stores not just filenames but the files themselves in a balanced
          tree.  Uses journalling.
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 972250c62896..0ae6486d9046 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/crc32.h>
 struct file_system_type reiserfs_fs_type;
@@ -1904,6 +1905,10 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bsize = dentry->d_sb->s_blocksize;
        /* changed to accommodate gcc folks. */
        buf->f_type = REISERFS_SUPER_MAGIC;
+        buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
+        buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
+                                sizeof(rs->s_uuid)/2);
        return 0;
 }
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d423416d93d1..c303c426fe2b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
        } else {
              apply_umask:
                /* no ACL, apply umask */
-                inode->i_mode &= ~current->fs->umask;
+                inode->i_mode &= ~current_umask();
        }
        return err;
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
 config ROMFS_FS
        tristate "ROM file system support"
-        depends on BLOCK
+        depends on BLOCK || MTD
        ---help---
          This is a very small read-only file system mainly intended for
          initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
          If you don't know whether you need it, then you don't need it:
          answer N.
+#
+# Select the backing stores to be supported
+#
+choice
+        prompt "RomFS backing stores"
+        depends on ROMFS_FS
+        default ROMFS_BACKED_BY_BLOCK
+        help
+          Select the backing stores to be supported.
+config ROMFS_BACKED_BY_BLOCK
+        bool "Block device-backed ROM file system support"
+        depends on BLOCK
+        help
+          This permits ROMFS to use block devices buffered through the page
+          cache as the medium from which to retrieve data.  It does not allow
+          direct mapping of the medium.
+          If unsure, answer Y.
+config ROMFS_BACKED_BY_MTD
+        bool "MTD-backed ROM file system support"
+        depends on MTD=y || (ROMFS_FS=m && MTD)
+        help
+          This permits ROMFS to use MTD based devices directly, without the
+          intercession of the block layer (which may have been disabled).  It
+          also allows direct mapping of MTD devices through romfs files under
+          NOMMU conditions if the underlying device is directly addressable by
+          the CPU.
+          If unsure, answer Y.
+config ROMFS_BACKED_BY_BOTH
+        bool "Both the above"
+        depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
+endchoice
+config ROMFS_ON_BLOCK
+        bool
+        default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+config ROMFS_ON_MTD
+        bool
+        default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
 #
-# Makefile for the linux romfs filesystem routines.
+# Makefile for the linux RomFS filesystem routines.
 #
 obj-$(CONFIG_ROMFS_FS) += romfs.o
-romfs-objs := inode.o
+romfs-y := storage.o super.o
+ifneq ($(CONFIG_MMU),y)
+romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
+endif
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/*
- * ROMFS file system, Linux implementation
- *
- * Copyright (C) 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
- *
- * Using parts of the minix filesystem
- * Copyright (C) 1991, 1992  Linus Torvalds
- *
- * and parts of the affs filesystem additionally
- * Copyright (C) 1993  Ray Burr
- * Copyright (C) 1996  Hans-Joachim Widmaier
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes
- *                                      Changed for 2.1.19 modules
- *      Jan 1997                        Initial release
- *      Jun 1997                        2.1.43+ changes
- *                                      Proper page locking in readpage
- *                                      Changed to work with 2.1.45+ fs
- *      Jul 1997                        Fixed follow_link
- *                      2.1.47
- *                                      lookup shouldn't return -ENOENT
- *                                      from Horst von Brand:
- *                                        fail on wrong checksum
- *                                        double unlock_super was possible
- *                                        correct namelen for statfs
- *                                      spotted by Bill Hawes:
- *                                        readlink shouldn't iput()
- *      Jun 1998        2.1.106         from Avery Pennarun: glibc scandir()
- *                                        exposed a problem in readdir
- *                      2.1.107         code-freeze spellchecker run
- *      Aug 1998                        2.1.118+ VFS changes
- *      Sep 1998        2.1.122         another VFS change (follow_link)
- *      Apr 1999        2.2.7           no more EBADF checking in
- *                                        lookup/readdir, use ERR_PTR
- *      Jun 1999        2.3.6           d_alloc_root use changed
- *                      2.3.9           clean up usage of ENOENT/negative
- *                                        dentries in lookup
- *                                      clean up page flags setting
- *                                        (error, uptodate, locking) in
- *                                        in readpage
- *                                      use init_special_inode for
- *                                        fifos/sockets (and streamline) in
- *                                        read_inode, fix _ops table order
- *      Aug 1999        2.3.16          __initfunc() => __init change
- *      Oct 1999        2.3.24          page->owner hack obsoleted
- *      Nov 1999        2.3.27          2.3.25+ page->offset => index change
- */
-/* todo:
- *      - see Documentation/filesystems/romfs.txt
- *      - use allocated, not stack memory for file names?
- *      - considering write access...
- *      - network (tftp) files?
- *      - merge back some _op tables
- */
-/*
- * Sorry about some optimizations and for some goto's.  I just wanted
- * to squeeze some more bytes out of this code.. :)
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/romfs_fs.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <asm/uaccess.h>
-struct romfs_inode_info {
-        unsigned long i_metasize;       /* size of non-data area */
-        unsigned long i_dataoffset;     /* from the start of fs */
-        struct inode vfs_inode;
-};
-static struct inode *romfs_iget(struct super_block *, unsigned long);
-/* instead of private superblock data */
-static inline unsigned long romfs_maxsize(struct super_block *sb)
-{
-        return (unsigned long)sb->s_fs_info;
-}
-static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
-{
-        return container_of(inode, struct romfs_inode_info, vfs_inode);
-}
-static __u32
-romfs_checksum(void *data, int size)
-{
-        __u32 sum;
-        __be32 *ptr;
-        sum = 0; ptr = data;
-        size>>=2;
-        while (size>0) {
-                sum += be32_to_cpu(*ptr++);
-                size--;
-        }
-        return sum;
-}
-static const struct super_operations romfs_ops;
-static int romfs_fill_super(struct super_block *s, void *data, int silent)
-{
-        struct buffer_head *bh;
-        struct romfs_super_block *rsb;
-        struct inode *root;
-        int sz, ret = -EINVAL;
-        /* I would parse the options here, but there are none.. :) */
-        sb_set_blocksize(s, ROMBSIZE);
-        s->s_maxbytes = 0xFFFFFFFF;
-        bh = sb_bread(s, 0);
-        if (!bh) {
-                /* XXX merge with other printk? */
-                printk ("romfs: unable to read superblock\n");
-                goto outnobh;
-        }
-        rsb = (struct romfs_super_block *)bh->b_data;
-        sz = be32_to_cpu(rsb->size);
-        if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
-           || sz < ROMFH_SIZE) {
-                if (!silent)
-                        printk ("VFS: Can't find a romfs filesystem on dev "
-                                "%s.\n", s->s_id);
-                goto out;
-        }
-        if (romfs_checksum(rsb, min_t(int, sz, 512))) {
-                printk ("romfs: bad initial checksum on dev "
-                        "%s.\n", s->s_id);
-                goto out;
-        }
-        s->s_magic = ROMFS_MAGIC;
-        s->s_fs_info = (void *)(long)sz;
-        s->s_flags |= MS_RDONLY;
-        /* Find the start of the fs */
-        sz = (ROMFH_SIZE +
-              strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
-             & ROMFH_MASK;
-        s->s_op = &romfs_ops;
-        root = romfs_iget(s, sz);
-        if (IS_ERR(root)) {
-                ret = PTR_ERR(root);
-                goto out;
-        }
-        ret = -ENOMEM;
-        s->s_root = d_alloc_root(root);
-        if (!s->s_root)
-                goto outiput;
-        brelse(bh);
-        return 0;
-outiput:
-        iput(root);
-out:
-        brelse(bh);
-outnobh:
-        return ret;
-}
-/* That's simple too. */
-static int
-romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        buf->f_type = ROMFS_MAGIC;
-        buf->f_bsize = ROMBSIZE;
-        buf->f_bfree = buf->f_bavail = buf->f_ffree;
-        buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
-        buf->f_namelen = ROMFS_MAXFN;
-        return 0;
-}
-/* some helper routines */
-static int
-romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
-{
-        struct buffer_head *bh;
-        unsigned long avail, maxsize, res;
-        maxsize = romfs_maxsize(i->i_sb);
-        if (offset >= maxsize)
-                return -1;
-        /* strnlen is almost always valid */
-        if (count > maxsize || offset+count > maxsize)
-                count = maxsize-offset;
-        bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-        if (!bh)
-                return -1;              /* error */
-        avail = ROMBSIZE - (offset & ROMBMASK);
-        maxsize = min_t(unsigned long, count, avail);
-        res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
-        brelse(bh);
-        if (res < maxsize)
-                return res;             /* found all of it */
-        while (res < count) {
-                offset += maxsize;
-                bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-                if (!bh)
-                        return -1;
-                maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-                avail = strnlen(bh->b_data, maxsize);
-                res += avail;
-                brelse(bh);
-                if (avail < maxsize)
-                        return res;
-        }
-        return res;
-}
-static int
-romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
-{
-        struct buffer_head *bh;
-        unsigned long avail, maxsize, res;
-        maxsize = romfs_maxsize(i->i_sb);
-        if (offset >= maxsize || count > maxsize || offset+count>maxsize)
-                return -1;
-        bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-        if (!bh)
-                return -1;              /* error */
-        avail = ROMBSIZE - (offset & ROMBMASK);
-        maxsize = min_t(unsigned long, count, avail);
-        memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
-        brelse(bh);
-        res = maxsize;                  /* all of it */
-        while (res < count) {
-                offset += maxsize;
-                dest += maxsize;
-                bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-                if (!bh)
-                        return -1;
-                maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-                memcpy(dest, bh->b_data, maxsize);
-                brelse(bh);
-                res += maxsize;
-        }
-        return res;
-}
-static unsigned char romfs_dtype_table[] = {
-        DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
-};
-static int
-romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct inode *i = filp->f_path.dentry->d_inode;
-        struct romfs_inode ri;
-        unsigned long offset, maxoff;
-        int j, ino, nextfh;
-        int stored = 0;
-        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
-        lock_kernel();
-        maxoff = romfs_maxsize(i->i_sb);
-        offset = filp->f_pos;
-        if (!offset) {
-                offset = i->i_ino & ROMFH_MASK;
-                if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto out;
-                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        }
-        /* Not really failsafe, but we are read-only... */
-        for(;;) {
-                if (!offset || offset >= maxoff) {
-                        offset = maxoff;
-                        filp->f_pos = offset;
-                        goto out;
-                }
-                filp->f_pos = offset;
-                /* Fetch inode info */
-                if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto out;
-                j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
-                if (j < 0)
-                        goto out;
-                fsname[j]=0;
-                romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
-                ino = offset;
-                nextfh = be32_to_cpu(ri.next);
-                if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
-                        ino = be32_to_cpu(ri.spec);
-                if (filldir(dirent, fsname, j, offset, ino,
-                            romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
-                        goto out;
-                }
-                stored++;
-                offset = nextfh & ROMFH_MASK;
-        }
-out:
-        unlock_kernel();
-        return stored;
-}
-static struct dentry *
-romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        unsigned long offset, maxoff;
-        long res;
-        int fslen;
-        struct inode *inode = NULL;
-        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
-        struct romfs_inode ri;
-        const char *name;               /* got from dentry */
-        int len;
-        res = -EACCES;                  /* placeholder for "no data here" */
-        offset = dir->i_ino & ROMFH_MASK;
-        lock_kernel();
-        if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-                goto error;
-        maxoff = romfs_maxsize(dir->i_sb);
-        offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        /* OK, now find the file whose name is in "dentry" in the
-         * directory specified by "dir".  */
-        name = dentry->d_name.name;
-        len = dentry->d_name.len;
-        for(;;) {
-                if (!offset || offset >= maxoff)
-                        goto success; /* negative success */
-                if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto error;
-                /* try to match the first 16 bytes of name */
-                fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
-                if (len < ROMFH_SIZE) {
-                        if (len == fslen) {
-                                /* both are shorter, and same size */
-                                romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-                                if (strncmp (name, fsname, len) == 0)
-                                        break;
-                        }
-                } else if (fslen >= ROMFH_SIZE) {
-                        /* both are longer; XXX optimize max size */
-                        fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
-                        if (len == fslen) {
-                                romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-                                if (strncmp(name, fsname, len) == 0)
-                                        break;
-                        }
-                }
-                /* next entry */
-                offset = be32_to_cpu(ri.next) & ROMFH_MASK;
-        }
-        /* Hard link handling */
-        if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
-                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        inode = romfs_iget(dir->i_sb, offset);
-        if (IS_ERR(inode)) {
-                res = PTR_ERR(inode);
-                goto error;
-        }
-success:
-        d_add(dentry, inode);
-        res = 0;
-error:
-        unlock_kernel();
-        return ERR_PTR(res);
-}
-/*
- * Ok, we do readpage, to be able to execute programs.  Unfortunately,
- * we can't use bmap, since we may have looser alignments.
- */
-static int
-romfs_readpage(struct file *file, struct page * page)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t offset, size;
-        unsigned long filled;
-        void *buf;
-        int result = -EIO;
-        page_cache_get(page);
-        lock_kernel();
-        buf = kmap(page);
-        if (!buf)
-                goto err_out;
-        /* 32 bit warning -- but not for us :) */
-        offset = page_offset(page);
-        size = i_size_read(inode);
-        filled = 0;
-        result = 0;
-        if (offset < size) {
-                unsigned long readlen;
-                size -= offset;
-                readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
-                filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
-                if (filled != readlen) {
-                        SetPageError(page);
-                        filled = 0;
-                        result = -EIO;
-                }
-        }
-        if (filled < PAGE_SIZE)
-                memset(buf + filled, 0, PAGE_SIZE-filled);
-        if (!result)
-                SetPageUptodate(page);
-        flush_dcache_page(page);
-        unlock_page(page);
-        kunmap(page);
-err_out:
-        page_cache_release(page);
-        unlock_kernel();
-        return result;
-}
-/* Mapping from our types to the kernel */
-static const struct address_space_operations romfs_aops = {
-        .readpage = romfs_readpage
-};
-static const struct file_operations romfs_dir_operations = {
-        .read           = generic_read_dir,
-        .readdir        = romfs_readdir,
-};
-static const struct inode_operations romfs_dir_inode_operations = {
-        .lookup         = romfs_lookup,
-};
-static mode_t romfs_modemap[] =
-{
-        0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
-        S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
-};
-static struct inode *
-romfs_iget(struct super_block *sb, unsigned long ino)
-{
-        int nextfh, ret;
-        struct romfs_inode ri;
-        struct inode *i;
-        ino &= ROMFH_MASK;
-        i = iget_locked(sb, ino);
-        if (!i)
-                return ERR_PTR(-ENOMEM);
-        if (!(i->i_state & I_NEW))
-                return i;
-        i->i_mode = 0;
-        /* Loop for finding the real hard link */
-        for(;;) {
-                if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
-                        printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
-                                ino);
-                        iget_failed(i);
-                        return ERR_PTR(-EIO);
-                }
-                /* XXX: do romfs_checksum here too (with name) */
-                nextfh = be32_to_cpu(ri.next);
-                if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
-                        break;
-                ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
-        }
-        i->i_nlink = 1;         /* Hard to decide.. */
-        i->i_size = be32_to_cpu(ri.size);
-        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
-        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-        /* Precalculate the data offset */
-        ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-        if (ret >= 0)
-                ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-        else
-                ino = 0;
-        ROMFS_I(i)->i_metasize = ino;
-        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
-        /* Compute permissions */
-        ino = romfs_modemap[nextfh & ROMFH_TYPE];
-        /* only "normal" files have ops */
-        switch (nextfh & ROMFH_TYPE) {
-                case 1:
-                        i->i_size = ROMFS_I(i)->i_metasize;
-                        i->i_op = &romfs_dir_inode_operations;
-                        i->i_fop = &romfs_dir_operations;
-                        if (nextfh & ROMFH_EXEC)
-                                ino |= S_IXUGO;
-                        i->i_mode = ino;
-                        break;
-                case 2:
-                        i->i_fop = &generic_ro_fops;
-                        i->i_data.a_ops = &romfs_aops;
-                        if (nextfh & ROMFH_EXEC)
-                                ino |= S_IXUGO;
-                        i->i_mode = ino;
-                        break;
-                case 3:
-                        i->i_op = &page_symlink_inode_operations;
-                        i->i_data.a_ops = &romfs_aops;
-                        i->i_mode = ino | S_IRWXUGO;
-                        break;
-                default:
-                        /* depending on MBZ for sock/fifos */
-                        nextfh = be32_to_cpu(ri.spec);
-                        init_special_inode(i, ino,
-                                        MKDEV(nextfh>>16,nextfh&0xffff));
-        }
-        unlock_new_inode(i);
-        return i;
-}
-static struct kmem_cache * romfs_inode_cachep;
-static struct inode *romfs_alloc_inode(struct super_block *sb)
-{
-        struct romfs_inode_info *ei;
-        ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
-        if (!ei)
-                return NULL;
-        return &ei->vfs_inode;
-}
-static void romfs_destroy_inode(struct inode *inode)
-{
-        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
-}
-static void init_once(void *foo)
-{
-        struct romfs_inode_info *ei = foo;
-        inode_init_once(&ei->vfs_inode);
-}
-static int init_inodecache(void)
-{
-        romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
-                                             sizeof(struct romfs_inode_info),
-                                             0, (SLAB_RECLAIM_ACCOUNT|
-                                                SLAB_MEM_SPREAD),
-                                             init_once);
-        if (romfs_inode_cachep == NULL)
-                return -ENOMEM;
-        return 0;
-}
-static void destroy_inodecache(void)
-{
-        kmem_cache_destroy(romfs_inode_cachep);
-}
-static int romfs_remount(struct super_block *sb, int *flags, char *data)
-{
-        *flags |= MS_RDONLY;
-        return 0;
-}
-static const struct super_operations romfs_ops = {
-        .alloc_inode    = romfs_alloc_inode,
-        .destroy_inode  = romfs_destroy_inode,
-        .statfs         = romfs_statfs,
-        .remount_fs     = romfs_remount,
-};
-static int romfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
-                           mnt);
-}
-static struct file_system_type romfs_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "romfs",
-        .get_sb         = romfs_get_sb,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
-static int __init init_romfs_fs(void)
-{
-        int err = init_inodecache();
-        if (err)
-                goto out1;
-        err = register_filesystem(&romfs_fs_type);
-        if (err)
-                goto out;
-        return 0;
-out:
-        destroy_inodecache();
-out1:
-        return err;
-}
-static void __exit exit_romfs_fs(void)
-{
-        unregister_filesystem(&romfs_fs_type);
-        destroy_inodecache();
-}
-/* Yes, works even as a module... :) */
-module_init(init_romfs_fs)
-module_exit(exit_romfs_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..06044a9dc62d
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
+/* RomFS internal definitions
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/romfs_fs.h>
+struct romfs_inode_info {
+        struct inode    vfs_inode;
+        unsigned long   i_metasize;     /* size of non-data area */
+        unsigned long   i_dataoffset;   /* from the start of fs */
+};
+static inline size_t romfs_maxsize(struct super_block *sb)
+{
+        return (size_t) (unsigned long) sb->s_fs_info;
+}
+static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
+{
+        return container_of(inode, struct romfs_inode_info, vfs_inode);
+}
+/*
+ * mmap-nommu.c
+ */
+#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
+extern const struct file_operations romfs_ro_fops;
+#else
+#define romfs_ro_fops   generic_ro_fops
+#endif
+/*
+ * storage.c
+ */
+extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
+                          void *buf, size_t buflen);
+extern ssize_t romfs_dev_strnlen(struct super_block *sb,
+                                 unsigned long pos, size_t maxlen);
+extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+                             const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
+/* NOMMU mmap support for RomFS on MTD devices
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/mm.h>
+#include <linux/mtd/super.h>
+#include "internal.h"
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ *   mappings)
+ * - attempts to map through to the underlying MTD device
+ */
+static unsigned long romfs_get_unmapped_area(struct file *file,
+                                             unsigned long addr,
+                                             unsigned long len,
+                                             unsigned long pgoff,
+                                             unsigned long flags)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct mtd_info *mtd = inode->i_sb->s_mtd;
+        unsigned long isize, offset;
+        if (!mtd)
+                goto cant_map_directly;
+        isize = i_size_read(inode);
+        offset = pgoff << PAGE_SHIFT;
+        if (offset > isize || len > isize || offset > isize - len)
+                return (unsigned long) -EINVAL;
+        /* we need to call down to the MTD layer to do the actual mapping */
+        if (mtd->get_unmapped_area) {
+                if (addr != 0)
+                        return (unsigned long) -EINVAL;
+                if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+                        return (unsigned long) -EINVAL;
+                offset += ROMFS_I(inode)->i_dataoffset;
+                if (offset > mtd->size - len)
+                        return (unsigned long) -EINVAL;
+                return mtd->get_unmapped_area(mtd, len, offset, flags);
+        }
+cant_map_directly:
+        return (unsigned long) -ENOSYS;
+}
+/*
+ * permit a R/O mapping to be made directly through onto an MTD device if
+ * possible
+ */
+static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+const struct file_operations romfs_ro_fops = {
+        .llseek                 = generic_file_llseek,
+        .read                   = do_sync_read,
+        .aio_read               = generic_file_aio_read,
+        .splice_read            = generic_file_splice_read,
+        .mmap                   = romfs_mmap,
+        .get_unmapped_area      = romfs_get_unmapped_area,
+};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..7e3e1e12a081
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,261 @@
+/* RomFS storage access routines
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/fs.h>
+#include <linux/mtd/super.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
+#error no ROMFS backing store interface configured
+#endif
+#ifdef CONFIG_ROMFS_ON_MTD
+#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
+/*
+ * read data from an romfs image on an MTD device
+ */
+static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
+                          void *buf, size_t buflen)
+{
+        size_t rlen;
+        int ret;
+        ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
+        return (ret < 0 || rlen != buflen) ? -EIO : 0;
+}
+/*
+ * determine the length of a string in a romfs image on an MTD device
+ */
+static ssize_t romfs_mtd_strnlen(struct super_block *sb,
+                                 unsigned long pos, size_t maxlen)
+{
+        ssize_t n = 0;
+        size_t segment;
+        u_char buf[16], *p;
+        size_t len;
+        int ret;
+        /* scan the string up to 16 bytes at a time */
+        while (maxlen > 0) {
+                segment = min_t(size_t, maxlen, 16);
+                ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+                if (ret < 0)
+                        return ret;
+                p = memchr(buf, 0, len);
+                if (p)
+                        return n + (p - buf);
+                maxlen -= len;
+                pos += len;
+                n += len;
+        }
+        return n;
+}
+/*
+ * compare a string to one in a romfs image on MTD
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
+                             const char *str, size_t size)
+{
+        u_char buf[16];
+        size_t len, segment;
+        int ret;
+        /* scan the string up to 16 bytes at a time */
+        while (size > 0) {
+                segment = min_t(size_t, size, 16);
+                ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+                if (ret < 0)
+                        return ret;
+                if (memcmp(buf, str, len) != 0)
+                        return 0;
+                size -= len;
+                pos += len;
+                str += len;
+        }
+        return 1;
+}
+#endif /* CONFIG_ROMFS_ON_MTD */
+#ifdef CONFIG_ROMFS_ON_BLOCK
+/*
+ * read data from an romfs image on a block device
+ */
+static int romfs_blk_read(struct super_block *sb, unsigned long pos,
+                          void *buf, size_t buflen)
+{
+        struct buffer_head *bh;
+        unsigned long offset;
+        size_t segment;
+        /* copy the string up to blocksize bytes at a time */
+        while (buflen > 0) {
+                offset = pos & (ROMBSIZE - 1);
+                segment = min_t(size_t, buflen, ROMBSIZE - offset);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                memcpy(buf, bh->b_data + offset, segment);
+                brelse(bh);
+                buflen -= segment;
+                pos += segment;
+        }
+        return 0;
+}
+/*
+ * determine the length of a string in romfs on a block device
+ */
+static ssize_t romfs_blk_strnlen(struct super_block *sb,
+                                 unsigned long pos, size_t limit)
+{
+        struct buffer_head *bh;
+        unsigned long offset;
+        ssize_t n = 0;
+        size_t segment;
+        u_char *buf, *p;
+        /* scan the string up to blocksize bytes at a time */
+        while (limit > 0) {
+                offset = pos & (ROMBSIZE - 1);
+                segment = min_t(size_t, limit, ROMBSIZE - offset);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                buf = bh->b_data + offset;
+                p = memchr(buf, 0, segment);
+                brelse(bh);
+                if (p)
+                        return n + (p - buf);
+                limit -= segment;
+                pos += segment;
+                n += segment;
+        }
+        return n;
+}
+/*
+ * compare a string to one in a romfs image on a block device
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
+                             const char *str, size_t size)
+{
+        struct buffer_head *bh;
+        unsigned long offset;
+        size_t segment;
+        bool x;
+        /* scan the string up to 16 bytes at a time */
+        while (size > 0) {
+                offset = pos & (ROMBSIZE - 1);
+                segment = min_t(size_t, size, ROMBSIZE - offset);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                x = (memcmp(bh->b_data + offset, str, segment) != 0);
+                brelse(bh);
+                if (x)
+                        return 0;
+                size -= segment;
+                pos += segment;
+                str += segment;
+        }
+        return 1;
+}
+#endif /* CONFIG_ROMFS_ON_BLOCK */
+/*
+ * read data from the romfs image
+ */
+int romfs_dev_read(struct super_block *sb, unsigned long pos,
+                   void *buf, size_t buflen)
+{
+        size_t limit;
+        limit = romfs_maxsize(sb);
+        if (pos >= limit)
+                return -EIO;
+        if (buflen > limit - pos)
+                buflen = limit - pos;
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd)
+                return romfs_mtd_read(sb, pos, buf, buflen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev)
+                return romfs_blk_read(sb, pos, buf, buflen);
+#endif
+        return -EIO;
+}
+/*
+ * determine the length of a string in romfs
+ */
+ssize_t romfs_dev_strnlen(struct super_block *sb,
+                          unsigned long pos, size_t maxlen)
+{
+        size_t limit;
+        limit = romfs_maxsize(sb);
+        if (pos >= limit)
+                return -EIO;
+        if (maxlen > limit - pos)
+                maxlen = limit - pos;
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd)
+                return romfs_mtd_strnlen(sb, pos, limit);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev)
+                return romfs_blk_strnlen(sb, pos, limit);
+#endif
+        return -EIO;
+}
+/*
+ * compare a string to one in romfs
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+                      const char *str, size_t size)
+{
+        size_t limit;
+        limit = romfs_maxsize(sb);
+        if (pos >= limit)
+                return -EIO;
+        if (size > ROMFS_MAXFN)
+                return -ENAMETOOLONG;
+        if (size > limit - pos)
+                return -EIO;
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd)
+                return romfs_mtd_strncmp(sb, pos, str, size);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev)
+                return romfs_blk_strncmp(sb, pos, str, size);
+#endif
+        return -EIO;
+}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..10ca7d984a8b
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,653 @@
+/* Block- or MTD-based romfs
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Derived from: ROMFS file system, Linux implementation
+ *
+ * Copyright © 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
+ *
+ * Using parts of the minix filesystem
+ * Copyright © 1991, 1992  Linus Torvalds
+ *
+ * and parts of the affs filesystem additionally
+ * Copyright © 1993  Ray Burr
+ * Copyright © 1996  Hans-Joachim Widmaier
+ *
+ * Changes
+ *                                      Changed for 2.1.19 modules
+ *      Jan 1997                        Initial release
+ *      Jun 1997                        2.1.43+ changes
+ *                                      Proper page locking in readpage
+ *                                      Changed to work with 2.1.45+ fs
+ *      Jul 1997                        Fixed follow_link
+ *                      2.1.47
+ *                                      lookup shouldn't return -ENOENT
+ *                                      from Horst von Brand:
+ *                                        fail on wrong checksum
+ *                                        double unlock_super was possible
+ *                                        correct namelen for statfs
+ *                                      spotted by Bill Hawes:
+ *                                        readlink shouldn't iput()
+ *      Jun 1998        2.1.106         from Avery Pennarun: glibc scandir()
+ *                                        exposed a problem in readdir
+ *                      2.1.107         code-freeze spellchecker run
+ *      Aug 1998                        2.1.118+ VFS changes
+ *      Sep 1998        2.1.122         another VFS change (follow_link)
+ *      Apr 1999        2.2.7           no more EBADF checking in
+ *                                        lookup/readdir, use ERR_PTR
+ *      Jun 1999        2.3.6           d_alloc_root use changed
+ *                      2.3.9           clean up usage of ENOENT/negative
+ *                                        dentries in lookup
+ *                                      clean up page flags setting
+ *                                        (error, uptodate, locking) in
+ *                                        in readpage
+ *                                      use init_special_inode for
+ *                                        fifos/sockets (and streamline) in
+ *                                        read_inode, fix _ops table order
+ *      Aug 1999        2.3.16          __initfunc() => __init change
+ *      Oct 1999        2.3.24          page->owner hack obsoleted
+ *      Nov 1999        2.3.27          2.3.25+ page->offset => index change
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+static struct kmem_cache *romfs_inode_cachep;
+static const umode_t romfs_modemap[8] = {
+        0,                      /* hard link */
+        S_IFDIR  | 0644,        /* directory */
+        S_IFREG  | 0644,        /* regular file */
+        S_IFLNK  | 0777,        /* symlink */
+        S_IFBLK  | 0600,        /* blockdev */
+        S_IFCHR  | 0600,        /* chardev */
+        S_IFSOCK | 0644,        /* socket */
+        S_IFIFO  | 0644         /* FIFO */
+};
+static const unsigned char romfs_dtype_table[] = {
+        DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
+/*
+ * read a page worth of data from the image
+ */
+static int romfs_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        loff_t offset, size;
+        unsigned long fillsize, pos;
+        void *buf;
+        int ret;
+        buf = kmap(page);
+        if (!buf)
+                return -ENOMEM;
+        /* 32 bit warning -- but not for us :) */
+        offset = page_offset(page);
+        size = i_size_read(inode);
+        fillsize = 0;
+        ret = 0;
+        if (offset < size) {
+                size -= offset;
+                fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
+                pos = ROMFS_I(inode)->i_dataoffset + offset;
+                ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+                if (ret < 0) {
+                        SetPageError(page);
+                        fillsize = 0;
+                        ret = -EIO;
+                }
+        }
+        if (fillsize < PAGE_SIZE)
+                memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
+        if (ret == 0)
+                SetPageUptodate(page);
+        flush_dcache_page(page);
+        kunmap(page);
+        unlock_page(page);
+        return ret;
+}
+static const struct address_space_operations romfs_aops = {
+        .readpage       = romfs_readpage
+};
+/*
+ * read the entries from a directory
+ */
+static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct inode *i = filp->f_dentry->d_inode;
+        struct romfs_inode ri;
+        unsigned long offset, maxoff;
+        int j, ino, nextfh;
+        int stored = 0;
+        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
+        int ret;
+        maxoff = romfs_maxsize(i->i_sb);
+        offset = filp->f_pos;
+        if (!offset) {
+                offset = i->i_ino & ROMFH_MASK;
+                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+                if (ret < 0)
+                        goto out;
+                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        }
+        /* Not really failsafe, but we are read-only... */
+        for (;;) {
+                if (!offset || offset >= maxoff) {
+                        offset = maxoff;
+                        filp->f_pos = offset;
+                        goto out;
+                }
+                filp->f_pos = offset;
+                /* Fetch inode info */
+                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+                if (ret < 0)
+                        goto out;
+                j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+                                      sizeof(fsname) - 1);
+                if (j < 0)
+                        goto out;
+                ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+                if (ret < 0)
+                        goto out;
+                fsname[j] = '\0';
+                ino = offset;
+                nextfh = be32_to_cpu(ri.next);
+                if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
+                        ino = be32_to_cpu(ri.spec);
+                if (filldir(dirent, fsname, j, offset, ino,
+                            romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+                        goto out;
+                stored++;
+                offset = nextfh & ROMFH_MASK;
+        }
+out:
+        return stored;
+}
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
+                                   struct nameidata *nd)
+{
+        unsigned long offset, maxoff;
+        struct inode *inode;
+        struct romfs_inode ri;
+        const char *name;               /* got from dentry */
+        int len, ret;
+        offset = dir->i_ino & ROMFH_MASK;
+        ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
+        if (ret < 0)
+                goto error;
+        /* search all the file entries in the list starting from the one
+         * pointed to by the directory's special data */
+        maxoff = romfs_maxsize(dir->i_sb);
+        offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        name = dentry->d_name.name;
+        len = dentry->d_name.len;
+        for (;;) {
+                if (!offset || offset >= maxoff)
+                        goto out0;
+                ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
+                if (ret < 0)
+                        goto error;
+                /* try to match the first 16 bytes of name */
+                ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
+                                        len);
+                if (ret < 0)
+                        goto error;
+                if (ret == 1)
+                        break;
+                /* next entry */
+                offset = be32_to_cpu(ri.next) & ROMFH_MASK;
+        }
+        /* Hard link handling */
+        if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+                offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        inode = romfs_iget(dir->i_sb, offset);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                goto error;
+        }
+        goto outi;
+        /*
+         * it's a bit funky, _lookup needs to return an error code
+         * (negative) or a NULL, both as a dentry.  ENOENT should not
+         * be returned, instead we need to create a negative dentry by
+         * d_add(dentry, NULL); and return 0 as no error.
+         * (Although as I see, it only matters on writable file
+         * systems).
+         */
+out0:
+        inode = NULL;
+outi:
+        d_add(dentry, inode);
+        ret = 0;
+error:
+        return ERR_PTR(ret);
+}
+static const struct file_operations romfs_dir_operations = {
+        .read           = generic_read_dir,
+        .readdir        = romfs_readdir,
+};
+static struct inode_operations romfs_dir_inode_operations = {
+        .lookup         = romfs_lookup,
+};
+/*
+ * get a romfs inode based on its position in the image (which doubles as the
+ * inode number)
+ */
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
+{
+        struct romfs_inode_info *inode;
+        struct romfs_inode ri;
+        struct inode *i;
+        unsigned long nlen;
+        unsigned nextfh, ret;
+        umode_t mode;
+        /* we might have to traverse a chain of "hard link" file entries to get
+         * to the actual file */
+        for (;;) {
+                ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
+                if (ret < 0)
+                        goto error;
+                /* XXX: do romfs_checksum here too (with name) */
+                nextfh = be32_to_cpu(ri.next);
+                if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
+                        break;
+                pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
+        }
+        /* determine the length of the filename */
+        nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
+        if (IS_ERR_VALUE(nlen))
+                goto eio;
+        /* get an inode for this image position */
+        i = iget_locked(sb, pos);
+        if (!i)
+                return ERR_PTR(-ENOMEM);
+        if (!(i->i_state & I_NEW))
+                return i;
+        /* precalculate the data offset */
+        inode = ROMFS_I(i);
+        inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
+        inode->i_dataoffset = pos + inode->i_metasize;
+        i->i_nlink = 1;         /* Hard to decide.. */
+        i->i_size = be32_to_cpu(ri.size);
+        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
+        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+        /* set up mode and ops */
+        mode = romfs_modemap[nextfh & ROMFH_TYPE];
+        switch (nextfh & ROMFH_TYPE) {
+        case ROMFH_DIR:
+                i->i_size = ROMFS_I(i)->i_metasize;
+                i->i_op = &romfs_dir_inode_operations;
+                i->i_fop = &romfs_dir_operations;
+                if (nextfh & ROMFH_EXEC)
+                        mode |= S_IXUGO;
+                break;
+        case ROMFH_REG:
+                i->i_fop = &romfs_ro_fops;
+                i->i_data.a_ops = &romfs_aops;
+                if (i->i_sb->s_mtd)
+                        i->i_data.backing_dev_info =
+                                i->i_sb->s_mtd->backing_dev_info;
+                if (nextfh & ROMFH_EXEC)
+                        mode |= S_IXUGO;
+                break;
+        case ROMFH_SYM:
+                i->i_op = &page_symlink_inode_operations;
+                i->i_data.a_ops = &romfs_aops;
+                mode |= S_IRWXUGO;
+                break;
+        default:
+                /* depending on MBZ for sock/fifos */
+                nextfh = be32_to_cpu(ri.spec);
+                init_special_inode(i, mode, MKDEV(nextfh >> 16,
+                                                  nextfh & 0xffff));
+                break;
+        }
+        i->i_mode = mode;
+        unlock_new_inode(i);
+        return i;
+eio:
+        ret = -EIO;
+error:
+        printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+        return ERR_PTR(ret);
+}
+/*
+ * allocate a new inode
+ */
+static struct inode *romfs_alloc_inode(struct super_block *sb)
+{
+        struct romfs_inode_info *inode;
+        inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+        return inode ? &inode->vfs_inode : NULL;
+}
+/*
+ * return a spent inode to the slab cache
+ */
+static void romfs_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
+}
+/*
+ * get filesystem statistics
+ */
+static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+        buf->f_type = ROMFS_MAGIC;
+        buf->f_namelen = ROMFS_MAXFN;
+        buf->f_bsize = ROMBSIZE;
+        buf->f_bfree = buf->f_bavail = buf->f_ffree;
+        buf->f_blocks =
+                (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
+        return 0;
+}
+/*
+ * remounting must involve read-only
+ */
+static int romfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        *flags |= MS_RDONLY;
+        return 0;
+}
+static const struct super_operations romfs_super_ops = {
+        .alloc_inode    = romfs_alloc_inode,
+        .destroy_inode  = romfs_destroy_inode,
+        .statfs         = romfs_statfs,
+        .remount_fs     = romfs_remount,
+};
+/*
+ * checksum check on part of a romfs filesystem
+ */
+static __u32 romfs_checksum(const void *data, int size)
+{
+        const __be32 *ptr = data;
+        __u32 sum;
+        sum = 0;
+        size >>= 2;
+        while (size > 0) {
+                sum += be32_to_cpu(*ptr++);
+                size--;
+        }
+        return sum;
+}
+/*
+ * fill in the superblock
+ */
+static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct romfs_super_block *rsb;
+        struct inode *root;
+        unsigned long pos, img_size;
+        const char *storage;
+        size_t len;
+        int ret;
+#ifdef CONFIG_BLOCK
+        if (!sb->s_mtd) {
+                sb_set_blocksize(sb, ROMBSIZE);
+        } else {
+                sb->s_blocksize = ROMBSIZE;
+                sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
+        }
+#endif
+        sb->s_maxbytes = 0xFFFFFFFF;
+        sb->s_magic = ROMFS_MAGIC;
+        sb->s_flags |= MS_RDONLY | MS_NOATIME;
+        sb->s_op = &romfs_super_ops;
+        /* read the image superblock and check it */
+        rsb = kmalloc(512, GFP_KERNEL);
+        if (!rsb)
+                return -ENOMEM;
+        sb->s_fs_info = (void *) 512;
+        ret = romfs_dev_read(sb, 0, rsb, 512);
+        if (ret < 0)
+                goto error_rsb;
+        img_size = be32_to_cpu(rsb->size);
+        if (sb->s_mtd && img_size > sb->s_mtd->size)
+                goto error_rsb_inval;
+        sb->s_fs_info = (void *) img_size;
+        if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
+            img_size < ROMFH_SIZE) {
+                if (!silent)
+                        printk(KERN_WARNING "VFS:"
+                               " Can't find a romfs filesystem on dev %s.\n",
+                               sb->s_id);
+                goto error_rsb_inval;
+        }
+        if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
+                printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
+                       sb->s_id);
+                goto error_rsb_inval;
+        }
+        storage = sb->s_mtd ? "MTD" : "the block layer";
+        len = strnlen(rsb->name, ROMFS_MAXFN);
+        if (!silent)
+                printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
+                       (unsigned) len, (unsigned) len, rsb->name, storage);
+        kfree(rsb);
+        rsb = NULL;
+        /* find the root directory */
+        pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
+        root = romfs_iget(sb, pos);
+        if (!root)
+                goto error;
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root)
+                goto error_i;
+        return 0;
+error_i:
+        iput(root);
+error:
+        return -EINVAL;
+error_rsb_inval:
+        ret = -EINVAL;
+error_rsb:
+        return ret;
+}
+/*
+ * get a superblock for mounting
+ */
+static int romfs_get_sb(struct file_system_type *fs_type,
+                        int flags, const char *dev_name,
+                        void *data, struct vfsmount *mnt)
+{
+        int ret = -EINVAL;
+#ifdef CONFIG_ROMFS_ON_MTD
+        ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+                         mnt);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (ret == -EINVAL)
+                ret = get_sb_bdev(fs_type, flags, dev_name, data,
+                                  romfs_fill_super, mnt);
+#endif
+        return ret;
+}
+/*
+ * destroy a romfs superblock in the appropriate manner
+ */
+static void romfs_kill_sb(struct super_block *sb)
+{
+#ifdef CONFIG_ROMFS_ON_MTD
+        if (sb->s_mtd) {
+                kill_mtd_super(sb);
+                return;
+        }
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+        if (sb->s_bdev) {
+                kill_block_super(sb);
+                return;
+        }
+#endif
+}
+static struct file_system_type romfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "romfs",
+        .get_sb         = romfs_get_sb,
+        .kill_sb        = romfs_kill_sb,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+/*
+ * inode storage initialiser
+ */
+static void romfs_i_init_once(void *_inode)
+{
+        struct romfs_inode_info *inode = _inode;
+        inode_init_once(&inode->vfs_inode);
+}
+/*
+ * romfs module initialisation
+ */
+static int __init init_romfs_fs(void)
+{
+        int ret;
+        printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+        romfs_inode_cachep =
+                kmem_cache_create("romfs_i",
+                                  sizeof(struct romfs_inode_info), 0,
+                                  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                  romfs_i_init_once);
+        if (!romfs_inode_cachep) {
+                printk(KERN_ERR
+                       "ROMFS error: Failed to initialise inode cache\n");
+                return -ENOMEM;
+        }
+        ret = register_filesystem(&romfs_fs_type);
+        if (ret) {
+                printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+                goto error_register;
+        }
+        return 0;
+error_register:
+        kmem_cache_destroy(romfs_inode_cachep);
+        return ret;
+}
+/*
+ * romfs module removal
+ */
+static void __exit exit_romfs_fs(void)
+{
+        unregister_filesystem(&romfs_fs_type);
+        kmem_cache_destroy(romfs_inode_cachep);
+}
+module_init(init_romfs_fs);
+module_exit(exit_romfs_fs);
+MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a966..c18aa7e03e2b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
                 */
                wait_on_page_writeback(page);
-                if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+                if (page_has_private(page) &&
+                    !try_to_release_page(page, GFP_KERNEL))
                        goto out_unlock;
                /*
@@ -736,10 +737,19 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
         * ->write_end. Most of the time, these expect i_mutex to
         * be held. Since this may result in an ABBA deadlock with
         * pipe->inode, we have to order lock acquiry here.
+         *
+         * Outer lock must be inode->i_mutex, as pipe_wait() will
+         * release and reacquire pipe->inode->i_mutex, AND inode must
+         * never be a pipe.
         */
-        inode_double_lock(inode, pipe->inode);
+        WARN_ON(S_ISFIFO(inode->i_mode));
+        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+        if (pipe->inode)
+                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
        ret = __splice_from_pipe(pipe, &sd, actor);
-        inode_double_unlock(inode, pipe->inode);
+        if (pipe->inode)
+                mutex_unlock(&pipe->inode->i_mutex);
+        mutex_unlock(&inode->i_mutex);
        return ret;
 }
@@ -830,11 +840,17 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
        };
        ssize_t ret;
-        inode_double_lock(inode, pipe->inode);
+        WARN_ON(S_ISFIFO(inode->i_mode));
+        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
        ret = file_remove_suid(out);
-        if (likely(!ret))
+        if (likely(!ret)) {
+                if (pipe->inode)
+                        mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
                ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-        inode_double_unlock(inode, pipe->inode);
+                if (pipe->inode)
+                        mutex_unlock(&pipe->inode->i_mutex);
+        }
+        mutex_unlock(&inode->i_mutex);
        if (ret > 0) {
                unsigned long nr_pages;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/zlib.h>
+#include <linux/slab.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 681ec0d83799..ffa6edcd2d0c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -301,6 +301,7 @@ failure:
 static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+        u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
        TRACE("Entered squashfs_statfs\n");
@@ -311,6 +312,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = msblk->inodes;
        buf->f_ffree = 0;
        buf->f_namelen = SQUASHFS_NAME_LEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/super.c b/fs/super.c
index 2ba481518ba7..786fe7d72790 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -287,6 +287,7 @@ int fsync_super(struct super_block *sb)
        __fsync_super(sb);
        return sync_blockdev(sb->s_bdev);
 }
+EXPORT_SYMBOL_GPL(fsync_super);
 /**
 *      generic_shutdown_super  -       common helper for ->kill_sb()
@@ -770,6 +771,46 @@ void kill_litter_super(struct super_block *sb)
 EXPORT_SYMBOL(kill_litter_super);
+static int ns_test_super(struct super_block *sb, void *data)
+{
+        return sb->s_fs_info == data;
+}
+static int ns_set_super(struct super_block *sb, void *data)
+{
+        sb->s_fs_info = data;
+        return set_anon_super(sb, NULL);
+}
+int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct super_block *sb;
+        sb = sget(fs_type, ns_test_super, ns_set_super, data);
+        if (IS_ERR(sb))
+                return PTR_ERR(sb);
+        if (!sb->s_root) {
+                int err;
+                sb->s_flags = flags;
+                err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+                if (err) {
+                        up_write(&sb->s_umount);
+                        deactivate_super(sb);
+                        return err;
+                }
+                sb->s_flags |= MS_ACTIVE;
+        }
+        simple_set_mnt(mnt, sb);
+        return 0;
+}
+EXPORT_SYMBOL(get_sb_ns);
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3d81bf58dae2..da20b48d350f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = sb->s_magic;
        buf->f_bsize = sb->s_blocksize;
@@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = sbi->s_ninodes;
        buf->f_ffree = sysv_count_free_inodes(sb);
        buf->f_namelen = SYSV_NAMELEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index e35b54d5059d..830e3f76f442 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR
        depends on UBIFS_FS
        help
          This option allows to explicitly choose which compressions, if any,
-          are enabled in UBIFS. Removing compressors means inbility to read
+          are enabled in UBIFS. Removing compressors means inability to read
          existing file systems.
          If unsure, say 'N'.
@@ -32,7 +32,7 @@ config UBIFS_FS_LZO
        depends on UBIFS_FS
        default y
        help
-           LZO compressor is generally faster then zlib but compresses worse.
+           LZO compressor is generally faster than zlib but compresses worse.
           Say 'Y' if unsure.
 config UBIFS_FS_ZLIB
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
 }
 /**
- * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
 * @c: UBIFS file-system description object
 *
- * This function calculates and returns the number of eraseblocks which should
+ * This function calculates and returns the number of LEBs which should be kept
- * be kept for index usage.
+ * for index usage.
 */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-        int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+        int idx_lebs;
        long long idx_size;
        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
        /* And make sure we have thrice the index size of space reserved */
-        idx_size = idx_size + (idx_size << 1);
+        idx_size += idx_size << 1;
        /*
         * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
         * pair, nor similarly the two variables for the new index size, so we
         * have to do this costly 64-bit division on fast-path.
         */
-        idx_size += eff_leb_size - 1;
+        idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
-        idx_lebs = div_u64(idx_size, eff_leb_size);
        /*
         * The index head is not available for the in-the-gaps method, so add an
         * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
 * do_budget_space - reserve flash space for index and data growth.
 * @c: UBIFS file-system description object
 *
- * This function makes sure UBIFS has enough free eraseblocks for index growth
+ * This function makes sure UBIFS has enough free LEBs for index growth and
- * and data.
+ * data.
 *
 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
 * would take if it was consolidated and written to the flash. This guarantees
 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
 * be able to commit dirty index. So this function basically adds amount of
 * budgeted index space to the size of the current index, multiplies this by 3,
- * and makes sure this does not exceed the amount of free eraseblocks.
+ * and makes sure this does not exceed the amount of free LEBs.
 *
 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
 *    be large, because UBIFS does not do any index consolidation as long as
 *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
 *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
+ * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
- *   consolidated to take up to @c->min_idx_lebs LEBs.
+ *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
 *
 * This function returns zero in case of success, and %-ENOSPC in case of
 * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 * This function calculates amount of free space to report to user-space.
 *
 * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alignment, wastage at the end of eraseblocks, etc), it cannot report real
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
- * amount of free flash space it has (well, because not all dirty space is
+ * free flash space it has (well, because not all dirty space is reclaimable,
- * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
- * it would bread user expectations about what free space is. Users seem to
+ * bread user expectations about what free space is. Users seem to accustomed
- * accustomed to assume that if the file-system reports N bytes of free space,
+ * to assume that if the file-system reports N bytes of free space, they would
- * they would be able to fit a file of N bytes to the FS. This almost works for
+ * be able to fit a file of N bytes to the FS. This almost works for
 * traditional file-systems, because they have way less overhead than UBIFS.
 * So, to keep users happy, UBIFS tries to take the overhead into account.
 */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                                          "bad or corrupted node)");
                else {
                        for (i = 0; i < nlen && dent->name[i]; i++)
-                                printk("%c", dent->name[i]);
+                                printk(KERN_CONT "%c", dent->name[i]);
                }
-                printk("\n");
+                printk(KERN_CONT "\n");
                break;
        }
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
                        /*
                         * Make sure the last key in our znode is less or
-                         * equivalent than the the key in zbranch which goes
+                         * equivalent than the key in the zbranch which goes
                         * after our pointing zbranch.
                         */
                        cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e51..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        struct ubifs_inode *ui = ubifs_inode(inode);
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+        int skipped_read = 0;
        struct page *page;
        ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        if (!PageUptodate(page)) {
                /* The page is not loaded from the flash */
-                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
                        /*
                         * We change whole page so no need to load it. But we
                         * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                         * the media.
                         */
                        SetPageChecked(page);
-                else {
+                        skipped_read = 1;
+                } else {
                        err = do_readpage(page);
                        if (err) {
                                unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        if (unlikely(err)) {
                ubifs_assert(err == -ENOSPC);
                /*
+                 * If we skipped reading the page because we were going to
+                 * write all of it, then it is not up to date.
+                 */
+                if (skipped_read) {
+                        ClearPageChecked(page);
+                        ClearPageUptodate(page);
+                }
+                /*
                 * Budgeting failed which means it would have to force
                 * write-back but didn't, because we set the @fast flag in the
                 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
 * whole index and correct all inode sizes, which is long an unacceptable.
 *
 * To prevent situations like this, UBIFS writes pages back only if they are
- * within last synchronized inode size, i.e. the the size which has been
+ * within the last synchronized inode size, i.e. the size which has been
 * written to the flash media last time. Otherwise, UBIFS forces inode
 * write-back, thus making sure the on-flash inode contains current inode size,
 * and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
 * ubifs_find_free_space - find a data LEB with free space.
 * @c: the UBIFS file-system description object
 * @min_space: minimum amount of required free space
- * @free: contains amount of free space in the LEB on exit
+ * @offs: contains offset of where free space starts on exit
 * @squeeze: whether to try to find space in a non-empty LEB first
 *
 * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
 * failed to find a LEB with @min_space bytes of free space and other a negative
 * error codes in case of failure.
 */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
                          int squeeze)
 {
        const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
                spin_unlock(&c->space_lock);
        }
-        *free = lprops->free;
+        *offs = c->leb_size - lprops->free;
        ubifs_release_lprops(c);
-        if (*free == c->leb_size) {
+        if (*offs == 0) {
                /*
                 * Ensure that empty LEBs have been unmapped. They may not have
                 * been, for example, because of an unclean unmount.  Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
                        return err;
        }
-        dbg_find("found LEB %d, free %d", lnum, *free);
+        dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
-        ubifs_assert(*free >= min_space);
+        ubifs_assert(*offs <= c->leb_size - min_space);
        return lnum;
 out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
 * have to waste large pieces of free space at the end of LEB B, because nodes
 * from LEB A would not fit. And the worst situation is when all nodes are of
 * maximum size. So dark watermark is the amount of free + dirty space in LEB
- * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
 * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
 #include "ubifs.h"
 /*
- * GC tries to optimize the way it fit nodes to available space, and it sorts
- * nodes a little. The below constants are watermarks which define "large",
- * "medium", and "small" nodes.
- */
-#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
-#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
-/*
 * GC may need to move more than one LEB to make progress. The below constants
 * define "soft" and "hard" limits on the number of LEBs the garbage collector
 * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 /**
- * joinup - bring data nodes for an inode together.
+ * list_sort - sort a list.
- * @c: UBIFS file-system description object
+ * @priv: private data, passed to @cmp
- * @sleb: describes scanned LEB
+ * @head: the list to sort
- * @inum: inode number
+ * @cmp: the elements comparison function
- * @blk: block number
- * @data: list to which to add data nodes
 *
- * This function looks at the first few nodes in the scanned LEB @sleb and adds
+ * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
- * them to @data if they are data nodes from @inum and have a larger block
+ * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
- * number than @blk. This function returns %0 on success and a negative error
+ * in ascending order.
- * code on failure.
+ *
+ * The comparison function @cmp is supposed to return a negative value if @a is
+ * than @b, and a positive value if @a is greater than @b. If @a and @b are
+ * equivalent, then it does not matter what this function returns.
 */
-static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
+static void list_sort(void *priv, struct list_head *head,
-                  unsigned int blk, struct list_head *data)
+                      int (*cmp)(void *priv, struct list_head *a,
+                                 struct list_head *b))
 {
-        int err, cnt = 6, lnum = sleb->lnum, offs;
+        struct list_head *p, *q, *e, *list, *tail, *oldhead;
-        struct ubifs_scan_node *snod, *tmp;
+        int insize, nmerges, psize, qsize, i;
-        union ubifs_key *key;
+        if (list_empty(head))
+                return;
+        list = head->next;
+        list_del(head);
+        insize = 1;
+        for (;;) {
+                p = oldhead = list;
+                list = tail = NULL;
+                nmerges = 0;
+                while (p) {
+                        nmerges++;
+                        q = p;
+                        psize = 0;
+                        for (i = 0; i < insize; i++) {
+                                psize++;
+                                q = q->next == oldhead ? NULL : q->next;
+                                if (!q)
+                                        break;
+                        }
-        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                        qsize = insize;
-                key = &snod->key;
+                        while (psize > 0 || (qsize > 0 && q)) {
-                if (key_inum(c, key) == inum &&
+                                if (!psize) {
-                    key_type(c, key) == UBIFS_DATA_KEY &&
+                                        e = q;
-                    key_block(c, key) > blk) {
+                                        q = q->next;
-                        offs = snod->offs;
+                                        qsize--;
-                        err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
+                                        if (q == oldhead)
-                        if (err < 0)
+                                                q = NULL;
-                                return err;
+                                } else if (!qsize || !q) {
-                        list_del(&snod->list);
+                                        e = p;
-                        if (err) {
+                                        p = p->next;
-                                list_add_tail(&snod->list, data);
+                                        psize--;
-                                blk = key_block(c, key);
+                                        if (p == oldhead)
-                        } else
+                                                p = NULL;
-                                kfree(snod);
+                                } else if (cmp(priv, p, q) <= 0) {
-                        cnt = 6;
+                                        e = p;
-                } else if (--cnt == 0)
+                                        p = p->next;
+                                        psize--;
+                                        if (p == oldhead)
+                                                p = NULL;
+                                } else {
+                                        e = q;
+                                        q = q->next;
+                                        qsize--;
+                                        if (q == oldhead)
+                                                q = NULL;
+                                }
+                                if (tail)
+                                        tail->next = e;
+                                else
+                                        list = e;
+                                e->prev = tail;
+                                tail = e;
+                        }
+                        p = q;
+                }
+                tail->next = list;
+                list->prev = tail;
+                if (nmerges <= 1)
                        break;
+                insize *= 2;
        }
-        return 0;
+        head->next = list;
+        head->prev = list->prev;
+        list->prev->next = head;
+        list->prev = head;
 }
 /**
- * move_nodes - move nodes.
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        ino_t inuma, inumb;
+        struct ubifs_info *c = priv;
+        struct ubifs_scan_node *sa, *sb;
+        cond_resched();
+        sa = list_entry(a, struct ubifs_scan_node, list);
+        sb = list_entry(b, struct ubifs_scan_node, list);
+        ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+        ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+        inuma = key_inum(c, &sa->key);
+        inumb = key_inum(c, &sb->key);
+        if (inuma == inumb) {
+                unsigned int blka = key_block(c, &sa->key);
+                unsigned int blkb = key_block(c, &sb->key);
+                if (blka <= blkb)
+                        return -1;
+        } else if (inuma <= inumb)
+                return -1;
+        return 1;
+}
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        int typea, typeb;
+        ino_t inuma, inumb;
+        struct ubifs_info *c = priv;
+        struct ubifs_scan_node *sa, *sb;
+        cond_resched();
+        sa = list_entry(a, struct ubifs_scan_node, list);
+        sb = list_entry(b, struct ubifs_scan_node, list);
+        typea = key_type(c, &sa->key);
+        typeb = key_type(c, &sb->key);
+        ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+        /* Inodes go before directory entries */
+        if (typea == UBIFS_INO_KEY) {
+                if (typeb == UBIFS_INO_KEY)
+                        return sb->len - sa->len;
+                return -1;
+        }
+        if (typeb == UBIFS_INO_KEY)
+                return 1;
+        ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+        inuma = key_inum(c, &sa->key);
+        inumb = key_inum(c, &sb->key);
+        if (inuma == inumb) {
+                uint32_t hasha = key_hash(c, &sa->key);
+                uint32_t hashb = key_hash(c, &sb->key);
+                if (hasha <= hashb)
+                        return -1;
+        } else if (inuma <= inumb)
+                return -1;
+        return 1;
+}
+/**
+ * sort_nodes - sort nodes for GC.
 * @c: UBIFS file-system description object
- * @sleb: describes nodes to move
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
 *
- * This function moves valid nodes from data LEB described by @sleb to the GC
+ * This function sorts the list of inodes to garbage collect. First of all, it
- * journal head. The obsolete nodes are dropped.
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
 *
- * When moving nodes we have to deal with classical bin-packing problem: the
+ * Non-data nodes are sorted as follows.
- * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
+ *   o First go inode nodes - they are sorted in descending length order.
- * where the nodes in the @sleb->nodes list are the elements which should be
+ *   o Then go directory entry nodes - they are sorted in hash order, which
- * fit optimally to the bins. This function uses the "first fit decreasing"
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
- * strategy, although it does not really sort the nodes but just split them on
+ *     inode number go before direntry nodes with higher parent inode number,
- * 3 classes - large, medium, and small, so they are roughly sorted.
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
 *
- * This function returns zero in case of success, %-EAGAIN if commit is
+ * This function returns zero in case of success and a negative error code in
- * required, and other negative error codes in case of other failures.
+ * case of failure.
 */
-static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                      struct list_head *nondata, int *min)
 {
        struct ubifs_scan_node *snod, *tmp;
-        struct list_head data, large, medium, small;
-        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
-        int avail, err, min = INT_MAX;
-        unsigned int blk = 0;
-        ino_t inum = 0;
-        INIT_LIST_HEAD(&data);
+        *min = INT_MAX;
-        INIT_LIST_HEAD(&large);
-        INIT_LIST_HEAD(&medium);
-        INIT_LIST_HEAD(&small);
-        while (!list_empty(&sleb->nodes)) {
+        /* Separate data nodes and non-data nodes */
-                struct list_head *lst = sleb->nodes.next;
+        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                int err;
-                snod = list_entry(lst, struct ubifs_scan_node, list);
                ubifs_assert(snod->type != UBIFS_IDX_NODE);
                ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
                err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
                                         snod->offs, 0);
                if (err < 0)
-                        goto out;
+                        return err;
-                list_del(lst);
                if (!err) {
                        /* The node is obsolete, remove it from the list */
+                        list_del(&snod->list);
                        kfree(snod);
                        continue;
                }
-                /*
+                if (snod->len < *min)
-                 * Sort the list of nodes so that data nodes go first, large
+                        *min = snod->len;
-                 * nodes go second, and small nodes go last.
-                 */
+                if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
-                if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
+                        list_move_tail(&snod->list, nondata);
-                        if (inum != key_inum(c, &snod->key)) {
-                                if (inum) {
-                                        /*
-                                         * Try to move data nodes from the same
-                                         * inode together.
-                                         */
-                                        err = joinup(c, sleb, inum, blk, &data);
-                                        if (err)
-                                                goto out;
-                                }
-                                inum = key_inum(c, &snod->key);
-                                blk = key_block(c, &snod->key);
-                        }
-                        list_add_tail(lst, &data);
-                } else if (snod->len > MEDIUM_NODE_WM)
-                        list_add_tail(lst, &large);
-                else if (snod->len > SMALL_NODE_WM)
-                        list_add_tail(lst, &medium);
-                else
-                        list_add_tail(lst, &small);
-                /* And find the smallest node */
-                if (snod->len < min)
-                        min = snod->len;
        }
-        /*
+        /* Sort data and non-data nodes */
-         * Join the tree lists so that we'd have one roughly sorted list
+        list_sort(c, &sleb->nodes, &data_nodes_cmp);
-         * ('large' will be the head of the joined list).
+        list_sort(c, nondata, &nondata_nodes_cmp);
-         */
+        return 0;
-        list_splice(&data, &large);
+}
-        list_splice(&medium, large.prev);
-        list_splice(&small, large.prev);
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+        int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+        cond_resched();
+        err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+        if (err)
+                return err;
+        err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+                                snod->offs, new_lnum, new_offs,
+                                snod->len);
+        list_del(&snod->list);
+        kfree(snod);
+        return err;
+}
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+        int err, min;
+        LIST_HEAD(nondata);
+        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
        if (wbuf->lnum == -1) {
                /*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
                 */
                err = switch_gc_head(c);
                if (err)
-                        goto out;
+                        return err;
        }
+        err = sort_nodes(c, sleb, &nondata, &min);
+        if (err)
+                goto out;
        /* Write nodes to their new location. Use the first-fit strategy */
        while (1) {
-                avail = c->leb_size - wbuf->offs - wbuf->used;
+                int avail;
-                list_for_each_entry_safe(snod, tmp, &large, list) {
+                struct ubifs_scan_node *snod, *tmp;
-                        int new_lnum, new_offs;
+                /* Move data nodes */
+                list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                        avail = c->leb_size - wbuf->offs - wbuf->used;
+                        if  (snod->len > avail)
+                                /*
+                                 * Do not skip data nodes in order to optimize
+                                 * bulk-read.
+                                 */
+                                break;
+                        err = move_node(c, sleb, snod, wbuf);
+                        if (err)
+                                goto out;
+                }
+                /* Move non-data nodes */
+                list_for_each_entry_safe(snod, tmp, &nondata, list) {
+                        avail = c->leb_size - wbuf->offs - wbuf->used;
                        if (avail < min)
                                break;
-                        if (snod->len > avail)
+                        if  (snod->len > avail) {
-                                /* This node does not fit */
+                                /*
+                                 * Keep going only if this is an inode with
+                                 * some data. Otherwise stop and switch the GC
+                                 * head. IOW, we assume that data-less inode
+                                 * nodes and direntry nodes are roughly of the
+                                 * same size.
+                                 */
+                                if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+                                    snod->len == UBIFS_INO_NODE_SZ)
+                                        break;
                                continue;
+                        }
-                        cond_resched();
+                        err = move_node(c, sleb, snod, wbuf);
-                        new_lnum = wbuf->lnum;
-                        new_offs = wbuf->offs + wbuf->used;
-                        err = ubifs_wbuf_write_nolock(wbuf, snod->node,
-                                                      snod->len);
                        if (err)
                                goto out;
-                        err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
-                                                snod->offs, new_lnum, new_offs,
-                                                snod->len);
-                        if (err)
-                                goto out;
-                        avail = c->leb_size - wbuf->offs - wbuf->used;
-                        list_del(&snod->list);
-                        kfree(snod);
                }
-                if (list_empty(&large))
+                if (list_empty(&sleb->nodes) && list_empty(&nondata))
                        break;
                /*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
        return 0;
 out:
-        list_for_each_entry_safe(snod, tmp, &large, list) {
+        list_splice_tail(&nondata, &sleb->nodes);
-                list_del(&snod->list);
-                kfree(snod);
-        }
        return err;
 }
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
 */
 static int reserve_space(struct ubifs_info *c, int jhead, int len)
 {
-        int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+        int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
        struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
        /*
@@ -139,10 +139,9 @@ again:
         * Write buffer wasn't seek'ed or there is no enough space - look for an
         * LEB with some empty space.
         */
-        lnum = ubifs_find_free_space(c, len, &free, squeeze);
+        lnum = ubifs_find_free_space(c, len, &offs, squeeze);
        if (lnum >= 0) {
                /* Found an LEB, add it to the journal head */
-                offs = c->leb_size - free;
                err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
                if (err)
                        goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
 * @host: host inode
 *
 * This function writes the updated version of an extended attribute inode and
- * the host inode tho the journal (to the base head). The host inode is written
+ * the host inode to the journal (to the base head). The host inode is written
 * after the extended attribute inode in order to guarantee that the extended
 * attribute will be flushed when the inode is synchronized by 'fsync()' and
 * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
 * @c: UBIFS file-system description object
 * @key: the key to get hash from
 */
-static inline int key_hash(const struct ubifs_info *c,
+static inline uint32_t key_hash(const struct ubifs_info *c,
-                           const union ubifs_key *key)
+                                const union ubifs_key *key)
 {
        return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
 }
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
 * @c: UBIFS file-system description object
 * @k: the key to get hash from
 */
-static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
 {
        const union ubifs_key *key = k;
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        }
        /*
-         * Make sure the the amount of space in buds will not exceed
+         * Make sure the amount of space in buds will not exceed the
         * 'c->max_bud_bytes' limit, because we want to guarantee mount time
         * limits.
         *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
                                bud->jhead, c->leb_size - bud->start,
                                c->cmt_bud_bytes);
                        rb_erase(p1, &c->buds);
-                        list_del(&bud->list);
                        /*
                         * If the commit does not finish, the recovery will need
                         * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
                         * commit i.e. do not allow them to be garbage
                         * collected.
                         */
-                        list_add(&bud->list, &c->old_buds);
+                        list_move(&bud->list, &c->old_buds);
                }
        }
        spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
                while (offs + len > c->leb_size) {
                        alen = ALIGN(offs, c->min_io_size);
                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-                        dbg_chk_lpt_sz(c, 2, alen - offs);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = alloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
                if (offs + c->lsave_sz > c->leb_size) {
                        alen = ALIGN(offs, c->min_io_size);
                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-                        dbg_chk_lpt_sz(c, 2, alen - offs);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = alloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
                if (offs + c->ltab_sz > c->leb_size) {
                        alen = ALIGN(offs, c->min_io_size);
                        upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-                        dbg_chk_lpt_sz(c, 2, alen - offs);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = alloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
                                                       alen, UBI_SHORTTERM);
                                if (err)
                                        return err;
-                                dbg_chk_lpt_sz(c, 4, alen - wlen);
                        }
-                        dbg_chk_lpt_sz(c, 2, 0);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = realloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
-                        offs = 0;
+                        offs = from = 0;
-                        from = 0;
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                        err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
                                              UBI_SHORTTERM);
                        if (err)
                                return err;
-                        dbg_chk_lpt_sz(c, 2, alen - wlen);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = realloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
-                        offs = 0;
+                        offs = from = 0;
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                        err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
                                              UBI_SHORTTERM);
                        if (err)
                                return err;
-                        dbg_chk_lpt_sz(c, 2, alen - wlen);
+                        dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
                        err = realloc_lpt_leb(c, &lnum);
                        if (err)
                                goto no_space;
-                        offs = 0;
+                        offs = from = 0;
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                        err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 /**
 * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
 * @c: the UBIFS file-system description object
- * @action: action
+ * @action: what to do
 * @len: length written
 *
 * This function returns %0 on success and a negative error code on failure.
+ * The @action argument may be one of:
+ *   o %0 - LPT debugging checking starts, initialize debugging variables;
+ *   o %1 - wrote an LPT node, increase LPT size by @len bytes;
+ *   o %2 - switched to a different LEB and wasted @len bytes;
+ *   o %3 - check that we've written the right number of bytes.
+ *   o %4 - wasted @len bytes;
 */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                                       lnum, offs);
                        err = ubifs_unpack_nnode(c, buf, &nnode);
                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
-                                printk("%d:%d", nnode.nbranch[i].lnum,
+                                printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
                                       nnode.nbranch[i].offs);
                                if (i != UBIFS_LPT_FANOUT - 1)
-                                        printk(", ");
+                                        printk(KERN_CONT ", ");
                        }
-                        printk("\n");
+                        printk(KERN_CONT "\n");
                        break;
                }
                case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
 * @lnum: LEB number of the LEB from which @buf was read
 * @offs: offset from which @buf was read
 *
- * This function scans @buf for more nodes and returns %0 is a node is found and
+ * This function ensures that the corrupted node at @offs is the last thing
- * %1 if no more nodes are found.
+ * written to a LEB. This function returns %1 if more data is not found and
+ * %0 if more data is found.
 */
 static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
                        int lnum, int offs)
 {
-        int skip, next_offs = 0;
+        struct ubifs_ch *ch = buf;
+        int skip, dlen = le32_to_cpu(ch->len);
-        if (len > UBIFS_DATA_NODE_SZ) {
+        /* Check for empty space after the corrupt node's common header */
-                struct ubifs_ch *ch = buf;
+        skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
-                int dlen = le32_to_cpu(ch->len);
+        if (is_empty(buf + skip, len - skip))
+                return 1;
-                if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
+        /*
-                    dlen <= UBIFS_MAX_DATA_NODE_SZ)
+         * The area after the common header size is not empty, so the common
-                        /* The corrupt node looks like a data node */
+         * header must be intact. Check it.
-                        next_offs = ALIGN(offs + dlen, 8);
+         */
-        }
+        if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+                dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
-        if (c->min_io_size == 1)
+                return 0;
-                skip = 8;
-        else
-                skip = ALIGN(offs + 1, c->min_io_size) - offs;
-        offs += skip;
-        buf += skip;
-        len -= skip;
-        while (len > 8) {
-                struct ubifs_ch *ch = buf;
-                uint32_t magic = le32_to_cpu(ch->magic);
-                int ret;
-                if (magic == UBIFS_NODE_MAGIC) {
-                        ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
-                        if (ret == SCANNED_A_NODE || ret > 0) {
-                                /*
-                                 * There is a small chance this is just data in
-                                 * a data node, so check that possibility. e.g.
-                                 * this is part of a file that itself contains
-                                 * a UBIFS image.
-                                 */
-                                if (next_offs && offs + le32_to_cpu(ch->len) <=
-                                    next_offs)
-                                        continue;
-                                dbg_rcvry("unexpected node at %d:%d", lnum,
-                                          offs);
-                                return 0;
-                        }
-                }
-                offs += 8;
-                buf += 8;
-                len -= 8;
        }
-        return 1;
+        /* Now we know the corrupt node's length we can skip over it */
+        skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+        /* After which there should be empty space */
+        if (is_empty(buf + skip, len - skip))
+                return 1;
+        dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
+        return 0;
 }
 /**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                dirty -= c->leb_size - lp->free;
                /*
                 * If the replay order was perfect the dirty space would now be
-                 * zero. The order is not perfect because the the journal heads
+                 * zero. The order is not perfect because the journal heads
                 * race with each other. This is not a problem but is does mean
                 * that the dirty space may temporarily exceed c->leb_size
                 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        if (tmp64 > DEFAULT_MAX_RP_SIZE)
                tmp64 = DEFAULT_MAX_RP_SIZE;
        sup->rp_size = cpu_to_le64(tmp64);
+        sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
        err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
        kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
        if (IS_ERR(sup))
                return PTR_ERR(sup);
+        c->fmt_version = le32_to_cpu(sup->fmt_version);
+        c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
        /*
         * The software supports all previous versions but not future versions,
         * due to the unavailability of time-travelling equipment.
         */
-        c->fmt_version = le32_to_cpu(sup->fmt_version);
        if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-                ubifs_err("on-flash format version is %d, but software only "
+                struct super_block *sb = c->vfs_sb;
-                          "supports up to version %d", c->fmt_version,
+                int mounting_ro = sb->s_flags & MS_RDONLY;
-                          UBIFS_FORMAT_VERSION);
-                err = -EINVAL;
+                ubifs_assert(!c->ro_media || mounting_ro);
-                goto out;
+                if (!mounting_ro ||
+                    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
+                        ubifs_err("on-flash format version is w%d/r%d, but "
+                                  "software only supports up to version "
+                                  "w%d/r%d", c->fmt_version,
+                                  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+                                  UBIFS_RO_COMPAT_VERSION);
+                        if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
+                                ubifs_msg("only R/O mounting is possible");
+                                err = -EROFS;
+                        } else
+                                err = -EINVAL;
+                        goto out;
+                }
+                /*
+                 * The FS is mounted R/O, and the media format is
+                 * R/O-compatible with the UBIFS implementation, so we can
+                 * mount.
+                 */
+                c->rw_incompat = 1;
        }
        if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
        c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
        c->main_first = c->leb_cnt - c->main_lebs;
-        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
        err = validate_sb(c, sup);
 out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
                 * Move this one to the end of the list to provide some
                 * fairness.
                 */
-                list_del(&c->infos_list);
+                list_move_tail(&c->infos_list, &ubifs_infos);
-                list_add_tail(&c->infos_list, &ubifs_infos);
                mutex_unlock(&c->umount_mutex);
                if (freed >= nr)
                        break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
                        }
                        if (i == 1) {
-                                list_del(&c->infos_list);
+                                list_move_tail(&c->infos_list, &ubifs_infos);
-                                list_add_tail(&c->infos_list, &ubifs_infos);
                                spin_unlock(&ubifs_infos_lock);
                                ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..faa44f90608a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
                seq_printf(s, ",no_chk_data_crc");
        if (c->mount_opts.override_compr) {
-                seq_printf(s, ",compr=");
+                seq_printf(s, ",compr=%s",
-                seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+                           ubifs_compr_name(c->mount_opts.compr_type));
        }
        return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
        if (err)
                return err;
+        /* Initialize effective LEB size used in budgeting calculations */
+        c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
        return 0;
 }
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
        long long tmp64;
        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        c->report_rp_size = ubifs_reported_space(c, c->rp_size);
        /*
         * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
                        goto out_cbuf;
                /* Create background thread */
-                c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+                c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
                if (IS_ERR(c->bgt)) {
                        err = PTR_ERR(c->bgt);
                        c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
                else {
                        c->need_recovery = 0;
                        ubifs_msg("recovery completed");
-                        /* GC LEB has to be empty and taken at this point */
+                        /*
-                        ubifs_assert(c->lst.taken_empty_lebs == 1);
+                         * GC LEB has to be empty and taken at this point. But
+                         * the journal head LEBs may also be accounted as
+                         * "empty taken" if they are empty.
+                         */
+                        ubifs_assert(c->lst.taken_empty_lebs > 0);
                }
        } else
-                ubifs_assert(c->lst.taken_empty_lebs == 1);
+                ubifs_assert(c->lst.taken_empty_lebs > 0);
        err = dbg_check_filesystem(c);
        if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
        x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
        ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
                  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
-        ubifs_msg("media format:       %d (latest is %d)",
+        ubifs_msg("media format:       w%d/r%d (latest is w%d/r%d)",
-                  c->fmt_version, UBIFS_FORMAT_VERSION);
+                  c->fmt_version, c->ro_compat_version,
+                  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
        ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
        ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
                c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
        int err, lnum;
+        if (c->rw_incompat) {
+                ubifs_err("the file-system is not R/W-compatible");
+                ubifs_msg("on-flash format version is w%d/r%d, but software "
+                          "only supports up to version w%d/r%d", c->fmt_version,
+                          c->ro_compat_version, UBIFS_FORMAT_VERSION,
+                          UBIFS_RO_COMPAT_VERSION);
+                return -EROFS;
+        }
        mutex_lock(&c->umount_mutex);
        dbg_save_space_info(c);
        c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        ubifs_create_buds_lists(c);
        /* Create background thread */
-        c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+        c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
        if (IS_ERR(c->bgt)) {
                err = PTR_ERR(c->bgt);
                c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                c->bu.buf = NULL;
        }
-        ubifs_assert(c->lst.taken_empty_lebs == 1);
+        ubifs_assert(c->lst.taken_empty_lebs > 0);
        return 0;
 }
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
         * splitting in the middle of the colliding sequence. Also, when
         * removing the leftmost key, we would have to correct the key of the
         * parent node, which would introduce additional complications. Namely,
-         * if we changed the the leftmost key of the parent znode, the garbage
+         * if we changed the leftmost key of the parent znode, the garbage
         * collector would be unable to find it (GC is doing this when GC'ing
         * indexing LEBs). Although we already have an additional RB-tree where
         * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
 /* UBIFS node magic number (must not have the padding byte first or last) */
 #define UBIFS_NODE_MAGIC  0x06101831
-/* UBIFS on-flash format version */
+/*
+ * UBIFS on-flash format version. This version is increased when the on-flash
+ * format is changing. If this happens, UBIFS is will support older versions as
+ * well. But older UBIFS code will not support newer formats. Format changes
+ * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
+ * a new feature.
+ *
+ * UBIFS went into mainline kernel with format version 4. The older formats
+ * were development formats.
+ */
 #define UBIFS_FORMAT_VERSION 4
+/*
+ * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
+ * implementations will not be able to mount newer formats in read-write mode.
+ * However, depending on the change, it may be possible to mount newer formats
+ * in R/O mode. This is indicated by the R/O compatibility version which is
+ * stored in the super-block.
+ *
+ * This is needed to support boot-loaders which only need R/O mounting. With
+ * this flag it is possible to do UBIFS format changes without a need to update
+ * boot-loaders.
+ */
+#define UBIFS_RO_COMPAT_VERSION 0
 /* Minimum logical eraseblock size in bytes */
 #define UBIFS_MIN_LEB_SZ (15*1024)
@@ -53,7 +75,7 @@
 /*
 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
- * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * shorter than uncompressed data length, UBIFS prefers to leave this data
 * node uncompress, because it'll be read faster.
 */
 #define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
 * @padding2: reserved for future, zeroes
 * @time_gran: time granularity in nanoseconds
 * @uuid: UUID generated when the file system image was created
+ * @ro_compat_version: UBIFS R/O compatibility version
 */
 struct ubifs_sb_node {
        struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
        __le64 rp_size;
        __le32 time_gran;
        __u8 uuid[16];
-        __u8 padding2[3972];
+        __le32 ro_compat_version;
+        __u8 padding2[3968];
 } __attribute__ ((packed));
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
 *          by @commit_sem
 * @cnt_lock: protects @highest_inum and @max_sqnum counters
 * @fmt_version: UBIFS on-flash format version
+ * @ro_compat_version: R/O compatibility version
 * @uuid: UUID from super block
 *
 * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
 *                   recovery)
 * @bulk_read: enable bulk-reads
 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @rw_incompat: the media is not R/W compatible
 *
 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
 *             @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
 * @min_io_shift: number of bits in @min_io_size minus one
 * @leb_size: logical eraseblock size in bytes
 * @half_leb_size: half LEB size
+ * @idx_leb_size: how many bytes of an LEB are effectively available when it is
+ *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
 * @leb_cnt: count of logical eraseblocks
 * @max_leb_cnt: maximum count of logical eraseblocks
 * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
 *             previous commit start
 * @uncat_list: list of un-categorized LEBs
 * @empty_list: list of empty LEBs
- * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
- * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
 * @freeable_cnt: number of freeable LEBs in @freeable_list
 *
 * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
        unsigned long long cmt_no;
        spinlock_t cnt_lock;
        int fmt_version;
+        int ro_compat_version;
        unsigned char uuid[16];
        int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
        unsigned int no_chk_data_crc:1;
        unsigned int bulk_read:1;
        unsigned int default_compr:2;
+        unsigned int rw_incompat:1;
        struct mutex tnc_mutex;
        struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
        int min_io_shift;
        int leb_size;
        int half_leb_size;
+        int idx_leb_size;
        int leb_cnt;
        int max_leb_cnt;
        int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 /* find.c */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
                          int squeeze);
 int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 2bb788a2acb1..e48e9a3af763 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb,
 {
        struct buffer_head *bh = NULL;
        int retval = 0;
-        kernel_lb_addr loc;
+        struct kernel_lb_addr loc;
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
-        bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block));
+        bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
        if (!bh)
                retval = -EIO;
@@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb,
        return slot;
 }
-static bool udf_add_free_space(struct udf_sb_info *sbi,
+static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
-                                u16 partition, u32 cnt)
 {
+        struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDesc *lvid;
-        if (sbi->s_lvid_bh == NULL)
+        if (!sbi->s_lvid_bh)
-                return false;
+                return;
        lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
        le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
-        return true;
+        udf_updated_lvid(sb);
 }
 static void udf_bitmap_free_blocks(struct super_block *sb,
                                   struct inode *inode,
                                   struct udf_bitmap *bitmap,
-                                   kernel_lb_addr bloc, uint32_t offset,
+                                   struct kernel_lb_addr *bloc,
+                                   uint32_t offset,
                                   uint32_t count)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct buffer_head *bh = NULL;
+        struct udf_part_map *partmap;
        unsigned long block;
        unsigned long block_group;
        unsigned long bit;
@@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
        unsigned long overflow;
        mutex_lock(&sbi->s_alloc_mutex);
-        if (bloc.logicalBlockNum < 0 ||
+        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-            (bloc.logicalBlockNum + count) >
+        if (bloc->logicalBlockNum < 0 ||
-                sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+            (bloc->logicalBlockNum + count) >
+                partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
-                          bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
+                          bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
-                          sbi->s_partmaps[bloc.partitionReferenceNum].
+                          count, partmap->s_partition_len);
-                                                        s_partition_len);
                goto error_return;
        }
-        block = bloc.logicalBlockNum + offset +
+        block = bloc->logicalBlockNum + offset +
                (sizeof(struct spaceBitmapDesc) << 3);
        do {
@@ -207,7 +209,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                        } else {
                                if (inode)
                                        vfs_dq_free_block(inode, 1);
-                                udf_add_free_space(sbi, sbi->s_partition, 1);
+                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
                mark_buffer_dirty(bh);
@@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
        } while (overflow);
 error_return:
-        sb->s_dirt = 1;
-        if (sbi->s_lvid_bh)
-                mark_buffer_dirty(sbi->s_lvid_bh);
        mutex_unlock(&sbi->s_alloc_mutex);
 }
@@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
        } while (block_count > 0);
 out:
-        if (udf_add_free_space(sbi, partition, -alloc_count))
+        udf_add_free_space(sb, partition, -alloc_count);
-                mark_buffer_dirty(sbi->s_lvid_bh);
-        sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        return alloc_count;
 }
@@ -409,9 +406,7 @@ got_block:
        mark_buffer_dirty(bh);
-        if (udf_add_free_space(sbi, partition, -1))
+        udf_add_free_space(sb, partition, -1);
-                mark_buffer_dirty(sbi->s_lvid_bh);
-        sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        *err = 0;
        return newblock;
@@ -425,26 +420,28 @@ error_return:
 static void udf_table_free_blocks(struct super_block *sb,
                                  struct inode *inode,
                                  struct inode *table,
-                                  kernel_lb_addr bloc, uint32_t offset,
+                                  struct kernel_lb_addr *bloc,
+                                  uint32_t offset,
                                  uint32_t count)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
+        struct udf_part_map *partmap;
        uint32_t start, end;
        uint32_t elen;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        struct extent_position oepos, epos;
        int8_t etype;
        int i;
        struct udf_inode_info *iinfo;
        mutex_lock(&sbi->s_alloc_mutex);
-        if (bloc.logicalBlockNum < 0 ||
+        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-            (bloc.logicalBlockNum + count) >
+        if (bloc->logicalBlockNum < 0 ||
-                sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+            (bloc->logicalBlockNum + count) >
+                partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
                          bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
-                          sbi->s_partmaps[bloc.partitionReferenceNum].
+                          partmap->s_partition_len);
-                                                        s_partition_len);
                goto error_return;
        }
@@ -453,11 +450,10 @@ static void udf_table_free_blocks(struct super_block *sb,
           could occure, but.. oh well */
        if (inode)
                vfs_dq_free_block(inode, count);
-        if (udf_add_free_space(sbi, sbi->s_partition, count))
+        udf_add_free_space(sb, sbi->s_partition, count);
-                mark_buffer_dirty(sbi->s_lvid_bh);
-        start = bloc.logicalBlockNum + offset;
+        start = bloc->logicalBlockNum + offset;
-        end = bloc.logicalBlockNum + offset + count - 1;
+        end = bloc->logicalBlockNum + offset + count - 1;
        epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
        elen = 0;
@@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                                start += count;
                                count = 0;
                        }
-                        udf_write_aext(table, &oepos, eloc, elen, 1);
+                        udf_write_aext(table, &oepos, &eloc, elen, 1);
                } else if (eloc.logicalBlockNum == (end + 1)) {
                        if ((0x3FFFFFFF - elen) <
                                        (count << sb->s_blocksize_bits)) {
@@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                                end -= count;
                                count = 0;
                        }
-                        udf_write_aext(table, &oepos, eloc, elen, 1);
+                        udf_write_aext(table, &oepos, &eloc, elen, 1);
                }
                if (epos.bh != oepos.bh) {
@@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb,
                 */
                int adsize;
-                short_ad *sad = NULL;
+                struct short_ad *sad = NULL;
-                long_ad *lad = NULL;
+                struct long_ad *lad = NULL;
                struct allocExtDesc *aed;
                eloc.logicalBlockNum = start;
@@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb,
                        (count << sb->s_blocksize_bits);
                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                        adsize = sizeof(short_ad);
+                        adsize = sizeof(struct short_ad);
                else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                        adsize = sizeof(long_ad);
+                        adsize = sizeof(struct long_ad);
                else {
                        brelse(oepos.bh);
                        brelse(epos.bh);
@@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                        elen -= sb->s_blocksize;
                        epos.bh = udf_tread(sb,
-                                        udf_get_lb_pblock(sb, epos.block, 0));
+                                        udf_get_lb_pblock(sb, &epos.block, 0));
                        if (!epos.bh) {
                                brelse(oepos.bh);
                                goto error_return;
@@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb,
                        if (sbi->s_udfrev >= 0x0200)
                                udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
                                            3, 1, epos.block.logicalBlockNum,
-                                            sizeof(tag));
+                                            sizeof(struct tag));
                        else
                                udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
                                            2, 1, epos.block.logicalBlockNum,
-                                            sizeof(tag));
+                                            sizeof(struct tag));
                        switch (iinfo->i_alloc_type) {
                        case ICBTAG_FLAG_AD_SHORT:
-                                sad = (short_ad *)sptr;
+                                sad = (struct short_ad *)sptr;
                                sad->extLength = cpu_to_le32(
                                        EXT_NEXT_EXTENT_ALLOCDECS |
                                        sb->s_blocksize);
@@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                                        cpu_to_le32(epos.block.logicalBlockNum);
                                break;
                        case ICBTAG_FLAG_AD_LONG:
-                                lad = (long_ad *)sptr;
+                                lad = (struct long_ad *)sptr;
                                lad->extLength = cpu_to_le32(
                                        EXT_NEXT_EXTENT_ALLOCDECS |
                                        sb->s_blocksize);
@@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                /* It's possible that stealing the block emptied the extent */
                if (elen) {
-                        udf_write_aext(table, &epos, eloc, elen, 1);
+                        udf_write_aext(table, &epos, &eloc, elen, 1);
                        if (!epos.bh) {
                                iinfo->i_lenAlloc += adsize;
@@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb,
        brelse(oepos.bh);
 error_return:
-        sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        return;
 }
@@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
        struct udf_sb_info *sbi = UDF_SB(sb);
        int alloc_count = 0;
        uint32_t elen, adsize;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        struct extent_position epos;
        int8_t etype = -1;
        struct udf_inode_info *iinfo;
@@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
        iinfo = UDF_I(table);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                return 0;
@@ -707,7 +702,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
                        alloc_count = block_count;
                        eloc.logicalBlockNum += alloc_count;
                        elen -= (alloc_count << sb->s_blocksize_bits);
-                        udf_write_aext(table, &epos, eloc,
+                        udf_write_aext(table, &epos, &eloc,
                                        (etype << 30) | elen, 1);
                } else
                        udf_delete_aext(table, epos, eloc,
@@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
        brelse(epos.bh);
-        if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) {
+        if (alloc_count)
-                mark_buffer_dirty(sbi->s_lvid_bh);
+                udf_add_free_space(sb, partition, -alloc_count);
-                sb->s_dirt = 1;
-        }
        mutex_unlock(&sbi->s_alloc_mutex);
        return alloc_count;
 }
@@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb,
        uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
        uint32_t newblock = 0, adsize;
        uint32_t elen, goal_elen = 0;
-        kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+        struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
        struct extent_position epos, goal_epos;
        int8_t etype;
        struct udf_inode_info *iinfo = UDF_I(table);
@@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb,
        *err = -ENOSPC;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                return newblock;
@@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb,
        }
        if (goal_elen)
-                udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1);
+                udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
        else
                udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
        brelse(goal_epos.bh);
-        if (udf_add_free_space(sbi, partition, -1))
+        udf_add_free_space(sb, partition, -1);
-                mark_buffer_dirty(sbi->s_lvid_bh);
-        sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        *err = 0;
        return newblock;
 }
-inline void udf_free_blocks(struct super_block *sb,
+void udf_free_blocks(struct super_block *sb, struct inode *inode,
-                            struct inode *inode,
+                     struct kernel_lb_addr *bloc, uint32_t offset,
-                            kernel_lb_addr bloc, uint32_t offset,
+                     uint32_t count)
-                            uint32_t count)
 {
-        uint16_t partition = bloc.partitionReferenceNum;
+        uint16_t partition = bloc->partitionReferenceNum;
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
-                return udf_bitmap_free_blocks(sb, inode,
+                udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
-                                              map->s_uspace.s_bitmap,
+                                       bloc, offset, count);
-                                              bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
-                return udf_table_free_blocks(sb, inode,
+                udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
-                                             map->s_uspace.s_table,
+                                      bloc, offset, count);
-                                             bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
-                return udf_bitmap_free_blocks(sb, inode,
+                udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
-                                              map->s_fspace.s_bitmap,
+                                       bloc, offset, count);
-                                              bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
-                return udf_table_free_blocks(sb, inode,
+                udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
-                                             map->s_fspace.s_table,
+                                      bloc, offset, count);
-                                             bloc, offset, count);
-        } else {
-                return;
        }
 }
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 62dc270c69d1..2efd4d5291b6 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
        uint8_t lfi;
        loff_t size = udf_ext0_offset(dir) + dir->i_size;
        struct buffer_head *tmp, *bha[16];
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        int i, num, ret = 0;
@@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                        ret = -ENOENT;
                        goto out;
                }
-                block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                                epos.offset -= sizeof(short_ad);
+                                epos.offset -= sizeof(struct short_ad);
                        else if (iinfo->i_alloc_type ==
                                        ICBTAG_FLAG_AD_LONG)
-                                epos.offset -= sizeof(long_ad);
+                                epos.offset -= sizeof(struct long_ad);
                } else {
                        offset = 0;
                }
@@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                        if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
                                i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
                        for (num = 0; i > 0; i--) {
-                                block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i);
+                                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
                                tmp = udf_tgetblk(dir->i_sb, block);
                                if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
                                        bha[num++] = tmp;
@@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                        memcpy(fname, "..", flen);
                        dt_type = DT_DIR;
                } else {
-                        kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
+                        struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
-                        iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0);
+                        iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
                        flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
                        dt_type = DT_UNKNOWN;
                }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2820f8fcf4cc..1d2c570704c8 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -20,7 +20,7 @@
 #if 0
 static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
-                                uint8_t ad_size, kernel_lb_addr fe_loc,
+                                uint8_t ad_size, struct kernel_lb_addr fe_loc,
                                int *pos, int *offset, struct buffer_head **bh,
                                int *error)
 {
@@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                                         struct udf_fileident_bh *fibh,
                                         struct fileIdentDesc *cfi,
                                         struct extent_position *epos,
-                                         kernel_lb_addr *eloc, uint32_t *elen,
+                                         struct kernel_lb_addr *eloc, uint32_t *elen,
                                         sector_t *offset)
 {
        struct fileIdentDesc *fi;
@@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                    (EXT_RECORDED_ALLOCATED >> 30))
                        return NULL;
-                block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+                block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
                (*offset)++;
@@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                        if (i + *offset > (*elen >> blocksize_bits))
                                i = (*elen >> blocksize_bits)-*offset;
                        for (num = 0; i > 0; i--) {
-                                block = udf_get_lb_pblock(dir->i_sb, *eloc,
+                                block = udf_get_lb_pblock(dir->i_sb, eloc,
                                                          *offset + i);
                                tmp = udf_tgetblk(dir->i_sb, block);
                                if (tmp && !buffer_uptodate(tmp) &&
@@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                    (EXT_RECORDED_ALLOCATED >> 30))
                        return NULL;
-                block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+                block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
                (*offset)++;
@@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
 }
 #if 0
-static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
+static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
 {
-        extent_ad *ext;
+        struct extent_ad *ext;
        struct fileEntry *fe;
        uint8_t *ptr;
@@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
        if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
                ptr += *offset;
-        ext = (extent_ad *)ptr;
+        ext = (struct extent_ad *)ptr;
-        *offset = *offset + sizeof(extent_ad);
+        *offset = *offset + sizeof(struct extent_ad);
        return ext;
 }
 #endif
-short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
+struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
                              int inc)
 {
-        short_ad *sa;
+        struct short_ad *sa;
        if ((!ptr) || (!offset)) {
                printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
                return NULL;
        }
-        if ((*offset + sizeof(short_ad)) > maxoffset)
+        if ((*offset + sizeof(struct short_ad)) > maxoffset)
                return NULL;
        else {
-                sa = (short_ad *)ptr;
+                sa = (struct short_ad *)ptr;
                if (sa->extLength == 0)
                        return NULL;
        }
        if (inc)
-                *offset += sizeof(short_ad);
+                *offset += sizeof(struct short_ad);
        return sa;
 }
-long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
+struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
 {
-        long_ad *la;
+        struct long_ad *la;
        if ((!ptr) || (!offset)) {
                printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
                return NULL;
        }
-        if ((*offset + sizeof(long_ad)) > maxoffset)
+        if ((*offset + sizeof(struct long_ad)) > maxoffset)
                return NULL;
        else {
-                la = (long_ad *)ptr;
+                la = (struct long_ad *)ptr;
                if (la->extLength == 0)
                        return NULL;
        }
        if (inc)
-                *offset += sizeof(long_ad);
+                *offset += sizeof(struct long_ad);
        return la;
 }
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index a0974df82b31..4792b771aa80 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -38,10 +38,10 @@
 #define _ECMA_167_H 1
 /* Character set specification (ECMA 167r3 1/7.2.1) */
-typedef struct {
+struct charspec {
        uint8_t         charSetType;
        uint8_t         charSetInfo[63];
-} __attribute__ ((packed)) charspec;
+} __attribute__ ((packed));
 /* Character Set Type (ECMA 167r3 1/7.2.1.1) */
 #define CHARSPEC_TYPE_CS0               0x00    /* (1/7.2.2) */
@@ -57,7 +57,7 @@ typedef struct {
 typedef uint8_t         dstring;
 /* Timestamp (ECMA 167r3 1/7.3) */
-typedef struct {
+struct timestamp {
        __le16          typeAndTimezone;
        __le16          year;
        uint8_t         month;
@@ -68,7 +68,7 @@ typedef struct {
        uint8_t         centiseconds;
        uint8_t         hundredsOfMicroseconds;
        uint8_t         microseconds;
-} __attribute__ ((packed)) timestamp;
+} __attribute__ ((packed));
 /* Type and Time Zone (ECMA 167r3 1/7.3.1) */
 #define TIMESTAMP_TYPE_MASK             0xF000
@@ -78,11 +78,11 @@ typedef struct {
 #define TIMESTAMP_TIMEZONE_MASK         0x0FFF
 /* Entity identifier (ECMA 167r3 1/7.4) */
-typedef struct {
+struct regid {
        uint8_t         flags;
        uint8_t         ident[23];
        uint8_t         identSuffix[8];
-} __attribute__ ((packed)) regid;
+} __attribute__ ((packed));
 /* Flags (ECMA 167r3 1/7.4.1) */
 #define ENTITYID_FLAGS_DIRTY            0x00
@@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc {
 /* Boot Descriptor (ECMA 167r3 2/9.4) */
 struct bootDesc {
-        uint8_t         structType;
+        uint8_t                 structType;
-        uint8_t         stdIdent[VSD_STD_ID_LEN];
+        uint8_t                 stdIdent[VSD_STD_ID_LEN];
-        uint8_t         structVersion;
+        uint8_t                 structVersion;
-        uint8_t         reserved1;
+        uint8_t                 reserved1;
-        regid           archType;
+        struct regid            archType;
-        regid           bootIdent;
+        struct regid            bootIdent;
-        __le32          bootExtLocation;
+        __le32                  bootExtLocation;
-        __le32          bootExtLength;
+        __le32                  bootExtLength;
-        __le64          loadAddress;
+        __le64                  loadAddress;
-        __le64          startAddress;
+        __le64                  startAddress;
-        timestamp       descCreationDateAndTime;
+        struct timestamp        descCreationDateAndTime;
-        __le16          flags;
+        __le16                  flags;
-        uint8_t         reserved2[32];
+        uint8_t                 reserved2[32];
-        uint8_t         bootUse[1906];
+        uint8_t                 bootUse[1906];
 } __attribute__ ((packed));
 /* Flags (ECMA 167r3 2/9.4.12) */
 #define BOOT_FLAGS_ERASE                0x01
 /* Extent Descriptor (ECMA 167r3 3/7.1) */
-typedef struct {
+struct extent_ad {
        __le32          extLength;
        __le32          extLocation;
-} __attribute__ ((packed)) extent_ad;
+} __attribute__ ((packed));
-typedef struct {
+struct kernel_extent_ad {
        uint32_t        extLength;
        uint32_t        extLocation;
-} kernel_extent_ad;
+};
 /* Descriptor Tag (ECMA 167r3 3/7.2) */
-typedef struct {
+struct tag {
        __le16          tagIdent;
        __le16          descVersion;
        uint8_t         tagChecksum;
@@ -166,7 +166,7 @@ typedef struct {
        __le16          descCRC;
        __le16          descCRCLength;
        __le32          tagLocation;
-} __attribute__ ((packed)) tag;
+} __attribute__ ((packed));
 /* Tag Identifier (ECMA 167r3 3/7.2.1) */
 #define TAG_IDENT_PVD                   0x0001
@@ -190,28 +190,28 @@ struct NSRDesc {
 /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
 struct primaryVolDesc {
-        tag             descTag;
+        struct tag              descTag;
-        __le32          volDescSeqNum;
+        __le32                  volDescSeqNum;
-        __le32          primaryVolDescNum;
+        __le32                  primaryVolDescNum;
-        dstring         volIdent[32];
+        dstring                 volIdent[32];
-        __le16          volSeqNum;
+        __le16                  volSeqNum;
-        __le16          maxVolSeqNum;
+        __le16                  maxVolSeqNum;
-        __le16          interchangeLvl;
+        __le16                  interchangeLvl;
-        __le16          maxInterchangeLvl;
+        __le16                  maxInterchangeLvl;
-        __le32          charSetList;
+        __le32                  charSetList;
-        __le32          maxCharSetList;
+        __le32                  maxCharSetList;
-        dstring         volSetIdent[128];
+        dstring                 volSetIdent[128];
-        charspec        descCharSet;
+        struct charspec         descCharSet;
-        charspec        explanatoryCharSet;
+        struct charspec         explanatoryCharSet;
-        extent_ad       volAbstract;
+        struct extent_ad        volAbstract;
-        extent_ad       volCopyright;
+        struct extent_ad        volCopyright;
-        regid           appIdent;
+        struct regid            appIdent;
-        timestamp       recordingDateAndTime;
+        struct timestamp        recordingDateAndTime;
-        regid           impIdent;
+        struct regid            impIdent;
-        uint8_t         impUse[64];
+        uint8_t                 impUse[64];
-        __le32          predecessorVolDescSeqLocation;
+        __le32                  predecessorVolDescSeqLocation;
-        __le16          flags;
+        __le16                  flags;
-        uint8_t         reserved[22];
+        uint8_t                 reserved[22];
 } __attribute__ ((packed));
 /* Flags (ECMA 167r3 3/10.1.21) */
@@ -219,40 +219,40 @@ struct primaryVolDesc {
 /* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
 struct anchorVolDescPtr {
-        tag             descTag;
+        struct tag              descTag;
-        extent_ad       mainVolDescSeqExt;
+        struct extent_ad        mainVolDescSeqExt;
-        extent_ad       reserveVolDescSeqExt;
+        struct extent_ad        reserveVolDescSeqExt;
-        uint8_t         reserved[480];
+        uint8_t                 reserved[480];
 } __attribute__ ((packed));
 /* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
 struct volDescPtr {
-        tag             descTag;
+        struct tag              descTag;
-        __le32          volDescSeqNum;
+        __le32                  volDescSeqNum;
-        extent_ad       nextVolDescSeqExt;
+        struct extent_ad        nextVolDescSeqExt;
-        uint8_t         reserved[484];
+        uint8_t                 reserved[484];
 } __attribute__ ((packed));
 /* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
 struct impUseVolDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          volDescSeqNum;
-        regid           impIdent;
+        struct regid    impIdent;
        uint8_t         impUse[460];
 } __attribute__ ((packed));
 /* Partition Descriptor (ECMA 167r3 3/10.5) */
 struct partitionDesc {
-        tag descTag;
+        struct tag descTag;
        __le32 volDescSeqNum;
        __le16 partitionFlags;
        __le16 partitionNumber;
-        regid partitionContents;
+        struct regid partitionContents;
        uint8_t partitionContentsUse[128];
        __le32 accessType;
        __le32 partitionStartingLocation;
        __le32 partitionLength;
-        regid impIdent;
+        struct regid impIdent;
        uint8_t impUse[128];
        uint8_t reserved[156];
 } __attribute__ ((packed));
@@ -278,19 +278,19 @@ struct partitionDesc {
 /* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
 struct logicalVolDesc {
-        tag             descTag;
+        struct tag              descTag;
-        __le32          volDescSeqNum;
+        __le32                  volDescSeqNum;
-        charspec        descCharSet;
+        struct charspec         descCharSet;
-        dstring         logicalVolIdent[128];
+        dstring                 logicalVolIdent[128];
-        __le32          logicalBlockSize;
+        __le32                  logicalBlockSize;
-        regid           domainIdent;
+        struct regid            domainIdent;
-        uint8_t         logicalVolContentsUse[16];
+        uint8_t                 logicalVolContentsUse[16];
-        __le32          mapTableLength;
+        __le32                  mapTableLength;
-        __le32          numPartitionMaps;
+        __le32                  numPartitionMaps;
-        regid           impIdent;
+        struct regid            impIdent;
-        uint8_t         impUse[128];
+        uint8_t                 impUse[128];
-        extent_ad       integritySeqExt;
+        struct extent_ad        integritySeqExt;
-        uint8_t         partitionMaps[0];
+        uint8_t                 partitionMaps[0];
 } __attribute__ ((packed));
 /* Generic Partition Map (ECMA 167r3 3/10.7.1) */
@@ -322,30 +322,30 @@ struct genericPartitionMap2 {
 /* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
 struct unallocSpaceDesc {
-        tag             descTag;
+        struct tag              descTag;
-        __le32          volDescSeqNum;
+        __le32                  volDescSeqNum;
-        __le32          numAllocDescs;
+        __le32                  numAllocDescs;
-        extent_ad       allocDescs[0];
+        struct extent_ad        allocDescs[0];
 } __attribute__ ((packed));
 /* Terminating Descriptor (ECMA 167r3 3/10.9) */
 struct terminatingDesc {
-        tag             descTag;
+        struct tag      descTag;
        uint8_t         reserved[496];
 } __attribute__ ((packed));
 /* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
 struct logicalVolIntegrityDesc {
-        tag             descTag;
+        struct tag              descTag;
-        timestamp       recordingDateAndTime;
+        struct timestamp        recordingDateAndTime;
-        __le32          integrityType;
+        __le32                  integrityType;
-        extent_ad       nextIntegrityExt;
+        struct extent_ad        nextIntegrityExt;
-        uint8_t         logicalVolContentsUse[32];
+        uint8_t                 logicalVolContentsUse[32];
-        __le32          numOfPartitions;
+        __le32                  numOfPartitions;
-        __le32          lengthOfImpUse;
+        __le32                  lengthOfImpUse;
-        __le32          freeSpaceTable[0];
+        __le32                  freeSpaceTable[0];
-        __le32          sizeTable[0];
+        __le32                  sizeTable[0];
-        uint8_t         impUse[0];
+        uint8_t                 impUse[0];
 } __attribute__ ((packed));
 /* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc {
 #define LVID_INTEGRITY_TYPE_CLOSE       0x00000001
 /* Recorded Address (ECMA 167r3 4/7.1) */
-typedef struct {
+struct lb_addr {
        __le32          logicalBlockNum;
        __le16          partitionReferenceNum;
-} __attribute__ ((packed)) lb_addr;
+} __attribute__ ((packed));
 /* ... and its in-core analog */
-typedef struct {
+struct kernel_lb_addr {
        uint32_t                logicalBlockNum;
        uint16_t                partitionReferenceNum;
-} kernel_lb_addr;
+};
 /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
-typedef struct {
+struct short_ad {
        __le32          extLength;
        __le32          extPosition;
-} __attribute__ ((packed)) short_ad;
+} __attribute__ ((packed));
 /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
-typedef struct {
+struct long_ad {
        __le32          extLength;
-        lb_addr         extLocation;
+        struct lb_addr  extLocation;
        uint8_t         impUse[6];
-} __attribute__ ((packed)) long_ad;
+} __attribute__ ((packed));
-typedef struct {
+struct kernel_long_ad {
-        uint32_t        extLength;
+        uint32_t                extLength;
-        kernel_lb_addr  extLocation;
+        struct kernel_lb_addr   extLocation;
-        uint8_t         impUse[6];
+        uint8_t                 impUse[6];
-} kernel_long_ad;
+};
 /* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
-typedef struct {
+struct ext_ad {
        __le32          extLength;
        __le32          recordedLength;
        __le32          informationLength;
-        lb_addr         extLocation;
+        struct lb_addr  extLocation;
-} __attribute__ ((packed)) ext_ad;
+} __attribute__ ((packed));
-typedef struct {
+struct kernel_ext_ad {
-        uint32_t        extLength;
+        uint32_t                extLength;
-        uint32_t        recordedLength;
+        uint32_t                recordedLength;
-        uint32_t        informationLength;
+        uint32_t                informationLength;
-        kernel_lb_addr  extLocation;
+        struct kernel_lb_addr   extLocation;
-} kernel_ext_ad;
+};
 /* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
@@ -415,44 +415,44 @@ typedef struct {
 /* File Set Descriptor (ECMA 167r3 4/14.1) */
 struct fileSetDesc {
-        tag             descTag;
+        struct tag              descTag;
-        timestamp       recordingDateAndTime;
+        struct timestamp        recordingDateAndTime;
-        __le16          interchangeLvl;
+        __le16                  interchangeLvl;
-        __le16          maxInterchangeLvl;
+        __le16                  maxInterchangeLvl;
-        __le32          charSetList;
+        __le32                  charSetList;
-        __le32          maxCharSetList;
+        __le32                  maxCharSetList;
-        __le32          fileSetNum;
+        __le32                  fileSetNum;
-        __le32          fileSetDescNum;
+        __le32                  fileSetDescNum;
-        charspec        logicalVolIdentCharSet;
+        struct charspec         logicalVolIdentCharSet;
-        dstring         logicalVolIdent[128];
+        dstring                 logicalVolIdent[128];
-        charspec        fileSetCharSet;
+        struct charspec         fileSetCharSet;
-        dstring         fileSetIdent[32];
+        dstring                 fileSetIdent[32];
-        dstring         copyrightFileIdent[32];
+        dstring                 copyrightFileIdent[32];
-        dstring         abstractFileIdent[32];
+        dstring                 abstractFileIdent[32];
-        long_ad         rootDirectoryICB;
+        struct long_ad          rootDirectoryICB;
-        regid           domainIdent;
+        struct regid            domainIdent;
-        long_ad         nextExt;
+        struct long_ad          nextExt;
-        long_ad         streamDirectoryICB;
+        struct long_ad          streamDirectoryICB;
-        uint8_t         reserved[32];
+        uint8_t                 reserved[32];
 } __attribute__ ((packed));
 /* Partition Header Descriptor (ECMA 167r3 4/14.3) */
 struct partitionHeaderDesc {
-        short_ad        unallocSpaceTable;
+        struct short_ad unallocSpaceTable;
-        short_ad        unallocSpaceBitmap;
+        struct short_ad unallocSpaceBitmap;
-        short_ad        partitionIntegrityTable;
+        struct short_ad partitionIntegrityTable;
-        short_ad        freedSpaceTable;
+        struct short_ad freedSpaceTable;
-        short_ad        freedSpaceBitmap;
+        struct short_ad freedSpaceBitmap;
        uint8_t         reserved[88];
 } __attribute__ ((packed));
 /* File Identifier Descriptor (ECMA 167r3 4/14.4) */
 struct fileIdentDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le16          fileVersionNum;
        uint8_t         fileCharacteristics;
        uint8_t         lengthFileIdent;
-        long_ad         icb;
+        struct long_ad  icb;
        __le16          lengthOfImpUse;
        uint8_t         impUse[0];
        uint8_t         fileIdent[0];
@@ -468,22 +468,22 @@ struct fileIdentDesc {
 /* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
 struct allocExtDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          previousAllocExtLocation;
        __le32          lengthAllocDescs;
 } __attribute__ ((packed));
 /* ICB Tag (ECMA 167r3 4/14.6) */
-typedef struct {
+struct icbtag {
        __le32          priorRecordedNumDirectEntries;
        __le16          strategyType;
        __le16          strategyParameter;
        __le16          numEntries;
        uint8_t         reserved;
        uint8_t         fileType;
-        lb_addr         parentICBLocation;
+        struct lb_addr  parentICBLocation;
        __le16          flags;
-} __attribute__ ((packed)) icbtag;
+} __attribute__ ((packed));
 /* Strategy Type (ECMA 167r3 4/14.6.2) */
 #define ICBTAG_STRATEGY_TYPE_UNDEF      0x0000
@@ -528,41 +528,41 @@ typedef struct {
 /* Indirect Entry (ECMA 167r3 4/14.7) */
 struct indirectEntry {
-        tag             descTag;
+        struct tag      descTag;
-        icbtag          icbTag;
+        struct icbtag   icbTag;
-        long_ad         indirectICB;
+        struct long_ad  indirectICB;
 } __attribute__ ((packed));
 /* Terminal Entry (ECMA 167r3 4/14.8) */
 struct terminalEntry {
-        tag             descTag;
+        struct tag      descTag;
-        icbtag          icbTag;
+        struct icbtag   icbTag;
 } __attribute__ ((packed));
 /* File Entry (ECMA 167r3 4/14.9) */
 struct fileEntry {
-        tag             descTag;
+        struct tag              descTag;
-        icbtag          icbTag;
+        struct icbtag           icbTag;
-        __le32          uid;
+        __le32                  uid;
-        __le32          gid;
+        __le32                  gid;
-        __le32          permissions;
+        __le32                  permissions;
-        __le16          fileLinkCount;
+        __le16                  fileLinkCount;
-        uint8_t         recordFormat;
+        uint8_t                 recordFormat;
-        uint8_t         recordDisplayAttr;
+        uint8_t                 recordDisplayAttr;
-        __le32          recordLength;
+        __le32                  recordLength;
-        __le64          informationLength;
+        __le64                  informationLength;
-        __le64          logicalBlocksRecorded;
+        __le64                  logicalBlocksRecorded;
-        timestamp       accessTime;
+        struct timestamp        accessTime;
-        timestamp       modificationTime;
+        struct timestamp        modificationTime;
-        timestamp       attrTime;
+        struct timestamp        attrTime;
-        __le32          checkpoint;
+        __le32                  checkpoint;
-        long_ad         extendedAttrICB;
+        struct long_ad          extendedAttrICB;
-        regid           impIdent;
+        struct regid            impIdent;
-        __le64          uniqueID;
+        __le64                  uniqueID;
-        __le32          lengthExtendedAttr;
+        __le32                  lengthExtendedAttr;
-        __le32          lengthAllocDescs;
+        __le32                  lengthAllocDescs;
-        uint8_t         extendedAttr[0];
+        uint8_t                 extendedAttr[0];
-        uint8_t         allocDescs[0];
+        uint8_t                 allocDescs[0];
 } __attribute__ ((packed));
 /* Permissions (ECMA 167r3 4/14.9.5) */
@@ -604,7 +604,7 @@ struct fileEntry {
 /* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
 struct extendedAttrHeaderDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          impAttrLocation;
        __le32          appAttrLocation;
 } __attribute__ ((packed));
@@ -687,7 +687,7 @@ struct impUseExtAttr {
        uint8_t         reserved[3];
        __le32          attrLength;
        __le32          impUseLength;
-        regid           impIdent;
+        struct regid    impIdent;
        uint8_t         impUse[0];
 } __attribute__ ((packed));
@@ -698,7 +698,7 @@ struct appUseExtAttr {
        uint8_t         reserved[3];
        __le32          attrLength;
        __le32          appUseLength;
-        regid           appIdent;
+        struct regid    appIdent;
        uint8_t         appUse[0];
 } __attribute__ ((packed));
@@ -712,15 +712,15 @@ struct appUseExtAttr {
 /* Unallocated Space Entry (ECMA 167r3 4/14.11) */
 struct unallocSpaceEntry {
-        tag             descTag;
+        struct tag      descTag;
-        icbtag          icbTag;
+        struct icbtag   icbTag;
        __le32          lengthAllocDescs;
        uint8_t         allocDescs[0];
 } __attribute__ ((packed));
 /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
 struct spaceBitmapDesc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          numOfBits;
        __le32          numOfBytes;
        uint8_t         bitmap[0];
@@ -728,13 +728,13 @@ struct spaceBitmapDesc {
 /* Partition Integrity Entry (ECMA 167r3 4/14.13) */
 struct partitionIntegrityEntry {
-        tag             descTag;
+        struct tag              descTag;
-        icbtag          icbTag;
+        struct icbtag           icbTag;
-        timestamp       recordingDateAndTime;
+        struct timestamp        recordingDateAndTime;
-        uint8_t         integrityType;
+        uint8_t                 integrityType;
-        uint8_t         reserved[175];
+        uint8_t                 reserved[175];
-        regid           impIdent;
+        struct regid            impIdent;
-        uint8_t         impUse[256];
+        uint8_t                 impUse[256];
 } __attribute__ ((packed));
 /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
@@ -765,32 +765,32 @@ struct pathComponent {
 /* File Entry (ECMA 167r3 4/14.17) */
 struct extendedFileEntry {
-        tag             descTag;
+        struct tag              descTag;
-        icbtag          icbTag;
+        struct icbtag           icbTag;
-        __le32          uid;
+        __le32                  uid;
-        __le32          gid;
+        __le32                  gid;
-        __le32          permissions;
+        __le32                  permissions;
-        __le16          fileLinkCount;
+        __le16                  fileLinkCount;
-        uint8_t         recordFormat;
+        uint8_t                 recordFormat;
-        uint8_t         recordDisplayAttr;
+        uint8_t                 recordDisplayAttr;
-        __le32          recordLength;
+        __le32                  recordLength;
-        __le64          informationLength;
+        __le64                  informationLength;
-        __le64          objectSize;
+        __le64                  objectSize;
-        __le64          logicalBlocksRecorded;
+        __le64                  logicalBlocksRecorded;
-        timestamp       accessTime;
+        struct timestamp        accessTime;
-        timestamp       modificationTime;
+        struct timestamp        modificationTime;
-        timestamp       createTime;
+        struct timestamp        createTime;
-        timestamp       attrTime;
+        struct timestamp        attrTime;
-        __le32          checkpoint;
+        __le32                  checkpoint;
-        __le32          reserved;
+        __le32                  reserved;
-        long_ad         extendedAttrICB;
+        struct long_ad          extendedAttrICB;
-        long_ad         streamDirectoryICB;
+        struct long_ad          streamDirectoryICB;
-        regid           impIdent;
+        struct regid            impIdent;
-        __le64          uniqueID;
+        __le64                  uniqueID;
-        __le32          lengthExtendedAttr;
+        __le32                  lengthExtendedAttr;
-        __le32          lengthAllocDescs;
+        __le32                  lengthAllocDescs;
-        uint8_t         extendedAttr[0];
+        uint8_t                 extendedAttr[0];
-        uint8_t         allocDescs[0];
+        uint8_t                 allocDescs[0];
 } __attribute__ ((packed));
 #endif /* _ECMA_167_H */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 47dbe5613f90..c10fa39f97e2 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode)
                        le32_add_cpu(&lvidiu->numDirs, -1);
                else
                        le32_add_cpu(&lvidiu->numFiles, -1);
+                udf_updated_lvid(sb);
-                mark_buffer_dirty(sbi->s_lvid_bh);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
-        udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1);
+        udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
 }
 struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
@@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
                        uniqueID += 16;
                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(sbi->s_lvid_bh);
+                udf_updated_lvid(sb);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
        inode->i_mode = mode;
@@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        iinfo->i_location.logicalBlockNum = block;
        iinfo->i_location.partitionReferenceNum =
                                dinfo->i_location.partitionReferenceNum;
-        inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0);
+        inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
        inode->i_blocks = 0;
        iinfo->i_lenEAttr = 0;
        iinfo->i_lenAlloc = 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 30ebde490f7f..e7533f785636 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
                                        sector_t *, int *);
 static int8_t udf_insert_aext(struct inode *, struct extent_position,
-                              kernel_lb_addr, uint32_t);
+                              struct kernel_lb_addr, uint32_t);
 static void udf_split_extents(struct inode *, int *, int, int,
-                              kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+                              struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_prealloc_extents(struct inode *, int, int,
-                                 kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+                                 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_merge_extents(struct inode *,
-                              kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+                              struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_update_extents(struct inode *,
-                               kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
+                               struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
                               struct extent_position *);
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
@@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
 {
        int newblock;
        struct buffer_head *dbh = NULL;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        uint8_t alloctype;
        struct extent_position epos;
@@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
        epos.bh = NULL;
        epos.block = iinfo->i_location;
        epos.offset = udf_file_entry_alloc_offset(inode);
-        udf_add_aext(inode, &epos, eloc, elen, 0);
+        udf_add_aext(inode, &epos, &eloc, elen, 0);
        /* UniqueID stuff */
        brelse(epos.bh);
@@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
 /* Extend the file by 'blocks' blocks, return the number of extents added */
 int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
-                    kernel_long_ad *last_ext, sector_t blocks)
+                    struct kernel_long_ad *last_ext, sector_t blocks)
 {
        sector_t add;
        int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
        struct super_block *sb = inode->i_sb;
-        kernel_lb_addr prealloc_loc = {};
+        struct kernel_lb_addr prealloc_loc = {};
        int prealloc_len = 0;
        struct udf_inode_info *iinfo;
@@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        }
        if (fake) {
-                udf_add_aext(inode, last_pos, last_ext->extLocation,
+                udf_add_aext(inode, last_pos, &last_ext->extLocation,
                             last_ext->extLength, 1);
                count++;
        } else
-                udf_write_aext(inode, last_pos, last_ext->extLocation,
+                udf_write_aext(inode, last_pos, &last_ext->extLocation,
                                last_ext->extLength, 1);
        /* Managed to do everything necessary? */
@@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        /* Create enough extents to cover the whole hole */
        while (blocks > add) {
                blocks -= add;
-                if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
                                 last_ext->extLength, 1) == -1)
                        return -1;
                count++;
@@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        if (blocks) {
                last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                        (blocks << sb->s_blocksize_bits);
-                if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
                                 last_ext->extLength, 1) == -1)
                        return -1;
                count++;
@@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
 out:
        /* Do we have some preallocated blocks saved? */
        if (prealloc_len) {
-                if (udf_add_aext(inode, last_pos, prealloc_loc,
+                if (udf_add_aext(inode, last_pos, &prealloc_loc,
                                 prealloc_len, 1) == -1)
                        return -1;
                last_ext->extLocation = prealloc_loc;
@@ -459,9 +459,9 @@ out:
        /* last_pos should point to the last written extent... */
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                last_pos->offset -= sizeof(short_ad);
+                last_pos->offset -= sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                last_pos->offset -= sizeof(long_ad);
+                last_pos->offset -= sizeof(struct long_ad);
        else
                return -1;
@@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 {
        static sector_t last_block;
        struct buffer_head *result = NULL;
-        kernel_long_ad laarr[EXTENT_MERGE_SIZE];
+        struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
        struct extent_position prev_epos, cur_epos, next_epos;
        int count = 0, startnum = 0, endnum = 0;
        uint32_t elen = 0, tmpelen;
-        kernel_lb_addr eloc, tmpeloc;
+        struct kernel_lb_addr eloc, tmpeloc;
        int c = 1;
        loff_t lbcount = 0, b_off = 0;
        uint32_t newblocknum, newblock;
@@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        elen = EXT_RECORDED_ALLOCATED |
                                ((elen + inode->i_sb->s_blocksize - 1) &
                                 ~(inode->i_sb->s_blocksize - 1));
-                        etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1);
+                        etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
                }
                brelse(prev_epos.bh);
                brelse(cur_epos.bh);
                brelse(next_epos.bh);
-                newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+                newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
                *phys = newblock;
                return NULL;
        }
@@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                } else {
                        /* Create a fake extent when there's not one */
                        memset(&laarr[0].extLocation, 0x00,
-                                sizeof(kernel_lb_addr));
+                                sizeof(struct kernel_lb_addr));
                        laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
                        /* Will udf_extend_file() create real extent from
                           a fake one? */
@@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                                inode->i_sb->s_blocksize;
                        memset(&laarr[c].extLocation, 0x00,
-                                sizeof(kernel_lb_addr));
+                                sizeof(struct kernel_lb_addr));
                        count++;
                        endnum++;
                }
@@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 static void udf_split_extents(struct inode *inode, int *c, int offset,
                              int newblocknum,
-                              kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                              struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                              int *endnum)
 {
        unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
                if (offset) {
                        if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                                udf_free_blocks(inode->i_sb, inode,
-                                                laarr[curr].extLocation,
+                                                &laarr[curr].extLocation,
                                                0, offset);
                                laarr[curr].extLength =
                                        EXT_NOT_RECORDED_NOT_ALLOCATED |
@@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
 }
 static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
-                                 kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                                 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                                 int *endnum)
 {
        int start, length = 0, currlength = 0, i;
@@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
                                         inode->i_sb->s_blocksize_bits);
                        else {
                                memmove(&laarr[c + 2], &laarr[c + 1],
-                                        sizeof(long_ad) * (*endnum - (c + 1)));
+                                        sizeof(struct long_ad) * (*endnum - (c + 1)));
                                (*endnum)++;
                                laarr[c + 1].extLocation.logicalBlockNum = next;
                                laarr[c + 1].extLocation.partitionReferenceNum =
@@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
                                        if (*endnum > (i + 1))
                                                memmove(&laarr[i],
                                                        &laarr[i + 1],
-                                                        sizeof(long_ad) *
+                                                        sizeof(struct long_ad) *
                                                        (*endnum - (i + 1)));
                                        i--;
                                        (*endnum)--;
@@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
 }
 static void udf_merge_extents(struct inode *inode,
-                              kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                              struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                              int *endnum)
 {
        int i;
@@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode,
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        for (i = 0; i < (*endnum - 1); i++) {
-                kernel_long_ad *li /*l[i]*/ = &laarr[i];
+                struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
-                kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
+                struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
                if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
                        (((li->extLength >> 30) ==
@@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode,
                                         blocksize - 1) & ~(blocksize - 1));
                                if (*endnum > (i + 2))
                                        memmove(&laarr[i + 1], &laarr[i + 2],
-                                                sizeof(long_ad) *
+                                                sizeof(struct long_ad) *
                                                (*endnum - (i + 2)));
                                i--;
                                (*endnum)--;
@@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode,
                                (EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
                           ((lip1->extLength >> 30) ==
                                (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
-                        udf_free_blocks(inode->i_sb, inode, li->extLocation, 0,
+                        udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
                                        ((li->extLength &
                                          UDF_EXTENT_LENGTH_MASK) +
                                         blocksize - 1) >> blocksize_bits);
@@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode,
                                          blocksize - 1) & ~(blocksize - 1));
                                if (*endnum > (i + 2))
                                        memmove(&laarr[i + 1], &laarr[i + 2],
-                                                sizeof(long_ad) *
+                                                sizeof(struct long_ad) *
                                                (*endnum - (i + 2)));
                                i--;
                                (*endnum)--;
@@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode,
                } else if ((li->extLength >> 30) ==
                                        (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                        udf_free_blocks(inode->i_sb, inode,
-                                        li->extLocation, 0,
+                                        &li->extLocation, 0,
                                        ((li->extLength &
                                                UDF_EXTENT_LENGTH_MASK) +
                                         blocksize - 1) >> blocksize_bits);
@@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode,
 }
 static void udf_update_extents(struct inode *inode,
-                               kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                               struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                               int startnum, int endnum,
                               struct extent_position *epos)
 {
        int start = 0, i;
-        kernel_lb_addr tmploc;
+        struct kernel_lb_addr tmploc;
        uint32_t tmplen;
        if (startnum > endnum) {
@@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode,
        for (i = start; i < endnum; i++) {
                udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
-                udf_write_aext(inode, epos, laarr[i].extLocation,
+                udf_write_aext(inode, epos, &laarr[i].extLocation,
                               laarr[i].extLength, 1);
        }
 }
@@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode)
         *      i_nlink = 1
         *      i_op = NULL;
         */
-        bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident);
+        bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
        if (!bh) {
                printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
                       inode->i_ino);
@@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode)
        if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
                struct buffer_head *ibh;
-                ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1,
+                ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
                                        &ident);
                if (ident == TAG_IDENT_IE && ibh) {
                        struct buffer_head *nbh = NULL;
-                        kernel_lb_addr loc;
+                        struct kernel_lb_addr loc;
                        struct indirectEntry *ie;
                        ie = (struct indirectEntry *)ibh->b_data;
                        loc = lelb_to_cpu(ie->indirectICB.extLocation);
                        if (ie->indirectICB.extLength &&
-                                (nbh = udf_read_ptagged(inode->i_sb, loc, 0,
+                                (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
                                                        &ident))) {
                                if (ident == TAG_IDENT_FE ||
                                        ident == TAG_IDENT_EFE) {
                                        memcpy(&iinfo->i_location,
                                                &loc,
-                                                sizeof(kernel_lb_addr));
+                                                sizeof(struct kernel_lb_addr));
                                        brelse(bh);
                                        brelse(ibh);
                                        brelse(nbh);
@@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        inode->i_size = le64_to_cpu(fe->informationLength);
        iinfo->i_lenExtents = inode->i_size;
-        inode->i_mode = udf_convert_permissions(fe);
+        if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
-        inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask;
+                        sbi->s_fmode != UDF_INVALID_MODE)
+                inode->i_mode = sbi->s_fmode;
+        else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
+                        sbi->s_dmode != UDF_INVALID_MODE)
+                inode->i_mode = sbi->s_dmode;
+        else
+                inode->i_mode = udf_convert_permissions(fe);
+        inode->i_mode &= ~sbi->s_umask;
        if (iinfo->i_efe == 0) {
                inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        bh = udf_tread(inode->i_sb,
                        udf_get_lb_pblock(inode->i_sb,
-                                          iinfo->i_location, 0));
+                                          &iinfo->i_location, 0));
        if (!bh) {
                udf_debug("bread failure\n");
                return -EIO;
@@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
                                        sizeof(struct unallocSpaceEntry));
                crclen = sizeof(struct unallocSpaceEntry) +
-                                iinfo->i_lenAlloc - sizeof(tag);
+                                iinfo->i_lenAlloc - sizeof(struct tag);
                use->descTag.tagLocation = cpu_to_le32(
                                                iinfo->i_location.
                                                        logicalBlockNum);
                use->descTag.descCRCLength = cpu_to_le16(crclen);
                use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
-                                                           sizeof(tag),
+                                                           sizeof(struct tag),
                                                           crclen));
                use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
@@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        fe->informationLength = cpu_to_le64(inode->i_size);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-                regid *eid;
+                struct regid *eid;
                struct deviceSpec *dsea =
                        (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
                if (!dsea) {
                        dsea = (struct deviceSpec *)
                                udf_add_extendedattr(inode,
                                                     sizeof(struct deviceSpec) +
-                                                     sizeof(regid), 12, 0x3);
+                                                     sizeof(struct regid), 12, 0x3);
                        dsea->attrType = cpu_to_le32(12);
                        dsea->attrSubtype = 1;
                        dsea->attrLength = cpu_to_le32(
                                                sizeof(struct deviceSpec) +
-                                                sizeof(regid));
+                                                sizeof(struct regid));
-                        dsea->impUseLength = cpu_to_le32(sizeof(regid));
+                        dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
                }
-                eid = (regid *)dsea->impUse;
+                eid = (struct regid *)dsea->impUse;
-                memset(eid, 0, sizeof(regid));
+                memset(eid, 0, sizeof(struct regid));
                strcpy(eid->ident, UDF_ID_DEVELOPER);
                eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
                eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
                udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
                udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
-                memset(&(fe->impIdent), 0, sizeof(regid));
+                memset(&(fe->impIdent), 0, sizeof(struct regid));
                strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
                fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
                fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
                udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
-                memset(&(efe->impIdent), 0, sizeof(regid));
+                memset(&(efe->impIdent), 0, sizeof(struct regid));
                strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
                efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
                efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        fe->descTag.tagLocation = cpu_to_le32(
                                        iinfo->i_location.logicalBlockNum);
        crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
-                                                                sizeof(tag);
+                                                                sizeof(struct tag);
        fe->descTag.descCRCLength = cpu_to_le16(crclen);
-        fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag),
+        fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
                                                  crclen));
        fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
@@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        return err;
 }
-struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
+struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
 {
        unsigned long block = udf_get_lb_pblock(sb, ino, 0);
        struct inode *inode = iget_locked(sb, block);
@@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
                return NULL;
        if (inode->i_state & I_NEW) {
-                memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr));
+                memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
                __udf_read_inode(inode);
                unlock_new_inode(inode);
        }
@@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
        if (is_bad_inode(inode))
                goto out_iput;
-        if (ino.logicalBlockNum >= UDF_SB(sb)->
+        if (ino->logicalBlockNum >= UDF_SB(sb)->
-                        s_partmaps[ino.partitionReferenceNum].s_partition_len) {
+                        s_partmaps[ino->partitionReferenceNum].s_partition_len) {
                udf_debug("block=%d, partition=%d out of range\n",
-                          ino.logicalBlockNum, ino.partitionReferenceNum);
+                          ino->logicalBlockNum, ino->partitionReferenceNum);
                make_bad_inode(inode);
                goto out_iput;
        }
@@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
 }
 int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
-                    kernel_lb_addr eloc, uint32_t elen, int inc)
+                    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
-        short_ad *sad = NULL;
+        struct short_ad *sad = NULL;
-        long_ad *lad = NULL;
+        struct long_ad *lad = NULL;
        struct allocExtDesc *aed;
        int8_t etype;
        uint8_t *ptr;
@@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                ptr = epos->bh->b_data + epos->offset;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                return -1;
@@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                char *sptr, *dptr;
                struct buffer_head *nbh;
                int err, loffset;
-                kernel_lb_addr obloc = epos->block;
+                struct kernel_lb_addr obloc = epos->block;
                epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
                                                obloc.partitionReferenceNum,
@@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                if (!epos->block.logicalBlockNum)
                        return -1;
                nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
-                                                                 epos->block,
+                                                                 &epos->block,
                                                                 0));
                if (!nbh)
                        return -1;
@@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                }
                if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
                        udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
-                                    epos->block.logicalBlockNum, sizeof(tag));
+                                    epos->block.logicalBlockNum, sizeof(struct tag));
                else
                        udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
-                                    epos->block.logicalBlockNum, sizeof(tag));
+                                    epos->block.logicalBlockNum, sizeof(struct tag));
                switch (iinfo->i_alloc_type) {
                case ICBTAG_FLAG_AD_SHORT:
-                        sad = (short_ad *)sptr;
+                        sad = (struct short_ad *)sptr;
                        sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
                                                     inode->i_sb->s_blocksize);
                        sad->extPosition =
                                cpu_to_le32(epos->block.logicalBlockNum);
                        break;
                case ICBTAG_FLAG_AD_LONG:
-                        lad = (long_ad *)sptr;
+                        lad = (struct long_ad *)sptr;
                        lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
                                                     inode->i_sb->s_blocksize);
                        lad->extLocation = cpu_to_lelb(epos->block);
@@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 }
 int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
-                      kernel_lb_addr eloc, uint32_t elen, int inc)
+                      struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
        uint8_t *ptr;
-        short_ad *sad;
+        struct short_ad *sad;
-        long_ad *lad;
+        struct long_ad *lad;
        struct udf_inode_info *iinfo = UDF_I(inode);
        if (!epos->bh)
@@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
        switch (iinfo->i_alloc_type) {
        case ICBTAG_FLAG_AD_SHORT:
-                sad = (short_ad *)ptr;
+                sad = (struct short_ad *)ptr;
                sad->extLength = cpu_to_le32(elen);
-                sad->extPosition = cpu_to_le32(eloc.logicalBlockNum);
+                sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
                break;
        case ICBTAG_FLAG_AD_LONG:
-                lad = (long_ad *)ptr;
+                lad = (struct long_ad *)ptr;
                lad->extLength = cpu_to_le32(elen);
-                lad->extLocation = cpu_to_lelb(eloc);
+                lad->extLocation = cpu_to_lelb(*eloc);
                memset(lad->impUse, 0x00, sizeof(lad->impUse));
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
                break;
        default:
                return -1;
@@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
 }
 int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
-                     kernel_lb_addr *eloc, uint32_t *elen, int inc)
+                     struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
        int8_t etype;
@@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
                epos->block = *eloc;
                epos->offset = sizeof(struct allocExtDesc);
                brelse(epos->bh);
-                block = udf_get_lb_pblock(inode->i_sb, epos->block, 0);
+                block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
                epos->bh = udf_tread(inode->i_sb, block);
                if (!epos->bh) {
                        udf_debug("reading block %d failed!\n", block);
@@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
 }
 int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
-                        kernel_lb_addr *eloc, uint32_t *elen, int inc)
+                        struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
        int alen;
        int8_t etype;
        uint8_t *ptr;
-        short_ad *sad;
+        struct short_ad *sad;
-        long_ad *lad;
+        struct long_ad *lad;
        struct udf_inode_info *iinfo = UDF_I(inode);
        if (!epos->bh) {
@@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
 }
 static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
-                              kernel_lb_addr neloc, uint32_t nelen)
+                              struct kernel_lb_addr neloc, uint32_t nelen)
 {
-        kernel_lb_addr oeloc;
+        struct kernel_lb_addr oeloc;
        uint32_t oelen;
        int8_t etype;
@@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
                get_bh(epos.bh);
        while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
-                udf_write_aext(inode, &epos, neloc, nelen, 1);
+                udf_write_aext(inode, &epos, &neloc, nelen, 1);
                neloc = oeloc;
                nelen = (etype << 30) | oelen;
        }
-        udf_add_aext(inode, &epos, neloc, nelen, 1);
+        udf_add_aext(inode, &epos, &neloc, nelen, 1);
        brelse(epos.bh);
        return (nelen >> 30);
 }
 int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
-                       kernel_lb_addr eloc, uint32_t elen)
+                       struct kernel_lb_addr eloc, uint32_t elen)
 {
        struct extent_position oepos;
        int adsize;
@@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
        iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                adsize = 0;
@@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                return -1;
        while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
-                udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1);
+                udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
                if (oepos.bh != epos.bh) {
                        oepos.block = epos.block;
                        brelse(oepos.bh);
@@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                        oepos.offset = epos.offset - adsize;
                }
        }
-        memset(&eloc, 0x00, sizeof(kernel_lb_addr));
+        memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
        elen = 0;
        if (epos.bh != oepos.bh) {
-                udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1);
+                udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
-                udf_write_aext(inode, &oepos, eloc, elen, 1);
+                udf_write_aext(inode, &oepos, &eloc, elen, 1);
-                udf_write_aext(inode, &oepos, eloc, elen, 1);
+                udf_write_aext(inode, &oepos, &eloc, elen, 1);
                if (!oepos.bh) {
                        iinfo->i_lenAlloc -= (adsize * 2);
                        mark_inode_dirty(inode);
@@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                        mark_buffer_dirty_inode(oepos.bh, inode);
                }
        } else {
-                udf_write_aext(inode, &oepos, eloc, elen, 1);
+                udf_write_aext(inode, &oepos, &eloc, elen, 1);
                if (!oepos.bh) {
                        iinfo->i_lenAlloc -= adsize;
                        mark_inode_dirty(inode);
@@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 }
 int8_t inode_bmap(struct inode *inode, sector_t block,
-                  struct extent_position *pos, kernel_lb_addr *eloc,
+                  struct extent_position *pos, struct kernel_lb_addr *eloc,
                  uint32_t *elen, sector_t *offset)
 {
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
 long udf_block_map(struct inode *inode, sector_t block)
 {
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        struct extent_position epos = {};
@@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
                                                (EXT_RECORDED_ALLOCATED >> 30))
-                ret = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+                ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
        else
                ret = 0;
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 84bf0fd4a4f1..9215700c00a4 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
                        }
                }
                /* rewrite CRC + checksum of eahd */
-                crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag);
+                crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
                eahd->descTag.descCRCLength = cpu_to_le16(crclen);
                eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
-                                                sizeof(tag), crclen));
+                                                sizeof(struct tag), crclen));
                eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
                iinfo->i_lenEAttr += size;
                return (struct genericFormat *)&ea[offset];
@@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
 struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
                                    uint32_t location, uint16_t *ident)
 {
-        tag *tag_p;
+        struct tag *tag_p;
        struct buffer_head *bh = NULL;
        /* Read the block */
@@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
                return NULL;
        }
-        tag_p = (tag *)(bh->b_data);
+        tag_p = (struct tag *)(bh->b_data);
        *ident = le16_to_cpu(tag_p->tagIdent);
@@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
        }
        /* Verify the descriptor CRC */
-        if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize ||
+        if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
            le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
-                                        bh->b_data + sizeof(tag),
+                                        bh->b_data + sizeof(struct tag),
                                        le16_to_cpu(tag_p->descCRCLength)))
                return bh;
@@ -255,27 +255,28 @@ error_out:
        return NULL;
 }
-struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc,
+struct buffer_head *udf_read_ptagged(struct super_block *sb,
+                                     struct kernel_lb_addr *loc,
                                     uint32_t offset, uint16_t *ident)
 {
        return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
-                               loc.logicalBlockNum + offset, ident);
+                               loc->logicalBlockNum + offset, ident);
 }
 void udf_update_tag(char *data, int length)
 {
-        tag *tptr = (tag *)data;
+        struct tag *tptr = (struct tag *)data;
-        length -= sizeof(tag);
+        length -= sizeof(struct tag);
        tptr->descCRCLength = cpu_to_le16(length);
-        tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length));
+        tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
        tptr->tagChecksum = udf_tag_checksum(tptr);
 }
 void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
                 uint32_t loc, int length)
 {
-        tag *tptr = (tag *)data;
+        struct tag *tptr = (struct tag *)data;
        tptr->tagIdent = cpu_to_le16(ident);
        tptr->descVersion = cpu_to_le16(version);
        tptr->tagSerialNum = cpu_to_le16(snum);
@@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
        udf_update_tag(data, length);
 }
-u8 udf_tag_checksum(const tag *t)
+u8 udf_tag_checksum(const struct tag *t)
 {
        u8 *data = (u8 *)t;
        u8 checksum = 0;
        int i;
-        for (i = 0; i < sizeof(tag); ++i)
+        for (i = 0; i < sizeof(struct tag); ++i)
                if (i != 4) /* position of checksum */
                        checksum += data[i];
        return checksum;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f84bfaa8d941..6a29fa34c478 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
                 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
                 uint8_t *impuse, uint8_t *fileident)
 {
-        uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag);
+        uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
        uint16_t crc;
        int offset;
        uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
                memset(fibh->ebh->b_data, 0x00, padlen + offset);
        }
-        crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag),
+        crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
-                      sizeof(struct fileIdentDesc) - sizeof(tag));
+                      sizeof(struct fileIdentDesc) - sizeof(struct tag));
        if (fibh->sbh == fibh->ebh) {
                crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
-                              crclen + sizeof(tag) -
+                              crclen + sizeof(struct tag) -
                              sizeof(struct fileIdentDesc));
        } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
                crc = crc_itu_t(crc, fibh->ebh->b_data +
                                        sizeof(struct fileIdentDesc) +
                                        fibh->soffset,
-                              crclen + sizeof(tag) -
+                              crclen + sizeof(struct tag) -
                                        sizeof(struct fileIdentDesc));
        } else {
                crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
@@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
        uint8_t lfi;
        uint16_t liu;
        loff_t size;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        struct extent_position epos = {};
@@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
                    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
                        goto out_err;
-                block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                                epos.offset -= sizeof(short_ad);
+                                epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                                epos.offset -= sizeof(long_ad);
+                                epos.offset -= sizeof(struct long_ad);
                } else
                        offset = 0;
@@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 #ifdef UDF_RECOVERY
        /* temporary shorthand for specifying files by inode number */
        if (!strncmp(dentry->d_name.name, ".B=", 3)) {
-                kernel_lb_addr lb = {
+                struct kernel_lb_addr lb = {
                        .logicalBlockNum = 0,
                        .partitionReferenceNum =
                                simple_strtoul(dentry->d_name.name + 3,
@@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 #endif /* UDF_RECOVERY */
        if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
+                struct kernel_lb_addr loc;
                if (fibh.sbh != fibh.ebh)
                        brelse(fibh.ebh);
                brelse(fibh.sbh);
-                inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation));
+                loc = lelb_to_cpu(cfi.icb.extLocation);
+                inode = udf_iget(dir->i_sb, &loc);
                if (!inode) {
                        unlock_kernel();
                        return ERR_PTR(-EACCES);
@@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
        uint8_t lfi;
        uint16_t liu;
        int block;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen = 0;
        sector_t offset;
        struct extent_position epos = {};
@@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
                if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
                    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
                        block = udf_get_lb_pblock(dir->i_sb,
-                                        dinfo->i_location, 0);
+                                        &dinfo->i_location, 0);
                        fibh->soffset = fibh->eoffset = sb->s_blocksize;
                        goto add;
                }
-                block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                                epos.offset -= sizeof(short_ad);
+                                epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                                epos.offset -= sizeof(long_ad);
+                                epos.offset -= sizeof(struct long_ad);
                } else
                        offset = 0;
@@ -409,10 +412,10 @@ add:
        if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
                elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
                if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                        epos.offset -= sizeof(short_ad);
+                        epos.offset -= sizeof(struct short_ad);
                else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                        epos.offset -= sizeof(long_ad);
+                        epos.offset -= sizeof(struct long_ad);
-                udf_write_aext(dir, &epos, eloc, elen, 1);
+                udf_write_aext(dir, &epos, &eloc, elen, 1);
        }
        f_pos += nfidlen;
@@ -494,10 +497,10 @@ add:
        memset(cfi, 0, sizeof(struct fileIdentDesc));
        if (UDF_SB(sb)->s_udfrev >= 0x0200)
                udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
-                            sizeof(tag));
+                            sizeof(struct tag));
        else
                udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
-                            sizeof(tag));
+                            sizeof(struct tag));
        cfi->fileVersionNum = cpu_to_le16(1);
        cfi->lengthFileIdent = namelen;
        cfi->lengthOfImpUse = cpu_to_le16(0);
@@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
        cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
-                memset(&(cfi->icb), 0x00, sizeof(long_ad));
+                memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
        return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
 }
@@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir)
        loff_t f_pos;
        loff_t size = udf_ext0_offset(dir) + dir->i_size;
        int block;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        struct extent_position epos = {};
@@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir)
        else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
                              &epos, &eloc, &elen, &offset) ==
                                        (EXT_RECORDED_ALLOCATED >> 30)) {
-                block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                                epos.offset -= sizeof(short_ad);
+                                epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                                epos.offset -= sizeof(long_ad);
+                                epos.offset -= sizeof(struct long_ad);
                } else
                        offset = 0;
@@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        struct udf_fileident_bh fibh;
        struct fileIdentDesc *fi, cfi;
-        kernel_lb_addr tloc;
+        struct kernel_lb_addr tloc;
        retval = -ENOENT;
        lock_kernel();
@@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        retval = -EIO;
        tloc = lelb_to_cpu(cfi.icb.extLocation);
-        if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+        if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
                goto end_rmdir;
        retval = -ENOTEMPTY;
        if (!empty_dir(inode))
@@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct udf_fileident_bh fibh;
        struct fileIdentDesc *fi;
        struct fileIdentDesc cfi;
-        kernel_lb_addr tloc;
+        struct kernel_lb_addr tloc;
        retval = -ENOENT;
        lock_kernel();
@@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        retval = -EIO;
        tloc = lelb_to_cpu(cfi.icb.extLocation);
-        if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+        if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
                goto end_unlink;
        if (!inode->i_nlink) {
@@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_op = &page_symlink_inode_operations;
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
-                kernel_lb_addr eloc;
+                struct kernel_lb_addr eloc;
                uint32_t bsize;
                block = udf_new_block(inode->i_sb, inode,
@@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                                iinfo->i_location.partitionReferenceNum;
                bsize = inode->i_sb->s_blocksize;
                iinfo->i_lenExtents = bsize;
-                udf_add_aext(inode, &epos, eloc, bsize, 0);
+                udf_add_aext(inode, &epos, &eloc, bsize, 0);
                brelse(epos.bh);
                block = udf_get_pblock(inode->i_sb, block,
@@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct fileIdentDesc ocfi, ncfi;
        struct buffer_head *dir_bh = NULL;
        int retval = -ENOENT;
-        kernel_lb_addr tloc;
+        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
        lock_kernel();
@@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                brelse(ofibh.sbh);
        }
        tloc = lelb_to_cpu(ocfi.icb.extLocation);
-        if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0)
+        if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
            != old_inode->i_ino)
                goto end_rename;
@@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (!dir_fi)
                        goto end_rename;
                tloc = lelb_to_cpu(dir_fi->icb.extLocation);
-                if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) !=
+                if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
                                old_dir->i_ino)
                        goto end_rename;
@@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
         */
        ncfi.fileVersionNum = ocfi.fileVersionNum;
        ncfi.fileCharacteristics = ocfi.fileCharacteristics;
-        memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad));
+        memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
        udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
        /* The old fid may have moved - find it again */
@@ -1242,6 +1245,7 @@ end_rename:
 static struct dentry *udf_get_parent(struct dentry *child)
 {
+        struct kernel_lb_addr tloc;
        struct inode *inode = NULL;
        struct qstr dotdot = {.name = "..", .len = 2};
        struct fileIdentDesc cfi;
@@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
-        inode = udf_iget(child->d_inode->i_sb,
+        tloc = lelb_to_cpu(cfi.icb.extLocation);
-                         lelb_to_cpu(cfi.icb.extLocation));
+        inode = udf_iget(child->d_inode->i_sb, &tloc);
        if (!inode)
                goto out_unlock;
        unlock_kernel();
@@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
                                        u16 partref, __u32 generation)
 {
        struct inode *inode;
-        kernel_lb_addr loc;
+        struct kernel_lb_addr loc;
        if (block == 0)
                return ERR_PTR(-ESTALE);
        loc.logicalBlockNum = block;
        loc.partitionReferenceNum = partref;
-        inode = udf_iget(sb, loc);
+        inode = udf_iget(sb, &loc);
        if (inode == NULL)
                return ERR_PTR(-ENOMEM);
@@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
 {
        int len = *lenp;
        struct inode *inode =  de->d_inode;
-        kernel_lb_addr location = UDF_I(inode)->i_location;
+        struct kernel_lb_addr location = UDF_I(inode)->i_location;
        struct fid *fid = (struct fid *)fh;
        int type = FILEID_UDF_WITHOUT_PARENT;
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 65ff47902bd2..fbff74654df2 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -85,7 +85,7 @@ struct appIdentSuffix {
 /* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
 /* Implementation Use (UDF 2.50 2.2.6.4) */
 struct logicalVolIntegrityDescImpUse {
-        regid           impIdent;
+        struct regid    impIdent;
        __le32          numFiles;
        __le32          numDirs;
        __le16          minUDFReadRev;
@@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse {
 /* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
 /* Implementation Use (UDF 2.50 2.2.7.2) */
 struct impUseVolDescImpUse {
-        charspec        LVICharset;
+        struct charspec LVICharset;
        dstring         logicalVolIdent[128];
        dstring         LVInfo1[36];
        dstring         LVInfo2[36];
        dstring         LVInfo3[36];
-        regid           impIdent;
+        struct regid    impIdent;
        uint8_t         impUse[128];
 } __attribute__ ((packed));
@@ -110,7 +110,7 @@ struct udfPartitionMap2 {
        uint8_t         partitionMapType;
        uint8_t         partitionMapLength;
        uint8_t         reserved1[2];
-        regid           partIdent;
+        struct regid    partIdent;
        __le16          volSeqNum;
        __le16          partitionNum;
 } __attribute__ ((packed));
@@ -120,7 +120,7 @@ struct virtualPartitionMap {
        uint8_t         partitionMapType;
        uint8_t         partitionMapLength;
        uint8_t         reserved1[2];
-        regid           partIdent;
+        struct regid    partIdent;
        __le16          volSeqNum;
        __le16          partitionNum;
        uint8_t         reserved2[24];
@@ -131,7 +131,7 @@ struct sparablePartitionMap {
        uint8_t partitionMapType;
        uint8_t partitionMapLength;
        uint8_t reserved1[2];
-        regid partIdent;
+        struct regid partIdent;
        __le16 volSeqNum;
        __le16 partitionNum;
        __le16 packetLength;
@@ -146,7 +146,7 @@ struct metadataPartitionMap {
        uint8_t         partitionMapType;
        uint8_t         partitionMapLength;
        uint8_t         reserved1[2];
-        regid           partIdent;
+        struct regid    partIdent;
        __le16          volSeqNum;
        __le16          partitionNum;
        __le32          metadataFileLoc;
@@ -161,7 +161,7 @@ struct metadataPartitionMap {
 /* Virtual Allocation Table (UDF 1.5 2.2.10) */
 struct virtualAllocationTable15 {
        __le32          VirtualSector[0];
-        regid           vatIdent;
+        struct regid    vatIdent;
        __le32          previousVATICBLoc;
 } __attribute__ ((packed));
@@ -192,8 +192,8 @@ struct sparingEntry {
 } __attribute__ ((packed));
 struct sparingTable {
-        tag             descTag;
+        struct tag      descTag;
-        regid           sparingIdent;
+        struct regid    sparingIdent;
        __le16          reallocationTableLen;
        __le16          reserved;
        __le32          sequenceNum;
@@ -206,7 +206,7 @@ struct sparingTable {
 #define ICBTAG_FILE_TYPE_MIRROR         0xFB
 #define ICBTAG_FILE_TYPE_BITMAP         0xFC
-/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
+/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
 struct allocDescImpUse {
        __le16          flags;
        uint8_t         impUse[4];
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 96dfd207c3d6..4b540ee632d5 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
 {
        struct super_block *sb = inode->i_sb;
        struct udf_part_map *map;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t ext_offset;
        struct extent_position epos = {};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e25e7010627b..72348cc855a4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -81,16 +81,13 @@ static char error_buf[1024];
 /* These are the "meat" - everything else is stuffing */
 static int udf_fill_super(struct super_block *, void *, int);
 static void udf_put_super(struct super_block *);
-static void udf_write_super(struct super_block *);
+static int udf_sync_fs(struct super_block *, int);
 static int udf_remount_fs(struct super_block *, int *, char *);
-static int udf_check_valid(struct super_block *, int, int);
+static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
-static int udf_vrs(struct super_block *sb, int silent);
+static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
-static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad);
+                            struct kernel_lb_addr *);
-static void udf_find_anchor(struct super_block *);
-static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
-                            kernel_lb_addr *);
 static void udf_load_fileset(struct super_block *, struct buffer_head *,
-                             kernel_lb_addr *);
+                             struct kernel_lb_addr *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
@@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = {
        .delete_inode   = udf_delete_inode,
        .clear_inode    = udf_clear_inode,
        .put_super      = udf_put_super,
-        .write_super    = udf_write_super,
+        .sync_fs        = udf_sync_fs,
        .statfs         = udf_statfs,
        .remount_fs     = udf_remount_fs,
        .show_options   = udf_show_options,
@@ -201,6 +198,8 @@ struct udf_options {
        mode_t umask;
        gid_t gid;
        uid_t uid;
+        mode_t fmode;
+        mode_t dmode;
        struct nls_table *nls_map;
 };
@@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
                seq_puts(seq, ",nostrict");
-        if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE)
+        if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
                seq_printf(seq, ",bs=%lu", sb->s_blocksize);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
                seq_puts(seq, ",unhide");
@@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
                seq_printf(seq, ",gid=%u", sbi->s_gid);
        if (sbi->s_umask != 0)
                seq_printf(seq, ",umask=%o", sbi->s_umask);
+        if (sbi->s_fmode != UDF_INVALID_MODE)
+                seq_printf(seq, ",mode=%o", sbi->s_fmode);
+        if (sbi->s_dmode != UDF_INVALID_MODE)
+                seq_printf(seq, ",dmode=%o", sbi->s_dmode);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
                seq_printf(seq, ",session=%u", sbi->s_session);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
                seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
-        /*
+        if (sbi->s_anchor != 0)
-         * s_anchor[2] could be zeroed out in case there is no anchor
+                seq_printf(seq, ",anchor=%u", sbi->s_anchor);
-         * in the specified block, but then the "anchor=N" option
-         * originally given by the user wasn't effective, so it's OK
-         * if we don't show it.
-         */
-        if (sbi->s_anchor[2] != 0)
-                seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]);
        /*
         * volume, partition, fileset and rootdir seem to be ignored
         * currently
@@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
 *
 *      gid=            Set the default group.
 *      umask=          Set the default umask.
+ *      mode=           Set the default file permissions.
+ *      dmode=          Set the default directory permissions.
 *      uid=            Set the default user.
 *      bs=             Set the block size.
 *      unhide          Show otherwise hidden files.
@@ -366,7 +365,8 @@ enum {
        Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
        Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
        Opt_rootdir, Opt_utf8, Opt_iocharset,
-        Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
+        Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
+        Opt_fmode, Opt_dmode
 };
 static const match_table_t tokens = {
@@ -395,6 +395,8 @@ static const match_table_t tokens = {
        {Opt_rootdir,   "rootdir=%u"},
        {Opt_utf8,      "utf8"},
        {Opt_iocharset, "iocharset=%s"},
+        {Opt_fmode,     "mode=%o"},
+        {Opt_dmode,     "dmode=%o"},
        {Opt_err,       NULL}
 };
@@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
        int option;
        uopt->novrs = 0;
-        uopt->blocksize = UDF_DEFAULT_BLOCKSIZE;
        uopt->partition = 0xFFFF;
        uopt->session = 0xFFFFFFFF;
        uopt->lastblock = 0;
@@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
                switch (token) {
                case Opt_novrs:
                        uopt->novrs = 1;
+                        break;
                case Opt_bs:
                        if (match_int(&args[0], &option))
                                return 0;
                        uopt->blocksize = option;
+                        uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
                        break;
                case Opt_unhide:
                        uopt->flags |= (1 << UDF_FLAG_UNHIDE);
@@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
                case Opt_gforget:
                        uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
                        break;
+                case Opt_fmode:
+                        if (match_octal(args, &option))
+                                return 0;
+                        uopt->fmode = option & 0777;
+                        break;
+                case Opt_dmode:
+                        if (match_octal(args, &option))
+                                return 0;
+                        uopt->dmode = option & 0777;
+                        break;
                default:
                        printk(KERN_ERR "udf: bad mount option \"%s\" "
                               "or missing value\n", p);
@@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
        return 1;
 }
-static void udf_write_super(struct super_block *sb)
-{
-        lock_kernel();
-        if (!(sb->s_flags & MS_RDONLY))
-                udf_open_lvid(sb);
-        sb->s_dirt = 0;
-        unlock_kernel();
-}
 static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 {
        struct udf_options uopt;
@@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        uopt.uid   = sbi->s_uid;
        uopt.gid   = sbi->s_gid;
        uopt.umask = sbi->s_umask;
+        uopt.fmode = sbi->s_fmode;
+        uopt.dmode = sbi->s_dmode;
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
@@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
        sbi->s_umask = uopt.umask;
+        sbi->s_fmode = uopt.fmode;
+        sbi->s_dmode = uopt.dmode;
        if (sbi->s_lvid_bh) {
                int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        return 0;
 }
-static int udf_vrs(struct super_block *sb, int silent)
+/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
+/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
+static loff_t udf_check_vsd(struct super_block *sb)
 {
        struct volStructDesc *vsd = NULL;
        loff_t sector = 32768;
        int sectorsize;
        struct buffer_head *bh = NULL;
-        int iso9660 = 0;
        int nsr02 = 0;
        int nsr03 = 0;
        struct udf_sb_info *sbi;
-        /* Block size must be a multiple of 512 */
-        if (sb->s_blocksize & 511)
-                return 0;
        sbi = UDF_SB(sb);
        if (sb->s_blocksize < sizeof(struct volStructDesc))
                sectorsize = sizeof(struct volStructDesc);
        else
@@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent)
                        break;
                } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
                                    VSD_STD_ID_LEN)) {
-                        iso9660 = sector;
                        switch (vsd->structType) {
                        case 0:
                                udf_debug("ISO9660 Boot Record found\n");
@@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent)
                return 0;
 }
-/*
- * Check whether there is an anchor block in the given block
- */
-static int udf_check_anchor_block(struct super_block *sb, sector_t block)
-{
-        struct buffer_head *bh;
-        uint16_t ident;
-        if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
-            udf_fixed_to_variable(block) >=
-            sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
-                return 0;
-        bh = udf_read_tagged(sb, block, block, &ident);
-        if (!bh)
-                return 0;
-        brelse(bh);
-        return ident == TAG_IDENT_AVDP;
-}
-/* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
-{
-        sector_t last[6];
-        int i;
-        struct udf_sb_info *sbi = UDF_SB(sb);
-        last[0] = lastblock;
-        last[1] = last[0] - 1;
-        last[2] = last[0] + 1;
-        last[3] = last[0] - 2;
-        last[4] = last[0] - 150;
-        last[5] = last[0] - 152;
-        /*  according to spec, anchor is in either:
-         *     block 256
-         *     lastblock-256
-         *     lastblock
-         *  however, if the disc isn't closed, it could be 512 */
-        for (i = 0; i < ARRAY_SIZE(last); i++) {
-                if (last[i] < 0)
-                        continue;
-                if (last[i] >= sb->s_bdev->bd_inode->i_size >>
-                                sb->s_blocksize_bits)
-                        continue;
-                if (udf_check_anchor_block(sb, last[i])) {
-                        sbi->s_anchor[0] = last[i];
-                        sbi->s_anchor[1] = last[i] - 256;
-                        return last[i];
-                }
-                if (last[i] < 256)
-                        continue;
-                if (udf_check_anchor_block(sb, last[i] - 256)) {
-                        sbi->s_anchor[1] = last[i] - 256;
-                        return last[i];
-                }
-        }
-        if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
-                sbi->s_anchor[0] = sbi->s_session + 256;
-                return last[0];
-        }
-        if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
-                sbi->s_anchor[0] = sbi->s_session + 512;
-                return last[0];
-        }
-        return 0;
-}
-/*
- * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
- * be the last block on the media.
- *
- * Return 1 if not found, 0 if ok
- *
- */
-static void udf_find_anchor(struct super_block *sb)
-{
-        sector_t lastblock;
-        struct buffer_head *bh = NULL;
-        uint16_t ident;
-        int i;
-        struct udf_sb_info *sbi = UDF_SB(sb);
-        lastblock = udf_scan_anchors(sb, sbi->s_last_block);
-        if (lastblock)
-                goto check_anchor;
-        /* No anchor found? Try VARCONV conversion of block numbers */
-        UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
-        /* Firstly, we try to not convert number of the last block */
-        lastblock = udf_scan_anchors(sb,
-                                udf_variable_to_fixed(sbi->s_last_block));
-        if (lastblock)
-                goto check_anchor;
-        /* Secondly, we try with converted number of the last block */
-        lastblock = udf_scan_anchors(sb, sbi->s_last_block);
-        if (!lastblock) {
-                /* VARCONV didn't help. Clear it. */
-                UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
-        }
-check_anchor:
-        /*
-         * Check located anchors and the anchor block supplied via
-         * mount options
-         */
-        for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
-                if (!sbi->s_anchor[i])
-                        continue;
-                bh = udf_read_tagged(sb, sbi->s_anchor[i],
-                                        sbi->s_anchor[i], &ident);
-                if (!bh)
-                        sbi->s_anchor[i] = 0;
-                else {
-                        brelse(bh);
-                        if (ident != TAG_IDENT_AVDP)
-                                sbi->s_anchor[i] = 0;
-                }
-        }
-        sbi->s_last_block = lastblock;
-}
 static int udf_find_fileset(struct super_block *sb,
-                            kernel_lb_addr *fileset,
+                            struct kernel_lb_addr *fileset,
-                            kernel_lb_addr *root)
+                            struct kernel_lb_addr *root)
 {
        struct buffer_head *bh = NULL;
        long lastblock;
@@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb,
        if (fileset->logicalBlockNum != 0xFFFFFFFF ||
            fileset->partitionReferenceNum != 0xFFFF) {
-                bh = udf_read_ptagged(sb, *fileset, 0, &ident);
+                bh = udf_read_ptagged(sb, fileset, 0, &ident);
                if (!bh) {
                        return 1;
@@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb,
        sbi = UDF_SB(sb);
        if (!bh) {
                /* Search backwards through the partitions */
-                kernel_lb_addr newfileset;
+                struct kernel_lb_addr newfileset;
 /* --> cvg: FIXME - is it reasonable? */
                return 1;
@@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb,
                        newfileset.logicalBlockNum = 0;
                        do {
-                                bh = udf_read_ptagged(sb, newfileset, 0,
+                                bh = udf_read_ptagged(sb, &newfileset, 0,
                                                      &ident);
                                if (!bh) {
                                        newfileset.logicalBlockNum++;
@@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb,
 static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 {
        struct primaryVolDesc *pvoldesc;
-        struct ustr instr;
+        struct ustr *instr, *outstr;
-        struct ustr outstr;
        struct buffer_head *bh;
        uint16_t ident;
+        int ret = 1;
+        instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        if (!instr)
+                return 1;
+        outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        if (!outstr)
+                goto out1;
        bh = udf_read_tagged(sb, block, block, &ident);
        if (!bh)
-                return 1;
+                goto out2;
        BUG_ON(ident != TAG_IDENT_PVD);
        pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
        if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
                              pvoldesc->recordingDateAndTime)) {
 #ifdef UDFFS_DEBUG
-                timestamp *ts = &pvoldesc->recordingDateAndTime;
+                struct timestamp *ts = &pvoldesc->recordingDateAndTime;
                udf_debug("recording time %04u/%02u/%02u"
                          " %02u:%02u (%x)\n",
                          le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
@@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 #endif
        }
-        if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32))
+        if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
-                if (udf_CS0toUTF8(&outstr, &instr)) {
+                if (udf_CS0toUTF8(outstr, instr)) {
-                        strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name,
+                        strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
-                                outstr.u_len > 31 ? 31 : outstr.u_len);
+                                outstr->u_len > 31 ? 31 : outstr->u_len);
                        udf_debug("volIdent[] = '%s'\n",
                                        UDF_SB(sb)->s_volume_ident);
                }
-        if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128))
+        if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
-                if (udf_CS0toUTF8(&outstr, &instr))
+                if (udf_CS0toUTF8(outstr, instr))
-                        udf_debug("volSetIdent[] = '%s'\n", outstr.u_name);
+                        udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
        brelse(bh);
-        return 0;
+        ret = 0;
+out2:
+        kfree(outstr);
+out1:
+        kfree(instr);
+        return ret;
 }
 static int udf_load_metadata_files(struct super_block *sb, int partition)
@@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        struct udf_meta_data *mdata;
-        kernel_lb_addr addr;
+        struct kernel_lb_addr addr;
        int fe_error = 0;
        map = &sbi->s_partmaps[partition];
@@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        udf_debug("Metadata file location: block = %d part = %d\n",
                          addr.logicalBlockNum, addr.partitionReferenceNum);
-        mdata->s_metadata_fe = udf_iget(sb, addr);
+        mdata->s_metadata_fe = udf_iget(sb, &addr);
        if (mdata->s_metadata_fe == NULL) {
                udf_warning(sb, __func__, "metadata inode efe not found, "
@@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        udf_debug("Mirror metadata file location: block = %d part = %d\n",
                          addr.logicalBlockNum, addr.partitionReferenceNum);
-        mdata->s_mirror_fe = udf_iget(sb, addr);
+        mdata->s_mirror_fe = udf_iget(sb, &addr);
        if (mdata->s_mirror_fe == NULL) {
                if (fe_error) {
@@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
                udf_debug("Bitmap file location: block = %d part = %d\n",
                        addr.logicalBlockNum, addr.partitionReferenceNum);
-                mdata->s_bitmap_fe = udf_iget(sb, addr);
+                mdata->s_bitmap_fe = udf_iget(sb, &addr);
                if (mdata->s_bitmap_fe == NULL) {
                        if (sb->s_flags & MS_RDONLY)
@@ -1037,7 +923,7 @@ error_exit:
 }
 static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
-                             kernel_lb_addr *root)
+                             struct kernel_lb_addr *root)
 {
        struct fileSetDesc *fset;
@@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
        phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
        if (phd->unallocSpaceTable.extLength) {
-                kernel_lb_addr loc = {
+                struct kernel_lb_addr loc = {
                        .logicalBlockNum = le32_to_cpu(
                                phd->unallocSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
-                map->s_uspace.s_table = udf_iget(sb, loc);
+                map->s_uspace.s_table = udf_iget(sb, &loc);
                if (!map->s_uspace.s_table) {
                        udf_debug("cannot load unallocSpaceTable (part %d)\n",
                                        p_index);
@@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                udf_debug("partitionIntegrityTable (part %d)\n", p_index);
        if (phd->freedSpaceTable.extLength) {
-                kernel_lb_addr loc = {
+                struct kernel_lb_addr loc = {
                        .logicalBlockNum = le32_to_cpu(
                                phd->freedSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
-                map->s_fspace.s_table = udf_iget(sb, loc);
+                map->s_fspace.s_table = udf_iget(sb, &loc);
                if (!map->s_fspace.s_table) {
                        udf_debug("cannot load freedSpaceTable (part %d)\n",
                                p_index);
@@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map = &sbi->s_partmaps[p_index];
-        kernel_lb_addr ino;
+        struct kernel_lb_addr ino;
        struct buffer_head *bh = NULL;
        struct udf_inode_info *vati;
        uint32_t pos;
@@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
        /* VAT file entry is in the last recorded block */
        ino.partitionReferenceNum = type1_index;
        ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
-        sbi->s_vat_inode = udf_iget(sb, ino);
+        sbi->s_vat_inode = udf_iget(sb, &ino);
        if (!sbi->s_vat_inode)
                return 1;
@@ -1322,7 +1208,7 @@ out_bh:
 }
 static int udf_load_logicalvol(struct super_block *sb, sector_t block,
-                               kernel_lb_addr *fileset)
+                               struct kernel_lb_addr *fileset)
 {
        struct logicalVolDesc *lvd;
        int i, j, offset;
@@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
        }
        if (fileset) {
-                long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]);
+                struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
                *fileset = lelb_to_cpu(la->extLocation);
                udf_debug("FileSet found in LogicalVolDesc at block=%d, "
@@ -1490,7 +1376,7 @@ out_bh:
 * udf_load_logicalvolint
 *
 */
-static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
+static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
 {
        struct buffer_head *bh = NULL;
        uint16_t ident;
@@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
 *      Written, tested, and released.
 */
 static noinline int udf_process_sequence(struct super_block *sb, long block,
-                                long lastblock, kernel_lb_addr *fileset)
+                                long lastblock, struct kernel_lb_addr *fileset)
 {
        struct buffer_head *bh = NULL;
        struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
        return 0;
 }
+static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
+                             struct kernel_lb_addr *fileset)
+{
+        struct anchorVolDescPtr *anchor;
+        long main_s, main_e, reserve_s, reserve_e;
+        struct udf_sb_info *sbi;
+        sbi = UDF_SB(sb);
+        anchor = (struct anchorVolDescPtr *)bh->b_data;
+        /* Locate the main sequence */
+        main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
+        main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
+        main_e = main_e >> sb->s_blocksize_bits;
+        main_e += main_s;
+        /* Locate the reserve sequence */
+        reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
+        reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
+        reserve_e = reserve_e >> sb->s_blocksize_bits;
+        reserve_e += reserve_s;
+        /* Process the main & reserve sequences */
+        /* responsible for finding the PartitionDesc(s) */
+        if (!udf_process_sequence(sb, main_s, main_e, fileset))
+                return 1;
+        return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
+}
 /*
- * udf_check_valid()
+ * Check whether there is an anchor block in the given block and
+ * load Volume Descriptor Sequence if so.
 */
-static int udf_check_valid(struct super_block *sb, int novrs, int silent)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+                                  struct kernel_lb_addr *fileset)
 {
-        long block;
+        struct buffer_head *bh;
-        struct udf_sb_info *sbi = UDF_SB(sb);
+        uint16_t ident;
+        int ret;
-        if (novrs) {
+        if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
-                udf_debug("Validity check skipped because of novrs option\n");
+            udf_fixed_to_variable(block) >=
+            sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+                return 0;
+        bh = udf_read_tagged(sb, block, block, &ident);
+        if (!bh)
+                return 0;
+        if (ident != TAG_IDENT_AVDP) {
+                brelse(bh);
                return 0;
        }
-        /* Check that it is NSR02 compliant */
+        ret = udf_load_sequence(sb, bh, fileset);
-        /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
+        brelse(bh);
-        block = udf_vrs(sb, silent);
+        return ret;
-        if (block == -1)
-                udf_debug("Failed to read byte 32768. Assuming open "
-                          "disc. Skipping validity check\n");
-        if (block && !sbi->s_last_block)
-                sbi->s_last_block = udf_get_last_block(sb);
-        return !block;
 }
-static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset)
+/* Search for an anchor volume descriptor pointer */
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
+                                 struct kernel_lb_addr *fileset)
 {
-        struct anchorVolDescPtr *anchor;
+        sector_t last[6];
-        uint16_t ident;
-        struct buffer_head *bh;
-        long main_s, main_e, reserve_s, reserve_e;
        int i;
-        struct udf_sb_info *sbi;
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        int last_count = 0;
-        if (!sb)
-                return 1;
-        sbi = UDF_SB(sb);
-        for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
+        /* First try user provided anchor */
-                if (!sbi->s_anchor[i])
+        if (sbi->s_anchor) {
+                if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
+                        return lastblock;
+        }
+        /*
+         * according to spec, anchor is in either:
+         *     block 256
+         *     lastblock-256
+         *     lastblock
+         *  however, if the disc isn't closed, it could be 512.
+         */
+        if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
+                return lastblock;
+        /*
+         * The trouble is which block is the last one. Drives often misreport
+         * this so we try various possibilities.
+         */
+        last[last_count++] = lastblock;
+        if (lastblock >= 1)
+                last[last_count++] = lastblock - 1;
+        last[last_count++] = lastblock + 1;
+        if (lastblock >= 2)
+                last[last_count++] = lastblock - 2;
+        if (lastblock >= 150)
+                last[last_count++] = lastblock - 150;
+        if (lastblock >= 152)
+                last[last_count++] = lastblock - 152;
+        for (i = 0; i < last_count; i++) {
+                if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+                                sb->s_blocksize_bits)
                        continue;
+                if (udf_check_anchor_block(sb, last[i], fileset))
-                bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i],
+                        return last[i];
-                                     &ident);
+                if (last[i] < 256)
-                if (!bh)
                        continue;
+                if (udf_check_anchor_block(sb, last[i] - 256, fileset))
+                        return last[i];
+        }
-                anchor = (struct anchorVolDescPtr *)bh->b_data;
+        /* Finally try block 512 in case media is open */
+        if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
+                return last[0];
+        return 0;
+}
-                /* Locate the main sequence */
+/*
-                main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
+ * Find an anchor volume descriptor and load Volume Descriptor Sequence from
-                main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
+ * area specified by it. The function expects sbi->s_lastblock to be the last
-                main_e = main_e >> sb->s_blocksize_bits;
+ * block on the media.
-                main_e += main_s;
+ *
+ * Return 1 if ok, 0 if not found.
+ *
+ */
+static int udf_find_anchor(struct super_block *sb,
+                           struct kernel_lb_addr *fileset)
+{
+        sector_t lastblock;
+        struct udf_sb_info *sbi = UDF_SB(sb);
-                /* Locate the reserve sequence */
+        lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
-                reserve_s = le32_to_cpu(
+        if (lastblock)
-                                anchor->reserveVolDescSeqExt.extLocation);
+                goto out;
-                reserve_e = le32_to_cpu(
-                                anchor->reserveVolDescSeqExt.extLength);
-                reserve_e = reserve_e >> sb->s_blocksize_bits;
-                reserve_e += reserve_s;
-                brelse(bh);
+        /* No anchor found? Try VARCONV conversion of block numbers */
+        UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+        /* Firstly, we try to not convert number of the last block */
+        lastblock = udf_scan_anchors(sb,
+                                udf_variable_to_fixed(sbi->s_last_block),
+                                fileset);
+        if (lastblock)
+                goto out;
-                /* Process the main & reserve sequences */
+        /* Secondly, we try with converted number of the last block */
-                /* responsible for finding the PartitionDesc(s) */
+        lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
-                if (!(udf_process_sequence(sb, main_s, main_e,
+        if (!lastblock) {
-                                           fileset) &&
+                /* VARCONV didn't help. Clear it. */
-                      udf_process_sequence(sb, reserve_s, reserve_e,
+                UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
-                                           fileset)))
+                return 0;
-                        break;
        }
+out:
+        sbi->s_last_block = lastblock;
+        return 1;
+}
-        if (i == ARRAY_SIZE(sbi->s_anchor)) {
+/*
-                udf_debug("No Anchor block found\n");
+ * Check Volume Structure Descriptor, find Anchor block and load Volume
-                return 1;
+ * Descriptor Sequence
+ */
+static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
+                        int silent, struct kernel_lb_addr *fileset)
+{
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        loff_t nsr_off;
+        if (!sb_set_blocksize(sb, uopt->blocksize)) {
+                if (!silent)
+                        printk(KERN_WARNING "UDF-fs: Bad block size\n");
+                return 0;
+        }
+        sbi->s_last_block = uopt->lastblock;
+        if (!uopt->novrs) {
+                /* Check that it is NSR02 compliant */
+                nsr_off = udf_check_vsd(sb);
+                if (!nsr_off) {
+                        if (!silent)
+                                printk(KERN_WARNING "UDF-fs: No VRS found\n");
+                        return 0;
+                }
+                if (nsr_off == -1)
+                        udf_debug("Failed to read byte 32768. Assuming open "
+                                  "disc. Skipping validity check\n");
+                if (!sbi->s_last_block)
+                        sbi->s_last_block = udf_get_last_block(sb);
+        } else {
+                udf_debug("Validity check skipped because of novrs option\n");
        }
-        udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
-        return 0;
+        /* Look for anchor block and load Volume Descriptor Sequence */
+        sbi->s_anchor = uopt->anchor;
+        if (!udf_find_anchor(sb, fileset)) {
+                if (!silent)
+                        printk(KERN_WARNING "UDF-fs: No anchor found\n");
+                return 0;
+        }
+        return 1;
 }
 static void udf_open_lvid(struct super_block *sb)
@@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb)
        struct buffer_head *bh = sbi->s_lvid_bh;
        struct logicalVolIntegrityDesc *lvid;
        struct logicalVolIntegrityDescImpUse *lvidiu;
        if (!bh)
                return;
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
@@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb)
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
        udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
                                CURRENT_TIME);
-        lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
+        lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
        lvid->descTag.descCRC = cpu_to_le16(
-                crc_itu_t(0, (char *)lvid + sizeof(tag),
+                crc_itu_t(0, (char *)lvid + sizeof(struct tag),
                        le16_to_cpu(lvid->descTag.descCRCLength)));
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
+        sbi->s_lvid_dirty = 0;
 }
 static void udf_close_lvid(struct super_block *sb)
@@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb)
                return;
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
-        if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
-                return;
        lvidiu = udf_sb_lvidiu(sbi);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb)
        lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
        lvid->descTag.descCRC = cpu_to_le16(
-                        crc_itu_t(0, (char *)lvid + sizeof(tag),
+                        crc_itu_t(0, (char *)lvid + sizeof(struct tag),
                                le16_to_cpu(lvid->descTag.descCRCLength)));
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
+        sbi->s_lvid_dirty = 0;
 }
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map)
 static int udf_fill_super(struct super_block *sb, void *options, int silent)
 {
        int i;
+        int ret;
        struct inode *inode = NULL;
        struct udf_options uopt;
-        kernel_lb_addr rootdir, fileset;
+        struct kernel_lb_addr rootdir, fileset;
        struct udf_sb_info *sbi;
        uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
        uopt.uid = -1;
        uopt.gid = -1;
        uopt.umask = 0;
+        uopt.fmode = UDF_INVALID_MODE;
+        uopt.dmode = UDF_INVALID_MODE;
        sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
        if (!sbi)
@@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sbi->s_uid = uopt.uid;
        sbi->s_gid = uopt.gid;
        sbi->s_umask = uopt.umask;
+        sbi->s_fmode = uopt.fmode;
+        sbi->s_dmode = uopt.dmode;
        sbi->s_nls_map = uopt.nls_map;
-        /* Set the block size for all transfers */
-        if (!sb_min_blocksize(sb, uopt.blocksize)) {
-                udf_debug("Bad block size (%d)\n", uopt.blocksize);
-                printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
-                goto error_out;
-        }
        if (uopt.session == 0xFFFFFFFF)
                sbi->s_session = udf_get_last_session(sb);
        else
@@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        udf_debug("Multi-session=%d\n", sbi->s_session);
-        sbi->s_last_block = uopt.lastblock;
-        sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
-        sbi->s_anchor[2] = uopt.anchor;
-        if (udf_check_valid(sb, uopt.novrs, silent)) {
-                /* read volume recognition sequences */
-                printk(KERN_WARNING "UDF-fs: No VRS found\n");
-                goto error_out;
-        }
-        udf_find_anchor(sb);
        /* Fill in the rest of the superblock */
        sb->s_op = &udf_sb_ops;
        sb->s_export_op = &udf_export_ops;
@@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sb->s_magic = UDF_SUPER_MAGIC;
        sb->s_time_gran = 1000;
-        if (udf_load_sequence(sb, &fileset)) {
+        if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+        } else {
+                uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+                if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
+                        if (!silent)
+                                printk(KERN_NOTICE
+                                       "UDF-fs: Rescanning with blocksize "
+                                       "%d\n", UDF_DEFAULT_BLOCKSIZE);
+                        uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
+                        ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+                }
+        }
+        if (!ret) {
                printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
                goto error_out;
        }
@@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        }
        if (!silent) {
-                timestamp ts;
+                struct timestamp ts;
                udf_time_to_disk_stamp(&ts, sbi->s_record_time);
                udf_info("UDF: Mounting volume '%s', "
                         "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
@@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* Assign the root inode */
        /* assign inodes by physical block number */
        /* perhaps it's not extensible enough, but for now ... */
-        inode = udf_iget(sb, rootdir);
+        inode = udf_iget(sb, &rootdir);
        if (!inode) {
                printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
                                "partition=%d\n",
@@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
 }
+static int udf_sync_fs(struct super_block *sb, int wait)
+{
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        mutex_lock(&sbi->s_alloc_mutex);
+        if (sbi->s_lvid_dirty) {
+                /*
+                 * Blockdevice will be synced later so we don't have to submit
+                 * the buffer for IO
+                 */
+                mark_buffer_dirty(sbi->s_lvid_bh);
+                sb->s_dirt = 0;
+                sbi->s_lvid_dirty = 0;
+        }
+        mutex_unlock(&sbi->s_alloc_mutex);
+        return 0;
+}
 static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDescImpUse *lvidiu;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        if (sbi->s_lvid_bh != NULL)
                lvidiu = udf_sb_lvidiu(sbi);
@@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
                                          le32_to_cpu(lvidiu->numDirs)) : 0)
                        + buf->f_bfree;
        buf->f_ffree = buf->f_bfree;
-        /* __kernel_fsid_t f_fsid */
        buf->f_namelen = UDF_NAME_LEN - 2;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
@@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        unsigned int accum = 0;
        int index;
        int block = 0, newblock;
-        kernel_lb_addr loc;
+        struct kernel_lb_addr loc;
        uint32_t bytes;
        uint8_t *ptr;
        uint16_t ident;
@@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
-        bh = udf_read_ptagged(sb, loc, 0, &ident);
+        bh = udf_read_ptagged(sb, &loc, 0, &ident);
        if (!bh) {
                printk(KERN_ERR "udf: udf_count_free failed\n");
@@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
                bytes -= cur_bytes;
                if (bytes) {
                        brelse(bh);
-                        newblock = udf_get_lb_pblock(sb, loc, ++block);
+                        newblock = udf_get_lb_pblock(sb, &loc, ++block);
                        bh = udf_tread(sb, newblock);
                        if (!bh) {
                                udf_debug("read failed\n");
@@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
 {
        unsigned int accum = 0;
        uint32_t elen;
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        int8_t etype;
        struct extent_position epos;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 65e19b4f9424..225527cdc885 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -28,10 +28,10 @@
 #include "udf_sb.h"
 static void extent_trunc(struct inode *inode, struct extent_position *epos,
-                         kernel_lb_addr eloc, int8_t etype, uint32_t elen,
+                         struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
                         uint32_t nelen)
 {
-        kernel_lb_addr neloc = {};
+        struct kernel_lb_addr neloc = {};
        int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
        int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
@@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
                                        last_block);
                        etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
                } else
-                        neloc = eloc;
+                        neloc = *eloc;
                nelen = (etype << 30) | nelen;
        }
        if (elen != nelen) {
-                udf_write_aext(inode, epos, neloc, nelen, 0);
+                udf_write_aext(inode, epos, &neloc, nelen, 0);
                if (last_block - first_block > 0) {
                        if (etype == (EXT_RECORDED_ALLOCATED >> 30))
                                mark_inode_dirty(inode);
@@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
 void udf_truncate_tail_extent(struct inode *inode)
 {
        struct extent_position epos = {};
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen, nelen;
        uint64_t lbcount = 0;
        int8_t etype = -1, netype;
@@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode)
                return;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                BUG();
@@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode)
                                       (unsigned)elen);
                        nelen = elen - (lbcount - inode->i_size);
                        epos.offset -= adsize;
-                        extent_trunc(inode, &epos, eloc, etype, elen, nelen);
+                        extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
                        epos.offset += adsize;
                        if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
                                printk(KERN_ERR "udf_truncate_tail_extent(): "
@@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode)
 void udf_discard_prealloc(struct inode *inode)
 {
        struct extent_position epos = { NULL, 0, {0, 0} };
-        kernel_lb_addr eloc;
+        struct kernel_lb_addr eloc;
        uint32_t elen;
        uint64_t lbcount = 0;
        int8_t etype = -1, netype;
@@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode)
                return;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                adsize = 0;
@@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode)
        if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                epos.offset -= adsize;
                lbcount -= elen;
-                extent_trunc(inode, &epos, eloc, etype, elen, 0);
+                extent_trunc(inode, &epos, &eloc, etype, elen, 0);
                if (!epos.bh) {
                        iinfo->i_lenAlloc =
                                epos.offset -
@@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
 void udf_truncate_extents(struct inode *inode)
 {
        struct extent_position epos;
-        kernel_lb_addr eloc, neloc = {};
+        struct kernel_lb_addr eloc, neloc = {};
        uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
        int8_t etype;
        struct super_block *sb = inode->i_sb;
@@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode)
        struct udf_inode_info *iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                adsize = sizeof(short_ad);
+                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                adsize = sizeof(long_ad);
+                adsize = sizeof(struct long_ad);
        else
                BUG();
@@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode)
                (inode->i_size & (sb->s_blocksize - 1));
        if (etype != -1) {
                epos.offset -= adsize;
-                extent_trunc(inode, &epos, eloc, etype, elen, byte_offset);
+                extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
                epos.offset += adsize;
                if (byte_offset)
                        lenalloc = epos.offset;
@@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode)
                while ((etype = udf_current_aext(inode, &epos, &eloc,
                                                 &elen, 0)) != -1) {
                        if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-                                udf_write_aext(inode, &epos, neloc, nelen, 0);
+                                udf_write_aext(inode, &epos, &neloc, nelen, 0);
                                if (indirect_ext_len) {
                                        /* We managed to free all extents in the
                                         * indirect extent - free it too */
                                        BUG_ON(!epos.bh);
-                                        udf_free_blocks(sb, inode, epos.block,
+                                        udf_free_blocks(sb, inode, &epos.block,
                                                        0, indirect_ext_len);
                                } else if (!epos.bh) {
                                        iinfo->i_lenAlloc = lenalloc;
@@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode)
                                epos.offset = sizeof(struct allocExtDesc);
                                epos.block = eloc;
                                epos.bh = udf_tread(sb,
-                                                udf_get_lb_pblock(sb, eloc, 0));
+                                                udf_get_lb_pblock(sb, &eloc, 0));
                                if (elen)
                                        indirect_ext_len =
                                                (elen + sb->s_blocksize - 1) >>
@@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode)
                                else
                                        indirect_ext_len = 1;
                        } else {
-                                extent_trunc(inode, &epos, eloc, etype,
+                                extent_trunc(inode, &epos, &eloc, etype,
                                             elen, 0);
                                epos.offset += adsize;
                        }
@@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode)
                if (indirect_ext_len) {
                        BUG_ON(!epos.bh);
-                        udf_free_blocks(sb, inode, epos.block, 0,
+                        udf_free_blocks(sb, inode, &epos.block, 0,
                                        indirect_ext_len);
                } else if (!epos.bh) {
                        iinfo->i_lenAlloc = lenalloc;
@@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode)
                        udf_update_alloc_ext_desc(inode, &epos, lenalloc);
        } else if (inode->i_size) {
                if (byte_offset) {
-                        kernel_long_ad extent;
+                        struct kernel_long_ad extent;
                        /*
                         *  OK, there is not extent covering inode->i_size and
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4f86b1d98a5d..e58d1de41073 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -4,7 +4,7 @@
 struct udf_inode_info {
        struct timespec         i_crtime;
        /* Physical address of inode */
-        kernel_lb_addr          i_location;
+        struct kernel_lb_addr           i_location;
        __u64                   i_unique;
        __u32                   i_lenEAttr;
        __u32                   i_lenAlloc;
@@ -17,8 +17,8 @@ struct udf_inode_info {
        unsigned                i_strat4096 : 1;
        unsigned                reserved : 26;
        union {
-                short_ad        *i_sad;
+                struct short_ad *i_sad;
-                long_ad         *i_lad;
+                struct long_ad          *i_lad;
                __u8            *i_data;
        } i_ext;
        struct inode vfs_inode;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 1c1c514a9725..d113b72c2768 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,6 +30,7 @@
 #define UDF_FLAG_GID_SET        16
 #define UDF_FLAG_SESSION_SET    17
 #define UDF_FLAG_LASTBLOCK_SET  18
+#define UDF_FLAG_BLOCKSIZE_SET  19
 #define UDF_PART_FLAG_UNALLOC_BITMAP    0x0001
 #define UDF_PART_FLAG_UNALLOC_TABLE     0x0002
@@ -48,6 +49,8 @@
 #define UDF_SPARABLE_MAP15              0x1522U
 #define UDF_METADATA_MAP25              0x2511U
+#define UDF_INVALID_MODE                ((mode_t)-1)
 #pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
 struct udf_meta_data {
@@ -114,7 +117,7 @@ struct udf_sb_info {
        /* Sector headers */
        __s32                   s_session;
-        __u32                   s_anchor[3];
+        __u32                   s_anchor;
        __u32                   s_last_block;
        struct buffer_head      *s_lvid_bh;
@@ -123,6 +126,8 @@ struct udf_sb_info {
        mode_t                  s_umask;
        gid_t                   s_gid;
        uid_t                   s_uid;
+        mode_t                  s_fmode;
+        mode_t                  s_dmode;
        /* Root Info */
        struct timespec         s_record_time;
@@ -143,6 +148,8 @@ struct udf_sb_info {
        struct inode            *s_vat_inode;
        struct mutex            s_alloc_mutex;
+        /* Protected by s_alloc_mutex */
+        unsigned int            s_lvid_dirty;
 };
 static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8ec865de5f13..cac51b77a5d1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode)
                return 0;
 }
-#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
 /* computes tag checksum */
-u8 udf_tag_checksum(const tag *t);
+u8 udf_tag_checksum(const struct tag *t);
 struct dentry;
 struct inode;
@@ -95,7 +93,7 @@ struct udf_vds_record {
 };
 struct generic_desc {
-        tag             descTag;
+        struct tag      descTag;
        __le32          volDescSeqNum;
 };
@@ -108,11 +106,22 @@ struct ustr {
 struct extent_position {
        struct buffer_head *bh;
        uint32_t offset;
-        kernel_lb_addr block;
+        struct kernel_lb_addr block;
 };
 /* super.c */
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
+static inline void udf_updated_lvid(struct super_block *sb)
+{
+        struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
+        BUG_ON(!bh);
+        WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
+                     bh->b_data)->integrityType !=
+                     cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
+        sb->s_dirt = 1;
+        UDF_SB(sb)->s_lvid_dirty = 1;
+}
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int,
                     unsigned long);
 /* inode.c */
-extern struct inode *udf_iget(struct super_block *, kernel_lb_addr);
+extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
@@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *);
 extern int udf_write_inode(struct inode *, int);
 extern long udf_block_map(struct inode *, sector_t);
 extern int udf_extend_file(struct inode *, struct extent_position *,
-                           kernel_long_ad *, sector_t);
+                           struct kernel_long_ad *, sector_t);
 extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
-                         kernel_lb_addr *, uint32_t *, sector_t *);
+                         struct kernel_lb_addr *, uint32_t *, sector_t *);
 extern int8_t udf_add_aext(struct inode *, struct extent_position *,
-                           kernel_lb_addr, uint32_t, int);
+                           struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_write_aext(struct inode *, struct extent_position *,
-                             kernel_lb_addr, uint32_t, int);
+                             struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_delete_aext(struct inode *, struct extent_position,
-                              kernel_lb_addr, uint32_t);
+                              struct kernel_lb_addr, uint32_t);
 extern int8_t udf_next_aext(struct inode *, struct extent_position *,
-                            kernel_lb_addr *, uint32_t *, int);
+                            struct kernel_lb_addr *, uint32_t *, int);
 extern int8_t udf_current_aext(struct inode *, struct extent_position *,
-                               kernel_lb_addr *, uint32_t *, int);
+                               struct kernel_lb_addr *, uint32_t *, int);
 /* misc.c */
 extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
 extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
                                           uint32_t, uint16_t *);
 extern struct buffer_head *udf_read_ptagged(struct super_block *,
-                                            kernel_lb_addr, uint32_t,
+                                            struct kernel_lb_addr *, uint32_t,
                                            uint16_t *);
 extern void udf_update_tag(char *, int);
 extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
@@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
                                          uint32_t);
 extern int udf_relocate_blocks(struct super_block *, long, long *);
+static inline uint32_t
+udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
+                  uint32_t offset)
+{
+        return udf_get_pblock(sb, loc->logicalBlockNum,
+                        loc->partitionReferenceNum, offset);
+}
 /* unicode.c */
 extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
 extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
@@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *);
 /* balloc.c */
 extern void udf_free_blocks(struct super_block *, struct inode *,
-                            kernel_lb_addr, uint32_t, uint32_t);
+                            struct kernel_lb_addr *, uint32_t, uint32_t);
 extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
                               uint32_t, uint32_t);
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
@@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
                                                struct udf_fileident_bh *,
                                                struct fileIdentDesc *,
                                                struct extent_position *,
-                                                kernel_lb_addr *, uint32_t *,
+                                                struct kernel_lb_addr *, uint32_t *,
                                                sector_t *);
 extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
                                               int *offset);
-extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
+extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
-extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
+extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
 /* udftime.c */
 extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
-                                                timestamp src);
+                                                struct timestamp src);
-extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src);
+extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
 #endif                          /* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index 489f52fb428c..6a9f3a9cc428 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -4,9 +4,9 @@
 #include <asm/byteorder.h>
 #include <linux/string.h>
-static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
+static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
 {
-        kernel_lb_addr out;
+        struct kernel_lb_addr out;
        out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
        out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
@@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
        return out;
 }
-static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
+static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
 {
-        lb_addr out;
+        struct lb_addr out;
        out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
        out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
@@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
        return out;
 }
-static inline short_ad lesa_to_cpu(short_ad in)
+static inline struct short_ad lesa_to_cpu(struct short_ad in)
 {
-        short_ad out;
+        struct short_ad out;
        out.extLength = le32_to_cpu(in.extLength);
        out.extPosition = le32_to_cpu(in.extPosition);
@@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in)
        return out;
 }
-static inline short_ad cpu_to_lesa(short_ad in)
+static inline struct short_ad cpu_to_lesa(struct short_ad in)
 {
-        short_ad out;
+        struct short_ad out;
        out.extLength = cpu_to_le32(in.extLength);
        out.extPosition = cpu_to_le32(in.extPosition);
@@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in)
        return out;
 }
-static inline kernel_long_ad lela_to_cpu(long_ad in)
+static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
 {
-        kernel_long_ad out;
+        struct kernel_long_ad out;
        out.extLength = le32_to_cpu(in.extLength);
        out.extLocation = lelb_to_cpu(in.extLocation);
@@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in)
        return out;
 }
-static inline long_ad cpu_to_lela(kernel_long_ad in)
+static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
 {
-        long_ad out;
+        struct long_ad out;
        out.extLength = cpu_to_le32(in.extLength);
        out.extLocation = cpu_to_lelb(in.extLocation);
@@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in)
        return out;
 }
-static inline kernel_extent_ad leea_to_cpu(extent_ad in)
+static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
 {
-        kernel_extent_ad out;
+        struct kernel_extent_ad out;
        out.extLength = le32_to_cpu(in.extLength);
        out.extLocation = le32_to_cpu(in.extLocation);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 5f811655c9b5..b8c828c4d200 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,7 +85,8 @@ extern struct timezone sys_tz;
 #define SECS_PER_HOUR   (60 * 60)
 #define SECS_PER_DAY    (SECS_PER_HOUR * 24)
-struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
+struct timespec *
+udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
 {
        int yday;
        u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
@@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
        return dest;
 }
-timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts)
+struct timestamp *
+udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
 {
        long int days, rem, y;
        const unsigned short int *ip;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 9fdf8c93c58e..cefa8c8913e6 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 {
        const uint8_t *ocu;
        uint8_t cmp_id, ocu_len;
-        int i;
+        int i, len;
        ocu_len = ocu_i->u_len;
@@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
                if (cmp_id == 16)
                        c = (c << 8) | ocu[i++];
-                utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
+                len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
-                                              UDF_NAME_LEN - utf_o->u_len);
+                                    UDF_NAME_LEN - utf_o->u_len);
+                /* Valid character? */
+                if (len >= 0)
+                        utf_o->u_len += len;
+                else
+                        utf_o->u_name[utf_o->u_len++] = '?';
        }
        utf_o->u_cmpID = 8;
@@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
                        int length)
 {
-        unsigned len, i, max_val;
+        int len;
+        unsigned i, max_val;
        uint16_t uni_char;
        int u_len;
@@ -302,8 +308,13 @@ try_again:
        u_len = 0U;
        for (i = 0U; i < uni->u_len; i++) {
                len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
-                if (len <= 0)
+                if (!len)
                        continue;
+                /* Invalid character, deal with it */
+                if (len < 0) {
+                        len = 1;
+                        uni_char = '?';
+                }
                if (uni_char > max_val) {
                        max_val = 0xffffU;
@@ -324,34 +335,43 @@ try_again:
 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
                     int flen)
 {
-        struct ustr filename, unifilename;
+        struct ustr *filename, *unifilename;
-        int len;
+        int len = 0;
-        if (udf_build_ustr_exact(&unifilename, sname, flen))
+        filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        if (!filename)
                return 0;
+        unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        if (!unifilename)
+                goto out1;
+        if (udf_build_ustr_exact(unifilename, sname, flen))
+                goto out2;
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-                if (!udf_CS0toUTF8(&filename, &unifilename)) {
+                if (!udf_CS0toUTF8(filename, unifilename)) {
                        udf_debug("Failed in udf_get_filename: sname = %s\n",
                                  sname);
-                        return 0;
+                        goto out2;
                }
        } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-                if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename,
+                if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
-                                  &unifilename)) {
+                                  unifilename)) {
                        udf_debug("Failed in udf_get_filename: sname = %s\n",
                                  sname);
-                        return 0;
+                        goto out2;
                }
        } else
-                return 0;
+                goto out2;
-        len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
+        len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
-                                     unifilename.u_name, unifilename.u_len);
+                                     unifilename->u_name, unifilename->u_len);
-        if (len)
+out2:
-                return len;
+        kfree(unifilename);
+out1:
-        return 0;
+        kfree(filename);
+        return len;
 }
 int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e1c1fc5ee239..60359291761f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1268,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct ufs_super_block_first *usb1;
        struct ufs_super_block_second *usb2;
        struct ufs_super_block_third *usb3;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        lock_kernel();
@@ -1290,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
                ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
        buf->f_files = uspi->s_ncg * uspi->s_ipg;
        buf->f_namelen = UFS_MAXNAMLEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        unlock_kernel();
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c3dc491fff89..60f107e47fe9 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix quota/, \
                                   xfs_qm_syscalls.o \
                                   xfs_qm_bhv.o \
                                   xfs_qm.o)
+xfs-$(CONFIG_XFS_QUOTA)         += linux-2.6/xfs_quotaops.o
 ifeq ($(CONFIG_XFS_QUOTA),y)
 xfs-$(CONFIG_PROC_FS)           += quota/xfs_qm_stats.o
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
deleted file mode 100644
index 2a88d56c4dc2..000000000000
--- a/fs/xfs/linux-2.6/mutex.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_MUTEX_H__
-#define __XFS_SUPPORT_MUTEX_H__
-#include <linux/mutex.h>
-typedef struct mutex mutex_t;
-#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index de3a198f771e..c13f67300fe7 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1623,4 +1623,5 @@ const struct address_space_operations xfs_address_space_operations = {
        .bmap                   = xfs_vm_bmap,
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4bd112313f33..d0b499418a7d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -34,6 +34,7 @@
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_ioctl.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
@@ -78,92 +79,74 @@ xfs_find_handle(
        int                     hsize;
        xfs_handle_t            handle;
        struct inode            *inode;
+        struct file             *file = NULL;
+        struct path             path;
+        int                     error;
+        struct xfs_inode        *ip;
-        memset((char *)&handle, 0, sizeof(handle));
+        if (cmd == XFS_IOC_FD_TO_HANDLE) {
+                file = fget(hreq->fd);
-        switch (cmd) {
+                if (!file)
-        case XFS_IOC_PATH_TO_FSHANDLE:
+                        return -EBADF;
-        case XFS_IOC_PATH_TO_HANDLE: {
+                inode = file->f_path.dentry->d_inode;
-                struct path path;
+        } else {
-                int error = user_lpath((const char __user *)hreq->path, &path);
+                error = user_lpath((const char __user *)hreq->path, &path);
                if (error)
                        return error;
+                inode = path.dentry->d_inode;
-                ASSERT(path.dentry);
-                ASSERT(path.dentry->d_inode);
-                inode = igrab(path.dentry->d_inode);
-                path_put(&path);
-                break;
        }
+        ip = XFS_I(inode);
-        case XFS_IOC_FD_TO_HANDLE: {
+        /*
-                struct file     *file;
+         * We can only generate handles for inodes residing on a XFS filesystem,
+         * and only for regular files, directories or symbolic links.
-                file = fget(hreq->fd);
+         */
-                if (!file)
+        error = -EINVAL;
-                    return -EBADF;
+        if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+                goto out_put;
-                ASSERT(file->f_path.dentry);
+        error = -EBADF;
-                ASSERT(file->f_path.dentry->d_inode);
+        if (!S_ISREG(inode->i_mode) &&
-                inode = igrab(file->f_path.dentry->d_inode);
+            !S_ISDIR(inode->i_mode) &&
-                fput(file);
+            !S_ISLNK(inode->i_mode))
-                break;
+                goto out_put;
-        }
-        default:
-                ASSERT(0);
-                return -XFS_ERROR(EINVAL);
-        }
-        if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
+        memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
-                /* we're not in XFS anymore, Toto */
-                iput(inode);
-                return -XFS_ERROR(EINVAL);
-        }
-        switch (inode->i_mode & S_IFMT) {
+        if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
-        case S_IFREG:
+                /*
-        case S_IFDIR:
+                 * This handle only contains an fsid, zero the rest.
-        case S_IFLNK:
+                 */
-                break;
+                memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
-        default:
+                hsize = sizeof(xfs_fsid_t);
-                iput(inode);
+        } else {
-                return -XFS_ERROR(EBADF);
-        }
-        /* now we can grab the fsid */
-        memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid,
-                        sizeof(xfs_fsid_t));
-        hsize = sizeof(xfs_fsid_t);
-        if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
-                xfs_inode_t     *ip = XFS_I(inode);
                int             lock_mode;
-                /* need to get access to the xfs_inode to read the generation */
                lock_mode = xfs_ilock_map_shared(ip);
-                /* fill in fid section of handle from inode */
                handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
                                        sizeof(handle.ha_fid.fid_len);
                handle.ha_fid.fid_pad = 0;
                handle.ha_fid.fid_gen = ip->i_d.di_gen;
                handle.ha_fid.fid_ino = ip->i_ino;
                xfs_iunlock_map_shared(ip, lock_mode);
                hsize = XFS_HSIZE(handle);
        }
-        /* now copy our handle into the user buffer & write out the size */
+        error = -EFAULT;
        if (copy_to_user(hreq->ohandle, &handle, hsize) ||
-            copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
+            copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
-                iput(inode);
+                goto out_put;
-                return -XFS_ERROR(EFAULT);
-        }
-        iput(inode);
+        error = 0;
-        return 0;
+ out_put:
+        if (cmd == XFS_IOC_FD_TO_HANDLE)
+                fput(file);
+        else
+                path_put(&path);
+        return error;
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 7aa53fefc67f..6075382336d7 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -211,8 +211,13 @@ xfs_vn_mknod(
         * Irix uses Missed'em'V split, but doesn't want to see
         * the upper 5 bits of (14bit) major.
         */
-        if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+        if (S_ISCHR(mode) || S_ISBLK(mode)) {
-                return -EINVAL;
+                if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+                        return -EINVAL;
+                rdev = sysv_encode_dev(rdev);
+        } else {
+                rdev = 0;
+        }
        if (test_default_acl && test_default_acl(dir)) {
                if (!_ACL_ALLOC(default_acl)) {
@@ -224,28 +229,11 @@ xfs_vn_mknod(
                }
        }
-        xfs_dentry_to_name(&name, dentry);
        if (IS_POSIXACL(dir) && !default_acl)
-                mode &= ~current->fs->umask;
+                mode &= ~current_umask();
-        switch (mode & S_IFMT) {
-        case S_IFCHR:
-        case S_IFBLK:
-        case S_IFIFO:
-        case S_IFSOCK:
-                rdev = sysv_encode_dev(rdev);
-        case S_IFREG:
-                error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
-                break;
-        case S_IFDIR:
-                error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
-                break;
-        default:
-                error = EINVAL;
-                break;
-        }
+        xfs_dentry_to_name(&name, dentry);
+        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
        if (unlikely(error))
                goto out_free_acl;
@@ -416,7 +404,7 @@ xfs_vn_symlink(
        mode_t          mode;
        mode = S_IFLNK |
-                (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+                (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
        xfs_dentry_to_name(&name, dentry);
        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
@@ -553,9 +541,6 @@ xfs_vn_getattr(
        stat->uid = ip->i_d.di_uid;
        stat->gid = ip->i_d.di_gid;
        stat->ino = ip->i_ino;
-#if XFS_BIG_INUMS
-        stat->ino += mp->m_inoadd;
-#endif
        stat->atime = inode->i_atime;
        stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
        stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 507492d6dccd..f65a53f8752f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -38,7 +38,6 @@
 #include <kmem.h>
 #include <mrlock.h>
 #include <sv.h>
-#include <mutex.h>
 #include <time.h>
 #include <support/ktrace.h>
@@ -51,6 +50,7 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/file.h>
 #include <linux/swap.h>
 #include <linux/errno.h>
@@ -147,17 +147,6 @@
 #define SYNCHRONIZE()   barrier()
 #define __return_address __builtin_return_address(0)
-/*
- * IRIX (BSD) quotactl makes use of separate commands for user/group,
- * whereas on Linux the syscall encodes this information into the cmd
- * field (see the QCMD macro in quota.h).  These macros help keep the
- * code portable - they are not visible from the syscall interface.
- */
-#define Q_XSETGQLIM     XQM_CMD(8)      /* set groups disk limits */
-#define Q_XGETGQUOTA    XQM_CMD(9)      /* get groups disk limits */
-#define Q_XSETPQLIM     XQM_CMD(10)     /* set projects disk limits */
-#define Q_XGETPQUOTA    XQM_CMD(11)     /* get projects disk limits */
 #define dfltprid        0
 #define MAXPATHLEN      1024
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
new file mode 100644
index 000000000000..94d9a633d3d9
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_dmapi.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "quota/xfs_qm.h"
+#include <linux/quota.h>
+STATIC int
+xfs_quota_type(int type)
+{
+        switch (type) {
+        case USRQUOTA:
+                return XFS_DQ_USER;
+        case GRPQUOTA:
+                return XFS_DQ_GROUP;
+        default:
+                return XFS_DQ_PROJ;
+        }
+}
+STATIC int
+xfs_fs_quota_sync(
+        struct super_block      *sb,
+        int                     type)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        return -xfs_sync_inodes(mp, SYNC_DELWRI);
+}
+STATIC int
+xfs_fs_get_xstate(
+        struct super_block      *sb,
+        struct fs_quota_stat    *fqs)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        return -xfs_qm_scall_getqstat(mp, fqs);
+}
+STATIC int
+xfs_fs_set_xstate(
+        struct super_block      *sb,
+        unsigned int            uflags,
+        int                     op)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        unsigned int            flags = 0;
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (uflags & XFS_QUOTA_UDQ_ACCT)
+                flags |= XFS_UQUOTA_ACCT;
+        if (uflags & XFS_QUOTA_PDQ_ACCT)
+                flags |= XFS_PQUOTA_ACCT;
+        if (uflags & XFS_QUOTA_GDQ_ACCT)
+                flags |= XFS_GQUOTA_ACCT;
+        if (uflags & XFS_QUOTA_UDQ_ENFD)
+                flags |= XFS_UQUOTA_ENFD;
+        if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
+                flags |= XFS_OQUOTA_ENFD;
+        switch (op) {
+        case Q_XQUOTAON:
+                return -xfs_qm_scall_quotaon(mp, flags);
+        case Q_XQUOTAOFF:
+                if (!XFS_IS_QUOTA_ON(mp))
+                        return -EINVAL;
+                return -xfs_qm_scall_quotaoff(mp, flags);
+        case Q_XQUOTARM:
+                if (XFS_IS_QUOTA_ON(mp))
+                        return -EINVAL;
+                return -xfs_qm_scall_trunc_qfiles(mp, flags);
+        }
+        return -EINVAL;
+}
+STATIC int
+xfs_fs_get_xquota(
+        struct super_block      *sb,
+        int                     type,
+        qid_t                   id,
+        struct fs_disk_quota    *fdq)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        if (!XFS_IS_QUOTA_ON(mp))
+                return -ESRCH;
+        return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
+}
+STATIC int
+xfs_fs_set_xquota(
+        struct super_block      *sb,
+        int                     type,
+        qid_t                   id,
+        struct fs_disk_quota    *fdq)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        if (!XFS_IS_QUOTA_ON(mp))
+                return -ESRCH;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
+}
+struct quotactl_ops xfs_quotactl_operations = {
+        .quota_sync             = xfs_fs_quota_sync,
+        .get_xstate             = xfs_fs_get_xstate,
+        .set_xstate             = xfs_fs_set_xstate,
+        .get_xquota             = xfs_fs_get_xquota,
+        .set_xquota             = xfs_fs_set_xquota,
+};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 32ae5028e96b..bb685269f832 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -68,7 +68,6 @@
 #include <linux/freezer.h>
 #include <linux/parser.h>
-static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
@@ -79,7 +78,6 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_RTDEV    "rtdev"         /* realtime I/O device */
 #define MNTOPT_BIOSIZE  "biosize"       /* log2 of preferred buffered io size */
 #define MNTOPT_WSYNC    "wsync"         /* safe-mode nfs compatible mount */
-#define MNTOPT_INO64    "ino64"         /* force inodes into 64-bit range */
 #define MNTOPT_NOALIGN  "noalign"       /* turn off stripe alignment */
 #define MNTOPT_SWALLOC  "swalloc"       /* turn on stripe width allocation */
 #define MNTOPT_SUNIT    "sunit"         /* data volume stripe unit */
@@ -180,7 +178,7 @@ xfs_parseargs(
        int                     dswidth = 0;
        int                     iosize = 0;
        int                     dmapi_implies_ikeep = 1;
-        uchar_t                 iosizelog = 0;
+        __uint8_t               iosizelog = 0;
        /*
         * Copy binary VFS mount flags we are interested in.
@@ -291,16 +289,6 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
                        mp->m_flags |= XFS_MOUNT_NORECOVERY;
-                } else if (!strcmp(this_char, MNTOPT_INO64)) {
-#if XFS_BIG_INUMS
-                        mp->m_flags |= XFS_MOUNT_INO64;
-                        mp->m_inoadd = XFS_INO64_OFFSET;
-#else
-                        cmn_err(CE_WARN,
-                                "XFS: %s option not allowed on this system",
-                                this_char);
-                        return EINVAL;
-#endif
                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
                        mp->m_flags |= XFS_MOUNT_NOALIGN;
                } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
@@ -529,7 +517,6 @@ xfs_showargs(
                /* the few simple ones we can get from the mount struct */
                { XFS_MOUNT_IKEEP,              "," MNTOPT_IKEEP },
                { XFS_MOUNT_WSYNC,              "," MNTOPT_WSYNC },
-                { XFS_MOUNT_INO64,              "," MNTOPT_INO64 },
                { XFS_MOUNT_NOALIGN,            "," MNTOPT_NOALIGN },
                { XFS_MOUNT_SWALLOC,            "," MNTOPT_SWALLOC },
                { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
@@ -634,7 +621,7 @@ xfs_max_file_offset(
        return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
-int
+STATIC int
 xfs_blkdev_get(
        xfs_mount_t             *mp,
        const char              *name,
@@ -651,7 +638,7 @@ xfs_blkdev_get(
        return -error;
 }
-void
+STATIC void
 xfs_blkdev_put(
        struct block_device     *bdev)
 {
@@ -872,7 +859,7 @@ xfsaild_wakeup(
        wake_up_process(ailp->xa_task);
 }
-int
+STATIC int
 xfsaild(
        void    *data)
 {
@@ -990,26 +977,57 @@ xfs_fs_write_inode(
        int                     sync)
 {
        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
        int                     error = 0;
-        int                     flags = 0;
        xfs_itrace_entry(ip);
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
        if (sync) {
                error = xfs_wait_on_pages(ip, 0, -1);
                if (error)
-                        goto out_error;
+                        goto out;
-                flags |= FLUSH_SYNC;
        }
-        error = xfs_inode_flush(ip, flags);
-out_error:
+        /*
+         * Bypass inodes which have already been cleaned by
+         * the inode flush clustering code inside xfs_iflush
+         */
+        if (xfs_inode_clean(ip))
+                goto out;
+        /*
+         * We make this non-blocking if the inode is contended, return
+         * EAGAIN to indicate to the caller that they did not succeed.
+         * This prevents the flush path from blocking on inodes inside
+         * another operation right now, they get caught later by xfs_sync.
+         */
+        if (sync) {
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                xfs_iflock(ip);
+                error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+        } else {
+                error = EAGAIN;
+                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+                        goto out;
+                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+                        goto out_unlock;
+                error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
+        }
+ out_unlock:
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ out:
        /*
         * if we failed to write out the inode then mark
         * it dirty again so we'll try again later.
         */
        if (error)
                xfs_mark_inode_dirty_sync(ip);
        return -error;
 }
@@ -1169,18 +1187,12 @@ xfs_fs_statfs(
        statp->f_bfree = statp->f_bavail =
                                sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
        fakeinos = statp->f_bfree << sbp->sb_inopblog;
-#if XFS_BIG_INUMS
-        fakeinos += mp->m_inoadd;
-#endif
        statp->f_files =
            MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
        if (mp->m_maxicount)
-#if XFS_BIG_INUMS
+                statp->f_files = min_t(typeof(statp->f_files),
-                if (!mp->m_inoadd)
+                                        statp->f_files,
-#endif
+                                        mp->m_maxicount);
-                        statp->f_files = min_t(typeof(statp->f_files),
-                                                statp->f_files,
-                                                mp->m_maxicount);
        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
        spin_unlock(&mp->m_sb_lock);
@@ -1302,57 +1314,6 @@ xfs_fs_show_options(
        return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
 }
-STATIC int
-xfs_fs_quotasync(
-        struct super_block      *sb,
-        int                     type)
-{
-        return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL);
-}
-STATIC int
-xfs_fs_getxstate(
-        struct super_block      *sb,
-        struct fs_quota_stat    *fqs)
-{
-        return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
-}
-STATIC int
-xfs_fs_setxstate(
-        struct super_block      *sb,
-        unsigned int            flags,
-        int                     op)
-{
-        return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags);
-}
-STATIC int
-xfs_fs_getxquota(
-        struct super_block      *sb,
-        int                     type,
-        qid_t                   id,
-        struct fs_disk_quota    *fdq)
-{
-        return -XFS_QM_QUOTACTL(XFS_M(sb),
-                                 (type == USRQUOTA) ? Q_XGETQUOTA :
-                                  ((type == GRPQUOTA) ? Q_XGETGQUOTA :
-                                   Q_XGETPQUOTA), id, (caddr_t)fdq);
-}
-STATIC int
-xfs_fs_setxquota(
-        struct super_block      *sb,
-        int                     type,
-        qid_t                   id,
-        struct fs_disk_quota    *fdq)
-{
-        return -XFS_QM_QUOTACTL(XFS_M(sb),
-                                 (type == USRQUOTA) ? Q_XSETQLIM :
-                                  ((type == GRPQUOTA) ? Q_XSETGQLIM :
-                                   Q_XSETPQLIM), id, (caddr_t)fdq);
-}
 /*
 * This function fills in xfs_mount_t fields based on mount args.
 * Note: the superblock _has_ now been read in.
@@ -1435,7 +1396,9 @@ xfs_fs_fill_super(
        sb_min_blocksize(sb, BBSIZE);
        sb->s_xattr = xfs_xattr_handlers;
        sb->s_export_op = &xfs_export_operations;
+#ifdef CONFIG_XFS_QUOTA
        sb->s_qcop = &xfs_quotactl_operations;
+#endif
        sb->s_op = &xfs_super_operations;
        error = xfs_dmops_get(mp);
@@ -1578,14 +1541,6 @@ static struct super_operations xfs_super_operations = {
        .show_options           = xfs_fs_show_options,
 };
-static struct quotactl_ops xfs_quotactl_operations = {
-        .quota_sync             = xfs_fs_quotasync,
-        .get_xstate             = xfs_fs_getxstate,
-        .set_xstate             = xfs_fs_setxstate,
-        .get_xquota             = xfs_fs_getxquota,
-        .set_xquota             = xfs_fs_setxquota,
-};
 static struct file_system_type xfs_fs_type = {
        .owner                  = THIS_MODULE,
        .name                   = "xfs",
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index d5d776d4cd67..5a2ea3a21781 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,6 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern const struct export_operations xfs_export_operations;
 extern struct xattr_handler *xfs_xattr_handlers[];
+extern struct quotactl_ops xfs_quotactl_operations;
 #define XFS_M(sb)               ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 5f6de1efe1f6..04f058c848ae 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -19,6 +19,7 @@
 #define XFS_SYNC_H 1
 struct xfs_mount;
+struct xfs_perag;
 typedef struct bhv_vfs_sync_work {
        struct list_head        w_list;
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f65983a230d3..ad7fbead4c97 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -41,11 +41,6 @@ struct attrlist_cursor_kern;
 #define IO_INVIS        0x00020         /* don't update inode timestamps */
 /*
- * Flags for xfs_inode_flush
- */
-#define FLUSH_SYNC              1       /* wait for flush to complete   */
-/*
 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
 */
 #define FI_NONE                 0       /* none */
@@ -55,33 +50,6 @@ struct attrlist_cursor_kern;
                                           the operation completes. */
 /*
- * Dealing with bad inodes
- */
-static inline int VN_BAD(struct inode *vp)
-{
-        return is_bad_inode(vp);
-}
-/*
- * Extracting atime values in various formats
- */
-static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
-{
-        bs_atime->tv_sec = vp->i_atime.tv_sec;
-        bs_atime->tv_nsec = vp->i_atime.tv_nsec;
-}
-static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
-{
-        *ts = vp->i_atime;
-}
-static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
-{
-        *tt = vp->i_atime.tv_sec;
-}
-/*
 * Some useful predicates.
 */
 #define VN_MAPPED(vp)   mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 6543c0b29753..e4babcc63423 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -804,7 +804,7 @@ xfs_qm_dqlookup(
        uint                    flist_locked;
        xfs_dquot_t             *d;
-        ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+        ASSERT(mutex_is_locked(&qh->qh_lock));
        flist_locked = B_FALSE;
@@ -877,7 +877,7 @@ xfs_qm_dqlookup(
                        /*
                         * move the dquot to the front of the hashchain
                         */
-                        ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+                        ASSERT(mutex_is_locked(&qh->qh_lock));
                        if (dqp->HL_PREVP != &qh->qh_next) {
                                xfs_dqtrace_entry(dqp,
                                                  "DQLOOKUP: HASH MOVETOFRONT");
@@ -892,13 +892,13 @@ xfs_qm_dqlookup(
                        }
                        xfs_dqtrace_entry(dqp, "LOOKUP END");
                        *O_dqpp = dqp;
-                        ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+                        ASSERT(mutex_is_locked(&qh->qh_lock));
                        return (0);
                }
        }
        *O_dqpp = NULL;
-        ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+        ASSERT(mutex_is_locked(&qh->qh_lock));
        return (1);
 }
@@ -956,7 +956,7 @@ xfs_qm_dqget(
                        ASSERT(ip->i_gdquot == NULL);
        }
 #endif
-        XFS_DQ_HASH_LOCK(h);
+        mutex_lock(&h->qh_lock);
        /*
         * Look in the cache (hashtable).
@@ -971,7 +971,7 @@ xfs_qm_dqget(
                 */
                ASSERT(*O_dqpp);
                ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
-                XFS_DQ_HASH_UNLOCK(h);
+                mutex_unlock(&h->qh_lock);
                xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
                return (0);     /* success */
        }
@@ -991,7 +991,7 @@ xfs_qm_dqget(
         * we don't keep the lock across a disk read
         */
        version = h->qh_version;
-        XFS_DQ_HASH_UNLOCK(h);
+        mutex_unlock(&h->qh_lock);
        /*
         * Allocate the dquot on the kernel heap, and read the ondisk
@@ -1056,7 +1056,7 @@ xfs_qm_dqget(
        /*
         * Hashlock comes after ilock in lock order
         */
-        XFS_DQ_HASH_LOCK(h);
+        mutex_lock(&h->qh_lock);
        if (version != h->qh_version) {
                xfs_dquot_t *tmpdqp;
                /*
@@ -1072,7 +1072,7 @@ xfs_qm_dqget(
                         * and start over.
                         */
                        xfs_qm_dqput(tmpdqp);
-                        XFS_DQ_HASH_UNLOCK(h);
+                        mutex_unlock(&h->qh_lock);
                        xfs_qm_dqdestroy(dqp);
                        XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
                        goto again;
@@ -1083,7 +1083,7 @@ xfs_qm_dqget(
         * Put the dquot at the beginning of the hash-chain and mp's list
         * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
         */
-        ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
+        ASSERT(mutex_is_locked(&h->qh_lock));
        dqp->q_hash = h;
        XQM_HASHLIST_INSERT(h, dqp);
@@ -1102,7 +1102,7 @@ xfs_qm_dqget(
        XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
        xfs_qm_mplist_unlock(mp);
-        XFS_DQ_HASH_UNLOCK(h);
+        mutex_unlock(&h->qh_lock);
 dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
        xfs_dqtrace_entry(dqp, "DQGET DONE");
@@ -1440,7 +1440,7 @@ xfs_qm_dqpurge(
        xfs_mount_t     *mp = dqp->q_mount;
        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
-        ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
+        ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
        xfs_dqlock(dqp);
        /*
@@ -1453,7 +1453,7 @@ xfs_qm_dqpurge(
         */
        if (dqp->q_nrefs != 0) {
                xfs_dqunlock(dqp);
-                XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+                mutex_unlock(&dqp->q_hash->qh_lock);
                return (1);
        }
@@ -1517,7 +1517,7 @@ xfs_qm_dqpurge(
        memset(&dqp->q_core, 0, sizeof(dqp->q_core));
        xfs_dqfunlock(dqp);
        xfs_dqunlock(dqp);
-        XFS_DQ_HASH_UNLOCK(thishash);
+        mutex_unlock(&thishash->qh_lock);
        return (0);
 }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index d443e93b4331..de0f402ddb4c 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -34,7 +34,7 @@
 */
 typedef struct xfs_dqhash {
        struct xfs_dquot *qh_next;
-        mutex_t           qh_lock;
+        struct mutex      qh_lock;
        uint              qh_version;   /* ever increasing version */
        uint              qh_nelems;    /* number of dquots on the list */
 } xfs_dqhash_t;
@@ -81,7 +81,7 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_bcount;  /* total regular nblks used+reserved */
        xfs_qcnt_t       q_res_icount;  /* total inos allocd+reserved */
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
-        mutex_t          q_qlock;       /* quota lock */
+        struct mutex     q_qlock;       /* quota lock */
        struct completion q_flush;      /* flush completion queue */
        atomic_t          q_pincount;   /* dquot pin count */
        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
@@ -109,19 +109,6 @@ enum {
 #define XFS_DQHOLD(dqp)         ((dqp)->q_nrefs++)
-#ifdef DEBUG
-static inline int
-XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
-{
-        if (mutex_trylock(&dqp->q_qlock)) {
-                mutex_unlock(&dqp->q_qlock);
-                return 0;
-        }
-        return 1;
-}
-#endif
 /*
 * Manage the q_flush completion queue embedded in the dquot.  This completion
 * queue synchronizes processes attempting to flush the in-core dquot back to
@@ -142,6 +129,7 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
        complete(&dqp->q_flush);
 }
+#define XFS_DQ_IS_LOCKED(dqp)   (mutex_is_locked(&((dqp)->q_qlock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7a2beb64314f..5b6695049e00 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,7 +55,7 @@
 * quota functionality, including maintaining the freelist and hash
 * tables of dquots.
 */
-mutex_t         xfs_Gqm_lock;
+struct mutex    xfs_Gqm_lock;
 struct xfs_qm   *xfs_Gqm;
 uint            ndquot;
@@ -69,8 +69,6 @@ STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 STATIC void     xfs_qm_freelist_init(xfs_frlist_t *);
 STATIC void     xfs_qm_freelist_destroy(xfs_frlist_t *);
-STATIC int      xfs_qm_mplist_nowait(xfs_mount_t *);
-STATIC int      xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
@@ -82,7 +80,7 @@ static struct shrinker xfs_qm_shaker = {
 };
 #ifdef DEBUG
-extern mutex_t  qcheck_lock;
+extern struct mutex     qcheck_lock;
 #endif
 #ifdef QUOTADEBUG
@@ -219,7 +217,7 @@ xfs_qm_hold_quotafs_ref(
         * the structure could disappear between the entry to this routine and
         * a HOLD operation if not locked.
         */
-        XFS_QM_LOCK(xfs_Gqm);
+        mutex_lock(&xfs_Gqm_lock);
        if (xfs_Gqm == NULL)
                xfs_Gqm = xfs_Gqm_init();
@@ -228,8 +226,8 @@ xfs_qm_hold_quotafs_ref(
         * debugging and statistical purposes, but ...
         * Just take a reference and get out.
         */
-        XFS_QM_HOLD(xfs_Gqm);
+        xfs_Gqm->qm_nrefs++;
-        XFS_QM_UNLOCK(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm_lock);
        return 0;
 }
@@ -277,13 +275,12 @@ xfs_qm_rele_quotafs_ref(
         * Destroy the entire XQM. If somebody mounts with quotaon, this'll
         * be restarted.
         */
-        XFS_QM_LOCK(xfs_Gqm);
+        mutex_lock(&xfs_Gqm_lock);
-        XFS_QM_RELE(xfs_Gqm);
+        if (--xfs_Gqm->qm_nrefs == 0) {
-        if (xfs_Gqm->qm_nrefs == 0) {
                xfs_qm_destroy(xfs_Gqm);
                xfs_Gqm = NULL;
        }
-        XFS_QM_UNLOCK(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm_lock);
 }
 /*
@@ -577,10 +574,10 @@ xfs_qm_dqpurge_int(
                        continue;
                }
-                if (! xfs_qm_dqhashlock_nowait(dqp)) {
+                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
                        nrecl = XFS_QI_MPLRECLAIMS(mp);
                        xfs_qm_mplist_unlock(mp);
-                        XFS_DQ_HASH_LOCK(dqp->q_hash);
+                        mutex_lock(&dqp->q_hash->qh_lock);
                        xfs_qm_mplist_lock(mp);
                        /*
@@ -590,7 +587,7 @@ xfs_qm_dqpurge_int(
                         * this point, but somebody might be taking things off.
                         */
                        if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
-                                XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+                                mutex_unlock(&dqp->q_hash->qh_lock);
                                goto again;
                        }
                }
@@ -632,7 +629,6 @@ xfs_qm_dqattach_one(
        xfs_dqid_t      id,
        uint            type,
        uint            doalloc,
-        uint            dolock,
        xfs_dquot_t     *udqhint, /* hint */
        xfs_dquot_t     **IO_idqpp)
 {
@@ -641,16 +637,16 @@ xfs_qm_dqattach_one(
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        error = 0;
        /*
         * See if we already have it in the inode itself. IO_idqpp is
         * &i_udquot or &i_gdquot. This made the code look weird, but
         * made the logic a lot simpler.
         */
-        if ((dqp = *IO_idqpp)) {
+        dqp = *IO_idqpp;
-                if (dolock)
+        if (dqp) {
-                        xfs_dqlock(dqp);
                xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
-                goto done;
+                return 0;
        }
        /*
@@ -659,38 +655,38 @@ xfs_qm_dqattach_one(
         * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
         * the user dquot.
         */
-        ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
+        if (udqhint) {
-        if (udqhint && !dolock)
+                ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
                xfs_dqlock(udqhint);
-        /*
+                /*
-         * No need to take dqlock to look at the id.
+                 * No need to take dqlock to look at the id.
-         * The ID can't change until it gets reclaimed, and it won't
+                 *
-         * be reclaimed as long as we have a ref from inode and we hold
+                 * The ID can't change until it gets reclaimed, and it won't
-         * the ilock.
+                 * be reclaimed as long as we have a ref from inode and we
-         */
+                 * hold the ilock.
-        if (udqhint &&
+                 */
-            (dqp = udqhint->q_gdquot) &&
+                dqp = udqhint->q_gdquot;
-            (be32_to_cpu(dqp->q_core.d_id) == id)) {
+                if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
-                ASSERT(XFS_DQ_IS_LOCKED(udqhint));
+                        xfs_dqlock(dqp);
-                xfs_dqlock(dqp);
+                        XFS_DQHOLD(dqp);
-                XFS_DQHOLD(dqp);
+                        ASSERT(*IO_idqpp == NULL);
-                ASSERT(*IO_idqpp == NULL);
+                        *IO_idqpp = dqp;
-                *IO_idqpp = dqp;
-                if (!dolock) {
                        xfs_dqunlock(dqp);
                        xfs_dqunlock(udqhint);
+                        return 0;
                }
-                goto done;
-        }
+                /*
-        /*
+                 * We can't hold a dquot lock when we call the dqget code.
-         * We can't hold a dquot lock when we call the dqget code.
+                 * We'll deadlock in no time, because of (not conforming to)
-         * We'll deadlock in no time, because of (not conforming to)
+                 * lock ordering - the inodelock comes before any dquot lock,
-         * lock ordering - the inodelock comes before any dquot lock,
+                 * and we may drop and reacquire the ilock in xfs_qm_dqget().
-         * and we may drop and reacquire the ilock in xfs_qm_dqget().
+                 */
-         */
-        if (udqhint)
                xfs_dqunlock(udqhint);
+        }
        /*
         * Find the dquot from somewhere. This bumps the
         * reference count of dquot and returns it locked.
@@ -698,48 +694,19 @@ xfs_qm_dqattach_one(
         * disk and we didn't ask it to allocate;
         * ESRCH if quotas got turned off suddenly.
         */
-        if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+        error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
-                                 doalloc|XFS_QMOPT_DOWARN, &dqp))) {
+        if (error)
-                if (udqhint && dolock)
+                return error;
-                        xfs_dqlock(udqhint);
-                goto done;
-        }
        xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
        /*
         * dqget may have dropped and re-acquired the ilock, but it guarantees
         * that the dquot returned is the one that should go in the inode.
         */
        *IO_idqpp = dqp;
-        ASSERT(dqp);
+        xfs_dqunlock(dqp);
-        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        return 0;
-        if (! dolock) {
-                xfs_dqunlock(dqp);
-                goto done;
-        }
-        if (! udqhint)
-                goto done;
-        ASSERT(udqhint);
-        ASSERT(dolock);
-        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        if (! xfs_qm_dqlock_nowait(udqhint)) {
-                xfs_dqunlock(dqp);
-                xfs_dqlock(udqhint);
-                xfs_dqlock(dqp);
-        }
-      done:
-#ifdef QUOTADEBUG
-        if (udqhint) {
-                if (dolock)
-                        ASSERT(XFS_DQ_IS_LOCKED(udqhint));
-        }
-        if (! error) {
-                if (dolock)
-                        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        }
-#endif
-        return error;
 }
@@ -754,24 +721,15 @@ xfs_qm_dqattach_one(
 STATIC void
 xfs_qm_dqattach_grouphint(
        xfs_dquot_t     *udq,
-        xfs_dquot_t     *gdq,
+        xfs_dquot_t     *gdq)
-        uint            locked)
 {
        xfs_dquot_t     *tmp;
-#ifdef QUOTADEBUG
+        xfs_dqlock(udq);
-        if (locked) {
-                ASSERT(XFS_DQ_IS_LOCKED(udq));
-                ASSERT(XFS_DQ_IS_LOCKED(gdq));
-        }
-#endif
-        if (! locked)
-                xfs_dqlock(udq);
        if ((tmp = udq->q_gdquot)) {
                if (tmp == gdq) {
-                        if (! locked)
+                        xfs_dqunlock(udq);
-                                xfs_dqunlock(udq);
                        return;
                }
@@ -781,8 +739,6 @@ xfs_qm_dqattach_grouphint(
                 * because the freelist lock comes before dqlocks.
                 */
                xfs_dqunlock(udq);
-                if (locked)
-                        xfs_dqunlock(gdq);
                /*
                 * we took a hard reference once upon a time in dqget,
                 * so give it back when the udquot no longer points at it
@@ -795,9 +751,7 @@ xfs_qm_dqattach_grouphint(
        } else {
                ASSERT(XFS_DQ_IS_LOCKED(udq));
-                if (! locked) {
+                xfs_dqlock(gdq);
-                        xfs_dqlock(gdq);
-                }
        }
        ASSERT(XFS_DQ_IS_LOCKED(udq));
@@ -810,10 +764,9 @@ xfs_qm_dqattach_grouphint(
                XFS_DQHOLD(gdq);
                udq->q_gdquot = gdq;
        }
-        if (! locked) {
-                xfs_dqunlock(gdq);
+        xfs_dqunlock(gdq);
-                xfs_dqunlock(udq);
+        xfs_dqunlock(udq);
-        }
 }
@@ -821,8 +774,6 @@ xfs_qm_dqattach_grouphint(
 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
 * into account.
 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
- * much made this code a complete mess, but it has been pretty useful.
 * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
 * Inode may get unlocked and relocked in here, and the caller must deal with
 * the consequences.
@@ -851,7 +802,6 @@ xfs_qm_dqattach(
        if (XFS_IS_UQUOTA_ON(mp)) {
                error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
                                                flags & XFS_QMOPT_DQALLOC,
-                                                flags & XFS_QMOPT_DQLOCK,
                                                NULL, &ip->i_udquot);
                if (error)
                        goto done;
@@ -863,11 +813,9 @@ xfs_qm_dqattach(
                error = XFS_IS_GQUOTA_ON(mp) ?
                        xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
                                                flags & XFS_QMOPT_DQALLOC,
-                                                flags & XFS_QMOPT_DQLOCK,
                                                ip->i_udquot, &ip->i_gdquot) :
                        xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
                                                flags & XFS_QMOPT_DQALLOC,
-                                                flags & XFS_QMOPT_DQLOCK,
                                                ip->i_udquot, &ip->i_gdquot);
                /*
                 * Don't worry about the udquot that we may have
@@ -898,22 +846,13 @@ xfs_qm_dqattach(
                /*
                 * Attach i_gdquot to the gdquot hint inside the i_udquot.
                 */
-                xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
+                xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
-                                         flags & XFS_QMOPT_DQLOCK);
        }
      done:
 #ifdef QUOTADEBUG
        if (! error) {
-                if (ip->i_udquot) {
-                        if (flags & XFS_QMOPT_DQLOCK)
-                                ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
-                }
-                if (ip->i_gdquot) {
-                        if (flags & XFS_QMOPT_DQLOCK)
-                                ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
-                }
                if (XFS_IS_UQUOTA_ON(mp))
                        ASSERT(ip->i_udquot);
                if (XFS_IS_OQUOTA_ON(mp))
@@ -2086,7 +2025,7 @@ xfs_qm_shake_freelist(
                 * a dqlookup process that holds the hashlock that is
                 * waiting for the freelist lock.
                 */
-                if (! xfs_qm_dqhashlock_nowait(dqp)) {
+                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
                        xfs_dqfunlock(dqp);
                        xfs_dqunlock(dqp);
                        dqp = dqp->dq_flnext;
@@ -2103,7 +2042,7 @@ xfs_qm_shake_freelist(
                        /* XXX put a sentinel so that we can come back here */
                        xfs_dqfunlock(dqp);
                        xfs_dqunlock(dqp);
-                        XFS_DQ_HASH_UNLOCK(hash);
+                        mutex_unlock(&hash->qh_lock);
                        xfs_qm_freelist_unlock(xfs_Gqm);
                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
                                return nreclaimed;
@@ -2120,7 +2059,7 @@ xfs_qm_shake_freelist(
                XQM_HASHLIST_REMOVE(hash, dqp);
                xfs_dqfunlock(dqp);
                xfs_qm_mplist_unlock(dqp->q_mount);
-                XFS_DQ_HASH_UNLOCK(hash);
+                mutex_unlock(&hash->qh_lock);
 off_freelist:
                XQM_FREELIST_REMOVE(dqp);
@@ -2262,7 +2201,7 @@ xfs_qm_dqreclaim_one(void)
                        continue;
                }
-                if (! xfs_qm_dqhashlock_nowait(dqp))
+                if (!mutex_trylock(&dqp->q_hash->qh_lock))
                        goto mplistunlock;
                ASSERT(dqp->q_nrefs == 0);
@@ -2271,7 +2210,7 @@ xfs_qm_dqreclaim_one(void)
                XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
                XQM_FREELIST_REMOVE(dqp);
                dqpout = dqp;
-                XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+                mutex_unlock(&dqp->q_hash->qh_lock);
 mplistunlock:
                xfs_qm_mplist_unlock(dqp->q_mount);
                xfs_dqfunlock(dqp);
@@ -2774,34 +2713,3 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
 {
        xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
 }
-STATIC int
-xfs_qm_dqhashlock_nowait(
-        xfs_dquot_t *dqp)
-{
-        int locked;
-        locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
-        return locked;
-}
-int
-xfs_qm_freelist_lock_nowait(
-        xfs_qm_t *xqm)
-{
-        int locked;
-        locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
-        return locked;
-}
-STATIC int
-xfs_qm_mplist_nowait(
-        xfs_mount_t     *mp)
-{
-        int locked;
-        ASSERT(mp->m_quotainfo);
-        locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
-        return locked;
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index ddf09166387c..a371954cae1b 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -27,7 +27,7 @@ struct xfs_qm;
 struct xfs_inode;
 extern uint             ndquot;
-extern mutex_t          xfs_Gqm_lock;
+extern struct mutex     xfs_Gqm_lock;
 extern struct xfs_qm    *xfs_Gqm;
 extern kmem_zone_t      *qm_dqzone;
 extern kmem_zone_t      *qm_dqtrxzone;
@@ -79,7 +79,7 @@ typedef xfs_dqhash_t	xfs_dqlist_t;
 typedef struct xfs_frlist {
       struct xfs_dquot *qh_next;
       struct xfs_dquot *qh_prev;
-       mutex_t           qh_lock;
+       struct mutex      qh_lock;
       uint              qh_version;
       uint              qh_nelems;
 } xfs_frlist_t;
@@ -115,7 +115,7 @@ typedef struct xfs_quotainfo {
        xfs_qwarncnt_t   qi_bwarnlimit;  /* limit for blks warnings */
        xfs_qwarncnt_t   qi_iwarnlimit;  /* limit for inodes warnings */
        xfs_qwarncnt_t   qi_rtbwarnlimit;/* limit for rt blks warnings */
-        mutex_t          qi_quotaofflock;/* to serialize quotaoff */
+        struct mutex     qi_quotaofflock;/* to serialize quotaoff */
        xfs_filblks_t    qi_dqchunklen;  /* # BBs in a chunk of dqs */
        uint             qi_dqperchunk;  /* # ondisk dqs in above chunk */
        xfs_qcnt_t       qi_bhardlimit;  /* default data blk hard limit */
@@ -158,11 +158,6 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_IWARNLIMIT       5
 #define XFS_QM_RTBWARNLIMIT     5
-#define XFS_QM_LOCK(xqm)        (mutex_lock(&xqm##_lock))
-#define XFS_QM_UNLOCK(xqm)      (mutex_unlock(&xqm##_lock))
-#define XFS_QM_HOLD(xqm)        ((xqm)->qm_nrefs++)
-#define XFS_QM_RELE(xqm)        ((xqm)->qm_nrefs--)
 extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern void             xfs_qm_mount_quotas(xfs_mount_t *);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
@@ -178,6 +173,16 @@ extern void		xfs_qm_dqdetach(xfs_inode_t *);
 extern int              xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void             xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+/* quota ops */
+extern int              xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+extern int              xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+                                        fs_disk_quota_t *);
+extern int              xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+                                        fs_disk_quota_t *);
+extern int              xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+extern int              xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+extern int              xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
 /* vop stuff */
 extern int              xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
                                        uid_t, gid_t, prid_t, uint,
@@ -194,11 +199,6 @@ extern int		xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
 /* list stuff */
 extern void             xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
 extern void             xfs_qm_freelist_unlink(xfs_dquot_t *);
-extern int              xfs_qm_freelist_lock_nowait(xfs_qm_t *);
-/* system call interface */
-extern int              xfs_qm_quotactl(struct xfs_mount *, int, int,
-                                xfs_caddr_t);
 #ifdef DEBUG
 extern int              xfs_qm_internalqcheck(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bc6c5cca3e12..63037c689a4b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -235,7 +235,6 @@ struct xfs_qmops xfs_qmcore_xfs = {
        .xfs_dqvopchownresv     = xfs_qm_vop_chown_reserve,
        .xfs_dqstatvfs          = xfs_qm_statvfs,
        .xfs_dqsync             = xfs_qm_sync,
-        .xfs_quotactl           = xfs_qm_quotactl,
        .xfs_dqtrxops           = &xfs_trans_dquot_ops,
 };
 EXPORT_SYMBOL(xfs_qmcore_xfs);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 68139b38aede..c7b66f6506ce 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -57,135 +57,16 @@
 # define qdprintk(s, args...)   do { } while (0)
 #endif
-STATIC int      xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-STATIC int      xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
-                                        fs_disk_quota_t *);
-STATIC int      xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-STATIC int      xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
-                                        fs_disk_quota_t *);
-STATIC int      xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-STATIC int      xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
 STATIC int      xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
-STATIC uint     xfs_qm_import_flags(uint);
 STATIC uint     xfs_qm_export_flags(uint);
-STATIC uint     xfs_qm_import_qtype_flags(uint);
 STATIC uint     xfs_qm_export_qtype_flags(uint);
 STATIC void     xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
                                        fs_disk_quota_t *);
 /*
- * The main distribution switch of all XFS quotactl system calls.
- */
-int
-xfs_qm_quotactl(
-        xfs_mount_t     *mp,
-        int             cmd,
-        int             id,
-        xfs_caddr_t     addr)
-{
-        int             error;
-        ASSERT(addr != NULL || cmd == Q_XQUOTASYNC);
-        /*
-         * The following commands are valid even when quotaoff.
-         */
-        switch (cmd) {
-        case Q_XQUOTARM:
-                /*
-                 * Truncate quota files. quota must be off.
-                 */
-                if (XFS_IS_QUOTA_ON(mp))
-                        return XFS_ERROR(EINVAL);
-                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return XFS_ERROR(EROFS);
-                return (xfs_qm_scall_trunc_qfiles(mp,
-                               xfs_qm_import_qtype_flags(*(uint *)addr)));
-        case Q_XGETQSTAT:
-                /*
-                 * Get quota status information.
-                 */
-                return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
-        case Q_XQUOTAON:
-                /*
-                 * QUOTAON - enabling quota enforcement.
-                 * Quota accounting must be turned on at mount time.
-                 */
-                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return XFS_ERROR(EROFS);
-                return (xfs_qm_scall_quotaon(mp,
-                                          xfs_qm_import_flags(*(uint *)addr)));
-        case Q_XQUOTAOFF:
-                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return XFS_ERROR(EROFS);
-                break;
-        case Q_XQUOTASYNC:
-                return xfs_sync_inodes(mp, SYNC_DELWRI);
-        default:
-                break;
-        }
-        if (! XFS_IS_QUOTA_ON(mp))
-                return XFS_ERROR(ESRCH);
-        switch (cmd) {
-        case Q_XQUOTAOFF:
-                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return XFS_ERROR(EROFS);
-                error = xfs_qm_scall_quotaoff(mp,
-                                            xfs_qm_import_flags(*(uint *)addr),
-                                            B_FALSE);
-                break;
-        case Q_XGETQUOTA:
-                error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
-                                        (fs_disk_quota_t *)addr);
-                break;
-        case Q_XGETGQUOTA:
-                error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
-                                        (fs_disk_quota_t *)addr);
-                break;
-        case Q_XGETPQUOTA:
-                error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
-                                        (fs_disk_quota_t *)addr);
-                break;
-        case Q_XSETQLIM:
-                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return XFS_ERROR(EROFS);
-                error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
-                                             (fs_disk_quota_t *)addr);
-                break;
-        case Q_XSETGQLIM:
-                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return XFS_ERROR(EROFS);
-                error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
-                                             (fs_disk_quota_t *)addr);
-                break;
-        case Q_XSETPQLIM:
-                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return XFS_ERROR(EROFS);
-                error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
-                                             (fs_disk_quota_t *)addr);
-                break;
-        default:
-                error = XFS_ERROR(EINVAL);
-                break;
-        }
-        return (error);
-}
-/*
 * Turn off quota accounting and/or enforcement for all udquots and/or
 * gdquots. Called only at unmount time.
 *
@@ -193,11 +74,10 @@ xfs_qm_quotactl(
 * incore, and modifies the ondisk dquot directly. Therefore, for example,
 * it is an error to call this twice, without purging the cache.
 */
-STATIC int
+int
 xfs_qm_scall_quotaoff(
        xfs_mount_t             *mp,
-        uint                    flags,
+        uint                    flags)
-        boolean_t               force)
 {
        uint                    dqtype;
        int                     error;
@@ -205,8 +85,6 @@ xfs_qm_scall_quotaoff(
        xfs_qoff_logitem_t      *qoffstart;
        int                     nculprits;
-        if (!force && !capable(CAP_SYS_ADMIN))
-                return XFS_ERROR(EPERM);
        /*
         * No file system can have quotas enabled on disk but not in core.
         * Note that quota utilities (like quotaoff) _expect_
@@ -375,7 +253,7 @@ out_error:
        return (error);
 }
-STATIC int
+int
 xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
@@ -383,8 +261,6 @@ xfs_qm_scall_trunc_qfiles(
        int             error = 0, error2 = 0;
        xfs_inode_t     *qip;
-        if (!capable(CAP_SYS_ADMIN))
-                return XFS_ERROR(EPERM);
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
@@ -416,7 +292,7 @@ xfs_qm_scall_trunc_qfiles(
 * effect immediately.
 * (Switching on quota accounting must be done at mount time.)
 */
-STATIC int
+int
 xfs_qm_scall_quotaon(
        xfs_mount_t     *mp,
        uint            flags)
@@ -426,9 +302,6 @@ xfs_qm_scall_quotaon(
        uint            accflags;
        __int64_t       sbflags;
-        if (!capable(CAP_SYS_ADMIN))
-                return XFS_ERROR(EPERM);
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
        /*
         * Switching on quota accounting must be done at mount time.
@@ -517,7 +390,7 @@ xfs_qm_scall_quotaon(
 /*
 * Return quota status information, such as uquota-off, enforcements, etc.
 */
-STATIC int
+int
 xfs_qm_scall_getqstat(
        xfs_mount_t     *mp,
        fs_quota_stat_t *out)
@@ -582,7 +455,7 @@ xfs_qm_scall_getqstat(
 /*
 * Adjust quota limits, and start/stop timers accordingly.
 */
-STATIC int
+int
 xfs_qm_scall_setqlim(
        xfs_mount_t             *mp,
        xfs_dqid_t              id,
@@ -595,9 +468,6 @@ xfs_qm_scall_setqlim(
        int                     error;
        xfs_qcnt_t              hard, soft;
-        if (!capable(CAP_SYS_ADMIN))
-                return XFS_ERROR(EPERM);
        if ((newlim->d_fieldmask &
            (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
                return (0);
@@ -742,7 +612,7 @@ xfs_qm_scall_setqlim(
        return error;
 }
-STATIC int
+int
 xfs_qm_scall_getquota(
        xfs_mount_t     *mp,
        xfs_dqid_t      id,
@@ -935,30 +805,6 @@ xfs_qm_export_dquot(
 }
 STATIC uint
-xfs_qm_import_qtype_flags(
-        uint            uflags)
-{
-        uint            oflags = 0;
-        /*
-         * Can't be more than one, or none.
-         */
-        if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
-                        (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
-            ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ==
-                        (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ||
-            ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ==
-                        (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ||
-            ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0))
-                return (0);
-        oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0;
-        oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0;
-        oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0;
-        return oflags;
-}
-STATIC uint
 xfs_qm_export_qtype_flags(
        uint flags)
 {
@@ -979,26 +825,6 @@ xfs_qm_export_qtype_flags(
 }
 STATIC uint
-xfs_qm_import_flags(
-        uint uflags)
-{
-        uint flags = 0;
-        if (uflags & XFS_QUOTA_UDQ_ACCT)
-                flags |= XFS_UQUOTA_ACCT;
-        if (uflags & XFS_QUOTA_PDQ_ACCT)
-                flags |= XFS_PQUOTA_ACCT;
-        if (uflags & XFS_QUOTA_GDQ_ACCT)
-                flags |= XFS_GQUOTA_ACCT;
-        if (uflags & XFS_QUOTA_UDQ_ENFD)
-                flags |= XFS_UQUOTA_ENFD;
-        if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
-                flags |= XFS_OQUOTA_ENFD;
-        return (flags);
-}
-STATIC uint
 xfs_qm_export_flags(
        uint flags)
 {
@@ -1134,7 +960,7 @@ xfs_dqhash_t *qmtest_udqtab;
 xfs_dqhash_t *qmtest_gdqtab;
 int           qmtest_hashmask;
 int           qmtest_nfails;
-mutex_t       qcheck_lock;
+struct mutex  qcheck_lock;
 #define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
                                 (__psunsigned_t)(id)) & \
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index c4fcea600bc2..8286b2842b6b 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -42,34 +42,24 @@
 #define XFS_QI_QOFFLOCK(mp)     ((mp)->m_quotainfo->qi_quotaofflock)
 #define XFS_QI_MPL_LIST(mp)     ((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLLOCK(mp)      ((mp)->m_quotainfo->qi_dqlist.qh_lock)
 #define XFS_QI_MPLNEXT(mp)      ((mp)->m_quotainfo->qi_dqlist.qh_next)
 #define XFS_QI_MPLNDQUOTS(mp)   ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-#define XQMLCK(h)                       (mutex_lock(&((h)->qh_lock)))
+#define xfs_qm_mplist_lock(mp) \
-#define XQMUNLCK(h)                     (mutex_unlock(&((h)->qh_lock)))
+        mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#ifdef DEBUG
+#define xfs_qm_mplist_nowait(mp) \
-struct xfs_dqhash;
+        mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-static inline int XQMISLCKD(struct xfs_dqhash *h)
+#define xfs_qm_mplist_unlock(mp) \
-{
+        mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-        if (mutex_trylock(&h->qh_lock)) {
+#define XFS_QM_IS_MPLIST_LOCKED(mp) \
-                mutex_unlock(&h->qh_lock);
+        mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
-                return 0;
-        }
+#define xfs_qm_freelist_lock(qm) \
-        return 1;
+        mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
-}
+#define xfs_qm_freelist_lock_nowait(qm) \
-#endif
+        mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
+#define xfs_qm_freelist_unlock(qm) \
-#define XFS_DQ_HASH_LOCK(h)             XQMLCK(h)
+        mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
-#define XFS_DQ_HASH_UNLOCK(h)           XQMUNLCK(h)
-#define XFS_DQ_IS_HASH_LOCKED(h)        XQMISLCKD(h)
-#define xfs_qm_mplist_lock(mp)          XQMLCK(&(XFS_QI_MPL_LIST(mp)))
-#define xfs_qm_mplist_unlock(mp)        XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
-#define XFS_QM_IS_MPLIST_LOCKED(mp)     XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
-#define xfs_qm_freelist_lock(qm)        XQMLCK(&((qm)->qm_dqfreelist))
-#define xfs_qm_freelist_unlock(qm)      XQMUNLCK(&((qm)->qm_dqfreelist))
 /*
 * Hash into a bucket in the dquot hash table, based on <mp, id>.
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 99611381e740..447173bcf96d 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -624,10 +624,9 @@ xfs_trans_dqresv(
        xfs_qcnt_t      *resbcountp;
        xfs_quotainfo_t *q = mp->m_quotainfo;
-        if (! (flags & XFS_QMOPT_DQLOCK)) {
-                xfs_dqlock(dqp);
+        xfs_dqlock(dqp);
-        }
-        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        if (flags & XFS_TRANS_DQ_RES_BLKS) {
                hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
                if (!hardlimit)
@@ -740,10 +739,8 @@ xfs_trans_dqresv(
        ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
 error_return:
-        if (! (flags & XFS_QMOPT_DQLOCK)) {
+        xfs_dqunlock(dqp);
-                xfs_dqunlock(dqp);
+        return error;
-        }
-        return (error);
 }
@@ -753,8 +750,7 @@ error_return:
 * grp/prj quotas is important, because this follows a both-or-nothing
 * approach.
 *
- * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked.
+ * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
- *         XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
 *         XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
 *         XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
 *         XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index ae5482965424..3f3610a7ee05 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -24,6 +24,7 @@
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
+#include "xfs_error.h"
 static char             message[1024];  /* keep it off the stack */
 static DEFINE_SPINLOCK(xfs_err_lock);
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 5830c040ea7e..b83f76b6d410 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -17,10 +17,6 @@
 */
 #include <xfs.h>
-static DEFINE_MUTEX(uuid_monitor);
-static int      uuid_table_size;
-static uuid_t   *uuid_table;
 /* IRIX interpretation of an uuid_t */
 typedef struct {
        __be32  uu_timelow;
@@ -46,12 +42,6 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
        fsid[1] = be32_to_cpu(uup->uu_timelow);
 }
-void
-uuid_create_nil(uuid_t *uuid)
-{
-        memset(uuid, 0, sizeof(*uuid));
-}
 int
 uuid_is_nil(uuid_t *uuid)
 {
@@ -71,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
 {
        return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
 }
-/*
- * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
- * 64-bit words.  NOTE: This function can not be changed EVER.  Although
- * brain-dead, some applications depend on this 64-bit value remaining
- * persistent.  Specifically, DMI vendors store the value as a persistent
- * filehandle.
- */
-__uint64_t
-uuid_hash64(uuid_t *uuid)
-{
-        __uint64_t      *sp = (__uint64_t *)uuid;
-        return sp[0] + sp[1];
-}
-int
-uuid_table_insert(uuid_t *uuid)
-{
-        int     i, hole;
-        mutex_lock(&uuid_monitor);
-        for (i = 0, hole = -1; i < uuid_table_size; i++) {
-                if (uuid_is_nil(&uuid_table[i])) {
-                        hole = i;
-                        continue;
-                }
-                if (uuid_equal(uuid, &uuid_table[i])) {
-                        mutex_unlock(&uuid_monitor);
-                        return 0;
-                }
-        }
-        if (hole < 0) {
-                uuid_table = kmem_realloc(uuid_table,
-                        (uuid_table_size + 1) * sizeof(*uuid_table),
-                        uuid_table_size  * sizeof(*uuid_table),
-                        KM_SLEEP);
-                hole = uuid_table_size++;
-        }
-        uuid_table[hole] = *uuid;
-        mutex_unlock(&uuid_monitor);
-        return 1;
-}
-void
-uuid_table_remove(uuid_t *uuid)
-{
-        int     i;
-        mutex_lock(&uuid_monitor);
-        for (i = 0; i < uuid_table_size; i++) {
-                if (uuid_is_nil(&uuid_table[i]))
-                        continue;
-                if (!uuid_equal(uuid, &uuid_table[i]))
-                        continue;
-                uuid_create_nil(&uuid_table[i]);
-                break;
-        }
-        ASSERT(i < uuid_table_size);
-        mutex_unlock(&uuid_monitor);
-}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
index cff5b607d445..4732d71262cc 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/support/uuid.h
@@ -22,12 +22,8 @@ typedef struct {
        unsigned char   __u_bits[16];
 } uuid_t;
-extern void uuid_create_nil(uuid_t *uuid);
 extern int uuid_is_nil(uuid_t *uuid);
 extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
 extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
-extern __uint64_t uuid_hash64(uuid_t *uuid);
-extern int uuid_table_insert(uuid_t *uuid);
-extern void uuid_table_remove(uuid_t *uuid);
 #endif  /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 143d63ecb20a..c8641f713caa 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,8 +223,8 @@ typedef struct xfs_perag
                be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
 #define XFS_MIN_FREELIST_PAG(pag,mp)    \
        (XFS_MIN_FREELIST_RAW(          \
-                (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+                (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
-                (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+                (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
 #define XFS_AGB_TO_FSB(mp,agno,agbno)   \
        (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 028e44e58ea9..2cf944eb796d 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1872,6 +1872,25 @@ xfs_alloc_compute_maxlevels(
 }
 /*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag)
+{
+        xfs_extlen_t            need, delta = 0;
+        need = XFS_MIN_FREELIST_PAG(pag, mp);
+        if (need > pag->pagf_flcount)
+                delta = need - pag->pagf_flcount;
+        if (pag->pagf_longest > delta)
+                return pag->pagf_longest - delta;
+        return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+/*
 * Decide whether to use this allocation group for this allocation.
 * If so, fix up the btree freelist's size.
 */
@@ -1923,15 +1942,12 @@ xfs_alloc_fix_freelist(
        }
        if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-                need = XFS_MIN_FREELIST_PAG(pag, mp);
-                delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
                /*
                 * If it looks like there isn't a long enough extent, or enough
                 * total blocks, reject it.
                 */
-                longest = (pag->pagf_longest > delta) ?
+                need = XFS_MIN_FREELIST_PAG(pag, mp);
-                        (pag->pagf_longest - delta) :
+                longest = xfs_alloc_longest_free_extent(mp, pag);
-                        (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
                if ((args->minlen + args->alignment + args->minalignslop - 1) >
                                longest ||
                    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 588172796f7b..e704caee10df 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -100,6 +100,12 @@ typedef struct xfs_alloc_arg {
 #define XFS_ALLOC_USERDATA              1       /* allocation is for user data*/
 #define XFS_ALLOC_INITIAL_USER_DATA     2       /* special case start of file */
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+                struct xfs_perag *pag);
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 6c323f8a4cd1..afdc8911637d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -155,7 +155,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
                 * minimum offset only needs to be the space required for 
                 * the btree root.
                 */ 
-                if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset)
+                if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+                    xfs_default_attroffset(dp))
                        dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
                break;
                
@@ -298,6 +299,26 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
 }
 /*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+        struct xfs_inode        *ip,
+        struct xfs_trans        *tp)
+{
+        xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+        ip->i_d.di_forkoff = 0;
+        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+        ASSERT(ip->i_d.di_anextents == 0);
+        ASSERT(ip->i_afp == NULL);
+        ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+/*
 * Remove an attribute from the shortform attribute list structure.
 */
 int
@@ -344,22 +365,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
         */
        totsize -= size;
        if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
-                                !(args->op_flags & XFS_DA_OP_ADDNAME) &&
+            (mp->m_flags & XFS_MOUNT_ATTR2) &&
-                                (mp->m_flags & XFS_MOUNT_ATTR2) &&
+            (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
-                                (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
+            !(args->op_flags & XFS_DA_OP_ADDNAME)) {
-                /*
+                xfs_attr_fork_reset(dp, args->trans);
-                 * Last attribute now removed, revert to original
-                 * inode format making all literal area available
-                 * to the data fork once more.
-                 */
-                xfs_idestroy_fork(dp, XFS_ATTR_FORK);
-                dp->i_d.di_forkoff = 0;
-                dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-                ASSERT(dp->i_d.di_anextents == 0);
-                ASSERT(dp->i_afp == NULL);
-                dp->i_df.if_ext_max =
-                        XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-                xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
        } else {
                xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
                dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
@@ -786,20 +795,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
        if (forkoff == -1) {
                ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
                ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
+                xfs_attr_fork_reset(dp, args->trans);
-                /*
-                 * Last attribute was removed, revert to original
-                 * inode format making all literal area available
-                 * to the data fork once more.
-                 */
-                xfs_idestroy_fork(dp, XFS_ATTR_FORK);
-                dp->i_d.di_forkoff = 0;
-                dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-                ASSERT(dp->i_d.di_anextents == 0);
-                ASSERT(dp->i_afp == NULL);
-                dp->i_df.if_ext_max =
-                        XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-                xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
                goto out;
        }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c852cd65aaea..3a6ed426327a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2479,7 +2479,7 @@ xfs_bmap_adjacent(
        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
        /*
         * If allocating at eof, and there's a previous real block,
-         * try to use it's last block as our starting point.
+         * try to use its last block as our starting point.
         */
        if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
            !isnullstartblock(ap->prevp->br_startblock) &&
@@ -2712,9 +2712,6 @@ xfs_bmap_btalloc(
        xfs_agnumber_t  startag;
        xfs_alloc_arg_t args;
        xfs_extlen_t    blen;
-        xfs_extlen_t    delta;
-        xfs_extlen_t    longest;
-        xfs_extlen_t    need;
        xfs_extlen_t    nextminlen = 0;
        xfs_perag_t     *pag;
        int             nullfb;         /* true if ap->firstblock isn't set */
@@ -2796,13 +2793,8 @@ xfs_bmap_btalloc(
                         * See xfs_alloc_fix_freelist...
                         */
                        if (pag->pagf_init) {
-                                need = XFS_MIN_FREELIST_PAG(pag, mp);
+                                xfs_extlen_t    longest;
-                                delta = need > pag->pagf_flcount ?
+                                longest = xfs_alloc_longest_free_extent(mp, pag);
-                                        need - pag->pagf_flcount : 0;
-                                longest = (pag->pagf_longest > delta) ?
-                                        (pag->pagf_longest - delta) :
-                                        (pag->pagf_flcount > 0 ||
-                                         pag->pagf_longest > 0);
                                if (blen < longest)
                                        blen = longest;
                        } else
@@ -3577,6 +3569,27 @@ xfs_bmap_extents_to_btree(
 }
 /*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        uint                    offset;
+        if (mp->m_sb.sb_inodesize == 256) {
+                offset = XFS_LITINO(mp) -
+                                XFS_BMDR_SPACE_CALC(MINABTPTRS);
+        } else {
+                offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+        }
+        ASSERT(offset < XFS_LITINO(mp));
+        return offset;
+}
+/*
 * Helper routine to reset inode di_forkoff field when switching
 * attribute fork from local to extent format - we reset it where
 * possible to make space available for inline data fork extents.
@@ -3588,15 +3601,18 @@ xfs_bmap_forkoff_reset(
        int             whichfork)
 {
        if (whichfork == XFS_ATTR_FORK &&
-            (ip->i_d.di_format != XFS_DINODE_FMT_DEV) &&
+            ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
-            (ip->i_d.di_format != XFS_DINODE_FMT_UUID) &&
+            ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
-            (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+            ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
-            ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) {
+                uint    dfl_forkoff = xfs_default_attroffset(ip) >> 3;
-                ip->i_d.di_forkoff = mp->m_attroffset >> 3;
-                ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) /
+                if (dfl_forkoff > ip->i_d.di_forkoff) {
-                                        (uint)sizeof(xfs_bmbt_rec_t);
+                        ip->i_d.di_forkoff = dfl_forkoff;
-                ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) /
+                        ip->i_df.if_ext_max =
-                                        (uint)sizeof(xfs_bmbt_rec_t);
+                                XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+                        ip->i_afp->if_ext_max =
+                                XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
+                }
        }
 }
@@ -4065,7 +4081,7 @@ xfs_bmap_add_attrfork(
        case XFS_DINODE_FMT_BTREE:
                ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
                if (!ip->i_d.di_forkoff)
-                        ip->i_d.di_forkoff = mp->m_attroffset >> 3;
+                        ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
                else if (mp->m_flags & XFS_MOUNT_ATTR2)
                        version = 2;
                break;
@@ -4212,12 +4228,12 @@ xfs_bmap_compute_maxlevels(
         * (a signed 16-bit number, xfs_aextnum_t).
         *
         * Note that we can no longer assume that if we are in ATTR1 that
-         * the fork offset of all the inodes will be (m_attroffset >> 3)
+         * the fork offset of all the inodes will be
-         * because we could have mounted with ATTR2 and then mounted back
+         * (xfs_default_attroffset(ip) >> 3) because we could have mounted
-         * with ATTR1, keeping the di_forkoff's fixed but probably at
+         * with ATTR2 and then mounted back with ATTR1, keeping the
-         * various positions. Therefore, for both ATTR1 and ATTR2
+         * di_forkoff's fixed but probably at various positions. Therefore,
-         * we have to assume the worst case scenario of a minimum size
+         * for both ATTR1 and ATTR2 we have to assume the worst case scenario
-         * available.
+         * of a minimum size available.
         */
        if (whichfork == XFS_DATA_FORK) {
                maxleafents = MAXEXTNUM;
@@ -4804,7 +4820,7 @@ xfs_bmapi(
        xfs_extlen_t    minlen;         /* min allocation size */
        xfs_mount_t     *mp;            /* xfs mount structure */
        int             n;              /* current extent index */
-        int             nallocs;        /* number of extents alloc\'d */
+        int             nallocs;        /* number of extents alloc'd */
        xfs_extnum_t    nextents;       /* number of extents in file */
        xfs_fileoff_t   obno;           /* old block number (offset) */
        xfs_bmbt_irec_t prev;           /* previous file extent record */
@@ -6204,7 +6220,7 @@ xfs_bmap_get_bp(
        return(bp);
 }
-void
+STATIC void
 xfs_check_block(
        struct xfs_btree_block  *block,
        xfs_mount_t             *mp,
@@ -6494,7 +6510,7 @@ xfs_bmap_count_tree(
        block = XFS_BUF_TO_BLOCK(bp);
        if (--level) {
-                /* Not at node above leafs, count this level of nodes */
+                /* Not at node above leaves, count this level of nodes */
                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                while (nextbno != NULLFSBLOCK) {
                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index be2979d88d32..1b8ff9256bd0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -125,7 +125,7 @@ typedef struct xfs_bmalloca {
        struct xfs_bmbt_irec    *gotp;  /* extent after, or delayed */
        xfs_extlen_t            alen;   /* i/o length asked/allocated */
        xfs_extlen_t            total;  /* total blocks needed for xaction */
-        xfs_extlen_t            minlen; /* mininum allocation size (blocks) */
+        xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
        xfs_extlen_t            minleft; /* amount must be left after alloc */
        char                    eof;    /* set if allocating past last extent */
        char                    wasdel; /* replacing a delayed allocation */
@@ -338,6 +338,10 @@ xfs_check_nostate_extents(
        xfs_extnum_t            idx,
        xfs_extnum_t            num);
+uint
+xfs_default_attroffset(
+        struct xfs_inode        *ip);
 #ifdef __KERNEL__
 /*
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e73c332eb23f..e9df99574829 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -1883,7 +1883,7 @@ xfs_btree_lshift(
        /*
         * We add one entry to the left side and remove one for the right side.
-         * Accout for it here, the changes will be updated on disk and logged
+         * Account for it here, the changes will be updated on disk and logged
         * later.
         */
        lrecs++;
@@ -3535,7 +3535,7 @@ xfs_btree_delrec(
        XFS_BTREE_STATS_INC(cur, join);
        /*
-         * Fix up the the number of records and right block pointer in the
+         * Fix up the number of records and right block pointer in the
         * surviving block, and log it.
         */
        xfs_btree_set_numrecs(left, lrecs + rrecs);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 789fffdf8b2f..4f852b735b96 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -41,7 +41,7 @@ extern kmem_zone_t	*xfs_btree_cur_zone;
 /*
 * Generic btree header.
 *
- * This is a comination of the actual format used on disk for short and long
+ * This is a combination of the actual format used on disk for short and long
 * format btrees.  The first three fields are shared by both format, but
 * the pointers are different and should be used with care.
 *
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c45f74ff1a5b..9ff6e57a5075 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1503,7 +1503,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 * This is implemented with some source-level loop unrolling.
 */
 xfs_dahash_t
-xfs_da_hashname(const uchar_t *name, int namelen)
+xfs_da_hashname(const __uint8_t *name, int namelen)
 {
        xfs_dahash_t hash;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 70b710c1792d..8c536167bf75 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -91,9 +91,9 @@ enum xfs_dacmp {
 * Structure to ease passing around component names.
 */
 typedef struct xfs_da_args {
-        const uchar_t   *name;          /* string (maybe not NULL terminated) */
+        const __uint8_t *name;          /* string (maybe not NULL terminated) */
        int             namelen;        /* length of string (maybe no NULL) */
-        uchar_t         *value;         /* set of bytes (maybe contain NULLs) */
+        __uint8_t       *value;         /* set of bytes (maybe contain NULLs) */
        int             valuelen;       /* length of value */
        int             flags;          /* argument flags (eg: ATTR_NOCREATE) */
        xfs_dahash_t    hashval;        /* hash value of name */
@@ -185,7 +185,7 @@ typedef struct xfs_da_state {
        unsigned char           inleaf;         /* insert into 1->lf, 0->splf */
        unsigned char           extravalid;     /* T/F: extrablk is in use */
        unsigned char           extraafter;     /* T/F: extrablk is after new */
-        xfs_da_state_blk_t      extrablk;       /* for double-splits on leafs */
+        xfs_da_state_blk_t      extrablk;       /* for double-splits on leaves */
                                                /* for dirv2 extrablk is data */
 } xfs_da_state_t;
@@ -251,7 +251,7 @@ xfs_daddr_t	xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 int     xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                                          xfs_dabuf_t *dead_buf);
-uint xfs_da_hashname(const uchar_t *name_string, int name_length);
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
 enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
                                const char *name, int len);
@@ -268,5 +268,6 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
 extern struct kmem_zone *xfs_da_state_zone;
 extern struct kmem_zone *xfs_dabuf_zone;
+extern const struct xfs_nameops xfs_default_nameops;
 #endif  /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f8278cfcc1d3..e6d839bddbf0 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -79,6 +79,12 @@ xfs_swapext(
                goto out_put_target_file;
        }
+        if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
+            IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
+                error = XFS_ERROR(EINVAL);
+                goto out_put_target_file;
+        }
        ip = XFS_I(file->f_path.dentry->d_inode);
        tip = XFS_I(target_file->f_path.dentry->d_inode);
@@ -118,19 +124,17 @@ xfs_swap_extents(
        xfs_bstat_t     *sbp = &sxp->sx_stat;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
        int             ilf_fields, tilf_fields;
-        static uint     lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
        int             error = 0;
        int             aforkblks = 0;
        int             taforkblks = 0;
        __uint64_t      tmp;
-        char            locked = 0;
        mp = ip->i_mount;
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
                error = XFS_ERROR(ENOMEM);
-                goto error0;
+                goto out;
        }
        sbp = &sxp->sx_stat;
@@ -143,25 +147,24 @@ xfs_swap_extents(
         */
        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-        locked = 1;
        /* Verify that both files have the same format */
        if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
                error = XFS_ERROR(EINVAL);
-                goto error0;
+                goto out_unlock;
        }
        /* Verify both files are either real-time or non-realtime */
        if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
                error = XFS_ERROR(EINVAL);
-                goto error0;
+                goto out_unlock;
        }
        /* Should never get a local format */
        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
                error = XFS_ERROR(EINVAL);
-                goto error0;
+                goto out_unlock;
        }
        if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -169,13 +172,13 @@ xfs_swap_extents(
                error = xfs_flushinval_pages(tip, 0, -1,
                                FI_REMAPF_LOCKED);
                if (error)
-                        goto error0;
+                        goto out_unlock;
        }
        /* Verify O_DIRECT for ftmp */
        if (VN_CACHED(VFS_I(tip)) != 0) {
                error = XFS_ERROR(EINVAL);
-                goto error0;
+                goto out_unlock;
        }
        /* Verify all data are being swapped */
@@ -183,7 +186,7 @@ xfs_swap_extents(
            sxp->sx_length != ip->i_d.di_size ||
            sxp->sx_length != tip->i_d.di_size) {
                error = XFS_ERROR(EFAULT);
-                goto error0;
+                goto out_unlock;
        }
        /*
@@ -193,7 +196,7 @@ xfs_swap_extents(
         */
        if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
                error = XFS_ERROR(EINVAL);
-                goto error0;
+                goto out_unlock;
        }
        /*
@@ -208,7 +211,7 @@ xfs_swap_extents(
            (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
            (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
                error = XFS_ERROR(EBUSY);
-                goto error0;
+                goto out_unlock;
        }
        /* We need to fail if the file is memory mapped.  Once we have tossed
@@ -219,7 +222,7 @@ xfs_swap_extents(
         */
        if (VN_MAPPED(VFS_I(ip))) {
                error = XFS_ERROR(EBUSY);
-                goto error0;
+                goto out_unlock;
        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -242,8 +245,7 @@ xfs_swap_extents(
                xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
                xfs_iunlock(tip, XFS_IOLOCK_EXCL);
                xfs_trans_cancel(tp, 0);
-                locked = 0;
+                goto out;
-                goto error0;
        }
        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
@@ -253,19 +255,15 @@ xfs_swap_extents(
        if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
             (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
                error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
-                if (error) {
+                if (error)
-                        xfs_trans_cancel(tp, 0);
+                        goto out_trans_cancel;
-                        goto error0;
-                }
        }
        if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
             (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
                error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
                        &taforkblks);
-                if (error) {
+                if (error)
-                        xfs_trans_cancel(tp, 0);
+                        goto out_trans_cancel;
-                        goto error0;
-                }
        }
        /*
@@ -332,10 +330,10 @@ xfs_swap_extents(
        IHOLD(ip);
-        xfs_trans_ijoin(tp, ip, lock_flags);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
        IHOLD(tip);
-        xfs_trans_ijoin(tp, tip, lock_flags);
+        xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
        xfs_trans_log_inode(tp, ip,  ilf_fields);
        xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -344,19 +342,19 @@ xfs_swap_extents(
         * If this is a synchronous mount, make sure that the
         * transaction goes to disk before returning to the user.
         */
-        if (mp->m_flags & XFS_MOUNT_WSYNC) {
+        if (mp->m_flags & XFS_MOUNT_WSYNC)
                xfs_trans_set_sync(tp);
-        }
        error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
-        locked = 0;
- error0:
+out_unlock:
-        if (locked) {
+        xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                xfs_iunlock(ip,  lock_flags);
+        xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                xfs_iunlock(tip, lock_flags);
+out:
-        }
+        kmem_free(tempifp);
-        if (tempifp != NULL)
-                kmem_free(tempifp);
        return error;
+out_trans_cancel:
+        xfs_trans_cancel(tp, 0);
+        goto out_unlock;
 }
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 162e8726df5e..e5b153b2e6a3 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -103,7 +103,9 @@ typedef enum xfs_dinode_fmt {
 /*
 * Inode size for given fs.
 */
-#define XFS_LITINO(mp)  ((mp)->m_litino)
+#define XFS_LITINO(mp) \
+        ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
 #define XFS_BROOT_SIZE_ADJ      \
        (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 1afb12278b8d..c657bec6d951 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -46,8 +46,6 @@
 struct xfs_name xfs_name_dotdot = {"..", 2};
-extern const struct xfs_nameops xfs_default_nameops;
 /*
 * ASCII case-insensitive (ie. A-Z) support for directories that was
 * used in IRIX.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e1f0a06aaf04..ab52e9e1c1ee 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -448,7 +448,6 @@ xfs_dir2_block_getdents(
        xfs_mount_t             *mp;            /* filesystem mount point */
        char                    *ptr;           /* current data entry */
        int                     wantoff;        /* starting block offset */
-        xfs_ino_t               ino;
        xfs_off_t               cook;
        mp = dp->i_mount;
@@ -509,16 +508,12 @@ xfs_dir2_block_getdents(
                cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
                                            (char *)dep - (char *)block);
-                ino = be64_to_cpu(dep->inumber);
-#if XFS_BIG_INUMS
-                ino += mp->m_inoadd;
-#endif
                /*
                 * If it didn't fit, set the final offset to here & return.
                 */
                if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
-                            ino, DT_UNKNOWN)) {
+                            be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
                        *offset = cook & 0x7fffffff;
                        xfs_da_brelse(NULL, bp);
                        return 0;
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index b816e0252739..efbc290c7fec 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -38,7 +38,7 @@ struct xfs_trans;
 /*
 * Directory address space divided into sections,
- * spaces separated by 32gb.
+ * spaces separated by 32GB.
 */
 #define XFS_DIR2_SPACE_SIZE     (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
 #define XFS_DIR2_DATA_SPACE     0
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ef805a374eec..fa913e459442 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -549,7 +549,7 @@ xfs_dir2_leaf_addname(
 * Check the internal consistency of a leaf1 block.
 * Pop an assert if something is wrong.
 */
-void
+STATIC void
 xfs_dir2_leaf_check(
        xfs_inode_t             *dp,            /* incore directory inode */
        xfs_dabuf_t             *bp)            /* leaf's buffer */
@@ -780,7 +780,6 @@ xfs_dir2_leaf_getdents(
        int                     ra_index;       /* *map index for read-ahead */
        int                     ra_offset;      /* map entry offset for ra */
        int                     ra_want;        /* readahead count wanted */
-        xfs_ino_t               ino;
        /*
         * If the offset is at or past the largest allowed value,
@@ -1076,24 +1075,12 @@ xfs_dir2_leaf_getdents(
                        continue;
                }
-                /*
-                 * Copy the entry into the putargs, and try formatting it.
-                 */
                dep = (xfs_dir2_data_entry_t *)ptr;
                length = xfs_dir2_data_entsize(dep->namelen);
-                ino = be64_to_cpu(dep->inumber);
-#if XFS_BIG_INUMS
-                ino += mp->m_inoadd;
-#endif
-                /*
-                 * Won't fit.  Return to caller.
-                 */
                if (filldir(dirent, dep->name, dep->namelen,
                            xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
-                            ino, DT_UNKNOWN))
+                            be64_to_cpu(dep->inumber), DT_UNKNOWN))
                        break;
                /*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index fa6c3a5ddbc6..5a81ccd1045b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1104,7 +1104,7 @@ xfs_dir2_leafn_remove(
        }
        xfs_dir2_leafn_check(dp, bp);
        /*
-         * Return indication of whether this leaf block is emtpy enough
+         * Return indication of whether this leaf block is empty enough
         * to justify trying to join it with a neighbor.
         */
        *rval =
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index a8a8a6efad5b..e89734e84646 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -748,11 +748,7 @@ xfs_dir2_sf_getdents(
         * Put . entry unless we're starting past it.
         */
        if (*offset <= dot_offset) {
-                ino = dp->i_ino;
+                if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
-#if XFS_BIG_INUMS
-                ino += mp->m_inoadd;
-#endif
-                if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
                        *offset = dot_offset & 0x7fffffff;
                        return 0;
                }
@@ -763,9 +759,6 @@ xfs_dir2_sf_getdents(
         */
        if (*offset <= dotdot_offset) {
                ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
-#if XFS_BIG_INUMS
-                ino += mp->m_inoadd;
-#endif
                if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
                        *offset = dotdot_offset & 0x7fffffff;
                        return 0;
@@ -786,10 +779,6 @@ xfs_dir2_sf_getdents(
                }
                ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
-#if XFS_BIG_INUMS
-                ino += mp->m_inoadd;
-#endif
                if (filldir(dirent, sfep->name, sfep->namelen,
                            off & 0x7fffffff, ino, DT_UNKNOWN)) {
                        *offset = off & 0x7fffffff;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 2f049f63e85f..0d22c56fdf64 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -33,12 +33,10 @@ typedef struct xfs_extent {
 * conversion routine.
 */
-#ifndef HAVE_FORMAT32
 typedef struct xfs_extent_32 {
        __uint64_t      ext_start;
        __uint32_t      ext_len;
 } __attribute__((packed)) xfs_extent_32_t;
-#endif
 typedef struct xfs_extent_64 {
        __uint64_t      ext_start;
@@ -59,7 +57,6 @@ typedef struct xfs_efi_log_format {
        xfs_extent_t            efi_extents[1]; /* array of extents to free */
 } xfs_efi_log_format_t;
-#ifndef HAVE_FORMAT32
 typedef struct xfs_efi_log_format_32 {
        __uint16_t              efi_type;       /* efi log item type */
        __uint16_t              efi_size;       /* size of this item */
@@ -67,7 +64,6 @@ typedef struct xfs_efi_log_format_32 {
        __uint64_t              efi_id;         /* efi identifier */
        xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
 } __attribute__((packed)) xfs_efi_log_format_32_t;
-#endif
 typedef struct xfs_efi_log_format_64 {
        __uint16_t              efi_type;       /* efi log item type */
@@ -90,7 +86,6 @@ typedef struct xfs_efd_log_format {
        xfs_extent_t            efd_extents[1]; /* array of extents freed */
 } xfs_efd_log_format_t;
-#ifndef HAVE_FORMAT32
 typedef struct xfs_efd_log_format_32 {
        __uint16_t              efd_type;       /* efd log item type */
        __uint16_t              efd_size;       /* size of this item */
@@ -98,7 +93,6 @@ typedef struct xfs_efd_log_format_32 {
        __uint64_t              efd_efi_id;     /* id of corresponding efi */
        xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
 } __attribute__((packed)) xfs_efd_log_format_32_t;
-#endif
 typedef struct xfs_efd_log_format_64 {
        __uint16_t              efd_type;       /* efd log item type */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index f3bb75da384e..6c87c8f304ef 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,7 +140,7 @@ _xfs_filestream_pick_ag(
        xfs_extlen_t    minlen)
 {
        int             err, trylock, nscan;
-        xfs_extlen_t    delta, longest, need, free, minfree, maxfree = 0;
+        xfs_extlen_t    longest, free, minfree, maxfree = 0;
        xfs_agnumber_t  ag, max_ag = NULLAGNUMBER;
        struct xfs_perag *pag;
@@ -186,12 +186,7 @@ _xfs_filestream_pick_ag(
                        goto next_ag;
                }
-                need = XFS_MIN_FREELIST_PAG(pag, mp);
+                longest = xfs_alloc_longest_free_extent(mp, pag);
-                delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
-                longest = (pag->pagf_longest > delta) ?
-                          (pag->pagf_longest - delta) :
-                          (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
                if (((minlen && longest >= minlen) ||
                     (!minlen && pag->pagf_freeblks >= minfree)) &&
                    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 680d0e0ec932..8379e3bca26c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -576,7 +576,7 @@ out:
        if (fdblks_delta) {
                /*
                 * If we are putting blocks back here, m_resblks_avail is
-                 * already at it's max so this will put it in the free pool.
+                 * already at its max so this will put it in the free pool.
                 *
                 * If we need space, we'll either succeed in getting it
                 * from the free block count or we'll get an enospc. If
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ab016e5ae7be..3120a3a5e20f 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -230,7 +230,7 @@ xfs_ialloc_ag_alloc(
                args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
                /* Allow space for the inode btree to split. */
-                args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+                args.minleft = args.mp->m_in_maxlevels - 1;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
        } else
@@ -270,7 +270,7 @@ xfs_ialloc_ag_alloc(
                /*
                 * Allow space for the inode btree to split.
                 */
-                args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+                args.minleft = args.mp->m_in_maxlevels - 1;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
        }
@@ -349,7 +349,7 @@ xfs_ialloc_ag_alloc(
                 * Initialize all inodes in this buffer and then log them.
                 *
                 * XXX: It would be much better if we had just one transaction to
-                 *      log a whole cluster of inodes instead of all the indivdual
+                 *      log a whole cluster of inodes instead of all the individual
                 *      transactions causing a lot of log traffic.
                 */
                xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
@@ -943,7 +943,7 @@ nextag:
        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
                                   XFS_INODES_PER_CHUNK) == 0);
        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
-        XFS_INOBT_CLR_FREE(&rec, offset);
+        rec.ir_free &= ~XFS_INOBT_MASK(offset);
        rec.ir_freecount--;
        if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
                        rec.ir_free)))
@@ -1105,11 +1105,11 @@ xfs_difree(
         */
        off = agino - rec.ir_startino;
        ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
-        ASSERT(!XFS_INOBT_IS_FREE(&rec, off));
+        ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
        /*
         * Mark the inode free & increment the count.
         */
-        XFS_INOBT_SET_FREE(&rec, off);
+        rec.ir_free |= XFS_INOBT_MASK(off);
        rec.ir_freecount++;
        /*
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 99f2408e8d8e..c282a9af5393 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -164,7 +164,7 @@ xfs_inobt_init_rec_from_cur(
 }
 /*
- * intial value of ptr for lookup
+ * initial value of ptr for lookup
 */
 STATIC void
 xfs_inobt_init_ptr_from_cur(
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 5580e255ff06..f782ad0c4769 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -32,14 +32,14 @@ struct xfs_mount;
 #define XFS_IBT_MAGIC   0x49414254      /* 'IABT' */
 typedef __uint64_t      xfs_inofree_t;
-#define XFS_INODES_PER_CHUNK    (NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK            (NBBY * sizeof(xfs_inofree_t))
 #define XFS_INODES_PER_CHUNK_LOG        (XFS_NBBYLOG + 3)
-#define XFS_INOBT_ALL_FREE      ((xfs_inofree_t)-1)
+#define XFS_INOBT_ALL_FREE              ((xfs_inofree_t)-1)
+#define XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
 static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 {
-        return (((n) >= XFS_INODES_PER_CHUNK ? \
+        return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
-                (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i);
 }
 /*
@@ -69,20 +69,6 @@ typedef struct xfs_inobt_key {
 typedef __be32 xfs_inobt_ptr_t;
 /*
- * Bit manipulations for ir_free.
- */
-#define XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
-#define XFS_INOBT_IS_FREE(rp,i)         \
-                (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
-#define XFS_INOBT_SET_FREE(rp,i)        ((rp)->ir_free |= XFS_INOBT_MASK(i))
-#define XFS_INOBT_CLR_FREE(rp,i)        ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
-/*
- * Maximum number of inode btree levels.
- */
-#define XFS_IN_MAXLEVELS(mp)            ((mp)->m_in_maxlevels)
-/*
 * block numbers in the AG.
 */
 #define XFS_IBT_BLOCK(mp)               ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1f175fa34b22..f879c1bc4b96 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -122,7 +122,7 @@ typedef struct xfs_ictimestamp {
 /*
 * NOTE:  This structure must be kept identical to struct xfs_dinode
- *        in xfs_dinode.h except for the endianess annotations.
+ *        in xfs_dinode.h except for the endianness annotations.
 */
 typedef struct xfs_icdinode {
        __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9957d0602d54..a52ac125f055 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -40,7 +40,6 @@ typedef struct xfs_inode_log_format {
        __int32_t               ilf_boffset;    /* off of inode in buffer */
 } xfs_inode_log_format_t;
-#ifndef HAVE_FORMAT32
 typedef struct xfs_inode_log_format_32 {
        __uint16_t              ilf_type;       /* inode log item type */
        __uint16_t              ilf_size;       /* size of this item */
@@ -56,7 +55,6 @@ typedef struct xfs_inode_log_format_32 {
        __int32_t               ilf_len;        /* len of inode buffer */
        __int32_t               ilf_boffset;    /* off of inode in buffer */
 } __attribute__((packed)) xfs_inode_log_format_32_t;
-#endif
 typedef struct xfs_inode_log_format_64 {
        __uint16_t              ilf_type;       /* inode log item type */
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index ee1a0c134cc2..a1cc1322fc0f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -63,7 +63,7 @@ typedef enum {
 */
 typedef struct xfs_iomap {
-        xfs_daddr_t             iomap_bn;       /* first 512b blk of mapping */
+        xfs_daddr_t             iomap_bn;       /* first 512B blk of mapping */
        xfs_buftarg_t           *iomap_target;
        xfs_off_t               iomap_offset;   /* offset of mapping, bytes */
        xfs_off_t               iomap_bsize;    /* size of mapping, bytes */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf98a805ec90..aeb2d2221c7d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -83,7 +83,12 @@ xfs_bulkstat_one_iget(
        buf->bs_uid = dic->di_uid;
        buf->bs_gid = dic->di_gid;
        buf->bs_size = dic->di_size;
-        vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
+        /*
+         * We are reading the atime from the Linux inode because the
+         * dinode might not be uptodate.
+         */
+        buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
+        buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
        buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
        buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
        buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -579,7 +584,7 @@ xfs_bulkstat(
                                 * first inode of the cluster.
                                 *
                                 * Careful with clustidx.   There can be
-                                 * multple clusters per chunk, a single
+                                 * multiple clusters per chunk, a single
                                 * cluster per chunk or a cluster that has
                                 * inodes represented from several different
                                 * chunks (if blocksize is large).
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f4726f702a9e..f76c6d7cea21 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -574,7 +574,7 @@ xfs_log_mount(
        error = xfs_trans_ail_init(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
-                goto error;
+                goto out_free_log;
        }
        mp->m_log->l_ailp = mp->m_ail;
@@ -594,20 +594,22 @@ xfs_log_mount(
                        mp->m_flags |= XFS_MOUNT_RDONLY;
                if (error) {
                        cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
-                        goto error;
+                        goto out_destroy_ail;
                }
        }
        /* Normal transactions can now occur */
        mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
-        /* End mounting message in xfs_log_mount_finish */
        return 0;
-error:
-        xfs_log_unmount_dealloc(mp);
+out_destroy_ail:
+        xfs_trans_ail_destroy(mp);
+out_free_log:
+        xlog_dealloc_log(mp->m_log);
 out:
        return error;
-}       /* xfs_log_mount */
+}
 /*
 * Finish the recovery of the file system.  This is separate from
@@ -633,19 +635,6 @@ xfs_log_mount_finish(xfs_mount_t *mp)
 }
 /*
- * Unmount processing for the log.
- */
-int
-xfs_log_unmount(xfs_mount_t *mp)
-{
-        int             error;
-        error = xfs_log_unmount_write(mp);
-        xfs_log_unmount_dealloc(mp);
-        return error;
-}
-/*
 * Final log writes as part of unmount.
 *
 * Mark the filesystem clean as unmount happens.  Note that during relocation
@@ -795,7 +784,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 * and deallocate the log as the aild references the log.
 */
 void
-xfs_log_unmount_dealloc(xfs_mount_t *mp)
+xfs_log_unmount(xfs_mount_t *mp)
 {
        xfs_trans_ail_destroy(mp);
        xlog_dealloc_log(mp->m_log);
@@ -1109,7 +1098,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp)
 /*
 * Return size of each in-core log record buffer.
 *
- * All machines get 8 x 32KB buffers by default, unless tuned otherwise.
+ * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
 *
 * If the filesystem blocksize is too large, we may need to choose a
 * larger size since the directory code currently logs entire blocks.
@@ -1139,8 +1128,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
                }
                if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-                        /* # headers = size / 32K
+                        /* # headers = size / 32k
-                         * one header holds cycles from 32K of data
+                         * one header holds cycles from 32k of data
                         */
                        xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
@@ -1156,7 +1145,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
                goto done;
        }
-        /* All machines use 32KB buffers by default. */
+        /* All machines use 32kB buffers by default. */
        log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
        log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
@@ -1164,32 +1153,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
        log->l_iclog_hsize = BBSIZE;
        log->l_iclog_heads = 1;
-        /*
+done:
-         * For 16KB, we use 3 32KB buffers.  For 32KB block sizes, we use
+        /* are we being asked to make the sizes selected above visible? */
-         * 4 32KB buffers.  For 64KB block sizes, we use 8 32KB buffers.
-         */
-        if (mp->m_sb.sb_blocksize >= 16*1024) {
-                log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
-                log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
-                if (mp->m_logbufs <= 0) {
-                        switch (mp->m_sb.sb_blocksize) {
-                            case 16*1024:                       /* 16 KB */
-                                log->l_iclog_bufs = 3;
-                                break;
-                            case 32*1024:                       /* 32 KB */
-                                log->l_iclog_bufs = 4;
-                                break;
-                            case 64*1024:                       /* 64 KB */
-                                log->l_iclog_bufs = 8;
-                                break;
-                            default:
-                                xlog_panic("XFS: Invalid blocksize");
-                                break;
-                        }
-                }
-        }
-done:   /* are we being asked to make the sizes selected above visible? */
        if (mp->m_logbufs == 0)
                mp->m_logbufs = log->l_iclog_bufs;
        if (mp->m_logbsize == 0)
@@ -3214,7 +3179,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 */
 /*
- * Free a used ticket when it's refcount falls to zero.
+ * Free a used ticket when its refcount falls to zero.
 */
 void
 xfs_log_ticket_put(
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 8a3e84e900a3..d0c9baa50b1a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -170,9 +170,8 @@ int	  xfs_log_write(struct xfs_mount *mp,
                        int              nentries,
                        xfs_log_ticket_t ticket,
                        xfs_lsn_t        *start_lsn);
-int       xfs_log_unmount(struct xfs_mount *mp);
 int       xfs_log_unmount_write(struct xfs_mount *mp);
-void      xfs_log_unmount_dealloc(struct xfs_mount *mp);
+void      xfs_log_unmount(struct xfs_mount *mp);
 int       xfs_log_force_umount(struct xfs_mount *mp, int logerror);
 int       xfs_log_need_covered(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 654167be0efb..bcad5f4c1fd1 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -359,7 +359,7 @@ typedef struct xlog_in_core {
        int                     ic_size;
        int                     ic_offset;
        int                     ic_bwritecnt;
-        ushort_t                ic_state;
+        unsigned short          ic_state;
        char                    *ic_datap;      /* pointer to iclog data */
 #ifdef XFS_LOG_TRACE
        struct ktrace           *ic_trace;
@@ -455,7 +455,6 @@ extern void	 xlog_recover_process_iunlinks(xlog_t *log);
 extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void      xlog_put_bp(struct xfs_buf *);
-extern int       xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
 extern kmem_zone_t      *xfs_log_ticket_zone;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 61af610d79b3..7ba450116d4f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -94,12 +94,30 @@ xlog_put_bp(
        xfs_buf_free(bp);
 }
+STATIC xfs_caddr_t
+xlog_align(
+        xlog_t          *log,
+        xfs_daddr_t     blk_no,
+        int             nbblks,
+        xfs_buf_t       *bp)
+{
+        xfs_caddr_t     ptr;
+        if (!log->l_sectbb_log)
+                return XFS_BUF_PTR(bp);
+        ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
+        ASSERT(XFS_BUF_SIZE(bp) >=
+                BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
+        return ptr;
+}
 /*
 * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
 */
-int
+STATIC int
-xlog_bread(
+xlog_bread_noalign(
        xlog_t          *log,
        xfs_daddr_t     blk_no,
        int             nbblks,
@@ -137,6 +155,24 @@ xlog_bread(
        return error;
 }
+STATIC int
+xlog_bread(
+        xlog_t          *log,
+        xfs_daddr_t     blk_no,
+        int             nbblks,
+        xfs_buf_t       *bp,
+        xfs_caddr_t     *offset)
+{
+        int             error;
+        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+        if (error)
+                return error;
+        *offset = xlog_align(log, blk_no, nbblks, bp);
+        return 0;
+}
 /*
 * Write out the buffer at the given block for the given number of blocks.
 * The buffer is kept locked across the write and is returned locked.
@@ -180,24 +216,6 @@ xlog_bwrite(
        return error;
 }
-STATIC xfs_caddr_t
-xlog_align(
-        xlog_t          *log,
-        xfs_daddr_t     blk_no,
-        int             nbblks,
-        xfs_buf_t       *bp)
-{
-        xfs_caddr_t     ptr;
-        if (!log->l_sectbb_log)
-                return XFS_BUF_PTR(bp);
-        ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
-        ASSERT(XFS_BUF_SIZE(bp) >=
-                BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
-        return ptr;
-}
 #ifdef DEBUG
 /*
 * dump debug superblock and log record information
@@ -211,11 +229,11 @@ xlog_header_check_dump(
        cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
        for (b = 0; b < 16; b++)
-                cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
+                cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
        cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
        cmn_err(CE_DEBUG, "    log : uuid = ");
        for (b = 0; b < 16; b++)
-                cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
+                cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
        cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
 }
 #else
@@ -321,9 +339,9 @@ xlog_find_cycle_start(
        mid_blk = BLK_AVG(first_blk, *last_blk);
        while (mid_blk != first_blk && mid_blk != *last_blk) {
-                if ((error = xlog_bread(log, mid_blk, 1, bp)))
+                error = xlog_bread(log, mid_blk, 1, bp, &offset);
+                if (error)
                        return error;
-                offset = xlog_align(log, mid_blk, 1, bp);
                mid_cycle = xlog_get_cycle(offset);
                if (mid_cycle == cycle) {
                        *last_blk = mid_blk;
@@ -379,10 +397,10 @@ xlog_find_verify_cycle(
                bcount = min(bufblks, (start_blk + nbblks - i));
-                if ((error = xlog_bread(log, i, bcount, bp)))
+                error = xlog_bread(log, i, bcount, bp, &buf);
+                if (error)
                        goto out;
-                buf = xlog_align(log, i, bcount, bp);
                for (j = 0; j < bcount; j++) {
                        cycle = xlog_get_cycle(buf);
                        if (cycle == stop_on_cycle_no) {
@@ -436,9 +454,9 @@ xlog_find_verify_log_record(
                        return ENOMEM;
                smallmem = 1;
        } else {
-                if ((error = xlog_bread(log, start_blk, num_blks, bp)))
+                error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+                if (error)
                        goto out;
-                offset = xlog_align(log, start_blk, num_blks, bp);
                offset += ((num_blks - 1) << BBSHIFT);
        }
@@ -453,9 +471,9 @@ xlog_find_verify_log_record(
                }
                if (smallmem) {
-                        if ((error = xlog_bread(log, i, 1, bp)))
+                        error = xlog_bread(log, i, 1, bp, &offset);
+                        if (error)
                                goto out;
-                        offset = xlog_align(log, i, 1, bp);
                }
                head = (xlog_rec_header_t *)offset;
@@ -559,15 +577,18 @@ xlog_find_head(
        bp = xlog_get_bp(log, 1);
        if (!bp)
                return ENOMEM;
-        if ((error = xlog_bread(log, 0, 1, bp)))
+        error = xlog_bread(log, 0, 1, bp, &offset);
+        if (error)
                goto bp_err;
-        offset = xlog_align(log, 0, 1, bp);
        first_half_cycle = xlog_get_cycle(offset);
        last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
-        if ((error = xlog_bread(log, last_blk, 1, bp)))
+        error = xlog_bread(log, last_blk, 1, bp, &offset);
+        if (error)
                goto bp_err;
-        offset = xlog_align(log, last_blk, 1, bp);
        last_half_cycle = xlog_get_cycle(offset);
        ASSERT(last_half_cycle != 0);
@@ -817,9 +838,10 @@ xlog_find_tail(
        if (!bp)
                return ENOMEM;
        if (*head_blk == 0) {                           /* special case */
-                if ((error = xlog_bread(log, 0, 1, bp)))
+                error = xlog_bread(log, 0, 1, bp, &offset);
+                if (error)
                        goto bread_err;
-                offset = xlog_align(log, 0, 1, bp);
                if (xlog_get_cycle(offset) == 0) {
                        *tail_blk = 0;
                        /* leave all other log inited values alone */
@@ -832,9 +854,10 @@ xlog_find_tail(
         */
        ASSERT(*head_blk < INT_MAX);
        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
-                if ((error = xlog_bread(log, i, 1, bp)))
+                error = xlog_bread(log, i, 1, bp, &offset);
+                if (error)
                        goto bread_err;
-                offset = xlog_align(log, i, 1, bp);
                if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
                        found = 1;
                        break;
@@ -848,9 +871,10 @@ xlog_find_tail(
         */
        if (!found) {
                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
-                        if ((error = xlog_bread(log, i, 1, bp)))
+                        error = xlog_bread(log, i, 1, bp, &offset);
+                        if (error)
                                goto bread_err;
-                        offset = xlog_align(log, i, 1, bp);
                        if (XLOG_HEADER_MAGIC_NUM ==
                            be32_to_cpu(*(__be32 *)offset)) {
                                found = 2;
@@ -922,10 +946,10 @@ xlog_find_tail(
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
-                if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
+                error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+                if (error)
                        goto bread_err;
-                }
-                offset = xlog_align(log, umount_data_blk, 1, bp);
                op_head = (xlog_op_header_t *)offset;
                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
                        /*
@@ -1017,9 +1041,10 @@ xlog_find_zeroed(
        bp = xlog_get_bp(log, 1);
        if (!bp)
                return ENOMEM;
-        if ((error = xlog_bread(log, 0, 1, bp)))
+        error = xlog_bread(log, 0, 1, bp, &offset);
+        if (error)
                goto bp_err;
-        offset = xlog_align(log, 0, 1, bp);
        first_cycle = xlog_get_cycle(offset);
        if (first_cycle == 0) {         /* completely zeroed log */
                *blk_no = 0;
@@ -1028,9 +1053,10 @@ xlog_find_zeroed(
        }
        /* check partially zeroed log */
-        if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
+        error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+        if (error)
                goto bp_err;
-        offset = xlog_align(log, log_bbnum-1, 1, bp);
        last_cycle = xlog_get_cycle(offset);
        if (last_cycle != 0) {          /* log completely written to */
                xlog_put_bp(bp);
@@ -1152,10 +1178,10 @@ xlog_write_log_records(
         */
        balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
        if (balign != start_block) {
-                if ((error = xlog_bread(log, start_block, 1, bp))) {
+                error = xlog_bread_noalign(log, start_block, 1, bp);
-                        xlog_put_bp(bp);
+                if (error)
-                        return error;
+                        goto out_put_bp;
-                }
                j = start_block - balign;
        }
@@ -1175,10 +1201,14 @@ xlog_write_log_records(
                        balign = BBTOB(ealign - start_block);
                        error = XFS_BUF_SET_PTR(bp, offset + balign,
                                                BBTOB(sectbb));
-                        if (!error)
+                        if (error)
-                                error = xlog_bread(log, ealign, sectbb, bp);
+                                break;
-                        if (!error)
-                                error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+                        error = xlog_bread_noalign(log, ealign, sectbb, bp);
+                        if (error)
+                                break;
+                        error = XFS_BUF_SET_PTR(bp, offset, bufblks);
                        if (error)
                                break;
                }
@@ -1195,6 +1225,8 @@ xlog_write_log_records(
                start_block += endcount;
                j = 0;
        }
+ out_put_bp:
        xlog_put_bp(bp);
        return error;
 }
@@ -2511,16 +2543,10 @@ xlog_recover_do_inode_trans(
        }
 write_inode_buffer:
-        if (ITEM_TYPE(item) == XFS_LI_INODE) {
+        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+        bp->b_mount = mp;
-                bp->b_mount = mp;
+        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
-                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+        xfs_bdwrite(mp, bp);
-                xfs_bdwrite(mp, bp);
-        } else {
-                XFS_BUF_STALE(bp);
-                error = xfs_bwrite(mp, bp);
-        }
 error:
        if (need_free)
                kmem_free(in_f);
@@ -2769,51 +2795,48 @@ xlog_recover_do_trans(
        int                     error = 0;
        xlog_recover_item_t     *item, *first_item;
-        if ((error = xlog_recover_reorder_trans(trans)))
+        error = xlog_recover_reorder_trans(trans);
+        if (error)
                return error;
        first_item = item = trans->r_itemq;
        do {
-                /*
+                switch (ITEM_TYPE(item)) {
-                 * we don't need to worry about the block number being
+                case XFS_LI_BUF:
-                 * truncated in > 1 TB buffers because in user-land,
+                        error = xlog_recover_do_buffer_trans(log, item, pass);
-                 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
+                        break;
-                 * the blknos will get through the user-mode buffer
+                case XFS_LI_INODE:
-                 * cache properly.  The only bad case is o32 kernels
+                        error = xlog_recover_do_inode_trans(log, item, pass);
-                 * where xfs_daddr_t is 32-bits but mount will warn us
+                        break;
-                 * off a > 1 TB filesystem before we get here.
+                case XFS_LI_EFI:
-                 */
+                        error = xlog_recover_do_efi_trans(log, item,
-                if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
+                                                          trans->r_lsn, pass);
-                        if  ((error = xlog_recover_do_buffer_trans(log, item,
+                        break;
-                                                                 pass)))
+                case XFS_LI_EFD:
-                                break;
-                } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
-                        if ((error = xlog_recover_do_inode_trans(log, item,
-                                                                pass)))
-                                break;
-                } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
-                        if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
-                                                  pass)))
-                                break;
-                } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
                        xlog_recover_do_efd_trans(log, item, pass);
-                } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
+                        error = 0;
-                        if ((error = xlog_recover_do_dquot_trans(log, item,
+                        break;
-                                                                   pass)))
+                case XFS_LI_DQUOT:
-                                        break;
+                        error = xlog_recover_do_dquot_trans(log, item, pass);
-                } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
+                        break;
-                        if ((error = xlog_recover_do_quotaoff_trans(log, item,
+                case XFS_LI_QUOTAOFF:
-                                                                   pass)))
+                        error = xlog_recover_do_quotaoff_trans(log, item,
-                                        break;
+                                                               pass);
-                } else {
+                        break;
-                        xlog_warn("XFS: xlog_recover_do_trans");
+                default:
+                        xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
                        ASSERT(0);
                        error = XFS_ERROR(EIO);
                        break;
                }
+                if (error)
+                        return error;
                item = item->ri_next;
        } while (first_item != item);
-        return error;
+        return 0;
 }
 /*
@@ -3490,9 +3513,11 @@ xlog_do_recovery_pass(
                hbp = xlog_get_bp(log, 1);
                if (!hbp)
                        return ENOMEM;
-                if ((error = xlog_bread(log, tail_blk, 1, hbp)))
+                error = xlog_bread(log, tail_blk, 1, hbp, &offset);
+                if (error)
                        goto bread_err1;
-                offset = xlog_align(log, tail_blk, 1, hbp);
                rhead = (xlog_rec_header_t *)offset;
                error = xlog_valid_rec_header(log, rhead, tail_blk);
                if (error)
@@ -3526,9 +3551,10 @@ xlog_do_recovery_pass(
        memset(rhash, 0, sizeof(rhash));
        if (tail_blk <= head_blk) {
                for (blk_no = tail_blk; blk_no < head_blk; ) {
-                        if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+                        error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+                        if (error)
                                goto bread_err2;
-                        offset = xlog_align(log, blk_no, hblks, hbp);
                        rhead = (xlog_rec_header_t *)offset;
                        error = xlog_valid_rec_header(log, rhead, blk_no);
                        if (error)
@@ -3536,10 +3562,11 @@ xlog_do_recovery_pass(
                        /* blocks in data section */
                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-                        error = xlog_bread(log, blk_no + hblks, bblks, dbp);
+                        error = xlog_bread(log, blk_no + hblks, bblks, dbp,
+                                           &offset);
                        if (error)
                                goto bread_err2;
-                        offset = xlog_align(log, blk_no + hblks, bblks, dbp);
                        xlog_unpack_data(rhead, offset, log);
                        if ((error = xlog_recover_process_data(log,
                                                rhash, rhead, offset, pass)))
@@ -3562,10 +3589,10 @@ xlog_do_recovery_pass(
                        wrapped_hblks = 0;
                        if (blk_no + hblks <= log->l_logBBsize) {
                                /* Read header in one read */
-                                error = xlog_bread(log, blk_no, hblks, hbp);
+                                error = xlog_bread(log, blk_no, hblks, hbp,
+                                                   &offset);
                                if (error)
                                        goto bread_err2;
-                                offset = xlog_align(log, blk_no, hblks, hbp);
                        } else {
                                /* This LR is split across physical log end */
                                if (blk_no != log->l_logBBsize) {
@@ -3573,12 +3600,13 @@ xlog_do_recovery_pass(
                                        ASSERT(blk_no <= INT_MAX);
                                        split_hblks = log->l_logBBsize - (int)blk_no;
                                        ASSERT(split_hblks > 0);
-                                        if ((error = xlog_bread(log, blk_no,
+                                        error = xlog_bread(log, blk_no,
-                                                        split_hblks, hbp)))
+                                                           split_hblks, hbp,
+                                                           &offset);
+                                        if (error)
                                                goto bread_err2;
-                                        offset = xlog_align(log, blk_no,
-                                                        split_hblks, hbp);
                                }
                                /*
                                 * Note: this black magic still works with
                                 * large sector sizes (non-512) only because:
@@ -3596,14 +3624,19 @@ xlog_do_recovery_pass(
                                error = XFS_BUF_SET_PTR(hbp,
                                                bufaddr + BBTOB(split_hblks),
                                                BBTOB(hblks - split_hblks));
-                                if (!error)
+                                if (error)
-                                        error = xlog_bread(log, 0,
+                                        goto bread_err2;
-                                                        wrapped_hblks, hbp);
-                                if (!error)
+                                error = xlog_bread_noalign(log, 0,
-                                        error = XFS_BUF_SET_PTR(hbp, bufaddr,
+                                                           wrapped_hblks, hbp);
+                                if (error)
+                                        goto bread_err2;
+                                error = XFS_BUF_SET_PTR(hbp, bufaddr,
                                                        BBTOB(hblks));
                                if (error)
                                        goto bread_err2;
                                if (!offset)
                                        offset = xlog_align(log, 0,
                                                        wrapped_hblks, hbp);
@@ -3619,10 +3652,10 @@ xlog_do_recovery_pass(
                        /* Read in data for log record */
                        if (blk_no + bblks <= log->l_logBBsize) {
-                                error = xlog_bread(log, blk_no, bblks, dbp);
+                                error = xlog_bread(log, blk_no, bblks, dbp,
+                                                   &offset);
                                if (error)
                                        goto bread_err2;
-                                offset = xlog_align(log, blk_no, bblks, dbp);
                        } else {
                                /* This log record is split across the
                                 * physical end of log */
@@ -3636,12 +3669,13 @@ xlog_do_recovery_pass(
                                        split_bblks =
                                                log->l_logBBsize - (int)blk_no;
                                        ASSERT(split_bblks > 0);
-                                        if ((error = xlog_bread(log, blk_no,
+                                        error = xlog_bread(log, blk_no,
-                                                        split_bblks, dbp)))
+                                                        split_bblks, dbp,
+                                                        &offset);
+                                        if (error)
                                                goto bread_err2;
-                                        offset = xlog_align(log, blk_no,
-                                                        split_bblks, dbp);
                                }
                                /*
                                 * Note: this black magic still works with
                                 * large sector sizes (non-512) only because:
@@ -3658,15 +3692,19 @@ xlog_do_recovery_pass(
                                error = XFS_BUF_SET_PTR(dbp,
                                                bufaddr + BBTOB(split_bblks),
                                                BBTOB(bblks - split_bblks));
-                                if (!error)
-                                        error = xlog_bread(log, wrapped_hblks,
-                                                        bblks - split_bblks,
-                                                        dbp);
-                                if (!error)
-                                        error = XFS_BUF_SET_PTR(dbp, bufaddr,
-                                                        h_size);
                                if (error)
                                        goto bread_err2;
+                                error = xlog_bread_noalign(log, wrapped_hblks,
+                                                bblks - split_bblks,
+                                                dbp);
+                                if (error)
+                                        goto bread_err2;
+                                error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+                                if (error)
+                                        goto bread_err2;
                                if (!offset)
                                        offset = xlog_align(log, wrapped_hblks,
                                                bblks - split_bblks, dbp);
@@ -3683,17 +3721,21 @@ xlog_do_recovery_pass(
                /* read first part of physical log */
                while (blk_no < head_blk) {
-                        if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+                        error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+                        if (error)
                                goto bread_err2;
-                        offset = xlog_align(log, blk_no, hblks, hbp);
                        rhead = (xlog_rec_header_t *)offset;
                        error = xlog_valid_rec_header(log, rhead, blk_no);
                        if (error)
                                goto bread_err2;
                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-                        if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+                        error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+                                           &offset);
+                        if (error)
                                goto bread_err2;
-                        offset = xlog_align(log, blk_no+hblks, bblks, dbp);
                        xlog_unpack_data(rhead, offset, log);
                        if ((error = xlog_recover_process_data(log, rhash,
                                                        rhead, offset, pass)))
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 35300250e86d..b101990df027 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
-STATIC int      xfs_uuid_mount(xfs_mount_t *);
 STATIC void     xfs_unmountfs_wait(xfs_mount_t *);
@@ -121,6 +120,84 @@ static const struct {
    { sizeof(xfs_sb_t),                  0 }
 };
+static DEFINE_MUTEX(xfs_uuid_table_mutex);
+static int xfs_uuid_table_size;
+static uuid_t *xfs_uuid_table;
+/*
+ * See if the UUID is unique among mounted XFS filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
+ */
+STATIC int
+xfs_uuid_mount(
+        struct xfs_mount        *mp)
+{
+        uuid_t                  *uuid = &mp->m_sb.sb_uuid;
+        int                     hole, i;
+        if (mp->m_flags & XFS_MOUNT_NOUUID)
+                return 0;
+        if (uuid_is_nil(uuid)) {
+                cmn_err(CE_WARN,
+                        "XFS: Filesystem %s has nil UUID - can't mount",
+                        mp->m_fsname);
+                return XFS_ERROR(EINVAL);
+        }
+        mutex_lock(&xfs_uuid_table_mutex);
+        for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
+                if (uuid_is_nil(&xfs_uuid_table[i])) {
+                        hole = i;
+                        continue;
+                }
+                if (uuid_equal(uuid, &xfs_uuid_table[i]))
+                        goto out_duplicate;
+        }
+        if (hole < 0) {
+                xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+                        (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
+                        xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
+                        KM_SLEEP);
+                hole = xfs_uuid_table_size++;
+        }
+        xfs_uuid_table[hole] = *uuid;
+        mutex_unlock(&xfs_uuid_table_mutex);
+        return 0;
+ out_duplicate:
+        mutex_unlock(&xfs_uuid_table_mutex);
+        cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+                         mp->m_fsname);
+        return XFS_ERROR(EINVAL);
+}
+STATIC void
+xfs_uuid_unmount(
+        struct xfs_mount        *mp)
+{
+        uuid_t                  *uuid = &mp->m_sb.sb_uuid;
+        int                     i;
+        if (mp->m_flags & XFS_MOUNT_NOUUID)
+                return;
+        mutex_lock(&xfs_uuid_table_mutex);
+        for (i = 0; i < xfs_uuid_table_size; i++) {
+                if (uuid_is_nil(&xfs_uuid_table[i]))
+                        continue;
+                if (!uuid_equal(uuid, &xfs_uuid_table[i]))
+                        continue;
+                memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
+                break;
+        }
+        ASSERT(i < xfs_uuid_table_size);
+        mutex_unlock(&xfs_uuid_table_mutex);
+}
 /*
 * Free up the resources associated with a mount structure.  Assume that
 * the structure was initially zeroed, so we can tell which fields got
@@ -256,6 +333,22 @@ xfs_mount_validate_sb(
                return XFS_ERROR(ENOSYS);
        }
+        /*
+         * Currently only very few inode sizes are supported.
+         */
+        switch (sbp->sb_inodesize) {
+        case 256:
+        case 512:
+        case 1024:
+        case 2048:
+                break;
+        default:
+                xfs_fs_mount_cmn_err(flags,
+                        "inode size of %d bytes not supported",
+                        sbp->sb_inodesize);
+                return XFS_ERROR(ENOSYS);
+        }
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
                xfs_fs_mount_cmn_err(flags,
@@ -574,32 +667,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
        mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
        mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-        mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
        mp->m_blockmask = sbp->sb_blocksize - 1;
        mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
        mp->m_blockwmask = mp->m_blockwsize - 1;
-        /*
-         * Setup for attributes, in case they get created.
-         * This value is for inodes getting attributes for the first time,
-         * the per-inode value is for old attribute values.
-         */
-        ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
-        switch (sbp->sb_inodesize) {
-        case 256:
-                mp->m_attroffset = XFS_LITINO(mp) -
-                                   XFS_BMDR_SPACE_CALC(MINABTPTRS);
-                break;
-        case 512:
-        case 1024:
-        case 2048:
-                mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
-                break;
-        default:
-                ASSERT(0);
-        }
-        ASSERT(mp->m_attroffset < XFS_LITINO(mp));
        mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
        mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
        mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
@@ -645,7 +716,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
        for (index = 0; index < agcount; index++) {
                /*
                 * read the agf, then the agi. This gets us
-                 * all the inforamtion we need and populates the
+                 * all the information we need and populates the
                 * per-ag structures for us.
                 */
                error = xfs_alloc_pagf_init(mp, NULL, index, 0);
@@ -886,8 +957,6 @@ xfs_check_sizes(xfs_mount_t *mp)
 }
 /*
- * xfs_mountfs
- *
 * This function does the following on an initial mount of a file system:
 *      - reads the superblock from disk and init the mount struct
 *      - if we're a 32-bit kernel, do a size check on the superblock
@@ -905,7 +974,6 @@ xfs_mountfs(
        xfs_inode_t     *rip;
        __uint64_t      resblks;
        uint            quotamount, quotaflags;
-        int             uuid_mounted = 0;
        int             error = 0;
        xfs_mount_common(mp, sbp);
@@ -960,7 +1028,7 @@ xfs_mountfs(
         */
        error = xfs_update_alignment(mp);
        if (error)
-                goto error1;
+                goto out;
        xfs_alloc_compute_maxlevels(mp);
        xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
@@ -971,19 +1039,9 @@ xfs_mountfs(
        mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
-        /*
+        error = xfs_uuid_mount(mp);
-         * XFS uses the uuid from the superblock as the unique
+        if (error)
-         * identifier for fsid.  We can not use the uuid from the volume
+                goto out;
-         * since a single partition filesystem is identical to a single
-         * partition volume/filesystem.
-         */
-        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
-                if (xfs_uuid_mount(mp)) {
-                        error = XFS_ERROR(EINVAL);
-                        goto error1;
-                }
-                uuid_mounted=1;
-        }
        /*
         * Set the minimum read and write sizes
@@ -1007,7 +1065,7 @@ xfs_mountfs(
         */
        error = xfs_check_sizes(mp);
        if (error)
-                goto error1;
+                goto out_remove_uuid;
        /*
         * Initialize realtime fields in the mount structure
@@ -1015,7 +1073,7 @@ xfs_mountfs(
        error = xfs_rtmount_init(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: RT mount failed");
-                goto error1;
+                goto out_remove_uuid;
        }
        /*
@@ -1045,26 +1103,26 @@ xfs_mountfs(
        mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
                                  KM_MAYFAIL);
        if (!mp->m_perag)
-                goto error1;
+                goto out_remove_uuid;
        mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
+        if (!sbp->sb_logblocks) {
+                cmn_err(CE_WARN, "XFS: no log defined");
+                XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
+                error = XFS_ERROR(EFSCORRUPTED);
+                goto out_free_perag;
+        }
        /*
         * log's mount-time initialization. Perform 1st part recovery if needed
         */
-        if (likely(sbp->sb_logblocks > 0)) {    /* check for volume case */
+        error = xfs_log_mount(mp, mp->m_logdev_targp,
-                error = xfs_log_mount(mp, mp->m_logdev_targp,
+                              XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
-                                      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+                              XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
-                                      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+        if (error) {
-                if (error) {
+                cmn_err(CE_WARN, "XFS: log mount failed");
-                        cmn_err(CE_WARN, "XFS: log mount failed");
+                goto out_free_perag;
-                        goto error2;
-                }
-        } else {        /* No log has been defined */
-                cmn_err(CE_WARN, "XFS: no log defined");
-                XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
-                error = XFS_ERROR(EFSCORRUPTED);
-                goto error2;
        }
        /*
@@ -1086,15 +1144,14 @@ xfs_mountfs(
         * If we are currently making the filesystem, the initialisation will
         * fail as the perag data is in an undefined state.
         */
        if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
            !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
             !mp->m_sb.sb_inprogress) {
                error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
-                if (error) {
+                if (error)
-                        goto error2;
+                        goto out_free_perag;
-                }
        }
        /*
         * Get and sanity-check the root inode.
         * Save the pointer to it in the mount structure.
@@ -1102,7 +1159,7 @@ xfs_mountfs(
        error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
        if (error) {
                cmn_err(CE_WARN, "XFS: failed to read root inode");
-                goto error3;
+                goto out_log_dealloc;
        }
        ASSERT(rip != NULL);
@@ -1116,7 +1173,7 @@ xfs_mountfs(
                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
                                 mp);
                error = XFS_ERROR(EFSCORRUPTED);
-                goto error4;
+                goto out_rele_rip;
        }
        mp->m_rootip = rip;     /* save it */
@@ -1131,7 +1188,7 @@ xfs_mountfs(
                 * Free up the root inode.
                 */
                cmn_err(CE_WARN, "XFS: failed to read RT inodes");
-                goto error4;
+                goto out_rele_rip;
        }
        /*
@@ -1143,7 +1200,7 @@ xfs_mountfs(
                error = xfs_mount_log_sb(mp, mp->m_update_flags);
                if (error) {
                        cmn_err(CE_WARN, "XFS: failed to write sb changes");
-                        goto error4;
+                        goto out_rtunmount;
                }
        }
@@ -1152,7 +1209,7 @@ xfs_mountfs(
         */
        error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
        if (error)
-                goto error4;
+                goto out_rtunmount;
        /*
         * Finish recovering the file system.  This part needed to be
@@ -1162,7 +1219,7 @@ xfs_mountfs(
        error = xfs_log_mount_finish(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: log mount finish failed");
-                goto error4;
+                goto out_rtunmount;
        }
        /*
@@ -1170,7 +1227,7 @@ xfs_mountfs(
         */
        error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
        if (error)
-                goto error4;
+                goto out_rtunmount;
        /*
         * Now we are mounted, reserve a small amount of unused space for
@@ -1194,18 +1251,17 @@ xfs_mountfs(
        return 0;
- error4:
+ out_rtunmount:
-        /*
+        xfs_rtunmount_inodes(mp);
-         * Free up the root inode.
+ out_rele_rip:
-         */
        IRELE(rip);
- error3:
+ out_log_dealloc:
-        xfs_log_unmount_dealloc(mp);
+        xfs_log_unmount(mp);
- error2:
+ out_free_perag:
        xfs_free_perag(mp);
- error1:
+ out_remove_uuid:
-        if (uuid_mounted)
+        xfs_uuid_unmount(mp);
-                uuid_table_remove(&mp->m_sb.sb_uuid);
+ out:
        return error;
 }
@@ -1226,15 +1282,12 @@ xfs_unmountfs(
         */
        XFS_QM_UNMOUNT(mp);
-        if (mp->m_rbmip)
+        xfs_rtunmount_inodes(mp);
-                IRELE(mp->m_rbmip);
-        if (mp->m_rsumip)
-                IRELE(mp->m_rsumip);
        IRELE(mp->m_rootip);
        /*
         * We can potentially deadlock here if we have an inode cluster
-         * that has been freed has it's buffer still pinned in memory because
+         * that has been freed has its buffer still pinned in memory because
         * the transaction is still sitting in a iclog. The stale inodes
         * on that buffer will have their flush locks held until the
         * transaction hits the disk and the callbacks run. the inode
@@ -1266,7 +1319,7 @@ xfs_unmountfs(
         * Unreserve any blocks we have so that when we unmount we don't account
         * the reserved free space as used. This is really only necessary for
         * lazy superblock counting because it trusts the incore superblock
-         * counters to be aboslutely correct on clean unmount.
+         * counters to be absolutely correct on clean unmount.
         *
         * We don't bother correcting this elsewhere for lazy superblock
         * counting because on mount of an unclean filesystem we reconstruct the
@@ -1288,10 +1341,9 @@ xfs_unmountfs(
                                "Freespace may not be correct on next mount.");
        xfs_unmountfs_writesb(mp);
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
-        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
+        xfs_log_unmount_write(mp);
+        xfs_log_unmount(mp);
-        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
+        xfs_uuid_unmount(mp);
-                uuid_table_remove(&mp->m_sb.sb_uuid);
 #if defined(DEBUG)
        xfs_errortag_clearall(mp, 0);
@@ -1793,29 +1845,6 @@ xfs_freesb(
 }
 /*
- * See if the UUID is unique among mounted XFS filesystems.
- * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
- */
-STATIC int
-xfs_uuid_mount(
-        xfs_mount_t     *mp)
-{
-        if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
-                cmn_err(CE_WARN,
-                        "XFS: Filesystem %s has nil UUID - can't mount",
-                        mp->m_fsname);
-                return -1;
-        }
-        if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
-                cmn_err(CE_WARN,
-                        "XFS: Filesystem %s has duplicate UUID - can't mount",
-                        mp->m_fsname);
-                return -1;
-        }
-        return 0;
-}
-/*
 * Used to log changes to the superblock unit and width fields which could
 * be altered by the mount options, as well as any potential sb_features2
 * fixup. Only the first superblock is updated.
@@ -1868,7 +1897,7 @@ xfs_mount_log_sb(
 * we disable the per-cpu counter and go through the slow path.
 *
 * The slow path is the current xfs_mod_incore_sb() function.  This means that
- * when we disable a per-cpu counter, we need to drain it's resources back to
+ * when we disable a per-cpu counter, we need to drain its resources back to
 * the global superblock. We do this after disabling the counter to prevent
 * more threads from queueing up on the counter.
 *
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f5e9937f9bdb..7af44adffc8f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -136,7 +136,6 @@ typedef int	(*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
                        struct xfs_dquot *, struct xfs_dquot *, uint);
 typedef void    (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
 typedef int     (*xfs_dqsync_t)(struct xfs_mount *, int flags);
-typedef int     (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
 typedef struct xfs_qmops {
        xfs_qminit_t            xfs_qminit;
@@ -154,7 +153,6 @@ typedef struct xfs_qmops {
        xfs_dqvopchownresv_t    xfs_dqvopchownresv;
        xfs_dqstatvfs_t         xfs_dqstatvfs;
        xfs_dqsync_t            xfs_dqsync;
-        xfs_quotactl_t          xfs_quotactl;
        struct xfs_dqtrxops     *xfs_dqtrxops;
 } xfs_qmops_t;
@@ -188,8 +186,6 @@ typedef struct xfs_qmops {
        (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
 #define XFS_QM_DQSYNC(mp, flags) \
        (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
-#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \
-        (*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr)
 #ifdef HAVE_PERCPU_SB
@@ -273,19 +269,17 @@ typedef struct xfs_mount {
        uint                    m_inobt_mnr[2]; /* min inobt btree records */
        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
-        uint                    m_in_maxlevels; /* XFS_IN_MAXLEVELS */
+        uint                    m_in_maxlevels; /* max inobt btree levels. */
        struct xfs_perag        *m_perag;       /* per-ag accounting info */
        struct rw_semaphore     m_peraglock;    /* lock for m_perag (pointer) */
        struct mutex            m_growlock;     /* growfs mutex */
        int                     m_fixedfsid[2]; /* unchanged for life of FS */
        uint                    m_dmevmask;     /* DMI events for this FS */
        __uint64_t              m_flags;        /* global mount flags */
-        uint                    m_attroffset;   /* inode attribute offset */
        uint                    m_dir_node_ents; /* #entries in a dir danode */
        uint                    m_attr_node_ents; /* #entries in attr danode */
        int                     m_ialloc_inos;  /* inodes in inode allocation */
        int                     m_ialloc_blks;  /* blocks in inode allocation */
-        int                     m_litino;       /* size of inode union area */
        int                     m_inoalign_mask;/* mask sb_inoalignmt if used */
        uint                    m_qflags;       /* quota status flags */
        xfs_trans_reservations_t m_reservations;/* precomputed res values */
@@ -293,9 +287,6 @@ typedef struct xfs_mount {
        __uint64_t              m_maxioffset;   /* maximum inode offset */
        __uint64_t              m_resblks;      /* total reserved blocks */
        __uint64_t              m_resblks_avail;/* available reserved blocks */
-#if XFS_BIG_INUMS
-        xfs_ino_t               m_inoadd;       /* add value for ino64_offset */
-#endif
        int                     m_dalign;       /* stripe unit */
        int                     m_swidth;       /* stripe width */
        int                     m_sinoalign;    /* stripe unit inode alignment */
@@ -337,7 +328,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC         (1ULL << 0)     /* for nfs - all metadata ops
                                                   must be synchronous except
                                                   for space allocations */
-#define XFS_MOUNT_INO64         (1ULL << 1)
 #define XFS_MOUNT_DMAPI         (1ULL << 2)     /* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN     (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
@@ -389,8 +379,8 @@ typedef struct xfs_mount {
 * Synchronous read and write sizes.  This should be
 * better for NFSv2 wsync filesystems.
 */
-#define XFS_WSYNC_READIO_LOG    15      /* 32K */
+#define XFS_WSYNC_READIO_LOG    15      /* 32k */
-#define XFS_WSYNC_WRITEIO_LOG   14      /* 16K */
+#define XFS_WSYNC_WRITEIO_LOG   14      /* 16k */
 /*
 * Allow large block sizes to be reported to userspace programs if the
@@ -500,9 +490,6 @@ typedef struct xfs_mod_sb {
        int64_t         msb_delta;      /* Change to make to specified field */
 } xfs_mod_sb_t;
-#define XFS_MOUNT_ILOCK(mp)     mutex_lock(&((mp)->m_ilock))
-#define XFS_MOUNT_IUNLOCK(mp)   mutex_unlock(&((mp)->m_ilock))
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
 extern int      xfs_mountfs(xfs_mount_t *mp);
 extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 27f80581520a..e101790ea8e7 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -126,7 +126,6 @@ static struct xfs_qmops xfs_qmcore_stub = {
        .xfs_dqvopchownresv     = (xfs_dqvopchownresv_t) fs_noerr,
        .xfs_dqstatvfs          = (xfs_dqstatvfs_t) fs_noval,
        .xfs_dqsync             = (xfs_dqsync_t) fs_noerr,
-        .xfs_quotactl           = (xfs_quotactl_t) fs_nosys,
 };
 int
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 48965ecaa155..f5d1202dde25 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_QUOTA_H__
 #define __XFS_QUOTA_H__
+struct xfs_trans;
 /*
 * The ondisk form of a dquot structure.
 */
@@ -185,7 +187,6 @@ typedef struct xfs_qoff_logformat {
 * to a single function. None of these XFS_QMOPT_* flags are meant to have
 * persistent values (ie. their values can and will change between versions)
 */
-#define XFS_QMOPT_DQLOCK        0x0000001 /* dqlock */
 #define XFS_QMOPT_DQALLOC       0x0000002 /* alloc dquot ondisk if needed */
 #define XFS_QMOPT_UQUOTA        0x0000004 /* user dquot requested */
 #define XFS_QMOPT_PQUOTA        0x0000008 /* project dquot requested */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c5bb86f3ec05..385f6dceba5d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2288,6 +2288,16 @@ xfs_rtmount_inodes(
        return 0;
 }
+void
+xfs_rtunmount_inodes(
+        struct xfs_mount        *mp)
+{
+        if (mp->m_rbmip)
+                IRELE(mp->m_rbmip);
+        if (mp->m_rsumip)
+                IRELE(mp->m_rsumip);
+}
 /*
 * Pick an extent for allocation at the start of a new realtime file.
 * Use the sequence number stored in the atime field of the bitmap inode.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 8d8dcd215716..b2d67adb6a08 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,8 +23,8 @@ struct xfs_trans;
 /* Min and max rt extent sizes, specified in bytes */
 #define XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
-#define XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64KB */
+#define XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64kB */
-#define XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4KB */
+#define XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4kB */
 /*
 * Constants for bit manipulations.
@@ -108,6 +108,9 @@ xfs_rtfree_extent(
 int                                     /* error */
 xfs_rtmount_init(
        struct xfs_mount        *mp);   /* file system mount structure */
+void
+xfs_rtunmount_inodes(
+        struct xfs_mount        *mp);
 /*
 * Get the bitmap and summary inodes into the mount structure
@@ -146,6 +149,7 @@ xfs_growfs_rt(
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
 # define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtunmount_inodes(m)
 #endif  /* CONFIG_XFS_RT */
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index d6fe4a88d79f..775249a54f6f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -292,7 +292,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 * In a write transaction we can allocate a maximum of 2
 * extents.  This gives:
 *    the inode getting the new extents: inode size
- *    the inode\'s bmap btree: max depth * block size
+ *    the inode's bmap btree: max depth * block size
 *    the agfs of the ags from which the extents are allocated: 2 * sector
 *    the superblock free block counter: sector size
 *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
@@ -321,7 +321,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
 * In truncating a file we free up to two extents at once.  We can modify:
 *    the inode being truncated: inode size
- *    the inode\'s bmap btree: (max depth + 1) * block size
+ *    the inode's bmap btree: (max depth + 1) * block size
 * And the bmap_finish transaction can free the blocks and bmap blocks:
 *    the agf for each of the ags: 4 * sector size
 *    the agfl for each of the ags: 4 * sector size
@@ -343,7 +343,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
          (128 * 5) + \
          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-           (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+           (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
            XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
@@ -431,8 +431,8 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 *    the new inode: inode size
 *    the inode btree entry: 1 block
 *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 KB
+ *    the blocks for the symlink: 1 kB
 * Or in the first xact we allocate some inodes giving:
 *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
 *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
@@ -449,9 +449,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
          (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
         (2 * (mp)->m_sb.sb_sectsize + \
          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
@@ -463,7 +463,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 *    the inode btree entry: block size
 *    the superblock for the nlink flag: sector size
 *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
 * Or in the first xact we allocate some inodes giving:
 *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
 *    the superblock for the nlink flag: sector size
@@ -481,9 +481,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
          (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
         (3 * (mp)->m_sb.sb_sectsize + \
          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_CREATE_LOG_RES(mp)  ((mp)->m_reservations.tr_create)
@@ -513,7 +513,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
         MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
         (128 * 5) + \
          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
@@ -637,7 +637,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
 * Removing the attribute fork of a file
 *    the inode being truncated: inode size
- *    the inode\'s bmap btree: max depth * block size
+ *    the inode's bmap btree: max depth * block size
 * And the bmap_finish transaction can free the blocks and bmap blocks:
 *    the agf for each of the ags: 4 * sector size
 *    the agfl for each of the ags: 4 * sector size
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2d47f10f8bed..f31271c30de9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -79,7 +79,7 @@ xfs_trans_ail_tail(
 * the push is run asynchronously in a separate thread, so we return the tail
 * of the log right now instead of the tail after the push. This means we will
 * either continue right away, or we will sleep waiting on the async thread to
- * do it's work.
+ * do its work.
 *
 * We do this unlocked - we only need to know whether there is anything in the
 * AIL at the time we are called. We don't need to access the contents of
@@ -160,7 +160,7 @@ xfs_trans_ail_cursor_next(
 /*
 * Now that the traversal is complete, we need to remove the cursor
 * from the list of traversing cursors. Avoid removing the embedded
- * push cursor, but use the fact it is alway present to make the
+ * push cursor, but use the fact it is always present to make the
 * list deletion simple.
 */
 void
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index e110bf57d7f4..eb3fc57f9eef 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,7 +22,7 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
-/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+/* XXX: from here down needed until struct xfs_trans has its own ailp */
 #include "xfs_bit.h"
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 4ea2e5074bdd..7d2c920dfb9c 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
 #define XFS_DIRREMOVE_SPACE_RES(mp)     \
        XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
 #define XFS_IALLOC_SPACE_RES(mp)        \
-        (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1)
+        (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
 /*
 * Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b2f724502f1b..d725428c9df6 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -21,14 +21,6 @@
 #ifdef __KERNEL__
 /*
- * POSIX Extensions
- */
-typedef unsigned char           uchar_t;
-typedef unsigned short          ushort_t;
-typedef unsigned int            uint_t;
-typedef unsigned long           ulong_t;
-/*
 * Additional type declarations for XFS
 */
 typedef signed char             __int8_t;
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index fcc2285d03ed..79b9e5ea5359 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -374,7 +374,7 @@ xfs_truncate_file(
        /*
         * Follow the normal truncate locking protocol.  Since we
-         * hold the inode in the transaction, we know that it's number
+         * hold the inode in the transaction, we know that its number
         * of references will stay constant.
         */
        xfs_ilock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0e55c5d7db5f..7394c7af5de5 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1136,7 +1136,7 @@ xfs_inactive(
         * If the inode is already free, then there can be nothing
         * to clean up here.
         */
-        if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
+        if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
                ASSERT(ip->i_df.if_real_bytes == 0);
                ASSERT(ip->i_df.if_broot_bytes == 0);
                return VN_INACTIVE_CACHE;
@@ -1387,23 +1387,28 @@ xfs_create(
        xfs_inode_t             **ipp,
        cred_t                  *credp)
 {
-        xfs_mount_t             *mp = dp->i_mount;
+        int                     is_dir = S_ISDIR(mode);
-        xfs_inode_t             *ip;
+        struct xfs_mount        *mp = dp->i_mount;
-        xfs_trans_t             *tp;
+        struct xfs_inode        *ip = NULL;
+        struct xfs_trans        *tp = NULL;
        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        boolean_t               unlock_dp_on_error = B_FALSE;
-        int                     dm_event_sent = 0;
        uint                    cancel_flags;
        int                     committed;
        xfs_prid_t              prid;
-        struct xfs_dquot        *udqp, *gdqp;
+        struct xfs_dquot        *udqp = NULL;
+        struct xfs_dquot        *gdqp = NULL;
        uint                    resblks;
+        uint                    log_res;
+        uint                    log_count;
-        ASSERT(!*ipp);
        xfs_itrace_entry(dp);
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
                                dp, DM_RIGHT_NULL, NULL,
@@ -1412,84 +1417,97 @@ xfs_create(
                if (error)
                        return error;
-                dm_event_sent = 1;
        }
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        /* Return through std_return after this point. */
-        udqp = gdqp = NULL;
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
                prid = dp->i_d.di_projid;
        else
-                prid = (xfs_prid_t)dfltprid;
+                prid = dfltprid;
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
                        current_fsuid(), current_fsgid(), prid,
-                        XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
+                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
-        ip = NULL;
+        if (is_dir) {
+                rdev = 0;
+                resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+                log_res = XFS_MKDIR_LOG_RES(mp);
+                log_count = XFS_MKDIR_LOG_COUNT;
+                tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+        } else {
+                resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+                log_res = XFS_CREATE_LOG_RES(mp);
+                log_count = XFS_CREATE_LOG_COUNT;
+                tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+        }
-        tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        resblks = XFS_CREATE_SPACE_RES(mp, name->len);
        /*
         * Initially assume that the file does not exist and
         * reserve the resources for that case.  If that is not
         * the case we'll drop the one we have and get a more
         * appropriate transaction later.
         */
-        error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+        error = xfs_trans_reserve(tp, resblks, log_res, 0,
-                        XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+                        XFS_TRANS_PERM_LOG_RES, log_count);
        if (error == ENOSPC) {
                resblks = 0;
-                error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
+                error = xfs_trans_reserve(tp, 0, log_res, 0,
-                                XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+                                XFS_TRANS_PERM_LOG_RES, log_count);
        }
        if (error) {
                cancel_flags = 0;
-                goto error_return;
+                goto out_trans_cancel;
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
        unlock_dp_on_error = B_TRUE;
-        xfs_bmap_init(&free_list, &first_block);
+        /*
+         * Check for directory link count overflow.
+         */
+        if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
+                error = XFS_ERROR(EMLINK);
+                goto out_trans_cancel;
+        }
-        ASSERT(ip == NULL);
+        xfs_bmap_init(&free_list, &first_block);
        /*
         * Reserve disk quota and the inode.
         */
        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
        if (error)
-                goto error_return;
+                goto out_trans_cancel;
        error = xfs_dir_canenter(tp, dp, name, resblks);
        if (error)
-                goto error_return;
+                goto out_trans_cancel;
-        error = xfs_dir_ialloc(&tp, dp, mode, 1,
-                        rdev, credp, prid, resblks > 0,
+        /*
-                        &ip, &committed);
+         * A newly created regular or special file just has one directory
+         * entry pointing to them, but a directory also the "." entry
+         * pointing to itself.
+         */
+        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
+                               prid, resblks > 0, &ip, &committed);
        if (error) {
                if (error == ENOSPC)
-                        goto error_return;
+                        goto out_trans_cancel;
-                goto abort_return;
+                goto out_trans_abort;
        }
-        xfs_itrace_ref(ip);
        /*
         * At this point, we've gotten a newly allocated inode.
         * It is locked (and joined to the transaction).
         */
+        xfs_itrace_ref(ip);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        /*
@@ -1508,19 +1526,28 @@ xfs_create(
                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
                ASSERT(error != ENOSPC);
-                goto abort_return;
+                goto out_trans_abort;
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+        if (is_dir) {
+                error = xfs_dir_init(tp, ip, dp);
+                if (error)
+                        goto out_bmap_cancel;
+                error = xfs_bumplink(tp, dp);
+                if (error)
+                        goto out_bmap_cancel;
+        }
        /*
         * If this is a synchronous mount, make sure that the
         * create transaction goes to disk before returning to
         * the user.
         */
-        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
                xfs_trans_set_sync(tp);
-        }
        /*
         * Attach the dquot(s) to the inodes and modify them incore.
@@ -1537,16 +1564,13 @@ xfs_create(
        IHOLD(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
-        if (error) {
+        if (error)
-                xfs_bmap_cancel(&free_list);
+                goto out_abort_rele;
-                goto abort_rele;
-        }
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        if (error) {
                IRELE(ip);
-                tp = NULL;
+                goto out_dqrele;
-                goto error_return;
        }
        XFS_QM_DQRELE(mp, udqp);
@@ -1555,26 +1579,22 @@ xfs_create(
        *ipp = ip;
        /* Fallthrough to std_return with error = 0  */
+ std_return:
-std_return:
+        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
-        if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
+                XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
-            DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
+                                ip, DM_RIGHT_NULL, name->name, NULL, mode,
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
+                                error, 0);
-                        dp, DM_RIGHT_NULL,
-                        *ipp ? ip : NULL,
-                        DM_RIGHT_NULL, name->name, NULL,
-                        mode, error, 0);
        }
        return error;
- abort_return:
+ out_bmap_cancel:
+        xfs_bmap_cancel(&free_list);
+ out_trans_abort:
        cancel_flags |= XFS_TRANS_ABORT;
-        /* FALLTHROUGH */
+ out_trans_cancel:
+        xfs_trans_cancel(tp, cancel_flags);
- error_return:
+ out_dqrele:
-        if (tp != NULL)
-                xfs_trans_cancel(tp, cancel_flags);
        XFS_QM_DQRELE(mp, udqp);
        XFS_QM_DQRELE(mp, gdqp);
@@ -1583,20 +1603,18 @@ std_return:
        goto std_return;
- abort_rele:
+ out_abort_rele:
        /*
         * Wait until after the current transaction is aborted to
         * release the inode.  This prevents recursive transactions
         * and deadlocks from xfs_inactive.
         */
+        xfs_bmap_cancel(&free_list);
        cancel_flags |= XFS_TRANS_ABORT;
        xfs_trans_cancel(tp, cancel_flags);
        IRELE(ip);
+        unlock_dp_on_error = B_FALSE;
-        XFS_QM_DQRELE(mp, udqp);
+        goto out_dqrele;
-        XFS_QM_DQRELE(mp, gdqp);
-        goto std_return;
 }
 #ifdef DEBUG
@@ -2004,8 +2022,10 @@ xfs_link(
        /* Return through std_return after this point. */
        error = XFS_QM_DQATTACH(mp, sip, 0);
-        if (!error && sip != tdp)
+        if (error)
-                error = XFS_QM_DQATTACH(mp, tdp, 0);
+                goto std_return;
+        error = XFS_QM_DQATTACH(mp, tdp, 0);
        if (error)
                goto std_return;
@@ -2110,209 +2130,6 @@ std_return:
        goto std_return;
 }
-int
-xfs_mkdir(
-        xfs_inode_t             *dp,
-        struct xfs_name         *dir_name,
-        mode_t                  mode,
-        xfs_inode_t             **ipp,
-        cred_t                  *credp)
-{
-        xfs_mount_t             *mp = dp->i_mount;
-        xfs_inode_t             *cdp;   /* inode of created dir */
-        xfs_trans_t             *tp;
-        int                     cancel_flags;
-        int                     error;
-        int                     committed;
-        xfs_bmap_free_t         free_list;
-        xfs_fsblock_t           first_block;
-        boolean_t               unlock_dp_on_error = B_FALSE;
-        boolean_t               created = B_FALSE;
-        int                     dm_event_sent = 0;
-        xfs_prid_t              prid;
-        struct xfs_dquot        *udqp, *gdqp;
-        uint                    resblks;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        tp = NULL;
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-                                        dp, DM_RIGHT_NULL, NULL,
-                                        DM_RIGHT_NULL, dir_name->name, NULL,
-                                        mode, 0, 0);
-                if (error)
-                        return error;
-                dm_event_sent = 1;
-        }
-        /* Return through std_return after this point. */
-        xfs_itrace_entry(dp);
-        mp = dp->i_mount;
-        udqp = gdqp = NULL;
-        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-                prid = dp->i_d.di_projid;
-        else
-                prid = (xfs_prid_t)dfltprid;
-        /*
-         * Make sure that we have allocated dquot(s) on disk.
-         */
-        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(), current_fsgid(), prid,
-                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
-        if (error)
-                goto std_return;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
-        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
-        error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
-                                  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
-        if (error == ENOSPC) {
-                resblks = 0;
-                error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
-                                          XFS_TRANS_PERM_LOG_RES,
-                                          XFS_MKDIR_LOG_COUNT);
-        }
-        if (error) {
-                cancel_flags = 0;
-                goto error_return;
-        }
-        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-        unlock_dp_on_error = B_TRUE;
-        /*
-         * Check for directory link count overflow.
-         */
-        if (dp->i_d.di_nlink >= XFS_MAXLINK) {
-                error = XFS_ERROR(EMLINK);
-                goto error_return;
-        }
-        /*
-         * Reserve disk quota and the inode.
-         */
-        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
-        if (error)
-                goto error_return;
-        error = xfs_dir_canenter(tp, dp, dir_name, resblks);
-        if (error)
-                goto error_return;
-        /*
-         * create the directory inode.
-         */
-        error = xfs_dir_ialloc(&tp, dp, mode, 2,
-                        0, credp, prid, resblks > 0,
-                &cdp, NULL);
-        if (error) {
-                if (error == ENOSPC)
-                        goto error_return;
-                goto abort_return;
-        }
-        xfs_itrace_ref(cdp);
-        /*
-         * Now we add the directory inode to the transaction.
-         * We waited until now since xfs_dir_ialloc might start
-         * a new transaction.  Had we joined the transaction
-         * earlier, the locks might have gotten released. An error
-         * from here on will result in the transaction cancel
-         * unlocking dp so don't do it explicitly in the error path.
-         */
-        IHOLD(dp);
-        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-        unlock_dp_on_error = B_FALSE;
-        xfs_bmap_init(&free_list, &first_block);
-        error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
-                                        &first_block, &free_list, resblks ?
-                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
-        if (error) {
-                ASSERT(error != ENOSPC);
-                goto error1;
-        }
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        error = xfs_dir_init(tp, cdp, dp);
-        if (error)
-                goto error2;
-        error = xfs_bumplink(tp, dp);
-        if (error)
-                goto error2;
-        created = B_TRUE;
-        *ipp = cdp;
-        IHOLD(cdp);
-        /*
-         * Attach the dquots to the new inode and modify the icount incore.
-         */
-        XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
-        /*
-         * If this is a synchronous mount, make sure that the
-         * mkdir transaction goes to disk before returning to
-         * the user.
-         */
-        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-                xfs_trans_set_sync(tp);
-        }
-        error = xfs_bmap_finish(&tp, &free_list, &committed);
-        if (error) {
-                IRELE(cdp);
-                goto error2;
-        }
-        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        XFS_QM_DQRELE(mp, udqp);
-        XFS_QM_DQRELE(mp, gdqp);
-        if (error) {
-                IRELE(cdp);
-        }
-        /* Fall through to std_return with error = 0 or errno from
-         * xfs_trans_commit. */
-std_return:
-        if ((created || (error != 0 && dm_event_sent != 0)) &&
-            DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-                                        dp, DM_RIGHT_NULL,
-                                        created ? cdp : NULL,
-                                        DM_RIGHT_NULL,
-                                        dir_name->name, NULL,
-                                        mode, error, 0);
-        }
-        return error;
- error2:
- error1:
-        xfs_bmap_cancel(&free_list);
- abort_return:
-        cancel_flags |= XFS_TRANS_ABORT;
- error_return:
-        xfs_trans_cancel(tp, cancel_flags);
-        XFS_QM_DQRELE(mp, udqp);
-        XFS_QM_DQRELE(mp, gdqp);
-        if (unlock_dp_on_error)
-                xfs_iunlock(dp, XFS_ILOCK_EXCL);
-        goto std_return;
-}
 int
 xfs_symlink(
        xfs_inode_t             *dp,
@@ -2587,51 +2404,6 @@ std_return:
 }
 int
-xfs_inode_flush(
-        xfs_inode_t     *ip,
-        int             flags)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        int             error = 0;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        /*
-         * Bypass inodes which have already been cleaned by
-         * the inode flush clustering code inside xfs_iflush
-         */
-        if (xfs_inode_clean(ip))
-                return 0;
-        /*
-         * We make this non-blocking if the inode is contended,
-         * return EAGAIN to indicate to the caller that they
-         * did not succeed. This prevents the flush path from
-         * blocking on inodes inside another operation right
-         * now, they get caught later by xfs_sync.
-         */
-        if (flags & FLUSH_SYNC) {
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
-                xfs_iflock(ip);
-        } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                        return EAGAIN;
-                }
-        } else {
-                return EAGAIN;
-        }
-        error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
-                                                    : XFS_IFLUSH_ASYNC_NOBLOCK);
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return error;
-}
-int
 xfs_set_dmattrs(
        xfs_inode_t     *ip,
        u_int           evmask,
@@ -2676,7 +2448,7 @@ xfs_reclaim(
        ASSERT(!VN_MAPPED(VFS_I(ip)));
        /* bad inode, get out here ASAP */
-        if (VN_BAD(VFS_I(ip))) {
+        if (is_bad_inode(VFS_I(ip))) {
                xfs_ireclaim(ip);
                return 0;
        }
@@ -3090,7 +2862,7 @@ xfs_free_file_space(
        /*
         * Need to zero the stuff we're not freeing, on disk.
-         * If its a realtime file & can't use unwritten extents then we
+         * If it's a realtime file & can't use unwritten extents then we
         * actually need to zero the extent edges.  Otherwise xfs_bunmapi
         * will take care of it for us.
         */
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 76df328c61b4..04373c6c61ff 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,14 +31,11 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
                struct xfs_name *target_name);
-int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
-                mode_t mode, struct xfs_inode **ipp, cred_t *credp);
 int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
                       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                const char *target_path, mode_t mode, struct xfs_inode **ipp,
                cred_t *credp);
-int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
author	Ingo Molnar <mingo@elte.hu>	2009-04-08 11:02:50 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-04-08 11:02:57 -0400
commit	ff96e612cba32510e263e17b213235fe5746397e (patch)
tree	a8df57d76b10e0901a4fb76cd2987eb9826a560a /fs
parent	cd84a42f315e50edd454c27a3da3951ccd3d735a (diff)
parent	577c9c456f0e1371cbade38eaf91ae8e8a308555 (diff)