Merge branch 'x86/cpu' into x86/xsave

Conflicts: arch/x86/kernel/cpu/feature_names.c include/asm-x86/cpufeature.h
author: H. Peter Anvin <hpa@zytor.com> 2008-09-04 12:04:45 -0400
committer: H. Peter Anvin <hpa@zytor.com> 2008-09-04 12:04:45 -0400
commit: fe47784ba5cbb6b713c013e046859946789b45e4 (patch)
tree: 6384958d55e29be0d2eb8ae78fa437c10636d8d6 /fs
parent: 83b8e28b14d63db928cb39e5c5ed2a548246bd71 (diff)
parent: af2e1f276ff08f17192411ea3b71c13a758dfe12 (diff)
107 files changed, 1828 insertions, 1991 deletions
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 56372ecf169..dfc0197905c 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -914,7 +914,9 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        /* Stash our initial stack pointer into the mm structure */
        current->mm->start_stack = (unsigned long )sp;
-        
+#ifdef FLAT_PLAT_INIT
+        FLAT_PLAT_INIT(regs);
+#endif
        DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
                (int)regs, (int)start_addr, (int)current->mm->start_stack);
        
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 756205314c2..8d7e88e02e0 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -120,8 +120,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        if (bprm->misc_bang)
                goto _ret;
-        bprm->misc_bang = 1;
        /* to keep locking time low, we copy the interpreter string */
        read_lock(&entries_lock);
        fmt = check_file(bprm);
@@ -199,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        if (retval < 0)
                goto _error;
+        bprm->misc_bang = 1;
        retval = search_binary_handler (bprm, regs);
        if (retval < 0)
                goto _error;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e8da4ee761b..25ecbd5b040 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -175,6 +175,8 @@ out_no_root:
        if (inode)
                iput(inode);
+        cifs_umount(sb, cifs_sb);
 out_mount_failed:
        if (cifs_sb) {
 #ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 28a22092d45..848286861c3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -649,6 +649,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
                inode->i_fop = &simple_dir_operations;
                inode->i_uid = cifs_sb->mnt_uid;
                inode->i_gid = cifs_sb->mnt_gid;
+        } else if (rc) {
                _FreeXid(xid);
                iget_failed(inode);
                return ERR_PTR(rc);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 0c3b618c15b..f40423eb1a1 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex);
 static int cramfs_iget5_test(struct inode *inode, void *opaque)
 {
        struct cramfs_inode *cramfs_inode = opaque;
+        return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
-        if (inode->i_ino != CRAMINO(cramfs_inode))
-                return 0; /* does not match */
-        if (inode->i_ino != 1)
-                return 1;
-        /* all empty directories, char, block, pipe, and sock, share inode #1 */
-        if ((inode->i_mode != cramfs_inode->mode) ||
-            (inode->i_gid != cramfs_inode->gid) ||
-            (inode->i_uid != cramfs_inode->uid))
-                return 0; /* does not match */
-        if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) &&
-            (inode->i_rdev != old_decode_dev(cramfs_inode->size)))
-                return 0; /* does not match */
-        return 1; /* matches */
 }
 static int cramfs_iget5_set(struct inode *inode, void *opaque)
 {
-        static struct timespec zerotime;
        struct cramfs_inode *cramfs_inode = opaque;
-        inode->i_mode = cramfs_inode->mode;
-        inode->i_uid = cramfs_inode->uid;
-        inode->i_size = cramfs_inode->size;
-        inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
-        inode->i_gid = cramfs_inode->gid;
-        /* Struct copy intentional */
-        inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
        inode->i_ino = CRAMINO(cramfs_inode);
-        /* inode->i_nlink is left 1 - arguably wrong for directories,
-           but it's the best we can do without reading the directory
-           contents.  1 yields the right result in GNU find, even
-           without -noleaf option. */
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_fop = &generic_ro_fops;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &cramfs_dir_inode_operations;
-                inode->i_fop = &cramfs_directory_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                inode->i_op = &page_symlink_inode_operations;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else {
-                inode->i_size = 0;
-                inode->i_blocks = 0;
-                init_special_inode(inode, inode->i_mode,
-                        old_decode_dev(cramfs_inode->size));
-        }
        return 0;
 }
@@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
        struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
                                            cramfs_iget5_test, cramfs_iget5_set,
                                            cramfs_inode);
+        static struct timespec zerotime;
        if (inode && (inode->i_state & I_NEW)) {
+                inode->i_mode = cramfs_inode->mode;
+                inode->i_uid = cramfs_inode->uid;
+                inode->i_size = cramfs_inode->size;
+                inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+                inode->i_gid = cramfs_inode->gid;
+                /* Struct copy intentional */
+                inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+                /* inode->i_nlink is left 1 - arguably wrong for directories,
+                   but it's the best we can do without reading the directory
+                   contents.  1 yields the right result in GNU find, even
+                   without -noleaf option. */
+                if (S_ISREG(inode->i_mode)) {
+                        inode->i_fop = &generic_ro_fops;
+                        inode->i_data.a_ops = &cramfs_aops;
+                } else if (S_ISDIR(inode->i_mode)) {
+                        inode->i_op = &cramfs_dir_inode_operations;
+                        inode->i_fop = &cramfs_directory_operations;
+                } else if (S_ISLNK(inode->i_mode)) {
+                        inode->i_op = &page_symlink_inode_operations;
+                        inode->i_data.a_ops = &cramfs_aops;
+                } else {
+                        inode->i_size = 0;
+                        inode->i_blocks = 0;
+                        init_special_inode(inode, inode->i_mode,
+                                old_decode_dev(cramfs_inode->size));
+                }
                unlock_new_inode(inode);
        }
        return inode;
 }
+static void cramfs_drop_inode(struct inode *inode)
+{
+        if (inode->i_ino == 1)
+                generic_delete_inode(inode);
+        else
+                generic_drop_inode(inode);
+}
 /*
 * We have our own block cache: don't fill up the buffer cache
 * with the rom-image, because the way the filesystem is set
@@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = {
        .put_super      = cramfs_put_super,
        .remount_fs     = cramfs_remount,
        .statfs         = cramfs_statfs,
+        .drop_inode     = cramfs_drop_inode,
 };
 static int cramfs_get_sb(struct file_system_type *fs_type,
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index c4e7d721bd8..89d2fb7b991 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -30,16 +30,16 @@
 static struct config_group *space_list;
 static struct config_group *comm_list;
-static struct comm *local_comm;
+static struct dlm_comm *local_comm;
-struct clusters;
+struct dlm_clusters;
-struct cluster;
+struct dlm_cluster;
-struct spaces;
+struct dlm_spaces;
-struct space;
+struct dlm_space;
-struct comms;
+struct dlm_comms;
-struct comm;
+struct dlm_comm;
-struct nodes;
+struct dlm_nodes;
-struct node;
+struct dlm_node;
 static struct config_group *make_cluster(struct config_group *, const char *);
 static void drop_cluster(struct config_group *, struct config_item *);
@@ -68,17 +68,22 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
                          const char *buf, size_t len);
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
-static ssize_t comm_local_read(struct comm *cm, char *buf);
+                                size_t len);
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
-static ssize_t node_nodeid_read(struct node *nd, char *buf);
+                                size_t len);
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
-static ssize_t node_weight_read(struct node *nd, char *buf);
+                                size_t len);
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
-struct cluster {
+                                size_t len);
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+                                size_t len);
+struct dlm_cluster {
        struct config_group group;
        unsigned int cl_tcp_port;
        unsigned int cl_buffer_size;
@@ -109,11 +114,11 @@ enum {
 struct cluster_attribute {
        struct configfs_attribute attr;
-        ssize_t (*show)(struct cluster *, char *);
+        ssize_t (*show)(struct dlm_cluster *, char *);
-        ssize_t (*store)(struct cluster *, const char *, size_t);
+        ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
 };
-static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
+static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
                           int *info_field, int check_zero,
                           const char *buf, size_t len)
 {
@@ -134,12 +139,12 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
 }
 #define CLUSTER_ATTR(name, check_zero)                                        \
-static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
+static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
 {                                                                             \
        return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name,         \
                           check_zero, buf, len);                             \
 }                                                                             \
-static ssize_t name##_read(struct cluster *cl, char *buf)                     \
+static ssize_t name##_read(struct dlm_cluster *cl, char *buf)                 \
 {                                                                             \
        return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name);               \
 }                                                                             \
@@ -181,8 +186,8 @@ enum {
 struct comm_attribute {
        struct configfs_attribute attr;
-        ssize_t (*show)(struct comm *, char *);
+        ssize_t (*show)(struct dlm_comm *, char *);
-        ssize_t (*store)(struct comm *, const char *, size_t);
+        ssize_t (*store)(struct dlm_comm *, const char *, size_t);
 };
 static struct comm_attribute comm_attr_nodeid = {
@@ -222,8 +227,8 @@ enum {
 struct node_attribute {
        struct configfs_attribute attr;
-        ssize_t (*show)(struct node *, char *);
+        ssize_t (*show)(struct dlm_node *, char *);
-        ssize_t (*store)(struct node *, const char *, size_t);
+        ssize_t (*store)(struct dlm_node *, const char *, size_t);
 };
 static struct node_attribute node_attr_nodeid = {
@@ -248,26 +253,26 @@ static struct configfs_attribute *node_attrs[] = {
        NULL,
 };
-struct clusters {
+struct dlm_clusters {
        struct configfs_subsystem subsys;
 };
-struct spaces {
+struct dlm_spaces {
        struct config_group ss_group;
 };
-struct space {
+struct dlm_space {
        struct config_group group;
        struct list_head members;
        struct mutex members_lock;
        int members_count;
 };
-struct comms {
+struct dlm_comms {
        struct config_group cs_group;
 };
-struct comm {
+struct dlm_comm {
        struct config_item item;
        int nodeid;
        int local;
@@ -275,11 +280,11 @@ struct comm {
        struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
-struct nodes {
+struct dlm_nodes {
        struct config_group ns_group;
 };
-struct node {
+struct dlm_node {
        struct config_item item;
        struct list_head list; /* space->members */
        int nodeid;
@@ -372,38 +377,40 @@ static struct config_item_type node_type = {
        .ct_owner = THIS_MODULE,
 };
-static struct cluster *to_cluster(struct config_item *i)
+static struct dlm_cluster *to_cluster(struct config_item *i)
 {
-        return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+        return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+                   NULL;
 }
-static struct space *to_space(struct config_item *i)
+static struct dlm_space *to_space(struct config_item *i)
 {
-        return i ? container_of(to_config_group(i), struct space, group) : NULL;
+        return i ? container_of(to_config_group(i), struct dlm_space, group) :
+                   NULL;
 }
-static struct comm *to_comm(struct config_item *i)
+static struct dlm_comm *to_comm(struct config_item *i)
 {
-        return i ? container_of(i, struct comm, item) : NULL;
+        return i ? container_of(i, struct dlm_comm, item) : NULL;
 }
-static struct node *to_node(struct config_item *i)
+static struct dlm_node *to_node(struct config_item *i)
 {
-        return i ? container_of(i, struct node, item) : NULL;
+        return i ? container_of(i, struct dlm_node, item) : NULL;
 }
 static struct config_group *make_cluster(struct config_group *g,
                                         const char *name)
 {
-        struct cluster *cl = NULL;
+        struct dlm_cluster *cl = NULL;
-        struct spaces *sps = NULL;
+        struct dlm_spaces *sps = NULL;
-        struct comms *cms = NULL;
+        struct dlm_comms *cms = NULL;
        void *gps = NULL;
-        cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+        cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
        gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
-        sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
+        sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
-        cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+        cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
        if (!cl || !gps || !sps || !cms)
                goto fail;
@@ -443,7 +450,7 @@ static struct config_group *make_cluster(struct config_group *g,
 static void drop_cluster(struct config_group *g, struct config_item *i)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = to_cluster(i);
        struct config_item *tmp;
        int j;
@@ -461,20 +468,20 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
 static void release_cluster(struct config_item *i)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = to_cluster(i);
        kfree(cl->group.default_groups);
        kfree(cl);
 }
 static struct config_group *make_space(struct config_group *g, const char *name)
 {
-        struct space *sp = NULL;
+        struct dlm_space *sp = NULL;
-        struct nodes *nds = NULL;
+        struct dlm_nodes *nds = NULL;
        void *gps = NULL;
-        sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+        sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
        gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
-        nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+        nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
        if (!sp || !gps || !nds)
                goto fail;
@@ -500,7 +507,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 static void drop_space(struct config_group *g, struct config_item *i)
 {
-        struct space *sp = to_space(i);
+        struct dlm_space *sp = to_space(i);
        struct config_item *tmp;
        int j;
@@ -517,16 +524,16 @@ static void drop_space(struct config_group *g, struct config_item *i)
 static void release_space(struct config_item *i)
 {
-        struct space *sp = to_space(i);
+        struct dlm_space *sp = to_space(i);
        kfree(sp->group.default_groups);
        kfree(sp);
 }
 static struct config_item *make_comm(struct config_group *g, const char *name)
 {
-        struct comm *cm;
+        struct dlm_comm *cm;
-        cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+        cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
        if (!cm)
                return ERR_PTR(-ENOMEM);
@@ -539,7 +546,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 static void drop_comm(struct config_group *g, struct config_item *i)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = to_comm(i);
        if (local_comm == cm)
                local_comm = NULL;
        dlm_lowcomms_close(cm->nodeid);
@@ -550,16 +557,16 @@ static void drop_comm(struct config_group *g, struct config_item *i)
 static void release_comm(struct config_item *i)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = to_comm(i);
        kfree(cm);
 }
 static struct config_item *make_node(struct config_group *g, const char *name)
 {
-        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct dlm_space *sp = to_space(g->cg_item.ci_parent);
-        struct node *nd;
+        struct dlm_node *nd;
-        nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+        nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
        if (!nd)
                return ERR_PTR(-ENOMEM);
@@ -578,8 +585,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 static void drop_node(struct config_group *g, struct config_item *i)
 {
-        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct dlm_space *sp = to_space(g->cg_item.ci_parent);
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = to_node(i);
        mutex_lock(&sp->members_lock);
        list_del(&nd->list);
@@ -591,11 +598,11 @@ static void drop_node(struct config_group *g, struct config_item *i)
 static void release_node(struct config_item *i)
 {
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = to_node(i);
        kfree(nd);
 }
-static struct clusters clusters_root = {
+static struct dlm_clusters clusters_root = {
        .subsys = {
                .su_group = {
                        .cg_item = {
@@ -625,7 +632,7 @@ void dlm_config_exit(void)
 static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
                            char *buf)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = to_cluster(i);
        struct cluster_attribute *cla =
                        container_of(a, struct cluster_attribute, attr);
        return cla->show ? cla->show(cl, buf) : 0;
@@ -635,7 +642,7 @@ static ssize_t store_cluster(struct config_item *i,
                             struct configfs_attribute *a,
                             const char *buf, size_t len)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = to_cluster(i);
        struct cluster_attribute *cla =
                container_of(a, struct cluster_attribute, attr);
        return cla->store ? cla->store(cl, buf, len) : -EINVAL;
@@ -644,7 +651,7 @@ static ssize_t store_cluster(struct config_item *i,
 static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
                         char *buf)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = to_comm(i);
        struct comm_attribute *cma =
                        container_of(a, struct comm_attribute, attr);
        return cma->show ? cma->show(cm, buf) : 0;
@@ -653,29 +660,31 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
                          const char *buf, size_t len)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = to_comm(i);
        struct comm_attribute *cma =
                container_of(a, struct comm_attribute, attr);
        return cma->store ? cma->store(cm, buf, len) : -EINVAL;
 }
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
 {
        return sprintf(buf, "%d\n", cm->nodeid);
 }
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+                                 size_t len)
 {
        cm->nodeid = simple_strtol(buf, NULL, 0);
        return len;
 }
-static ssize_t comm_local_read(struct comm *cm, char *buf)
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
 {
        return sprintf(buf, "%d\n", cm->local);
 }
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+                                size_t len)
 {
        cm->local= simple_strtol(buf, NULL, 0);
        if (cm->local && !local_comm)
@@ -683,7 +692,7 @@ static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
        return len;
 }
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
 {
        struct sockaddr_storage *addr;
@@ -705,7 +714,7 @@ static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
 static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
                         char *buf)
 {
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = to_node(i);
        struct node_attribute *nda =
                        container_of(a, struct node_attribute, attr);
        return nda->show ? nda->show(nd, buf) : 0;
@@ -714,29 +723,31 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
                          const char *buf, size_t len)
 {
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = to_node(i);
        struct node_attribute *nda =
                container_of(a, struct node_attribute, attr);
        return nda->store ? nda->store(nd, buf, len) : -EINVAL;
 }
-static ssize_t node_nodeid_read(struct node *nd, char *buf)
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
 {
        return sprintf(buf, "%d\n", nd->nodeid);
 }
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+                                 size_t len)
 {
        nd->nodeid = simple_strtol(buf, NULL, 0);
        return len;
 }
-static ssize_t node_weight_read(struct node *nd, char *buf)
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
 {
        return sprintf(buf, "%d\n", nd->weight);
 }
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+                                 size_t len)
 {
        nd->weight = simple_strtol(buf, NULL, 0);
        return len;
@@ -746,7 +757,7 @@ static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
 * Functions for the dlm to get the info that's been configured
 */
-static struct space *get_space(char *name)
+static struct dlm_space *get_space(char *name)
 {
        struct config_item *i;
@@ -760,15 +771,15 @@ static struct space *get_space(char *name)
        return to_space(i);
 }
-static void put_space(struct space *sp)
+static void put_space(struct dlm_space *sp)
 {
        config_item_put(&sp->group.cg_item);
 }
-static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 {
        struct config_item *i;
-        struct comm *cm = NULL;
+        struct dlm_comm *cm = NULL;
        int found = 0;
        if (!comm_list)
@@ -801,7 +812,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
        return cm;
 }
-static void put_comm(struct comm *cm)
+static void put_comm(struct dlm_comm *cm)
 {
        config_item_put(&cm->item);
 }
@@ -810,8 +821,8 @@ static void put_comm(struct comm *cm)
 int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
                    int **new_out, int *new_count_out)
 {
-        struct space *sp;
+        struct dlm_space *sp;
-        struct node *nd;
+        struct dlm_node *nd;
        int i = 0, rv = 0, ids_count = 0, new_count = 0;
        int *ids, *new;
@@ -874,8 +885,8 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 int dlm_node_weight(char *lsname, int nodeid)
 {
-        struct space *sp;
+        struct dlm_space *sp;
-        struct node *nd;
+        struct dlm_node *nd;
        int w = -EEXIST;
        sp = get_space(lsname);
@@ -897,7 +908,7 @@ int dlm_node_weight(char *lsname, int nodeid)
 int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
 {
-        struct comm *cm = get_comm(nodeid, NULL);
+        struct dlm_comm *cm = get_comm(nodeid, NULL);
        if (!cm)
                return -EEXIST;
        if (!cm->addr_count)
@@ -909,7 +920,7 @@ int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
 int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
 {
-        struct comm *cm = get_comm(0, addr);
+        struct dlm_comm *cm = get_comm(0, addr);
        if (!cm)
                return -EEXIST;
        *nodeid = cm->nodeid;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 929e48ae759..34f14a14fb4 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -527,8 +527,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
                k32buf = (struct dlm_write_request32 *)kbuf;
                kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) -
                               sizeof(struct dlm_write_request32)), GFP_KERNEL);
-                if (!kbuf)
+                if (!kbuf) {
+                        kfree(k32buf);
                        return -ENOMEM;
+                }
                if (proc)
                        set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
@@ -539,8 +541,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
        /* do we really need this? can a write happen after a close? */
        if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
-            (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)))
+            (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) {
-                return -EINVAL;
+                error = -EINVAL;
+                goto out_free;
+        }
        sigfillset(&allsigs);
        sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1ae5004e93f..e9fa960ba6d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1626,6 +1626,9 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
                free_blocks =
                        percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
 #endif
+        if (free_blocks <= root_blocks)
+                /* we don't have free space */
+                return 0;
        if (free_blocks - root_blocks < nblocks)
                return free_blocks - root_blocks;
        return nblocks;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d3d23d73c08..ec8e33b4521 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -411,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent,
                                get_dtype(sb, fname->file_type));
                if (error) {
                        filp->f_pos = curr_pos;
-                        info->extra_fname = fname->next;
+                        info->extra_fname = fname;
                        return error;
                }
                fname = fname->next;
@@ -450,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp,
         * If there are any leftover names on the hash collision
         * chain, return them first.
         */
-        if (info->extra_fname &&
+        if (info->extra_fname) {
-            call_filldir(filp, dirent, filldir, info->extra_fname))
+                if (call_filldir(filp, dirent, filldir, info->extra_fname))
-                goto finished;
+                        goto finished;
-        if (!info->curr_node)
+                info->extra_fname = NULL;
+                info->curr_node = rb_next(info->curr_node);
+                if (!info->curr_node) {
+                        if (info->next_hash == ~0) {
+                                filp->f_pos = EXT4_HTREE_EOF;
+                                goto finished;
+                        }
+                        info->curr_hash = info->next_hash;
+                        info->curr_minor_hash = 0;
+                }
+        } else if (!info->curr_node)
                info->curr_node = rb_first(&info->root);
        while (1) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6c7924d9e35..295003241d3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1072,6 +1072,8 @@ extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
@@ -1227,6 +1229,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
+                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned long max_blocks, struct buffer_head *bh_result,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 6c166c0a54b..d33dc56d698 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -216,7 +216,9 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+                                                   int num,
+                                                   struct ext4_ext_path *path);
 extern int ext4_ext_try_to_merge(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index eb8bc3afe6e..b455c685a98 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -51,6 +51,14 @@
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
                                         2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb)      (EXT4_XATTR_TRANS_BLOCKS + \
+                                        2*EXT4_QUOTA_TRANS_BLOCKS(sb))
 /* Delete operations potentially hit one directory's namespace plus an
 * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
 * generous.  We can grow the delete transaction later if necessary. */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 612c3d2c382..b24d3c53f20 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1747,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 }
 /*
- * ext4_ext_calc_credits_for_insert:
+ * ext4_ext_calc_credits_for_single_extent:
- * This routine returns max. credits that the extent tree can consume.
+ * This routine returns max. credits that needed to insert an extent
- * It should be OK for low-performance paths like ->writepage()
+ * to the extent tree.
- * To allow many writing processes to fit into a single transaction,
+ * When pass the actual path, the caller should calculate credits
- * the caller should calculate credits under i_data_sem and
+ * under i_data_sem.
- * pass the actual path.
 */
-int ext4_ext_calc_credits_for_insert(struct inode *inode,
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                                                struct ext4_ext_path *path)
 {
-        int depth, needed;
        if (path) {
+                int depth = ext_depth(inode);
+                int ret = 0;
                /* probably there is space in leaf? */
-                depth = ext_depth(inode);
                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
-                                < le16_to_cpu(path[depth].p_hdr->eh_max))
+                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {
-                        return 1;
-        }
-        /*
+                        /*
-         * given 32-bit logical block (4294967296 blocks), max. tree
+                         *  There are some space in the leaf tree, no
-         * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
+                         *  need to account for leaf block credit
-         * Let's also add one more level for imbalance.
+                         *
-         */
+                         *  bitmaps and block group descriptor blocks
-        depth = 5;
+                         *  and other metadat blocks still need to be
+                         *  accounted.
-        /* allocation of new data block(s) */
+                         */
-        needed = 2;
+                        /* 1 bitmap, 1 block group descriptor */
+                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+                }
+        }
-        /*
+        return ext4_chunk_trans_blocks(inode, nrblocks);
-         * tree can be full, so it would need to grow in depth:
+}
-         * we need one credit to modify old root, credits for
-         * new root will be added in split accounting
-         */
-        needed += 1;
-        /*
+/*
-         * Index split can happen, we would need:
+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
-         *    allocate intermediate indexes (bitmap + group)
+ *
-         *  + change two blocks at each level, but root (already included)
+ * if nrblocks are fit in a single extent (chunk flag is 1), then
-         */
+ * in the worse case, each tree level index/leaf need to be changed
-        needed += (depth * 2) + (depth * 2);
+ * if the tree split due to insert a new extent, then the old tree
+ * index/leaf need to be updated too
+ *
+ * If the nrblocks are discontiguous, they could cause
+ * the whole tree split more than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        int index;
+        int depth = ext_depth(inode);
-        /* any allocation modifies superblock */
+        if (chunk)
-        needed += 1;
+                index = depth * 2;
+        else
+                index = depth * 3;
-        return needed;
+        return index;
 }
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
@@ -1921,9 +1928,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        correct_index = 1;
                        credits += (ext_depth(inode)) + 1;
                }
-#ifdef CONFIG_QUOTA
                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
                err = ext4_ext_journal_restart(handle, credits);
                if (err)
@@ -2805,7 +2810,7 @@ void ext4_ext_truncate(struct inode *inode)
        /*
         * probably first extent we're gonna free will be last in block
         */
-        err = ext4_writepage_trans_blocks(inode) + 3;
+        err = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, err);
        if (IS_ERR(handle))
                return;
@@ -2819,7 +2824,7 @@ void ext4_ext_truncate(struct inode *inode)
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
-        ext4_mb_discard_inode_preallocations(inode);
+        ext4_discard_reservation(inode);
        /*
         * TODO: optimization is possible here.
@@ -2858,27 +2863,6 @@ out_stop:
        ext4_journal_stop(handle);
 }
-/*
- * ext4_ext_writepage_trans_blocks:
- * calculate max number of blocks we could modify
- * in order to allocate new block for an inode
- */
-int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
-{
-        int needed;
-        needed = ext4_ext_calc_credits_for_insert(inode, NULL);
-        /* caller wants to allocate num blocks, but note it includes sb */
-        needed = needed * num - (num - 1);
-#ifdef CONFIG_QUOTA
-        needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-        return needed;
-}
 static void ext4_falloc_update_inode(struct inode *inode,
                                int mode, loff_t new_size, int update_ctime)
 {
@@ -2939,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
                                                        - block;
        /*
-         * credits to insert 1 extent into extent tree + buffers to be able to
+         * credits to insert 1 extent into extent tree
-         * modify 1 super block, 1 block bitmap and 1 group descriptor.
         */
-        credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        mutex_lock(&inode->i_mutex);
 retry:
        while (ret >= 0 && ret < max_blocks) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 655e760212b..f344834bbf5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -351,7 +351,7 @@ find_close_to_parent:
                        goto found_flexbg;
                }
-                if (best_flex < 0 ||
+                if (flex_group[best_flex].free_inodes == 0 ||
                    (flex_group[i].free_blocks >
                     flex_group[best_flex].free_blocks &&
                     flex_group[i].free_inodes))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 59fbbe899ac..7e91913e325 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
 #include "acl.h"
 #include "ext4_extents.h"
+#define MPAGE_DA_EXTENT_TAIL 0x01
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
@@ -1005,6 +1007,9 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
 */
 static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
 {
+        if (!blocks)
+                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
                return ext4_ext_calc_metadata_amount(inode, blocks);
@@ -1041,18 +1046,6 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 }
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
-/*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
- * We need sb + group descriptor + bitmap + inode -> 4
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
- */
-#define DIO_CREDITS 25
 /*
 * The ext4_get_blocks_wrap() function try to look up the requested blocks,
 * and returns if the blocks are already mapped.
@@ -1164,19 +1157,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
        return retval;
 }
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
 static int ext4_get_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
 {
        handle_t *handle = ext4_journal_current_handle();
        int ret = 0, started = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        int dio_credits;
        if (create && !handle) {
                /* Direct IO write... */
                if (max_blocks > DIO_MAX_BLOCKS)
                        max_blocks = DIO_MAX_BLOCKS;
-                handle = ext4_journal_start(inode, DIO_CREDITS +
+                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-                              2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+                handle = ext4_journal_start(inode, dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
@@ -1559,7 +1556,25 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int total, mdb, mdb_free, release;
+        if (!to_free)
+                return;         /* Nothing to release, exit */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        if (!EXT4_I(inode)->i_reserved_data_blocks) {
+                /*
+                 * if there is no reserved blocks, but we try to free some
+                 * then the counter is messed up somewhere.
+                 * but since this function is called from invalidate
+                 * page, it's harmless to return without any action
+                 */
+                printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+                            "blocks for inode %lu, but there is no reserved "
+                            "data blocks\n", to_free, inode->i_ino);
+                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                return;
+        }
        /* recalculate the number of metablocks still need to be reserved */
        total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
        mdb = ext4_calc_metadata_amount(inode, total);
@@ -1613,11 +1628,13 @@ struct mpage_da_data {
        unsigned long first_page, next_page;    /* extent of pages */
        get_block_t *get_block;
        struct writeback_control *wbc;
+        int io_done;
+        long pages_written;
 };
 /*
 * mpage_da_submit_io - walks through extent of pages and try to write
- * them with __mpage_writepage()
+ * them with writepage() call back
 *
 * @mpd->inode: inode
 * @mpd->first_page: first page of the extent
@@ -1632,18 +1649,11 @@ struct mpage_da_data {
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
        struct address_space *mapping = mpd->inode->i_mapping;
-        struct mpage_data mpd_pp = {
-                .bio = NULL,
-                .last_block_in_bio = 0,
-                .get_block = mpd->get_block,
-                .use_writepage = 1,
-        };
        int ret = 0, err, nr_pages, i;
        unsigned long index, end;
        struct pagevec pvec;
        BUG_ON(mpd->next_page <= mpd->first_page);
        pagevec_init(&pvec, 0);
        index = mpd->first_page;
        end = mpd->next_page - 1;
@@ -1661,8 +1671,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                                break;
                        index++;
-                        err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+                        err = mapping->a_ops->writepage(page, mpd->wbc);
+                        if (!err)
+                                mpd->pages_written++;
                        /*
                         * In error case, we have to continue because
                         * remaining pages are still locked
@@ -1673,9 +1684,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                }
                pagevec_release(&pvec);
        }
-        if (mpd_pp.bio)
-                mpage_bio_submit(WRITE, mpd_pp.bio);
        return ret;
 }
@@ -1698,7 +1706,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
        int blocks = exbh->b_size >> inode->i_blkbits;
        sector_t pblock = exbh->b_blocknr, cur_logical;
        struct buffer_head *head, *bh;
-        unsigned long index, end;
+        pgoff_t index, end;
        struct pagevec pvec;
        int nr_pages, i;
@@ -1741,6 +1749,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                if (buffer_delay(bh)) {
                                        bh->b_blocknr = pblock;
                                        clear_buffer_delay(bh);
+                                        bh->b_bdev = inode->i_sb->s_bdev;
+                                } else if (buffer_unwritten(bh)) {
+                                        bh->b_blocknr = pblock;
+                                        clear_buffer_unwritten(bh);
+                                        set_buffer_mapped(bh);
+                                        set_buffer_new(bh);
+                                        bh->b_bdev = inode->i_sb->s_bdev;
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
@@ -1776,13 +1791,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
 *
 * The function skips space we know is already mapped to disk blocks.
 *
- * The function ignores errors ->get_block() returns, thus real
- * error handling is postponed to __mpage_writepage()
 */
 static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
+        int err = 0;
        struct buffer_head *lbh = &mpd->lbh;
-        int err = 0, remain = lbh->b_size;
        sector_t next = lbh->b_blocknr;
        struct buffer_head new;
@@ -1792,38 +1805,36 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
        if (buffer_mapped(lbh) && !buffer_delay(lbh))
                return;
-        while (remain) {
+        new.b_state = lbh->b_state;
-                new.b_state = lbh->b_state;
+        new.b_blocknr = 0;
-                new.b_blocknr = 0;
+        new.b_size = lbh->b_size;
-                new.b_size = remain;
-                err = mpd->get_block(mpd->inode, next, &new, 1);
-                if (err) {
-                        /*
-                         * Rather than implement own error handling
-                         * here, we just leave remaining blocks
-                         * unallocated and try again with ->writepage()
-                         */
-                        break;
-                }
-                BUG_ON(new.b_size == 0);
-                if (buffer_new(&new))
+        /*
-                        __unmap_underlying_blocks(mpd->inode, &new);
+         * If we didn't accumulate anything
+         * to write simply return
+         */
+        if (!new.b_size)
+                return;
+        err = mpd->get_block(mpd->inode, next, &new, 1);
+        if (err)
+                return;
+        BUG_ON(new.b_size == 0);
-                /*
+        if (buffer_new(&new))
-                 * If blocks are delayed marked, we need to
+                __unmap_underlying_blocks(mpd->inode, &new);
-                 * put actual blocknr and drop delayed bit
-                 */
-                if (buffer_delay(lbh))
-                        mpage_put_bnr_to_bhs(mpd, next, &new);
-                /* go for the remaining blocks */
+        /*
-                next += new.b_size >> mpd->inode->i_blkbits;
+         * If blocks are delayed marked, we need to
-                remain -= new.b_size;
+         * put actual blocknr and drop delayed bit
-        }
+         */
+        if (buffer_delay(lbh) || buffer_unwritten(lbh))
+                mpage_put_bnr_to_bhs(mpd, next, &new);
+        return;
 }
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
+                (1 << BH_Delay) | (1 << BH_Unwritten))
 /*
 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
@@ -1837,41 +1848,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
                                   sector_t logical, struct buffer_head *bh)
 {
-        struct buffer_head *lbh = &mpd->lbh;
        sector_t next;
+        size_t b_size = bh->b_size;
+        struct buffer_head *lbh = &mpd->lbh;
+        int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
-        next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+        /* check if thereserved journal credits might overflow */
+        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
+                        /*
+                         * With non-extent format we are limited by the journal
+                         * credit available.  Total credit needed to insert
+                         * nrblocks contiguous blocks is dependent on the
+                         * nrblocks.  So limit nrblocks.
+                         */
+                        goto flush_it;
+                } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
+                                EXT4_MAX_TRANS_DATA) {
+                        /*
+                         * Adding the new buffer_head would make it cross the
+                         * allowed limit for which we have journal credit
+                         * reserved. So limit the new bh->b_size
+                         */
+                        b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
+                                                mpd->inode->i_blkbits;
+                        /* we will do mpage_da_submit_io in the next loop */
+                }
+        }
        /*
         * First block in the extent
         */
        if (lbh->b_size == 0) {
                lbh->b_blocknr = logical;
-                lbh->b_size = bh->b_size;
+                lbh->b_size = b_size;
                lbh->b_state = bh->b_state & BH_FLAGS;
                return;
        }
+        next = lbh->b_blocknr + nrblocks;
        /*
         * Can we merge the block to our big extent?
         */
        if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
-                lbh->b_size += bh->b_size;
+                lbh->b_size += b_size;
                return;
        }
+flush_it:
        /*
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
        mpage_da_map_blocks(mpd);
+        mpage_da_submit_io(mpd);
-        /*
+        mpd->io_done = 1;
-         * Now start a new extent
+        return;
-         */
-        lbh->b_size = bh->b_size;
-        lbh->b_state = bh->b_state & BH_FLAGS;
-        lbh->b_blocknr = logical;
 }
 /*
@@ -1891,17 +1922,35 @@ static int __mpage_da_writepage(struct page *page,
        struct buffer_head *bh, *head, fake;
        sector_t logical;
+        if (mpd->io_done) {
+                /*
+                 * Rest of the page in the page_vec
+                 * redirty then and skip then. We will
+                 * try to to write them again after
+                 * starting a new transaction
+                 */
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return MPAGE_DA_EXTENT_TAIL;
+        }
        /*
         * Can we merge this page to current extent?
         */
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them using __mpage_writepage()
+                 * and start IO on them using writepage()
                 */
                if (mpd->next_page != mpd->first_page) {
                        mpage_da_map_blocks(mpd);
                        mpage_da_submit_io(mpd);
+                        /*
+                         * skip rest of the page in the page_vec
+                         */
+                        mpd->io_done = 1;
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return MPAGE_DA_EXTENT_TAIL;
                }
                /*
@@ -1932,6 +1981,8 @@ static int __mpage_da_writepage(struct page *page,
                set_buffer_dirty(bh);
                set_buffer_uptodate(bh);
                mpage_add_bh_to_extent(mpd, logical, bh);
+                if (mpd->io_done)
+                        return MPAGE_DA_EXTENT_TAIL;
        } else {
                /*
                 * Page with regular buffer heads, just add all dirty ones
@@ -1940,8 +1991,12 @@ static int __mpage_da_writepage(struct page *page,
                bh = head;
                do {
                        BUG_ON(buffer_locked(bh));
-                        if (buffer_dirty(bh))
+                        if (buffer_dirty(bh) &&
+                                (!buffer_mapped(bh) || buffer_delay(bh))) {
                                mpage_add_bh_to_extent(mpd, logical, bh);
+                                if (mpd->io_done)
+                                        return MPAGE_DA_EXTENT_TAIL;
+                        }
                        logical++;
                } while ((bh = bh->b_this_page) != head);
        }
@@ -1960,22 +2015,13 @@ static int __mpage_da_writepage(struct page *page,
 *
 * This is a library function, which implements the writepages()
 * address_space_operation.
- *
- * In order to avoid duplication of logic that deals with partial pages,
- * multiple bio per page, etc, we find non-allocated blocks, allocate
- * them with minimal calls to ->get_block() and re-use __mpage_writepage()
- *
- * It's important that we call __mpage_writepage() only once for each
- * involved page, otherwise we'd have to implement more complicated logic
- * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
- *
- * See comments to mpage_writepages()
 */
 static int mpage_da_writepages(struct address_space *mapping,
                               struct writeback_control *wbc,
                               get_block_t get_block)
 {
        struct mpage_da_data mpd;
+        long to_write;
        int ret;
        if (!get_block)
@@ -1989,17 +2035,22 @@ static int mpage_da_writepages(struct address_space *mapping,
        mpd.first_page = 0;
        mpd.next_page = 0;
        mpd.get_block = get_block;
+        mpd.io_done = 0;
+        mpd.pages_written = 0;
+        to_write = wbc->nr_to_write;
        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
        /*
         * Handle last extent of pages
         */
-        if (mpd.next_page != mpd.first_page) {
+        if (!mpd.io_done && mpd.next_page != mpd.first_page) {
                mpage_da_map_blocks(&mpd);
                mpage_da_submit_io(&mpd);
        }
+        wbc->nr_to_write = to_write - mpd.pages_written;
        return ret;
 }
@@ -2204,63 +2255,95 @@ static int ext4_da_writepage(struct page *page,
 }
 /*
- * For now just follow the DIO way to estimate the max credits
+ * This is called via ext4_da_writepages() to
- * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * calulate the total number of credits to reserve to fit
- * todo: need to calculate the max credits need for
+ * a single extent allocation into a single transaction,
- * extent based files, currently the DIO credits is based on
+ * ext4_da_writpeages() will loop calling this before
- * indirect-blocks mapping way.
+ * the block allocation.
- *
- * Probably should have a generic way to calculate credits
- * for DIO, writepages, and truncate
 */
-#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
-#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+        int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+        /*
+         * With non-extent format the journal credit needed to
+         * insert nrblocks contiguous block is dependent on
+         * number of contiguous block. So we will limit
+         * number of contiguous block to a sane value
+         */
+        if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+            (max_blocks > EXT4_MAX_TRANS_DATA))
+                max_blocks = EXT4_MAX_TRANS_DATA;
+        return ext4_chunk_trans_blocks(inode, max_blocks);
+}
 static int ext4_da_writepages(struct address_space *mapping,
-                                struct writeback_control *wbc)
+                              struct writeback_control *wbc)
 {
-        struct inode *inode = mapping->host;
        handle_t *handle = NULL;
-        int needed_blocks;
-        int ret = 0;
-        long to_write;
        loff_t range_start = 0;
+        struct inode *inode = mapping->host;
+        int needed_blocks, ret = 0, nr_to_writebump = 0;
+        long to_write, pages_skipped = 0;
+        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
         * because that could violate lock ordering on umount
         */
-        if (!mapping->nrpages)
+        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;
        /*
-         * Estimate the worse case needed credits to write out
+         * Make sure nr_to_write is >= sbi->s_mb_stream_request
-         * EXT4_MAX_BUF_BLOCKS pages
+         * This make sure small files blocks are allocated in
+         * single attempt. This ensure that small files
+         * get less fragmented.
         */
-        needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+        if (wbc->nr_to_write < sbi->s_mb_stream_request) {
+                nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+                wbc->nr_to_write = sbi->s_mb_stream_request;
+        }
-        to_write = wbc->nr_to_write;
+        if (!wbc->range_cyclic)
-        if (!wbc->range_cyclic) {
                /*
                 * If range_cyclic is not set force range_cont
                 * and save the old writeback_index
                 */
                wbc->range_cont = 1;
-                range_start =  wbc->range_start;
-        }
-        while (!ret && to_write) {
+        range_start =  wbc->range_start;
+        pages_skipped = wbc->pages_skipped;
+restart_loop:
+        to_write = wbc->nr_to_write;
+        while (!ret && to_write > 0) {
+                /*
+                 * we  insert one extent at a time. So we need
+                 * credit needed for single extent allocation.
+                 * journalled mode is currently not supported
+                 * by delalloc
+                 */
+                BUG_ON(ext4_should_journal_data(inode));
+                needed_blocks = ext4_da_writepages_trans_blocks(inode);
                /* start a new transaction*/
                handle = ext4_journal_start(inode, needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
+                        printk(KERN_EMERG "%s: jbd2_start: "
+                               "%ld pages, ino %lu; err %d\n", __func__,
+                                wbc->nr_to_write, inode->i_ino, ret);
+                        dump_stack();
                        goto out_writepages;
                }
                if (ext4_should_order_data(inode)) {
                        /*
                         * With ordered mode we need to add
-                         * the inode to the journal handle
+                         * the inode to the journal handl
                         * when we do block allocation.
                         */
                        ret = ext4_jbd2_file_inode(handle, inode);
@@ -2268,20 +2351,20 @@ static int ext4_da_writepages(struct address_space *mapping,
                                ext4_journal_stop(handle);
                                goto out_writepages;
                        }
                }
-                /*
-                 * set the max dirty pages could be write at a time
-                 * to fit into the reserved transaction credits
-                 */
-                if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
-                        wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
                to_write -= wbc->nr_to_write;
                ret = mpage_da_writepages(mapping, wbc,
-                                                ext4_da_get_block_write);
+                                          ext4_da_get_block_write);
                ext4_journal_stop(handle);
-                if (wbc->nr_to_write) {
+                if (ret == MPAGE_DA_EXTENT_TAIL) {
+                        /*
+                         * got one extent now try with
+                         * rest of the pages
+                         */
+                        to_write += wbc->nr_to_write;
+                        ret = 0;
+                } else if (wbc->nr_to_write) {
                        /*
                         * There is no more writeout needed
                         * or we requested for a noblocking writeout
@@ -2293,10 +2376,18 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->nr_to_write = to_write;
        }
-out_writepages:
+        if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
-        wbc->nr_to_write = to_write;
+                /* We skipped pages in this loop */
-        if (range_start)
                wbc->range_start = range_start;
+                wbc->nr_to_write = to_write +
+                                wbc->pages_skipped - pages_skipped;
+                wbc->pages_skipped = pages_skipped;
+                goto restart_loop;
+        }
+out_writepages:
+        wbc->nr_to_write = to_write - nr_to_writebump;
+        wbc->range_start = range_start;
        return ret;
 }
@@ -3486,6 +3577,9 @@ void ext4_truncate(struct inode *inode)
         * modify the block allocation tree.
         */
        down_write(&ei->i_data_sem);
+        ext4_discard_reservation(inode);
        /*
         * The orphan list entry will now protect us from any crash which
         * occurs before the truncate completes, so it is now safe to propagate
@@ -3555,8 +3649,6 @@ do_indirects:
                ;
        }
-        ext4_discard_reservation(inode);
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -4324,57 +4416,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
+static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
+                                      int chunk)
+{
+        int indirects;
+        /* if nrblocks are contiguous */
+        if (chunk) {
+                /*
+                 * With N contiguous data blocks, it need at most
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+                 * 2 dindirect blocks
+                 * 1 tindirect block
+                 */
+                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                return indirects + 3;
+        }
+        /*
+         * if nrblocks are not contiguous, worse case, each block touch
+         * a indirect block, and each indirect block touch a double indirect
+         * block, plus a triple indirect block
+         */
+        indirects = nrblocks * 2 + 1;
+        return indirects;
+}
+static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                return ext4_indirect_trans_blocks(inode, nrblocks, 0);
+        return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+}
 /*
- * How many blocks doth make a writepage()?
+ * Account for index blocks, block groups bitmaps and block group
- *
+ * descriptor blocks if modify datablocks and index blocks
- * With N blocks per page, it may be:
+ * worse case, the indexs blocks spread over different block groups
- * N data blocks
- * 2 indirect block
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
 *
- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiugous, with flexbg,
+ * they could still across block group boundary.
 *
- * With ordered or writeback data it's the same, less the N data blocks.
+ * Also account for superblock, inode, quota and xattr blocks
+ */
+int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        int groups, gdpblocks;
+        int idxblocks;
+        int ret = 0;
+        /*
+         * How many index blocks need to touch to modify nrblocks?
+         * The "Chunk" flag indicating whether the nrblocks is
+         * physically contiguous on disk
+         *
+         * For Direct IO and fallocate, they calls get_block to allocate
+         * one single extent at a time, so they could set the "Chunk" flag
+         */
+        idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+        ret = idxblocks;
+        /*
+         * Now let's see how many group bitmaps and group descriptors need
+         * to account
+         */
+        groups = idxblocks;
+        if (chunk)
+                groups += 1;
+        else
+                groups += nrblocks;
+        gdpblocks = groups;
+        if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
+                groups = EXT4_SB(inode->i_sb)->s_groups_count;
+        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+        /* bitmaps and block group descriptor blocks */
+        ret += groups + gdpblocks;
+        /* Blocks for super block, inode, quota and xattr blocks */
+        ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+        return ret;
+}
+/*
+ * Calulate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
 *
- * If the inode's direct blocks can hold an integral number of pages then a
+ * This could be called via ext4_write_begin()
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
 *
- * This still overestimates under most circumstances.  If we were to pass the
+ * We need to consider the worse case, when
- * start and end offsets in here as well we could do block_to_path() on each
+ * one new block per extent.
- * block and work out the exact number of indirects which are touched.  Pah.
 */
 int ext4_writepage_trans_blocks(struct inode *inode)
 {
        int bpp = ext4_journal_blocks_per_page(inode);
-        int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
        int ret;
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        ret = ext4_meta_trans_blocks(inode, bpp, 0);
-                return ext4_ext_writepage_trans_blocks(inode, bpp);
+        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
-                ret = 3 * (bpp + indirects) + 2;
+                ret += bpp;
-        else
-                ret = 2 * (bpp + indirects) + 2;
-#ifdef CONFIG_QUOTA
-        /* We know that structure was already allocated during DQUOT_INIT so
-         * we will be updating only the data blocks + inodes */
-        ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
        return ret;
 }
 /*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+        return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+/*
 * The caller must have previously called ext4_reserve_inode_write().
 * Give this, we know that the caller already has write access to iloc->bh.
 */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 865e9ddb44d..e0e3a5eb1dd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3282,6 +3282,35 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 }
 /*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+                        struct ext4_prealloc_space *pa,
+                        struct ext4_prealloc_space *cpa)
+{
+        ext4_fsblk_t cur_distance, new_distance;
+        if (cpa == NULL) {
+                atomic_inc(&pa->pa_count);
+                return pa;
+        }
+        cur_distance = abs(goal_block - cpa->pa_pstart);
+        new_distance = abs(goal_block - pa->pa_pstart);
+        if (cur_distance < new_distance)
+                return cpa;
+        /* drop the previous reference */
+        atomic_dec(&cpa->pa_count);
+        atomic_inc(&pa->pa_count);
+        return pa;
+}
+/*
 * search goal blocks in preallocated space
 */
 static noinline_for_stack int
@@ -3290,7 +3319,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
        int order, i;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_locality_group *lg;
-        struct ext4_prealloc_space *pa;
+        struct ext4_prealloc_space *pa, *cpa = NULL;
+        ext4_fsblk_t goal_block;
        /* only data can be preallocated */
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3333,6 +3363,13 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;
+        goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
+                     ac->ac_g_ex.fe_start +
+                     le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+        /*
+         * search for the prealloc space that is having
+         * minimal distance from the goal block.
+         */
        for (i = order; i < PREALLOC_TB_SIZE; i++) {
                rcu_read_lock();
                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
@@ -3340,17 +3377,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                        spin_lock(&pa->pa_lock);
                        if (pa->pa_deleted == 0 &&
                                        pa->pa_free >= ac->ac_o_ex.fe_len) {
-                                atomic_inc(&pa->pa_count);
-                                ext4_mb_use_group_pa(ac, pa);
+                                cpa = ext4_mb_check_group_pa(goal_block,
-                                spin_unlock(&pa->pa_lock);
+                                                                pa, cpa);
-                                ac->ac_criteria = 20;
-                                rcu_read_unlock();
-                                return 1;
                        }
                        spin_unlock(&pa->pa_lock);
                }
                rcu_read_unlock();
        }
+        if (cpa) {
+                ext4_mb_use_group_pa(ac, cpa);
+                ac->ac_criteria = 20;
+                return 1;
+        }
        return 0;
 }
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b9e077ba07e..46fc0b5b12b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
         * credit. But below we try to not accumalate too much
         * of them by restarting the journal.
         */
-        needed = ext4_ext_calc_credits_for_insert(inode, path);
+        needed = ext4_ext_calc_credits_for_single_extent(inode,
+                    lb->last_block - lb->first_block + 1, path);
        /*
         * Make sure the credit we accumalated is not really high
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 0a926516426..b3d35604ea1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if (reserved_gdb || gdb_off == 0) {
                if (!EXT4_HAS_COMPAT_FEATURE(sb,
-                                             EXT4_FEATURE_COMPAT_RESIZE_INODE)){
+                                             EXT4_FEATURE_COMPAT_RESIZE_INODE)
+                    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
                        ext4_warning(sb, __func__,
                                     "No reserved GDT blocks, can't resize");
                        return -EPERM;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d5d77958b86..566344b926b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -568,6 +568,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 #endif
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
+        ei->vfs_inode.i_data.writeback_index = 0;
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 6d266d793e2..80ff3381fa2 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -562,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait)
        struct buffer_head *bh;
        struct msdos_dir_entry *raw_entry;
        loff_t i_pos;
-        int err = 0;
+        int err;
 retry:
        i_pos = MSDOS_I(inode)->i_pos;
        if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
                return 0;
-        lock_super(sb);
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
        if (!bh) {
                printk(KERN_ERR "FAT: unable to read inode block "
                       "for updating (i_pos %lld)\n", i_pos);
-                err = -EIO;
+                return -EIO;
-                goto out;
        }
        spin_lock(&sbi->inode_hash_lock);
        if (i_pos != MSDOS_I(inode)->i_pos) {
                spin_unlock(&sbi->inode_hash_lock);
                brelse(bh);
-                unlock_super(sb);
                goto retry;
        }
@@ -607,11 +604,10 @@ retry:
        }
        spin_unlock(&sbi->inode_hash_lock);
        mark_buffer_dirty(bh);
+        err = 0;
        if (wait)
                err = sync_dirty_buffer(bh);
        brelse(bh);
-out:
-        unlock_super(sb);
        return err;
 }
diff --git a/fs/inode.c b/fs/inode.c
index b6726f64453..0487ddba139 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb)
                mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
                mapping->assoc_mapping = NULL;
                mapping->backing_dev_info = &default_backing_dev_info;
+                mapping->writeback_index = 0;
                /*
                 * If the block_device provides a backing_dev_info for client
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c4a1c3c65aa..da3cc460d4d 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
                                pgrp = task_pgrp(current);
                        else
                                pgrp = find_vpid(who);
-                        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                ret = set_task_ioprio(p, ioprio);
                                if (ret)
                                        break;
-                        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case IOPRIO_WHO_USER:
                        if (!who)
@@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
                                pgrp = task_pgrp(current);
                        else
                                pgrp = find_vpid(who);
-                        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                tmpio = get_task_ioprio(p);
                                if (tmpio < 0)
                                        continue;
@@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
                                        ret = tmpio;
                                else
                                        ret = ioprio_best(ret, tmpio);
-                        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case IOPRIO_WHO_USER:
                        if (!who)
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 31559f45fdd..4c41db91eaa 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -12,7 +12,6 @@
 #ifndef _JFFS2_FS_I
 #define _JFFS2_FS_I
-#include <linux/version.h>
 #include <linux/rbtree.h>
 #include <linux/posix_acl.h>
 #include <linux/mutex.h>
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index 697663b01ba..e1c0ec0ae98 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -92,7 +92,7 @@ int omfs_allocate_block(struct super_block *sb, u64 block)
        struct buffer_head *bh;
        struct omfs_sb_info *sbi = OMFS_SB(sb);
        int bits_per_entry = 8 * sb->s_blocksize;
-        int map, bit;
+        unsigned int map, bit;
        int ret = 0;
        u64 tmp;
@@ -176,7 +176,8 @@ int omfs_clear_range(struct super_block *sb, u64 block, int count)
        struct omfs_sb_info *sbi = OMFS_SB(sb);
        int bits_per_entry = 8 * sb->s_blocksize;
        u64 tmp;
-        int map, bit, ret;
+        unsigned int map, bit;
+        int ret;
        tmp = block;
        bit = do_div(tmp, bits_per_entry);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 7e2499053e4..834b2331f6b 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -26,6 +26,13 @@ static int omfs_sync_file(struct file *file, struct dentry *dentry,
        return err ? -EIO : 0;
 }
+static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
+{
+        return (sbi->s_sys_blocksize - offset -
+                sizeof(struct omfs_extent)) /
+                sizeof(struct omfs_extent_entry) + 1;
+}
 void omfs_make_empty_table(struct buffer_head *bh, int offset)
 {
        struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset];
@@ -45,6 +52,7 @@ int omfs_shrink_inode(struct inode *inode)
        struct buffer_head *bh;
        u64 next, last;
        u32 extent_count;
+        u32 max_extents;
        int ret;
        /* traverse extent table, freeing each entry that is greater
@@ -62,15 +70,18 @@ int omfs_shrink_inode(struct inode *inode)
                goto out;
        oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+        max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
        for (;;) {
-                if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) {
+                if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
-                        brelse(bh);
+                        goto out_brelse;
-                        goto out;
-                }
                extent_count = be32_to_cpu(oe->e_extent_count);
+                if (extent_count > max_extents)
+                        goto out_brelse;
                last = next;
                next = be64_to_cpu(oe->e_next);
                entry = &oe->e_entry;
@@ -98,10 +109,14 @@ int omfs_shrink_inode(struct inode *inode)
                if (!bh)
                        goto out;
                oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+                max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
        }
        ret = 0;
 out:
        return ret;
+out_brelse:
+        brelse(bh);
+        return ret;
 }
 static void omfs_truncate(struct inode *inode)
@@ -154,9 +169,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
                        goto out;
                }
        }
-        max_count = (sbi->s_sys_blocksize - OMFS_EXTENT_START -
+        max_count = omfs_max_extents(sbi, OMFS_EXTENT_START);
-                sizeof(struct omfs_extent)) /
-                sizeof(struct omfs_extent_entry) + 1;
        /* TODO: add a continuation block here */
        if (be32_to_cpu(oe->e_extent_count) > max_count-1)
@@ -225,6 +238,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
        sector_t next, offset;
        int ret;
        u64 new_block;
+        u32 max_extents;
        int extent_count;
        struct omfs_extent *oe;
        struct omfs_extent_entry *entry;
@@ -238,6 +252,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
                goto out;
        oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+        max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
        next = inode->i_ino;
        for (;;) {
@@ -249,6 +264,9 @@ static int omfs_get_block(struct inode *inode, sector_t block,
                next = be64_to_cpu(oe->e_next);
                entry = &oe->e_entry;
+                if (extent_count > max_extents)
+                        goto out_brelse;
                offset = find_block(inode, entry, block, extent_count, &remain);
                if (offset > 0) {
                        ret = 0;
@@ -266,6 +284,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
                if (!bh)
                        goto out;
                oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+                max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
        }
        if (create) {
                ret = omfs_grow_extent(inode, oe, &new_block);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index a95fe5984f4..d29047b1b9b 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -232,8 +232,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
                inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);
                inode->i_op = &omfs_dir_inops;
                inode->i_fop = &omfs_dir_operations;
-                inode->i_size = be32_to_cpu(oi->i_head.h_body_size) +
+                inode->i_size = sbi->s_sys_blocksize;
-                        sizeof(struct omfs_header);
                inc_nlink(inode);
                break;
        case OMFS_FILE:
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 79ecd281d2c..3f87d263294 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
        }
        seq_printf(m,
-                   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+                   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
                   vma->vm_start,
                   vma->vm_end,
                   flags & VM_READ ? 'r' : '-',
                   flags & VM_WRITE ? 'w' : '-',
                   flags & VM_EXEC ? 'x' : '-',
                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-                   vma->vm_pgoff << PAGE_SHIFT,
+                   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
                   MAJOR(dev), MINOR(dev), ino, &len);
        if (file) {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7546a918f79..73d1891ee62 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -219,14 +219,14 @@ static int show_map(struct seq_file *m, void *v)
                ino = inode->i_ino;
        }
-        seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
                        vma->vm_start,
                        vma->vm_end,
                        flags & VM_READ ? 'r' : '-',
                        flags & VM_WRITE ? 'w' : '-',
                        flags & VM_EXEC ? 'x' : '-',
                        flags & VM_MAYSHARE ? 's' : 'p',
-                        vma->vm_pgoff << PAGE_SHIFT,
+                        ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
                        MAJOR(dev), MINOR(dev), ino, &len);
        /*
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index d81fb9ed2b8..15409815747 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -263,8 +263,8 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
-        /* And make sure we have twice the index size of space reserved */
+        /* And make sure we have thrice the index size of space reserved */
-        idx_size <<= 1;
+        idx_size = idx_size + (idx_size << 1);
        /*
         * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
@@ -388,11 +388,11 @@ static int can_use_rp(struct ubifs_info *c)
 * This function makes sure UBIFS has enough free eraseblocks for index growth
 * and data.
 *
- * When budgeting index space, UBIFS reserves twice as more LEBs as the index
+ * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
 * would take if it was consolidated and written to the flash. This guarantees
 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
 * be able to commit dirty index. So this function basically adds amount of
- * budgeted index space to the size of the current index, multiplies this by 2,
+ * budgeted index space to the size of the current index, multiplies this by 3,
 * and makes sure this does not exceed the amount of free eraseblocks.
 *
 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
@@ -543,8 +543,16 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
        int err, idx_growth, data_growth, dd_growth;
        struct retries_info ri;
+        ubifs_assert(req->new_page <= 1);
+        ubifs_assert(req->dirtied_page <= 1);
+        ubifs_assert(req->new_dent <= 1);
+        ubifs_assert(req->mod_dent <= 1);
+        ubifs_assert(req->new_ino <= 1);
+        ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
        ubifs_assert(req->dirtied_ino <= 4);
        ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+        ubifs_assert(!(req->new_ino_d & 7));
+        ubifs_assert(!(req->dirtied_ino_d & 7));
        data_growth = calc_data_growth(c, req);
        dd_growth = calc_dd_growth(c, req);
@@ -618,8 +626,16 @@ again:
 */
 void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
+        ubifs_assert(req->new_page <= 1);
+        ubifs_assert(req->dirtied_page <= 1);
+        ubifs_assert(req->new_dent <= 1);
+        ubifs_assert(req->mod_dent <= 1);
+        ubifs_assert(req->new_ino <= 1);
+        ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
        ubifs_assert(req->dirtied_ino <= 4);
        ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+        ubifs_assert(!(req->new_ino_d & 7));
+        ubifs_assert(!(req->dirtied_ino_d & 7));
        if (!req->recalculate) {
                ubifs_assert(req->idx_growth >= 0);
                ubifs_assert(req->data_growth >= 0);
@@ -647,7 +663,11 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
        ubifs_assert(c->budg_idx_growth >= 0);
        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->budg_dd_growth >= 0);
        ubifs_assert(c->min_idx_lebs < c->main_lebs);
+        ubifs_assert(!(c->budg_idx_growth & 7));
+        ubifs_assert(!(c->budg_data_growth & 7));
+        ubifs_assert(!(c->budg_dd_growth & 7));
        spin_unlock(&c->space_lock);
 }
@@ -686,9 +706,10 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
                                      struct ubifs_inode *ui)
 {
-        struct ubifs_budget_req req = {.dd_growth = c->inode_budget,
+        struct ubifs_budget_req req;
-                                       .dirtied_ino_d = ui->data_len};
+        memset(&req, 0, sizeof(struct ubifs_budget_req));
+        req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
        ubifs_release_budget(c, &req);
 }
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 3b516316c9b..0a6aa2cc78f 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -74,6 +74,7 @@ static int do_commit(struct ubifs_info *c)
                        goto out_up;
        }
+        c->cmt_no += 1;
        err = ubifs_gc_start_commit(c);
        if (err)
                goto out_up;
@@ -115,7 +116,7 @@ static int do_commit(struct ubifs_info *c)
                goto out;
        mutex_lock(&c->mst_mutex);
-        c->mst_node->cmt_no      = cpu_to_le64(++c->cmt_no);
+        c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);
        c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
        c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
        c->mst_node->root_offs   = cpu_to_le32(zroot.offs);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4e3aaeba4ec..b9cb7747375 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -568,8 +568,8 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
 {
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs  %d\n",
+        printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
-               lst->empty_lebs, lst->idx_lebs);
+               "idx_lebs  %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
        printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
               "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
               lst->total_dirty);
@@ -587,8 +587,8 @@ void dbg_dump_budg(struct ubifs_info *c)
        struct ubifs_gced_idx_leb *idx_gc;
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, "
+        printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
-               "budg_dd_growth %lld, budg_idx_growth %lld\n",
+               "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
               c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
        printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
               "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
@@ -634,7 +634,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
        struct ubifs_lprops lp;
        struct ubifs_lp_stats lst;
-        printk(KERN_DEBUG "Dumping LEB properties\n");
+        printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
@@ -655,7 +655,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
        if (dbg_failure_mode)
                return;
-        printk(KERN_DEBUG "Dumping LEB %d\n", lnum);
+        printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
        if (IS_ERR(sleb)) {
@@ -720,8 +720,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
        int i;
-        printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n",
+        printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
-               cat, heap->cnt);
+               current->pid, cat, heap->cnt);
        for (i = 0; i < heap->cnt; i++) {
                struct ubifs_lprops *lprops = heap->arr[i];
@@ -736,7 +736,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 {
        int i;
-        printk(KERN_DEBUG "Dumping pnode:\n");
+        printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
        printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
               (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
        printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -755,7 +755,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
        int level;
        printk(KERN_DEBUG "\n");
-        printk(KERN_DEBUG "Dumping the TNC tree\n");
+        printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
        znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
        level = znode->level;
        printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -2208,16 +2208,17 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
                  int offset, int len, int dtype)
 {
-        int err;
+        int err, failing;
        if (in_failure_mode(desc))
                return -EIO;
-        if (do_fail(desc, lnum, 1))
+        failing = do_fail(desc, lnum, 1);
+        if (failing)
                cut_data(buf, len);
        err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
        if (err)
                return err;
-        if (in_failure_mode(desc))
+        if (failing)
                return -EIO;
        return 0;
 }
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 3c4f1e93c9e..50315fc5718 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -27,7 +27,7 @@
 #define UBIFS_DBG(op) op
-#define ubifs_assert(expr)  do {                                               \
+#define ubifs_assert(expr) do {                                                \
        if (unlikely(!(expr))) {                                               \
                printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
                       __func__, __LINE__, current->pid);                      \
@@ -73,50 +73,50 @@ const char *dbg_key_str1(const struct ubifs_info *c,
                         const union ubifs_key *key);
 /*
- * DBGKEY macros require dbg_lock to be held, which it is in the dbg message
+ * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
 * macros.
 */
 #define DBGKEY(key) dbg_key_str0(c, (key))
 #define DBGKEY1(key) dbg_key_str1(c, (key))
 /* General messages */
-#define dbg_gen(fmt, ...)        dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)   dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
 /* Additional journal messages */
-#define dbg_jnl(fmt, ...)        dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
-#define dbg_tnc(fmt, ...)        dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
-#define dbg_lp(fmt, ...)         dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
-#define dbg_find(fmt, ...)       dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
 /* Additional mount messages */
-#define dbg_mnt(fmt, ...)        dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
-#define dbg_io(fmt, ...)         dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
 /* Additional commit messages */
-#define dbg_cmt(fmt, ...)        dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
 /* Additional budgeting messages */
-#define dbg_budg(fmt, ...)       dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
 /* Additional log messages */
-#define dbg_log(fmt, ...)        dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
 /* Additional gc messages */
-#define dbg_gc(fmt, ...)         dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
 /* Additional scan messages */
-#define dbg_scan(fmt, ...)       dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
 /* Additional recovery messages */
-#define dbg_rcvry(fmt, ...)      dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
 /*
 * Debugging message type flags (must match msg_type_names in debug.c).
@@ -239,34 +239,23 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
                                 struct ubifs_zbranch *zbr, void *priv);
 typedef int (*dbg_znode_callback)(struct ubifs_info *c,
                                  struct ubifs_znode *znode, void *priv);
 int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
                   dbg_znode_callback znode_cb, void *priv);
 /* Checking functions */
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_cats(struct ubifs_info *c);
 int dbg_check_ltab(struct ubifs_info *c);
 int dbg_check_synced_i_size(struct inode *inode);
 int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
 int dbg_check_tnc(struct ubifs_info *c, int extra);
 int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
 int dbg_check_filesystem(struct ubifs_info *c);
 void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
                    int add_pos);
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
                        int row, int col);
@@ -329,71 +318,77 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #else /* !CONFIG_UBIFS_FS_DEBUG */
 #define UBIFS_DBG(op)
-#define ubifs_assert(expr)                         ({})
-#define ubifs_assert_cmt_locked(c)
+/* Use "if (0)" to make compiler check arguments even if debugging is off */
+#define ubifs_assert(expr)  do {                                               \
+        if (0 && (expr))                                                       \
+                printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+                       __func__, __LINE__, current->pid);                      \
+} while (0)
+#define dbg_err(fmt, ...)   do {                                               \
+        if (0)                                                                 \
+                ubifs_err(fmt, ##__VA_ARGS__);                                 \
+} while (0)
+#define dbg_msg(fmt, ...) do {                                                 \
+        if (0)                                                                 \
+                printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n",         \
+                       current->pid, __func__, ##__VA_ARGS__);                 \
+} while (0)
 #define dbg_dump_stack()
-#define dbg_err(fmt, ...)                          ({})
+#define ubifs_assert_cmt_locked(c)
-#define dbg_msg(fmt, ...)                          ({})
-#define dbg_key(c, key, fmt, ...)                  ({})
-#define dbg_gen(fmt, ...)                          ({})
-#define dbg_jnl(fmt, ...)                          ({})
-#define dbg_tnc(fmt, ...)                          ({})
-#define dbg_lp(fmt, ...)                           ({})
-#define dbg_find(fmt, ...)                         ({})
-#define dbg_mnt(fmt, ...)                          ({})
-#define dbg_io(fmt, ...)                           ({})
-#define dbg_cmt(fmt, ...)                          ({})
-#define dbg_budg(fmt, ...)                         ({})
-#define dbg_log(fmt, ...)                          ({})
-#define dbg_gc(fmt, ...)                           ({})
-#define dbg_scan(fmt, ...)                         ({})
-#define dbg_rcvry(fmt, ...)                        ({})
-#define dbg_ntype(type)                            ""
-#define dbg_cstate(cmt_state)                      ""
-#define dbg_get_key_dump(c, key)                   ({})
-#define dbg_dump_inode(c, inode)                   ({})
-#define dbg_dump_node(c, node)                     ({})
-#define dbg_dump_budget_req(req)                   ({})
-#define dbg_dump_lstats(lst)                       ({})
-#define dbg_dump_budg(c)                           ({})
-#define dbg_dump_lprop(c, lp)                      ({})
-#define dbg_dump_lprops(c)                         ({})
-#define dbg_dump_leb(c, lnum)                      ({})
-#define dbg_dump_znode(c, znode)                   ({})
-#define dbg_dump_heap(c, heap, cat)                ({})
-#define dbg_dump_pnode(c, pnode, parent, iip)      ({})
-#define dbg_dump_tnc(c)                            ({})
-#define dbg_dump_index(c)                          ({})
-#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+#define dbg_gen(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define DBGKEY(key)  ((char *)(key))
+#define DBGKEY1(key) ((char *)(key))
+#define dbg_ntype(type)                       ""
+#define dbg_cstate(cmt_state)                 ""
+#define dbg_get_key_dump(c, key)              ({})
+#define dbg_dump_inode(c, inode)              ({})
+#define dbg_dump_node(c, node)                ({})
+#define dbg_dump_budget_req(req)              ({})
+#define dbg_dump_lstats(lst)                  ({})
+#define dbg_dump_budg(c)                      ({})
+#define dbg_dump_lprop(c, lp)                 ({})
+#define dbg_dump_lprops(c)                    ({})
+#define dbg_dump_leb(c, lnum)                 ({})
+#define dbg_dump_znode(c, znode)              ({})
+#define dbg_dump_heap(c, heap, cat)           ({})
+#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_tnc(c)                       ({})
+#define dbg_dump_index(c)                     ({})
+#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
 #define dbg_check_old_index(c, zroot)              0
 #define dbg_check_cats(c)                          0
 #define dbg_check_ltab(c)                          0
 #define dbg_check_synced_i_size(inode)             0
 #define dbg_check_dir_size(c, dir)                 0
 #define dbg_check_tnc(c, x)                        0
 #define dbg_check_idx_size(c, idx_size)            0
 #define dbg_check_filesystem(c)                    0
 #define dbg_check_heap(c, heap, cat, add_pos)      ({})
 #define dbg_check_lprops(c)                        0
 #define dbg_check_lpt_nodes(c, cnode, row, col)    0
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
 #define dbg_failure_mode_registration(c)           ({})
 #define dbg_failure_mode_deregistration(c)         ({})
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e90374be7d3..5c96f1fb701 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -165,7 +165,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
        }
        inode->i_ino = ++c->highest_inum;
-        inode->i_generation = ++c->vfs_gen;
        /*
         * The creation sequence number remains with this inode for its
         * lifetime. All nodes for this inode have a greater sequence number,
@@ -220,15 +219,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
        err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
        if (err) {
-                /*
+                if (err == -ENOENT) {
-                 * Do not hash the direntry if parent 'i_nlink' is zero, because
-                 * this has side-effects - '->delete_inode()' call will not be
-                 * called for the parent orphan inode, because 'd_count' of its
-                 * direntry will stay 1 (it'll be negative direntry I guess)
-                 * and prevent 'iput_final()' until the dentry is destroyed due
-                 * to unmount or memory pressure.
-                 */
-                if (err == -ENOENT && dir->i_nlink != 0) {
                        dbg_gen("not found");
                        goto done;
                }
@@ -525,7 +516,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        struct ubifs_inode *dir_ui = ubifs_inode(dir);
        int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
-                                        .dirtied_ino_d = ui->data_len };
+                                .dirtied_ino_d = ALIGN(ui->data_len, 8) };
        /*
         * Budget request settings: new direntry, changing the target inode,
@@ -727,8 +718,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct ubifs_inode *dir_ui = ubifs_inode(dir);
        struct ubifs_info *c = dir->i_sb->s_fs_info;
        int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
-        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
-                                        .dirtied_ino_d = 1 };
        /*
         * Budget request settings: new inode, new direntry and changing parent
@@ -789,7 +779,8 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        int err, devlen = 0;
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-                                        .new_ino_d = devlen, .dirtied_ino = 1 };
+                                        .new_ino_d = ALIGN(devlen, 8),
+                                        .dirtied_ino = 1 };
        /*
         * Budget request settings: new inode, new direntry and changing parent
@@ -863,7 +854,8 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
        int err, len = strlen(symname);
        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-                                        .new_ino_d = len, .dirtied_ino = 1 };
+                                        .new_ino_d = ALIGN(len, 8),
+                                        .dirtied_ino = 1 };
        /*
         * Budget request settings: new inode, new direntry and changing parent
@@ -1012,7 +1004,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
                                        .dirtied_ino = 3 };
        struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
-                                .dirtied_ino_d = old_inode_ui->data_len };
+                        .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
        struct timespec time;
        /*
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 8565e586e53..4071d1cae29 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -890,7 +890,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        loff_t new_size = attr->ia_size;
        struct ubifs_inode *ui = ubifs_inode(inode);
        struct ubifs_budget_req req = { .dirtied_ino = 1,
-                                        .dirtied_ino_d = ui->data_len };
+                                .dirtied_ino_d = ALIGN(ui->data_len, 8) };
        err = ubifs_budget_space(c, &req);
        if (err)
@@ -941,7 +941,8 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *inode = dentry->d_inode;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
-        dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid);
+        dbg_gen("ino %lu, mode %#x, ia_valid %#x",
+                inode->i_ino, inode->i_mode, attr->ia_valid);
        err = inode_change_ok(inode, attr);
        if (err)
                return err;
@@ -1051,7 +1052,7 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode)
        if (mctime_update_needed(inode, &now)) {
                int err, release;
                struct ubifs_budget_req req = { .dirtied_ino = 1,
-                                                .dirtied_ino_d = ui->data_len };
+                                .dirtied_ino_d = ALIGN(ui->data_len, 8) };
                err = ubifs_budget_space(c, &req);
                if (err)
@@ -1270,6 +1271,7 @@ struct file_operations ubifs_file_operations = {
        .fsync          = ubifs_fsync,
        .unlocked_ioctl = ubifs_ioctl,
        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ubifs_compat_ioctl,
 #endif
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 10394c54836..adee7b5ddea 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -290,9 +290,14 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                idx_lp = idx_heap->arr[0];
                sum = idx_lp->free + idx_lp->dirty;
                /*
-                 * Since we reserve twice as more space for the index than it
+                 * Since we reserve thrice as much space for the index than it
                 * actually takes, it does not make sense to pick indexing LEBs
-                 * with less than half LEB of dirty space.
+                 * with less than, say, half LEB of dirty space. May be half is
+                 * not the optimal boundary - this should be tested and
+                 * checked. This boundary should determine how much we use
+                 * in-the-gaps to consolidate the index comparing to how much
+                 * we use garbage collector to consolidate it. The "half"
+                 * criteria just feels to be fine.
                 */
                if (sum < min_space || sum < c->half_leb_size)
                        idx_lp = NULL;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3374f91b670..054363f2b20 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -54,6 +54,20 @@
 #include "ubifs.h"
 /**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+        if (!c->ro_media) {
+                c->ro_media = 1;
+                ubifs_warn("switched to read-only mode, error %d", err);
+                dbg_dump_stack();
+        }
+}
+/**
 * ubifs_check_node - check node.
 * @c: UBIFS file-system description object
 * @buf: node to check
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 283155abe5f..22993f867d1 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -447,13 +447,11 @@ static int get_dent_type(int mode)
 * @ino: buffer in which to pack inode node
 * @inode: inode to pack
 * @last: indicates the last node of the group
- * @last_reference: non-zero if this is a deletion inode
 */
 static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
-                       const struct inode *inode, int last,
+                       const struct inode *inode, int last)
-                       int last_reference)
 {
-        int data_len = 0;
+        int data_len = 0, last_reference = !inode->i_nlink;
        struct ubifs_inode *ui = ubifs_inode(inode);
        ino->ch.node_type = UBIFS_INO_NODE;
@@ -596,9 +594,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
        ubifs_prep_grp_node(c, dent, dlen, 0);
        ino = (void *)dent + aligned_dlen;
-        pack_inode(c, ino, inode, 0, last_reference);
+        pack_inode(c, ino, inode, 0);
        ino = (void *)ino + aligned_ilen;
-        pack_inode(c, ino, dir, 1, 0);
+        pack_inode(c, ino, dir, 1);
        if (last_reference) {
                err = ubifs_add_orphan(c, inode->i_ino);
@@ -606,6 +604,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
                        release_head(c, BASEHD);
                        goto out_finish;
                }
+                ui->del_cmtno = c->cmt_no;
        }
        err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
@@ -750,30 +749,25 @@ out_free:
 * ubifs_jnl_write_inode - flush inode to the journal.
 * @c: UBIFS file-system description object
 * @inode: inode to flush
- * @deletion: inode has been deleted
 *
 * This function writes inode @inode to the journal. If the inode is
 * synchronous, it also synchronizes the write-buffer. Returns zero in case of
 * success and a negative error code in case of failure.
 */
-int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
-                          int deletion)
 {
-        int err, len, lnum, offs, sync = 0;
+        int err, lnum, offs;
        struct ubifs_ino_node *ino;
        struct ubifs_inode *ui = ubifs_inode(inode);
+        int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
-        dbg_jnl("ino %lu%s", inode->i_ino,
+        dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
-                deletion ? " (last reference)" : "");
-        if (deletion)
-                ubifs_assert(inode->i_nlink == 0);
-        len = UBIFS_INO_NODE_SZ;
        /*
         * If the inode is being deleted, do not write the attached data. No
         * need to synchronize the write-buffer either.
         */
-        if (!deletion) {
+        if (!last_reference) {
                len += ui->data_len;
                sync = IS_SYNC(inode);
        }
@@ -786,7 +780,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
        if (err)
                goto out_free;
-        pack_inode(c, ino, inode, 1, deletion);
+        pack_inode(c, ino, inode, 1);
        err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
        if (err)
                goto out_release;
@@ -795,7 +789,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
                                          inode->i_ino);
        release_head(c, BASEHD);
-        if (deletion) {
+        if (last_reference) {
                err = ubifs_tnc_remove_ino(c, inode->i_ino);
                if (err)
                        goto out_ro;
@@ -828,6 +822,65 @@ out_free:
 }
 /**
+ * ubifs_jnl_delete_inode - delete an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to delete
+ *
+ * This function deletes inode @inode which includes removing it from orphans,
+ * deleting it from TNC and, in some cases, writing a deletion inode to the
+ * journal.
+ *
+ * When regular file inodes are unlinked or a directory inode is removed, the
+ * 'ubifs_jnl_update()' function writes a corresponding deletion inode and
+ * direntry to the media, and adds the inode to orphans. After this, when the
+ * last reference to this inode has been dropped, this function is called. In
+ * general, it has to write one more deletion inode to the media, because if
+ * a commit happened between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal
+ * anymore, and in fact it might not be on the flash anymore, because it might
+ * have been garbage-collected already. And for optimization reasons UBIFS does
+ * not read the orphan area if it has been unmounted cleanly, so it would have
+ * no indication in the journal that there is a deleted inode which has to be
+ * removed from TNC.
+ *
+ * However, if there was no commit between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion
+ * inode to the media for the second time. And this is quite a typical case.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
+{
+        int err;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        ubifs_assert(inode->i_nlink == 0);
+        if (ui->del_cmtno != c->cmt_no)
+                /* A commit happened for sure */
+                return ubifs_jnl_write_inode(c, inode);
+        down_read(&c->commit_sem);
+        /*
+         * Check commit number again, because the first test has been done
+         * without @c->commit_sem, so a commit might have happened.
+         */
+        if (ui->del_cmtno != c->cmt_no) {
+                up_read(&c->commit_sem);
+                return ubifs_jnl_write_inode(c, inode);
+        }
+        err = ubifs_tnc_remove_ino(c, inode->i_ino);
+        if (err)
+                ubifs_ro_mode(c, err);
+        else
+                ubifs_delete_orphan(c, inode->i_ino);
+        up_read(&c->commit_sem);
+        return err;
+}
+/**
 * ubifs_jnl_rename - rename a directory entry.
 * @c: UBIFS file-system description object
 * @old_dir: parent inode of directory entry to rename
@@ -917,16 +970,16 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
        p = (void *)dent2 + aligned_dlen2;
        if (new_inode) {
-                pack_inode(c, p, new_inode, 0, last_reference);
+                pack_inode(c, p, new_inode, 0);
                p += ALIGN(ilen, 8);
        }
        if (!move)
-                pack_inode(c, p, old_dir, 1, 0);
+                pack_inode(c, p, old_dir, 1);
        else {
-                pack_inode(c, p, old_dir, 0, 0);
+                pack_inode(c, p, old_dir, 0);
                p += ALIGN(plen, 8);
-                pack_inode(c, p, new_dir, 1, 0);
+                pack_inode(c, p, new_dir, 1);
        }
        if (last_reference) {
@@ -935,6 +988,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
                        release_head(c, BASEHD);
                        goto out_finish;
                }
+                new_ui->del_cmtno = c->cmt_no;
        }
        err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
@@ -1131,7 +1185,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
        if (err)
                goto out_free;
-        pack_inode(c, ino, inode, 0, 0);
+        pack_inode(c, ino, inode, 0);
        ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
        if (dlen)
                ubifs_prep_grp_node(c, dn, dlen, 1);
@@ -1251,9 +1305,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
        ubifs_prep_grp_node(c, xent, xlen, 0);
        ino = (void *)xent + aligned_xlen;
-        pack_inode(c, ino, inode, 0, 1);
+        pack_inode(c, ino, inode, 0);
        ino = (void *)ino + UBIFS_INO_NODE_SZ;
-        pack_inode(c, ino, host, 1, 0);
+        pack_inode(c, ino, host, 1);
        err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
        if (!sync && !err)
@@ -1320,7 +1374,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
                           const struct inode *host)
 {
        int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
-        struct ubifs_inode *host_ui = ubifs_inode(inode);
+        struct ubifs_inode *host_ui = ubifs_inode(host);
        struct ubifs_ino_node *ino;
        union ubifs_key key;
        int sync = IS_DIRSYNC(host);
@@ -1344,8 +1398,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
        if (err)
                goto out_free;
-        pack_inode(c, ino, host, 0, 0);
+        pack_inode(c, ino, host, 0);
-        pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0);
+        pack_inode(c, (void *)ino + aligned_len1, inode, 1);
        err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
        if (!sync && !err) {
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36857b9ed59..3e0aa736755 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -317,6 +317,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        return 0;
 out_unlock:
+        if (err != -EAGAIN)
+                ubifs_ro_mode(c, err);
        mutex_unlock(&c->log_mutex);
        kfree(ref);
        kfree(bud);
@@ -410,7 +412,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
                return -ENOMEM;
        cs->ch.node_type = UBIFS_CS_NODE;
-        cs->cmt_no = cpu_to_le64(c->cmt_no + 1);
+        cs->cmt_no = cpu_to_le64(c->cmt_no);
        ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
        /*
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4beccfc256d..87dabf9fe74 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -80,20 +80,6 @@ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
 }
 /**
- * ubifs_ro_mode - switch UBIFS to read read-only mode.
- * @c: UBIFS file-system description object
- * @err: error code which is the reason of switching to R/O mode
- */
-static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
-{
-        if (!c->ro_media) {
-                c->ro_media = 1;
-                ubifs_warn("switched to read-only mode, error %d", err);
-                dbg_dump_stack();
-        }
-}
-/**
 * ubifs_compr_present - check if compressor was compiled in.
 * @compr_type: compressor type to check
 *
@@ -322,7 +308,7 @@ static inline long long ubifs_reported_space(const struct ubifs_info *c,
 {
        int divisor, factor;
-        divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
+        divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3);
        factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
        do_div(free, divisor);
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 3afeb9242c6..02d3462f4d3 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -310,10 +310,10 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
        c->cmt_orphans -= cnt;
        spin_unlock(&c->orphan_lock);
        if (c->cmt_orphans)
-                orph->cmt_no = cpu_to_le64(c->cmt_no + 1);
+                orph->cmt_no = cpu_to_le64(c->cmt_no);
        else
                /* Mark the last node of the commit */
-                orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63));
+                orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63));
        ubifs_assert(c->ohead_offs + len <= c->leb_size);
        ubifs_assert(c->ohead_lnum >= c->orph_first);
        ubifs_assert(c->ohead_lnum <= c->orph_last);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ca1e2d4e03c..f71e6b8822c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -30,7 +30,6 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
-#include <linux/random.h>
 #include <linux/kthread.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
@@ -149,7 +148,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
        if (err)
                goto out_invalid;
-        /* Disable readahead */
+        /* Disable read-ahead */
        inode->i_mapping->backing_dev_info = &c->bdi;
        switch (inode->i_mode & S_IFMT) {
@@ -278,7 +277,7 @@ static void ubifs_destroy_inode(struct inode *inode)
 */
 static int ubifs_write_inode(struct inode *inode, int wait)
 {
-        int err;
+        int err = 0;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        struct ubifs_inode *ui = ubifs_inode(inode);
@@ -299,10 +298,18 @@ static int ubifs_write_inode(struct inode *inode, int wait)
                return 0;
        }
-        dbg_gen("inode %lu", inode->i_ino);
+        /*
-        err = ubifs_jnl_write_inode(c, inode, 0);
+         * As an optimization, do not write orphan inodes to the media just
-        if (err)
+         * because this is not needed.
-                ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+         */
+        dbg_gen("inode %lu, mode %#x, nlink %u",
+                inode->i_ino, (int)inode->i_mode, inode->i_nlink);
+        if (inode->i_nlink) {
+                err = ubifs_jnl_write_inode(c, inode);
+                if (err)
+                        ubifs_err("can't write inode %lu, error %d",
+                                  inode->i_ino, err);
+        }
        ui->dirty = 0;
        mutex_unlock(&ui->ui_mutex);
@@ -314,8 +321,9 @@ static void ubifs_delete_inode(struct inode *inode)
 {
        int err;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        struct ubifs_inode *ui = ubifs_inode(inode);
-        if (ubifs_inode(inode)->xattr)
+        if (ui->xattr)
                /*
                 * Extended attribute inode deletions are fully handled in
                 * 'ubifs_removexattr()'. These inodes are special and have
@@ -323,7 +331,7 @@ static void ubifs_delete_inode(struct inode *inode)
                 */
                goto out;
-        dbg_gen("inode %lu", inode->i_ino);
+        dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
        ubifs_assert(!atomic_read(&inode->i_count));
        ubifs_assert(inode->i_nlink == 0);
@@ -331,15 +339,19 @@ static void ubifs_delete_inode(struct inode *inode)
        if (is_bad_inode(inode))
                goto out;
-        ubifs_inode(inode)->ui_size = inode->i_size = 0;
+        ui->ui_size = inode->i_size = 0;
-        err = ubifs_jnl_write_inode(c, inode, 1);
+        err = ubifs_jnl_delete_inode(c, inode);
        if (err)
                /*
                 * Worst case we have a lost orphan inode wasting space, so a
-                 * simple error message is ok here.
+                 * simple error message is OK here.
                 */
-                ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+                ubifs_err("can't delete inode %lu, error %d",
+                          inode->i_ino, err);
 out:
+        if (ui->dirty)
+                ubifs_release_dirty_inode_budget(c, ui);
        clear_inode(inode);
 }
@@ -1122,8 +1134,8 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_infos;
-        ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num,
+        ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
-                  c->vi.vol_id);
+                  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
        if (mounted_read_only)
                ubifs_msg("mounted read-only");
        x = (long long)c->main_lebs * c->leb_size;
@@ -1469,6 +1481,7 @@ static void ubifs_put_super(struct super_block *sb)
         */
        ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
        ubifs_assert(c->budg_idx_growth == 0);
+        ubifs_assert(c->budg_dd_growth == 0);
        ubifs_assert(c->budg_data_growth == 0);
        /*
@@ -1657,7 +1670,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&c->orph_new);
        c->highest_inum = UBIFS_FIRST_INO;
-        get_random_bytes(&c->vfs_gen, sizeof(int));
        c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
        ubi_get_volume_info(ubi, &c->vi);
@@ -1671,10 +1683,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        }
        /*
-         * UBIFS provids 'backing_dev_info' in order to disable readahead. For
+         * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
         * UBIFS, I/O is not deferred, it is done immediately in readpage,
         * which means the user would have to wait not just for their own I/O
-         * but the readahead I/O as well i.e. completely pointless.
+         * but the read-ahead I/O as well i.e. completely pointless.
         *
         * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
         */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8117e65ba2e..8ac76b1c2d5 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -372,26 +372,25 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
                written = layout_leb_in_gaps(c, p);
                if (written < 0) {
                        err = written;
-                        if (err == -ENOSPC) {
+                        if (err != -ENOSPC) {
-                                if (!dbg_force_in_the_gaps_enabled) {
+                                kfree(c->gap_lebs);
-                                        /*
+                                c->gap_lebs = NULL;
-                                         * Do not print scary warnings if the
+                                return err;
-                                         * debugging option which forces
-                                         * in-the-gaps is enabled.
-                                         */
-                                        ubifs_err("out of space");
-                                        spin_lock(&c->space_lock);
-                                        dbg_dump_budg(c);
-                                        spin_unlock(&c->space_lock);
-                                        dbg_dump_lprops(c);
-                                }
-                                /* Try to commit anyway */
-                                err = 0;
-                                break;
                        }
-                        kfree(c->gap_lebs);
+                        if (!dbg_force_in_the_gaps_enabled) {
-                        c->gap_lebs = NULL;
+                                /*
-                        return err;
+                                 * Do not print scary warnings if the debugging
+                                 * option which forces in-the-gaps is enabled.
+                                 */
+                                ubifs_err("out of space");
+                                spin_lock(&c->space_lock);
+                                dbg_dump_budg(c);
+                                spin_unlock(&c->space_lock);
+                                dbg_dump_lprops(c);
+                        }
+                        /* Try to commit anyway */
+                        err = 0;
+                        break;
                }
                p++;
                cnt -= written;
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0cc7da9bed4..bd2121f3426 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -228,10 +228,10 @@ enum {
 /* Minimum number of orphan area logical eraseblocks */
 #define UBIFS_MIN_ORPH_LEBS 1
 /*
- * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1
+ * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1
 * for GC, 1 for deletions, and at least 1 for committed data).
 */
-#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5)
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6)
 /* Minimum number of logical eraseblocks */
 #define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e4f89f27182..d7f706f7a30 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -20,8 +20,6 @@
 *          Adrian Hunter
 */
-/* Implementation version 0.7 */
 #ifndef __UBIFS_H__
 #define __UBIFS_H__
@@ -322,6 +320,8 @@ struct ubifs_gced_idx_leb {
 * struct ubifs_inode - UBIFS in-memory inode description.
 * @vfs_inode: VFS inode description object
 * @creat_sqnum: sequence number at time of creation
+ * @del_cmtno: commit number corresponding to the time the inode was deleted,
+ *             protected by @c->commit_sem;
 * @xattr_size: summarized size of all extended attributes in bytes
 * @xattr_cnt: count of extended attributes this inode has
 * @xattr_names: sum of lengths of all extended attribute names belonging to
@@ -373,6 +373,7 @@ struct ubifs_gced_idx_leb {
 struct ubifs_inode {
        struct inode vfs_inode;
        unsigned long long creat_sqnum;
+        unsigned long long del_cmtno;
        unsigned int xattr_size;
        unsigned int xattr_cnt;
        unsigned int xattr_names;
@@ -779,7 +780,7 @@ struct ubifs_compressor {
 /**
 * struct ubifs_budget_req - budget requirements of an operation.
 *
- * @fast: non-zero if the budgeting should try to aquire budget quickly and
+ * @fast: non-zero if the budgeting should try to acquire budget quickly and
 *        should not try to call write-back
 * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
 *               have to be re-calculated
@@ -805,21 +806,31 @@ struct ubifs_compressor {
 * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
 * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
 * dirty by the re-name operation.
+ *
+ * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to
+ * make sure the amount of inode data which contribute to @new_ino_d and
+ * @dirtied_ino_d fields are aligned.
 */
 struct ubifs_budget_req {
        unsigned int fast:1;
        unsigned int recalculate:1;
+#ifndef UBIFS_DEBUG
        unsigned int new_page:1;
        unsigned int dirtied_page:1;
        unsigned int new_dent:1;
        unsigned int mod_dent:1;
        unsigned int new_ino:1;
        unsigned int new_ino_d:13;
-#ifndef UBIFS_DEBUG
        unsigned int dirtied_ino:4;
        unsigned int dirtied_ino_d:15;
 #else
        /* Not bit-fields to check for overflows */
+        unsigned int new_page;
+        unsigned int dirtied_page;
+        unsigned int new_dent;
+        unsigned int mod_dent;
+        unsigned int new_ino;
+        unsigned int new_ino_d;
        unsigned int dirtied_ino;
        unsigned int dirtied_ino_d;
 #endif
@@ -860,13 +871,13 @@ struct ubifs_mount_opts {
 * struct ubifs_info - UBIFS file-system description data structure
 * (per-superblock).
 * @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable readahead
+ * @bdi: backing device info object to make VFS happy and disable read-ahead
 *
 * @highest_inum: highest used inode number
- * @vfs_gen: VFS inode generation counter
 * @max_sqnum: current global sequence number
- * @cmt_no: commit number (last successfully completed commit)
+ * @cmt_no: commit number of the last successfully completed commit, protected
- * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters
+ *          by @commit_sem
+ * @cnt_lock: protects @highest_inum and @max_sqnum counters
 * @fmt_version: UBIFS on-flash format version
 * @uuid: UUID from super block
 *
@@ -1103,7 +1114,6 @@ struct ubifs_info {
        struct backing_dev_info bdi;
        ino_t highest_inum;
-        unsigned int vfs_gen;
        unsigned long long max_sqnum;
        unsigned long long cmt_no;
        spinlock_t cnt_lock;
@@ -1346,6 +1356,7 @@ extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 /* io.c */
+void ubifs_ro_mode(struct ubifs_info *c, int err);
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
 int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
                           int dtype);
@@ -1399,8 +1410,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
                     int deletion, int xent);
 int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
                         const union ubifs_key *key, const void *buf, int len);
-int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
-                          int last_reference);
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
 int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
                     const struct dentry *old_dentry,
                     const struct inode *new_dir,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 1388a078e1a..649bec78b64 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -61,7 +61,7 @@
 /*
 * Limit the number of extended attributes per inode so that the total size
- * (xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.
 */
 #define MAX_XATTRS_PER_INODE 65535
@@ -103,14 +103,14 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        struct inode *inode;
        struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-                                        .new_ino_d = size, .dirtied_ino = 1,
+                                .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
-                                        .dirtied_ino_d = host_ui->data_len};
+                                .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
        if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
                return -ENOSPC;
        /*
         * Linux limits the maximum size of the extended attribute names list
-         * to %XATTR_LIST_MAX. This means we should not allow creating more*
+         * to %XATTR_LIST_MAX. This means we should not allow creating more
         * extended attributes if the name list becomes larger. This limitation
         * is artificial for UBIFS, though.
         */
@@ -128,7 +128,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
                goto out_budg;
        }
-        mutex_lock(&host_ui->ui_mutex);
        /* Re-define all operations to be "nothing" */
        inode->i_mapping->a_ops = &none_address_operations;
        inode->i_op = &none_inode_operations;
@@ -141,23 +140,19 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        ui->data = kmalloc(size, GFP_NOFS);
        if (!ui->data) {
                err = -ENOMEM;
-                goto out_unlock;
+                goto out_free;
        }
        memcpy(ui->data, value, size);
+        inode->i_size = ui->ui_size = size;
+        ui->data_len = size;
+        mutex_lock(&host_ui->ui_mutex);
        host->i_ctime = ubifs_current_time(host);
        host_ui->xattr_cnt += 1;
        host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
        host_ui->xattr_size += CALC_XATTR_BYTES(size);
        host_ui->xattr_names += nm->len;
-        /*
-         * We do not use i_size_write() because nobody can race with us as we
-         * are holding host @host->i_mutex - every xattr operation for this
-         * inode is serialized by it.
-         */
-        inode->i_size = ui->ui_size = size;
-        ui->data_len = size;
        err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
        if (err)
                goto out_cancel;
@@ -172,8 +167,8 @@ out_cancel:
        host_ui->xattr_cnt -= 1;
        host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
-out_unlock:
        mutex_unlock(&host_ui->ui_mutex);
+out_free:
        make_bad_inode(inode);
        iput(inode);
 out_budg:
@@ -200,29 +195,28 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
        struct ubifs_inode *host_ui = ubifs_inode(host);
        struct ubifs_inode *ui = ubifs_inode(inode);
        struct ubifs_budget_req req = { .dirtied_ino = 2,
-                                .dirtied_ino_d = size + host_ui->data_len };
+                .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
        ubifs_assert(ui->data_len == inode->i_size);
        err = ubifs_budget_space(c, &req);
        if (err)
                return err;
-        mutex_lock(&host_ui->ui_mutex);
-        host->i_ctime = ubifs_current_time(host);
-        host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
-        host_ui->xattr_size += CALC_XATTR_BYTES(size);
        kfree(ui->data);
        ui->data = kmalloc(size, GFP_NOFS);
        if (!ui->data) {
                err = -ENOMEM;
-                goto out_unlock;
+                goto out_free;
        }
        memcpy(ui->data, value, size);
        inode->i_size = ui->ui_size = size;
        ui->data_len = size;
+        mutex_lock(&host_ui->ui_mutex);
+        host->i_ctime = ubifs_current_time(host);
+        host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+        host_ui->xattr_size += CALC_XATTR_BYTES(size);
        /*
         * It is important to write the host inode after the xattr inode
         * because if the host inode gets synchronized (via 'fsync()'), then
@@ -240,9 +234,9 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
 out_cancel:
        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
        host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
-        make_bad_inode(inode);
-out_unlock:
        mutex_unlock(&host_ui->ui_mutex);
+        make_bad_inode(inode);
+out_free:
        ubifs_release_budget(c, &req);
        return err;
 }
@@ -312,6 +306,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,
        dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
                host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+        ubifs_assert(mutex_is_locked(&host->i_mutex));
        if (size > UBIFS_MAX_INO_DATA)
                return -ERANGE;
@@ -384,7 +379,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
        if (!xent)
                return -ENOMEM;
-        mutex_lock(&host->i_mutex);
        xent_key_init(c, &key, host->i_ino, &nm);
        err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
        if (err) {
@@ -419,7 +413,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
 out_iput:
        iput(inode);
 out_unlock:
-        mutex_unlock(&host->i_mutex);
        kfree(xent);
        return err;
 }
@@ -449,8 +442,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                return -ERANGE;
        lowest_xent_key(c, &key, host->i_ino);
-        mutex_lock(&host->i_mutex);
        while (1) {
                int type;
@@ -479,7 +470,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                pxent = xent;
                key_read(c, &xent->key, &key);
        }
-        mutex_unlock(&host->i_mutex);
        kfree(pxent);
        if (err != -ENOENT) {
@@ -497,8 +487,8 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
        int err;
        struct ubifs_inode *host_ui = ubifs_inode(host);
        struct ubifs_inode *ui = ubifs_inode(inode);
-        struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1,
+        struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1,
-                                        .dirtied_ino_d = host_ui->data_len };
+                                .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
        ubifs_assert(ui->data_len == inode->i_size);
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
deleted file mode 100644
index 3abe7e9ceb3..00000000000
--- a/fs/xfs/linux-2.6/sema.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SEMA_H__
-#define __XFS_SUPPORT_SEMA_H__
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <linux/semaphore.h>
-#include <asm/atomic.h>
-/*
- * sema_t structure just maps to struct semaphore in Linux kernel.
- */
-typedef struct semaphore sema_t;
-#define initnsema(sp, val, name)        sema_init(sp, val)
-#define psema(sp, b)                    down(sp)
-#define vsema(sp)                       up(sp)
-#define freesema(sema)                  do { } while (0)
-static inline int issemalocked(sema_t *sp)
-{
-        return down_trylock(sp) || (up(sp), 0);
-}
-/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
- */
-static inline int cpsema(sema_t *sp)
-{
-        return down_trylock(sp) ? 0 : 1;
-}
-#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index fa47e43b8b4..f42f80a3b1f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -73,7 +73,6 @@ xfs_page_trace(
        unsigned long   pgoff)
 {
        xfs_inode_t     *ip;
-        bhv_vnode_t     *vp = vn_from_inode(inode);
        loff_t          isize = i_size_read(inode);
        loff_t          offset = page_offset(page);
        int             delalloc = -1, unmapped = -1, unwritten = -1;
@@ -81,7 +80,7 @@ xfs_page_trace(
        if (page_has_buffers(page))
                xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-        ip = xfs_vtoi(vp);
+        ip = XFS_I(inode);
        if (!ip->i_rwtrace)
                return;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9cc8f021309..986061ae1b9 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -58,7 +58,7 @@ xfs_buf_trace(
                bp, id,
                (void *)(unsigned long)bp->b_flags,
                (void *)(unsigned long)bp->b_hold.counter,
-                (void *)(unsigned long)bp->b_sema.count.counter,
+                (void *)(unsigned long)bp->b_sema.count,
                (void *)current,
                data, ra,
                (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
@@ -253,7 +253,7 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
-        init_MUTEX_LOCKED(&bp->b_iodonesema);
+        init_completion(&bp->b_iowait);
        INIT_LIST_HEAD(&bp->b_list);
        INIT_LIST_HEAD(&bp->b_hash_list);
        init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
@@ -838,6 +838,7 @@ xfs_buf_rele(
                return;
        }
+        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
@@ -851,11 +852,6 @@ xfs_buf_rele(
                        spin_unlock(&hash->bh_lock);
                        xfs_buf_free(bp);
                }
-        } else {
-                /*
-                 * Catch reference count leaks
-                 */
-                ASSERT(atomic_read(&bp->b_hold) >= 0);
        }
 }
@@ -1037,7 +1033,7 @@ xfs_buf_ioend(
                        xfs_buf_iodone_work(&bp->b_iodone_work);
                }
        } else {
-                up(&bp->b_iodonesema);
+                complete(&bp->b_iowait);
        }
 }
@@ -1275,7 +1271,7 @@ xfs_buf_iowait(
        XB_TRACE(bp, "iowait", 0);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
-        down(&bp->b_iodonesema);
+        wait_for_completion(&bp->b_iowait);
        XB_TRACE(bp, "iowaited", (long)bp->b_error);
        return bp->b_error;
 }
@@ -1799,7 +1795,7 @@ int __init
 xfs_buf_init(void)
 {
 #ifdef XFS_BUF_TRACE
-        xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
+        xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
 #endif
        xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 29d1d4adc07..fe010995665 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -157,7 +157,7 @@ typedef struct xfs_buf {
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
        xfs_buf_relse_t         b_relse;        /* releasing function */
        xfs_buf_bdstrat_t       b_strat;        /* pre-write function */
-        struct semaphore        b_iodonesema;   /* Semaphore for I/O waiters */
+        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
        void                    *b_fspriv3;
@@ -352,7 +352,7 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_CPSEMA(bp)      (xfs_buf_cond_lock(bp) == 0)
 #define XFS_BUF_VSEMA(bp)       xfs_buf_unlock(bp)
 #define XFS_BUF_PSEMA(bp,x)     xfs_buf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema);
+#define XFS_BUF_FINISH_IOWAIT(bp)       complete(&bp->b_iowait);
 #define XFS_BUF_SET_TARGET(bp, target)  ((bp)->b_target = (target))
 #define XFS_BUF_TARGET(bp)              ((bp)->b_target)
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 987fe84f7b1..24fd598af84 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -139,7 +139,7 @@ xfs_nfs_get_inode(
        }
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return ip->i_vnode;
+        return VFS_I(ip);
 }
 STATIC struct dentry *
@@ -167,7 +167,7 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
        if (!inode)
                return NULL;
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return ERR_CAST(inode);
        result = d_alloc_anon(inode);
        if (!result) {
                iput(inode);
@@ -198,7 +198,7 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
        if (!inode)
                return NULL;
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return ERR_CAST(inode);
        result = d_alloc_anon(inode);
        if (!result) {
                iput(inode);
@@ -219,9 +219,9 @@ xfs_fs_get_parent(
        if (unlikely(error))
                return ERR_PTR(-error);
-        parent = d_alloc_anon(cip->i_vnode);
+        parent = d_alloc_anon(VFS_I(cip));
        if (unlikely(!parent)) {
-                iput(cip->i_vnode);
+                iput(VFS_I(cip));
                return ERR_PTR(-ENOMEM);
        }
        return parent;
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1eefe61f0e1..36caa6d957d 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -31,7 +31,7 @@ xfs_tosspages(
        xfs_off_t       last,
        int             fiopt)
 {
-        struct address_space *mapping = ip->i_vnode->i_mapping;
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
        if (mapping->nrpages)
                truncate_inode_pages(mapping, first);
@@ -44,7 +44,7 @@ xfs_flushinval_pages(
        xfs_off_t       last,
        int             fiopt)
 {
-        struct address_space *mapping = ip->i_vnode->i_mapping;
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
        int             ret = 0;
        if (mapping->nrpages) {
@@ -64,7 +64,7 @@ xfs_flush_pages(
        uint64_t        flags,
        int             fiopt)
 {
-        struct address_space *mapping = ip->i_vnode->i_mapping;
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
        int             ret = 0;
        int             ret2;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index acb978d9d08..48799ba7e3e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -245,7 +245,7 @@ xfs_vget_fsop_handlereq(
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        *inode = XFS_ITOV(ip);
+        *inode = VFS_I(ip);
        return 0;
 }
@@ -927,7 +927,7 @@ STATIC void
 xfs_diflags_to_linux(
        struct xfs_inode        *ip)
 {
-        struct inode            *inode = XFS_ITOV(ip);
+        struct inode            *inode = VFS_I(ip);
        unsigned int            xflags = xfs_ip2xflags(ip);
        if (xflags & XFS_XFLAG_IMMUTABLE)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e88f5102808..91bcd979242 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,7 +62,7 @@ void
 xfs_synchronize_atime(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        if (inode) {
                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
@@ -79,7 +79,7 @@ void
 xfs_mark_inode_dirty_sync(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        if (inode)
                mark_inode_dirty_sync(inode);
@@ -89,36 +89,31 @@ xfs_mark_inode_dirty_sync(
 * Change the requested timestamp in the given inode.
 * We don't lock across timestamp updates, and we don't log them but
 * we do record the fact that there is dirty information in core.
- *
- * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
- *              with XFS_ICHGTIME_ACC to be sure that access time
- *              update will take.  Calling first with XFS_ICHGTIME_ACC
- *              and then XFS_ICHGTIME_MOD may fail to modify the access
- *              timestamp if the filesystem is mounted noacctm.
 */
 void
 xfs_ichgtime(
        xfs_inode_t     *ip,
        int             flags)
 {
-        struct inode    *inode = vn_to_inode(XFS_ITOV(ip));
+        struct inode    *inode = VFS_I(ip);
        timespec_t      tv;
+        int             sync_it = 0;
+        tv = current_fs_time(inode->i_sb);
-        nanotime(&tv);
+        if ((flags & XFS_ICHGTIME_MOD) &&
-        if (flags & XFS_ICHGTIME_MOD) {
+            !timespec_equal(&inode->i_mtime, &tv)) {
                inode->i_mtime = tv;
                ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
                ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+                sync_it = 1;
        }
-        if (flags & XFS_ICHGTIME_ACC) {
+        if ((flags & XFS_ICHGTIME_CHG) &&
-                inode->i_atime = tv;
+            !timespec_equal(&inode->i_ctime, &tv)) {
-                ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
-                ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
-        }
-        if (flags & XFS_ICHGTIME_CHG) {
                inode->i_ctime = tv;
                ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
                ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
+                sync_it = 1;
        }
        /*
@@ -130,55 +125,11 @@ xfs_ichgtime(
         * ensure that the compiler does not reorder the update
         * of i_update_core above the timestamp updates above.
         */
-        SYNCHRONIZE();
+        if (sync_it) {
-        ip->i_update_core = 1;
+                SYNCHRONIZE();
-        if (!(inode->i_state & I_NEW))
+                ip->i_update_core = 1;
                mark_inode_dirty_sync(inode);
-}
-/*
- * Variant on the above which avoids querying the system clock
- * in situations where we know the Linux inode timestamps have
- * just been updated (and so we can update our inode cheaply).
- */
-void
-xfs_ichgtime_fast(
-        xfs_inode_t     *ip,
-        struct inode    *inode,
-        int             flags)
-{
-        timespec_t      *tvp;
-        /*
-         * Atime updates for read() & friends are handled lazily now, and
-         * explicit updates must go through xfs_ichgtime()
-         */
-        ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
-        if (flags & XFS_ICHGTIME_MOD) {
-                tvp = &inode->i_mtime;
-                ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
-                ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
        }
-        if (flags & XFS_ICHGTIME_CHG) {
-                tvp = &inode->i_ctime;
-                ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
-                ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec;
-        }
-        /*
-         * We update the i_update_core field _after_ changing
-         * the timestamps in order to coordinate properly with
-         * xfs_iflush() so that we don't lose timestamp updates.
-         * This keeps us from having to hold the inode lock
-         * while doing this.  We use the SYNCHRONIZE macro to
-         * ensure that the compiler does not reorder the update
-         * of i_update_core above the timestamp updates above.
-         */
-        SYNCHRONIZE();
-        ip->i_update_core = 1;
-        if (!(inode->i_state & I_NEW))
-                mark_inode_dirty_sync(inode);
 }
 /*
@@ -299,7 +250,7 @@ xfs_vn_mknod(
        if (unlikely(error))
                goto out_free_acl;
-        inode = ip->i_vnode;
+        inode = VFS_I(ip);
        error = xfs_init_security(inode, dir);
        if (unlikely(error))
@@ -366,7 +317,7 @@ xfs_vn_lookup(
                return NULL;
        }
-        return d_splice_alias(cip->i_vnode, dentry);
+        return d_splice_alias(VFS_I(cip), dentry);
 }
 STATIC struct dentry *
@@ -399,12 +350,12 @@ xfs_vn_ci_lookup(
        /* if exact match, just splice and exit */
        if (!ci_name.name)
-                return d_splice_alias(ip->i_vnode, dentry);
+                return d_splice_alias(VFS_I(ip), dentry);
        /* else case-insensitive match... */
        dname.name = ci_name.name;
        dname.len = ci_name.len;
-        dentry = d_add_ci(ip->i_vnode, dentry, &dname);
+        dentry = d_add_ci(VFS_I(ip), dentry, &dname);
        kmem_free(ci_name.name);
        return dentry;
 }
@@ -478,7 +429,7 @@ xfs_vn_symlink(
        if (unlikely(error))
                goto out;
-        inode = cip->i_vnode;
+        inode = VFS_I(cip);
        error = xfs_init_security(inode, dir);
        if (unlikely(error))
@@ -710,7 +661,7 @@ out_error:
        return error;
 }
-const struct inode_operations xfs_inode_operations = {
+static const struct inode_operations xfs_inode_operations = {
        .permission             = xfs_vn_permission,
        .truncate               = xfs_vn_truncate,
        .getattr                = xfs_vn_getattr,
@@ -722,7 +673,7 @@ const struct inode_operations xfs_inode_operations = {
        .fallocate              = xfs_vn_fallocate,
 };
-const struct inode_operations xfs_dir_inode_operations = {
+static const struct inode_operations xfs_dir_inode_operations = {
        .create                 = xfs_vn_create,
        .lookup                 = xfs_vn_lookup,
        .link                   = xfs_vn_link,
@@ -747,7 +698,7 @@ const struct inode_operations xfs_dir_inode_operations = {
        .listxattr              = xfs_vn_listxattr,
 };
-const struct inode_operations xfs_dir_ci_inode_operations = {
+static const struct inode_operations xfs_dir_ci_inode_operations = {
        .create                 = xfs_vn_create,
        .lookup                 = xfs_vn_ci_lookup,
        .link                   = xfs_vn_link,
@@ -772,7 +723,7 @@ const struct inode_operations xfs_dir_ci_inode_operations = {
        .listxattr              = xfs_vn_listxattr,
 };
-const struct inode_operations xfs_symlink_inode_operations = {
+static const struct inode_operations xfs_symlink_inode_operations = {
        .readlink               = generic_readlink,
        .follow_link            = xfs_vn_follow_link,
        .put_link               = xfs_vn_put_link,
@@ -784,3 +735,98 @@ const struct inode_operations xfs_symlink_inode_operations = {
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
 };
+STATIC void
+xfs_diflags_to_iflags(
+        struct inode            *inode,
+        struct xfs_inode        *ip)
+{
+        if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        else
+                inode->i_flags &= ~S_IMMUTABLE;
+        if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+                inode->i_flags |= S_APPEND;
+        else
+                inode->i_flags &= ~S_APPEND;
+        if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+                inode->i_flags |= S_SYNC;
+        else
+                inode->i_flags &= ~S_SYNC;
+        if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+                inode->i_flags |= S_NOATIME;
+        else
+                inode->i_flags &= ~S_NOATIME;
+}
+/*
+ * Initialize the Linux inode, set up the operation vectors and
+ * unlock the inode.
+ *
+ * When reading existing inodes from disk this is called directly
+ * from xfs_iget, when creating a new inode it is called from
+ * xfs_ialloc after setting up the inode.
+ */
+void
+xfs_setup_inode(
+        struct xfs_inode        *ip)
+{
+        struct inode            *inode = ip->i_vnode;
+        inode->i_mode   = ip->i_d.di_mode;
+        inode->i_nlink  = ip->i_d.di_nlink;
+        inode->i_uid    = ip->i_d.di_uid;
+        inode->i_gid    = ip->i_d.di_gid;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                inode->i_rdev =
+                        MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+                              sysv_minor(ip->i_df.if_u2.if_rdev));
+                break;
+        default:
+                inode->i_rdev = 0;
+                break;
+        }
+        inode->i_generation = ip->i_d.di_gen;
+        i_size_write(inode, ip->i_d.di_size);
+        inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
+        inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
+        inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
+        inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
+        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
+        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
+        xfs_diflags_to_iflags(inode, ip);
+        xfs_iflags_clear(ip, XFS_IMODIFIED);
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_op = &xfs_inode_operations;
+                inode->i_fop = &xfs_file_operations;
+                inode->i_mapping->a_ops = &xfs_address_space_operations;
+                break;
+        case S_IFDIR:
+                if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+                        inode->i_op = &xfs_dir_ci_inode_operations;
+                else
+                        inode->i_op = &xfs_dir_inode_operations;
+                inode->i_fop = &xfs_dir_file_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &xfs_symlink_inode_operations;
+                if (!(ip->i_df.if_flags & XFS_IFINLINE))
+                        inode->i_mapping->a_ops = &xfs_address_space_operations;
+                break;
+        default:
+                inode->i_op = &xfs_inode_operations;
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                break;
+        }
+        xfs_iflags_clear(ip, XFS_INEW);
+        barrier();
+        unlock_new_inode(inode);
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index d97ba934a2a..8b1a1e31dc2 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -18,10 +18,7 @@
 #ifndef __XFS_IOPS_H__
 #define __XFS_IOPS_H__
-extern const struct inode_operations xfs_inode_operations;
+struct xfs_inode;
-extern const struct inode_operations xfs_dir_inode_operations;
-extern const struct inode_operations xfs_dir_ci_inode_operations;
-extern const struct inode_operations xfs_symlink_inode_operations;
 extern const struct file_operations xfs_file_operations;
 extern const struct file_operations xfs_dir_file_operations;
@@ -29,14 +26,6 @@ extern const struct file_operations xfs_invis_file_operations;
 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-struct xfs_inode;
+extern void xfs_setup_inode(struct xfs_inode *);
-extern void xfs_ichgtime(struct xfs_inode *, int);
-extern void xfs_ichgtime_fast(struct xfs_inode *, struct inode *, int);
-#define xfs_vtoi(vp) \
-        ((struct xfs_inode *)vn_to_inode(vp)->i_private)
-#define XFS_I(inode) \
-        ((struct xfs_inode *)(inode)->i_private)
 #endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 4d45d9351a6..cc0f7b3a979 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -45,13 +45,13 @@
 #include <mrlock.h>
 #include <sv.h>
 #include <mutex.h>
-#include <sema.h>
 #include <time.h>
 #include <support/ktrace.h>
 #include <support/debug.h>
 #include <support/uuid.h>
+#include <linux/semaphore.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
@@ -126,8 +126,6 @@
 #define current_cpu()           (raw_smp_processor_id())
 #define current_pid()           (current->pid)
-#define current_fsuid(cred)     (current->fsuid)
-#define current_fsgid(cred)     (current->fsgid)
 #define current_test_flags(f)   (current->flags & (f))
 #define current_set_flags_nested(sp, f)         \
                (*(sp) = current->flags, current->flags |= (f))
@@ -180,7 +178,7 @@
 #define xfs_sort(a,n,s,fn)      sort(a,n,s,fn,NULL)
 #define xfs_stack_trace()       dump_stack()
 #define xfs_itruncate_data(ip, off)     \
-        (-vmtruncate(vn_to_inode(XFS_ITOV(ip)), (off)))
+        (-vmtruncate(VFS_I(ip), (off)))
 /* Move the kernel do_div definition off to one side */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 82333b3e118..1957e5357d0 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -137,7 +137,7 @@ xfs_iozero(
        struct address_space    *mapping;
        int                     status;
-        mapping = ip->i_vnode->i_mapping;
+        mapping = VFS_I(ip)->i_mapping;
        do {
                unsigned offset, bytes;
                void *fsdata;
@@ -674,9 +674,7 @@ start:
         */
        if (likely(!(ioflags & IO_INVIS) &&
                   !mnt_want_write(file->f_path.mnt))) {
-                file_update_time(file);
+                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-                xfs_ichgtime_fast(xip, inode,
-                                  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                mnt_drop_write(file->f_path.mnt);
        }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 30ae96397e3..73c65f19e54 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -581,118 +581,6 @@ xfs_max_file_offset(
        return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
-STATIC_INLINE void
-xfs_set_inodeops(
-        struct inode            *inode)
-{
-        switch (inode->i_mode & S_IFMT) {
-        case S_IFREG:
-                inode->i_op = &xfs_inode_operations;
-                inode->i_fop = &xfs_file_operations;
-                inode->i_mapping->a_ops = &xfs_address_space_operations;
-                break;
-        case S_IFDIR:
-                if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
-                        inode->i_op = &xfs_dir_ci_inode_operations;
-                else
-                        inode->i_op = &xfs_dir_inode_operations;
-                inode->i_fop = &xfs_dir_file_operations;
-                break;
-        case S_IFLNK:
-                inode->i_op = &xfs_symlink_inode_operations;
-                if (!(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE))
-                        inode->i_mapping->a_ops = &xfs_address_space_operations;
-                break;
-        default:
-                inode->i_op = &xfs_inode_operations;
-                init_special_inode(inode, inode->i_mode, inode->i_rdev);
-                break;
-        }
-}
-STATIC_INLINE void
-xfs_revalidate_inode(
-        xfs_mount_t             *mp,
-        bhv_vnode_t             *vp,
-        xfs_inode_t             *ip)
-{
-        struct inode            *inode = vn_to_inode(vp);
-        inode->i_mode   = ip->i_d.di_mode;
-        inode->i_nlink  = ip->i_d.di_nlink;
-        inode->i_uid    = ip->i_d.di_uid;
-        inode->i_gid    = ip->i_d.di_gid;
-        switch (inode->i_mode & S_IFMT) {
-        case S_IFBLK:
-        case S_IFCHR:
-                inode->i_rdev =
-                        MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
-                              sysv_minor(ip->i_df.if_u2.if_rdev));
-                break;
-        default:
-                inode->i_rdev = 0;
-                break;
-        }
-        inode->i_generation = ip->i_d.di_gen;
-        i_size_write(inode, ip->i_d.di_size);
-        inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
-        inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
-        inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
-        inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
-        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
-        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
-        if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
-                inode->i_flags |= S_IMMUTABLE;
-        else
-                inode->i_flags &= ~S_IMMUTABLE;
-        if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
-                inode->i_flags |= S_APPEND;
-        else
-                inode->i_flags &= ~S_APPEND;
-        if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
-                inode->i_flags |= S_SYNC;
-        else
-                inode->i_flags &= ~S_SYNC;
-        if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
-                inode->i_flags |= S_NOATIME;
-        else
-                inode->i_flags &= ~S_NOATIME;
-        xfs_iflags_clear(ip, XFS_IMODIFIED);
-}
-void
-xfs_initialize_vnode(
-        struct xfs_mount        *mp,
-        bhv_vnode_t             *vp,
-        struct xfs_inode        *ip)
-{
-        struct inode            *inode = vn_to_inode(vp);
-        if (!ip->i_vnode) {
-                ip->i_vnode = vp;
-                inode->i_private = ip;
-        }
-        /*
-         * We need to set the ops vectors, and unlock the inode, but if
-         * we have been called during the new inode create process, it is
-         * too early to fill in the Linux inode.  We will get called a
-         * second time once the inode is properly set up, and then we can
-         * finish our work.
-         */
-        if (ip->i_d.di_mode != 0 && (inode->i_state & I_NEW)) {
-                xfs_revalidate_inode(mp, vp, ip);
-                xfs_set_inodeops(inode);
-                xfs_iflags_clear(ip, XFS_INEW);
-                barrier();
-                unlock_new_inode(inode);
-        }
-}
 int
 xfs_blkdev_get(
        xfs_mount_t             *mp,
@@ -982,26 +870,21 @@ STATIC struct inode *
 xfs_fs_alloc_inode(
        struct super_block      *sb)
 {
-        bhv_vnode_t             *vp;
+        return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
-        vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
-        if (unlikely(!vp))
-                return NULL;
-        return vn_to_inode(vp);
 }
 STATIC void
 xfs_fs_destroy_inode(
        struct inode            *inode)
 {
-        kmem_zone_free(xfs_vnode_zone, vn_from_inode(inode));
+        kmem_zone_free(xfs_vnode_zone, inode);
 }
 STATIC void
 xfs_fs_inode_init_once(
        void                    *vnode)
 {
-        inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
+        inode_init_once((struct inode *)vnode);
 }
 /*
@@ -1106,7 +989,7 @@ void
 xfs_flush_inode(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        igrab(inode);
        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
@@ -1131,7 +1014,7 @@ void
 xfs_flush_device(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = vn_to_inode(XFS_ITOV(ip));
+        struct inode    *inode = VFS_I(ip);
        igrab(inode);
        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
@@ -1201,6 +1084,15 @@ xfssyncd(
 }
 STATIC void
+xfs_free_fsname(
+        struct xfs_mount        *mp)
+{
+        kfree(mp->m_fsname);
+        kfree(mp->m_rtname);
+        kfree(mp->m_logname);
+}
+STATIC void
 xfs_fs_put_super(
        struct super_block      *sb)
 {
@@ -1239,8 +1131,6 @@ xfs_fs_put_super(
        error = xfs_unmount_flush(mp, 0);
        WARN_ON(error);
-        IRELE(rip);
        /*
         * If we're forcing a shutdown, typically because of a media error,
         * we want to make sure we invalidate dirty pages that belong to
@@ -1257,10 +1147,12 @@ xfs_fs_put_super(
        }
        xfs_unmountfs(mp);
+        xfs_freesb(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
        xfs_qmops_put(mp);
        xfs_dmops_put(mp);
+        xfs_free_fsname(mp);
        kfree(mp);
 }
@@ -1517,6 +1409,8 @@ xfs_start_flags(
        struct xfs_mount_args   *ap,
        struct xfs_mount        *mp)
 {
+        int                     error;
        /* Values are in BBs */
        if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
                /*
@@ -1549,17 +1443,27 @@ xfs_start_flags(
                        ap->logbufsize);
                return XFS_ERROR(EINVAL);
        }
+        error = ENOMEM;
        mp->m_logbsize = ap->logbufsize;
        mp->m_fsname_len = strlen(ap->fsname) + 1;
-        mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
-        strcpy(mp->m_fsname, ap->fsname);
+        mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
+        if (!mp->m_fsname)
+                goto out;
        if (ap->rtname[0]) {
-                mp->m_rtname = kmem_alloc(strlen(ap->rtname) + 1, KM_SLEEP);
+                mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
-                strcpy(mp->m_rtname, ap->rtname);
+                if (!mp->m_rtname)
+                        goto out_free_fsname;
        }
        if (ap->logname[0]) {
-                mp->m_logname = kmem_alloc(strlen(ap->logname) + 1, KM_SLEEP);
+                mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
-                strcpy(mp->m_logname, ap->logname);
+                if (!mp->m_logname)
+                        goto out_free_rtname;
        }
        if (ap->flags & XFSMNT_WSYNC)
@@ -1632,6 +1536,14 @@ xfs_start_flags(
        if (ap->flags & XFSMNT_DMAPI)
                mp->m_flags |= XFS_MOUNT_DMAPI;
        return 0;
+ out_free_rtname:
+        kfree(mp->m_rtname);
+ out_free_fsname:
+        kfree(mp->m_fsname);
+ out:
+        return error;
 }
 /*
@@ -1792,10 +1704,10 @@ xfs_fs_fill_super(
         */
        error = xfs_start_flags(args, mp);
        if (error)
-                goto out_destroy_counters;
+                goto out_free_fsname;
        error = xfs_readsb(mp, flags);
        if (error)
-                goto out_destroy_counters;
+                goto out_free_fsname;
        error = xfs_finish_flags(args, mp);
        if (error)
                goto out_free_sb;
@@ -1811,7 +1723,7 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_sb;
-        error = xfs_mountfs(mp, flags);
+        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;
@@ -1825,7 +1737,7 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
-        root = igrab(mp->m_rootip->i_vnode);
+        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
                error = ENOENT;
                goto fail_unmount;
@@ -1857,7 +1769,8 @@ xfs_fs_fill_super(
        xfs_filestream_unmount(mp);
 out_free_sb:
        xfs_freesb(mp);
- out_destroy_counters:
+ out_free_fsname:
+        xfs_free_fsname(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
 out_put_qmops:
@@ -1890,10 +1803,8 @@ xfs_fs_fill_super(
        error = xfs_unmount_flush(mp, 0);
        WARN_ON(error);
-        IRELE(mp->m_rootip);
        xfs_unmountfs(mp);
-        goto out_destroy_counters;
+        goto out_free_sb;
 }
 STATIC int
@@ -2014,7 +1925,7 @@ xfs_free_trace_bufs(void)
 STATIC int __init
 xfs_init_zones(void)
 {
-        xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
+        xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
                                        KM_ZONE_SPREAD,
                                        xfs_fs_inode_init_once);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index b7d13da01bd..fe2ef4e6a0f 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -101,9 +101,6 @@ struct block_device;
 extern __uint64_t xfs_max_file_offset(unsigned int);
-extern void xfs_initialize_vnode(struct xfs_mount *mp, bhv_vnode_t *vp,
-                struct xfs_inode *ip);
 extern void xfs_flush_inode(struct xfs_inode *);
 extern void xfs_flush_device(struct xfs_inode *);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 25488b6d988..b52528bbbff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -33,7 +33,7 @@
 /*
- * Dedicated vnode inactive/reclaim sync semaphores.
+ * Dedicated vnode inactive/reclaim sync wait queues.
 * Prime number of hash buckets since address is used as the key.
 */
 #define NVSYNC                  37
@@ -82,24 +82,6 @@ vn_ioerror(
                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
 }
-/*
- * Add a reference to a referenced vnode.
- */
-bhv_vnode_t *
-vn_hold(
-        bhv_vnode_t     *vp)
-{
-        struct inode    *inode;
-        XFS_STATS_INC(vn_hold);
-        inode = igrab(vn_to_inode(vp));
-        ASSERT(inode);
-        return vp;
-}
 #ifdef  XFS_INODE_TRACE
 /*
@@ -108,7 +90,7 @@ vn_hold(
 */
 static inline int xfs_icount(struct xfs_inode *ip)
 {
-        bhv_vnode_t *vp = XFS_ITOV_NULL(ip);
+        struct inode *vp = VFS_I(ip);
        if (vp)
                return vn_count(vp);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 41ca2cec5d3..683ce16210f 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -22,20 +22,6 @@ struct file;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
-typedef struct inode    bhv_vnode_t;
-/*
- * Vnode to Linux inode mapping.
- */
-static inline bhv_vnode_t *vn_from_inode(struct inode *inode)
-{
-        return inode;
-}
-static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
-{
-        return vnode;
-}
 /*
 * Return values for xfs_inactive.  A return value of
 * VN_INACTIVE_NOCACHE implies that the file system behavior
@@ -76,57 +62,52 @@ extern void	vn_iowait(struct xfs_inode *ip);
 extern void     vn_iowake(struct xfs_inode *ip);
 extern void     vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
-static inline int vn_count(bhv_vnode_t *vp)
+static inline int vn_count(struct inode *vp)
 {
-        return atomic_read(&vn_to_inode(vp)->i_count);
+        return atomic_read(&vp->i_count);
 }
-/*
+#define IHOLD(ip) \
- * Vnode reference counting functions (and macros for compatibility).
+do { \
- */
+        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-extern bhv_vnode_t      *vn_hold(bhv_vnode_t *);
+        atomic_inc(&(VFS_I(ip)->i_count)); \
+        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
-#if defined(XFS_INODE_TRACE)
+#define IRELE(ip) \
-#define VN_HOLD(vp)             \
+do { \
-        ((void)vn_hold(vp),     \
+        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-          xfs_itrace_hold(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address))
+        iput(VFS_I(ip)); \
-#define VN_RELE(vp)             \
+} while (0)
-          (xfs_itrace_rele(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address), \
-           iput(vn_to_inode(vp)))
-#else
-#define VN_HOLD(vp)             ((void)vn_hold(vp))
-#define VN_RELE(vp)             (iput(vn_to_inode(vp)))
-#endif
-static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
+static inline struct inode *vn_grab(struct inode *vp)
 {
-        struct inode *inode = igrab(vn_to_inode(vp));
+        return igrab(vp);
-        return inode ? vn_from_inode(inode) : NULL;
 }
 /*
 * Dealing with bad inodes
 */
-static inline int VN_BAD(bhv_vnode_t *vp)
+static inline int VN_BAD(struct inode *vp)
 {
-        return is_bad_inode(vn_to_inode(vp));
+        return is_bad_inode(vp);
 }
 /*
 * Extracting atime values in various formats
 */
-static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime)
+static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
 {
        bs_atime->tv_sec = vp->i_atime.tv_sec;
        bs_atime->tv_nsec = vp->i_atime.tv_nsec;
 }
-static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts)
+static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
 {
        *ts = vp->i_atime;
 }
-static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
+static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
 {
        *tt = vp->i_atime.tv_sec;
 }
@@ -134,9 +115,9 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 /*
 * Some useful predicates.
 */
-#define VN_MAPPED(vp)   mapping_mapped(vn_to_inode(vp)->i_mapping)
+#define VN_MAPPED(vp)   mapping_mapped(vp->i_mapping)
-#define VN_CACHED(vp)   (vn_to_inode(vp)->i_mapping->nrpages)
+#define VN_CACHED(vp)   (vp->i_mapping->nrpages)
-#define VN_DIRTY(vp)    mapping_tagged(vn_to_inode(vp)->i_mapping, \
+#define VN_DIRTY(vp)    mapping_tagged(vp->i_mapping, \
                                        PAGECACHE_TAG_DIRTY)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index fc9f3fb39b7..f2705f2fd43 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,11 +101,18 @@ xfs_qm_dqinit(
        if (brandnewdquot) {
                dqp->dq_flnext = dqp->dq_flprev = dqp;
                mutex_init(&dqp->q_qlock);
-                initnsema(&dqp->q_flock, 1, "fdq");
                sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+                /*
+                 * Because we want to use a counting completion, complete
+                 * the flush completion once to allow a single access to
+                 * the flush completion without blocking.
+                 */
+                init_completion(&dqp->q_flush);
+                complete(&dqp->q_flush);
 #ifdef XFS_DQUOT_TRACE
-                dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
+                dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS);
                xfs_dqtrace_entry(dqp, "DQINIT");
 #endif
        } else {
@@ -150,7 +157,6 @@ xfs_qm_dqdestroy(
        ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
        mutex_destroy(&dqp->q_qlock);
-        freesema(&dqp->q_flock);
        sv_destroy(&dqp->q_pinwait);
 #ifdef XFS_DQUOT_TRACE
@@ -431,7 +437,7 @@ xfs_qm_dqalloc(
         * when it unlocks the inode. Since we want to keep the quota
         * inode around, we bump the vnode ref count now.
         */
-        VN_HOLD(XFS_ITOV(quotip));
+        IHOLD(quotip);
        xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
        nmaps = 1;
@@ -1211,7 +1217,7 @@ xfs_qm_dqflush(
        int                     error;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+        ASSERT(!completion_done(&dqp->q_flush));
        xfs_dqtrace_entry(dqp, "DQFLUSH");
        /*
@@ -1348,34 +1354,18 @@ xfs_qm_dqflush_done(
        xfs_dqfunlock(dqp);
 }
-int
-xfs_qm_dqflock_nowait(
-        xfs_dquot_t *dqp)
-{
-        int locked;
-        locked = cpsema(&((dqp)->q_flock));
-        /* XXX ifdef these out */
-        if (locked)
-                (dqp)->dq_flags |= XFS_DQ_FLOCKED;
-        return (locked);
-}
 int
 xfs_qm_dqlock_nowait(
        xfs_dquot_t *dqp)
 {
-        return (mutex_trylock(&((dqp)->q_qlock)));
+        return mutex_trylock(&dqp->q_qlock);
 }
 void
 xfs_dqlock(
        xfs_dquot_t *dqp)
 {
-        mutex_lock(&(dqp->q_qlock));
+        mutex_lock(&dqp->q_qlock);
 }
 void
@@ -1468,7 +1458,7 @@ xfs_qm_dqpurge(
         * if we're turning off quotas. Basically, we need this flush
         * lock, and are willing to block on it.
         */
-        if (! xfs_qm_dqflock_nowait(dqp)) {
+        if (!xfs_dqflock_nowait(dqp)) {
                /*
                 * Block on the flush lock after nudging dquot buffer,
                 * if it is incore.
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index f7393bba4e9..8958d0faf8d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -82,7 +82,7 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_icount;  /* total inos allocd+reserved */
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
        mutex_t          q_qlock;       /* quota lock */
-        sema_t           q_flock;       /* flush lock */
+        struct completion q_flush;      /* flush completion queue */
        uint             q_pincount;    /* pin count for this dquot */
        sv_t             q_pinwait;     /* sync var for pinning */
 #ifdef XFS_DQUOT_TRACE
@@ -113,17 +113,25 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
 /*
- * The following three routines simply manage the q_flock
+ * Manage the q_flush completion queue embedded in the dquot.  This completion
- * semaphore embedded in the dquot.  This semaphore synchronizes
+ * queue synchronizes processes attempting to flush the in-core dquot back to
- * processes attempting to flush the in-core dquot back to disk.
+ * disk.
 */
-#define xfs_dqflock(dqp)         { psema(&((dqp)->q_flock), PINOD | PRECALC);\
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
-                                   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
+{
-#define xfs_dqfunlock(dqp)       { ASSERT(issemalocked(&((dqp)->q_flock))); \
+        wait_for_completion(&dqp->q_flush);
-                                   vsema(&((dqp)->q_flock)); \
+}
-                                   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
+static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+        return try_wait_for_completion(&dqp->q_flush);
+}
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+        complete(&dqp->q_flush);
+}
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
@@ -167,7 +175,6 @@ extern int		xfs_qm_dqflush(xfs_dquot_t *, uint);
 extern int              xfs_qm_dqpurge(xfs_dquot_t *);
 extern void             xfs_qm_dqunpin_wait(xfs_dquot_t *);
 extern int              xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern int              xfs_qm_dqflock_nowait(xfs_dquot_t *);
 extern void             xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
 extern void             xfs_qm_adjust_dqtimers(xfs_mount_t *,
                                        xfs_disk_dquot_t *);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 08d2fc89e6a..f028644caa5 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -151,7 +151,7 @@ xfs_qm_dquot_logitem_push(
        dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+        ASSERT(!completion_done(&dqp->q_flush));
        /*
         * Since we were able to lock the dquot's flush lock and
@@ -245,7 +245,7 @@ xfs_qm_dquot_logitem_pushbuf(
         * inode flush completed and the inode was taken off the AIL.
         * So, just get out.
         */
-        if (!issemalocked(&(dqp->q_flock))  ||
+        if (completion_done(&dqp->q_flush)  ||
            ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
                qip->qli_pushbuf_flag = 0;
                xfs_dqunlock(dqp);
@@ -258,7 +258,7 @@ xfs_qm_dquot_logitem_pushbuf(
        if (bp != NULL) {
                if (XFS_BUF_ISDELAYWRITE(bp)) {
                        dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-                                  issemalocked(&(dqp->q_flock)));
+                                  !completion_done(&dqp->q_flush));
                        qip->qli_pushbuf_flag = 0;
                        xfs_dqunlock(dqp);
@@ -317,7 +317,7 @@ xfs_qm_dquot_logitem_trylock(
                return (XFS_ITEM_LOCKED);
        retval = XFS_ITEM_SUCCESS;
-        if (! xfs_qm_dqflock_nowait(dqp)) {
+        if (!xfs_dqflock_nowait(dqp)) {
                /*
                 * The dquot is already being flushed.  It may have been
                 * flushed delayed write, however, and we don't want to
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 021934a3d45..df0ffef9775 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -310,8 +310,7 @@ xfs_qm_unmount_quotadestroy(
 */
 void
 xfs_qm_mount_quotas(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             mfsi_flags)
 {
        int             error = 0;
        uint            sbf;
@@ -346,8 +345,7 @@ xfs_qm_mount_quotas(
        /*
         * If any of the quotas are not consistent, do a quotacheck.
         */
-        if (XFS_QM_NEED_QUOTACHECK(mp) &&
+        if (XFS_QM_NEED_QUOTACHECK(mp)) {
-            !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
                error = xfs_qm_quotacheck(mp);
                if (error) {
                        /* Quotacheck failed and disabled quotas. */
@@ -484,7 +482,7 @@ again:
                xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
                /* XXX a sentinel would be better */
                recl = XFS_QI_MPLRECLAIMS(mp);
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        /*
                         * If we can't grab the flush lock then check
                         * to see if the dquot has been flushed delayed
@@ -1062,7 +1060,7 @@ xfs_qm_sync(
                /* XXX a sentinel would be better */
                recl = XFS_QI_MPLRECLAIMS(mp);
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        if (nowait) {
                                xfs_dqunlock(dqp);
                                continue;
@@ -2079,7 +2077,7 @@ xfs_qm_shake_freelist(
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
                        dqp = dqp->dq_flnext;
                        continue;
@@ -2257,7 +2255,7 @@ xfs_qm_dqreclaim_one(void)
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
                        continue;
                }
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index cd2300e374a..44f25349e47 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RELE(xqm)        ((xqm)->qm_nrefs--)
 extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern void             xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void             xfs_qm_mount_quotas(xfs_mount_t *);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
 extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
 extern int              xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index f4f6c4c861d..eea2e60b456 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -162,7 +162,7 @@ xfs_qm_newmount(
                         * mounting, and get on with the boring life
                         * without disk quotas.
                         */
-                        xfs_qm_mount_quotas(mp, 0);
+                        xfs_qm_mount_quotas(mp);
                } else {
                        /*
                         * Clear the quota flags, but remember them. This
@@ -184,13 +184,12 @@ STATIC int
 xfs_qm_endmount(
        xfs_mount_t     *mp,
        uint            needquotamount,
-        uint            quotaflags,
+        uint            quotaflags)
-        int             mfsi_flags)
 {
        if (needquotamount) {
                ASSERT(mp->m_qflags == 0);
                mp->m_qflags = quotaflags;
-                xfs_qm_mount_quotas(mp, mfsi_flags);
+                xfs_qm_mount_quotas(mp);
        }
 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index adfb8723f65..1a3b803dfa5 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1034,7 +1034,7 @@ xfs_qm_dqrele_all_inodes(
 {
        xfs_inode_t     *ip, *topino;
        uint            ireclaims;
-        bhv_vnode_t     *vp;
+        struct inode    *vp;
        boolean_t       vnode_refd;
        ASSERT(mp->m_quotainfo);
@@ -1059,7 +1059,7 @@ again:
                        ip = ip->i_mnext;
                        continue;
                }
-                vp = XFS_ITOV_NULL(ip);
+                vp = VFS_I(ip);
                if (!vp) {
                        ASSERT(ip->i_udquot == NULL);
                        ASSERT(ip->i_gdquot == NULL);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 3e4648ad9cf..b2f639a1416 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,15 +37,15 @@
 #include <linux/capability.h>
 #include <linux/posix_acl_xattr.h>
-STATIC int      xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *);
+STATIC int      xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
 STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
 STATIC void     xfs_acl_get_endian(xfs_acl_t *);
 STATIC int      xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
 STATIC int      xfs_acl_invalid(xfs_acl_t *);
 STATIC void     xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void     xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *);
+STATIC void     xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
-STATIC void     xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *);
+STATIC void     xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
-STATIC int      xfs_acl_allow_set(bhv_vnode_t *, int);
+STATIC int      xfs_acl_allow_set(struct inode *, int);
 kmem_zone_t *xfs_acl_zone;
@@ -55,7 +55,7 @@ kmem_zone_t *xfs_acl_zone;
 */
 int
 xfs_acl_vhasacl_access(
-        bhv_vnode_t     *vp)
+        struct inode    *vp)
 {
        int             error;
@@ -68,7 +68,7 @@ xfs_acl_vhasacl_access(
 */
 int
 xfs_acl_vhasacl_default(
-        bhv_vnode_t     *vp)
+        struct inode    *vp)
 {
        int             error;
@@ -207,7 +207,7 @@ posix_acl_xfs_to_xattr(
 int
 xfs_acl_vget(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        void            *acl,
        size_t          size,
        int             kind)
@@ -217,7 +217,6 @@ xfs_acl_vget(
        posix_acl_xattr_header  *ext_acl = acl;
        int                     flags = 0;
-        VN_HOLD(vp);
        if(size) {
                if (!(_ACL_ALLOC(xfs_acl))) {
                        error = ENOMEM;
@@ -239,11 +238,10 @@ xfs_acl_vget(
                        goto out;
                }
                if (kind == _ACL_TYPE_ACCESS)
-                        xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, xfs_acl);
+                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
                error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
        }
 out:
-        VN_RELE(vp);
        if(xfs_acl)
                _ACL_FREE(xfs_acl);
        return -error;
@@ -251,28 +249,26 @@ out:
 int
 xfs_acl_vremove(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        int             kind)
 {
        int             error;
-        VN_HOLD(vp);
        error = xfs_acl_allow_set(vp, kind);
        if (!error) {
-                error = xfs_attr_remove(xfs_vtoi(vp),
+                error = xfs_attr_remove(XFS_I(vp),
                                                kind == _ACL_TYPE_DEFAULT?
                                                SGI_ACL_DEFAULT: SGI_ACL_FILE,
                                                ATTR_ROOT);
                if (error == ENOATTR)
                        error = 0;      /* 'scool */
        }
-        VN_RELE(vp);
        return -error;
 }
 int
 xfs_acl_vset(
-        bhv_vnode_t             *vp,
+        struct inode            *vp,
        void                    *acl,
        size_t                  size,
        int                     kind)
@@ -298,7 +294,6 @@ xfs_acl_vset(
                return 0;
        }
-        VN_HOLD(vp);
        error = xfs_acl_allow_set(vp, kind);
        /* Incoming ACL exists, set file mode based on its value */
@@ -321,7 +316,6 @@ xfs_acl_vset(
        }
 out:
-        VN_RELE(vp);
        _ACL_FREE(xfs_acl);
        return -error;
 }
@@ -363,7 +357,7 @@ xfs_acl_iaccess(
 STATIC int
 xfs_acl_allow_set(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        int             kind)
 {
        if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
@@ -372,7 +366,7 @@ xfs_acl_allow_set(
                return ENOTDIR;
        if (vp->i_sb->s_flags & MS_RDONLY)
                return EROFS;
-        if (xfs_vtoi(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
+        if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
                return EPERM;
        return 0;
 }
@@ -566,7 +560,7 @@ xfs_acl_get_endian(
 */
 STATIC void
 xfs_acl_get_attr(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *aclp,
        int             kind,
        int             flags,
@@ -576,7 +570,7 @@ xfs_acl_get_attr(
        ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
        flags |= ATTR_ROOT;
-        *error = xfs_attr_get(xfs_vtoi(vp),
+        *error = xfs_attr_get(XFS_I(vp),
                                        kind == _ACL_TYPE_ACCESS ?
                                        SGI_ACL_FILE : SGI_ACL_DEFAULT,
                                        (char *)aclp, &len, flags);
@@ -590,7 +584,7 @@ xfs_acl_get_attr(
 */
 STATIC void
 xfs_acl_set_attr(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *aclp,
        int             kind,
        int             *error)
@@ -615,7 +609,7 @@ xfs_acl_set_attr(
                INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
        }
        INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-        *error = xfs_attr_set(xfs_vtoi(vp),
+        *error = xfs_attr_set(XFS_I(vp),
                                kind == _ACL_TYPE_ACCESS ?
                                SGI_ACL_FILE: SGI_ACL_DEFAULT,
                                (char *)newacl, len, ATTR_ROOT);
@@ -624,7 +618,7 @@ xfs_acl_set_attr(
 int
 xfs_acl_vtoacl(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *access_acl,
        xfs_acl_t       *default_acl)
 {
@@ -639,7 +633,7 @@ xfs_acl_vtoacl(
                if (error)
                        access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
                else /* We have a good ACL and the file mode, synchronize. */
-                        xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, access_acl);
+                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
        }
        if (default_acl) {
@@ -656,7 +650,7 @@ xfs_acl_vtoacl(
 */
 int
 xfs_acl_inherit(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        mode_t          mode,
        xfs_acl_t       *pdaclp)
 {
@@ -715,7 +709,7 @@ out_error:
 */
 STATIC int
 xfs_acl_setmode(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *acl,
        int             *basicperms)
 {
@@ -734,7 +728,7 @@ xfs_acl_setmode(
         * mode.  The m:: bits take precedence over the g:: bits.
         */
        iattr.ia_valid = ATTR_MODE;
-        iattr.ia_mode = xfs_vtoi(vp)->i_d.di_mode;
+        iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
        iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
        ap = acl->acl_entry;
        for (i = 0; i < acl->acl_cnt; ++i) {
@@ -764,7 +758,7 @@ xfs_acl_setmode(
        if (gap && nomask)
                iattr.ia_mode |= gap->ae_perm << 3;
-        return xfs_setattr(xfs_vtoi(vp), &iattr, 0, sys_cred);
+        return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
 }
 /*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 323ee94cf83..a4e293b93ef 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -59,14 +59,14 @@ extern struct kmem_zone *xfs_acl_zone;
                (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
 #define xfs_acl_zone_destroy(zone)      kmem_zone_destroy(zone)
-extern int xfs_acl_inherit(bhv_vnode_t *, mode_t mode, xfs_acl_t *);
+extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
 extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(bhv_vnode_t *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(bhv_vnode_t *);
+extern int xfs_acl_vhasacl_access(struct inode *);
-extern int xfs_acl_vhasacl_default(bhv_vnode_t *);
+extern int xfs_acl_vhasacl_default(struct inode *);
-extern int xfs_acl_vset(bhv_vnode_t *, void *, size_t, int);
+extern int xfs_acl_vset(struct inode *, void *, size_t, int);
-extern int xfs_acl_vget(bhv_vnode_t *, void *, size_t, int);
+extern int xfs_acl_vget(struct inode *, void *, size_t, int);
-extern int xfs_acl_vremove(bhv_vnode_t *, int);
+extern int xfs_acl_vremove(struct inode *, int);
 #define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index f9472a2076d..0b3b5efe848 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -92,16 +92,6 @@
        ((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
    }
-/* define generic INT_ macros */
-#define INT_GET(reference,arch) \
-    (((arch) == ARCH_NOCONVERT) \
-        ? \
-            (reference) \
-        : \
-            INT_SWAP((reference),(reference)) \
-    )
 /* does not return a value */
 #define INT_SET(reference,arch,valueref) \
    (__builtin_constant_p(valueref) ? \
@@ -112,64 +102,6 @@
        ) \
    )
-/* does not return a value */
-#define INT_MOD_EXPR(reference,arch,code) \
-    (((arch) == ARCH_NOCONVERT) \
-        ? \
-            (void)((reference) code) \
-        : \
-            (void)( \
-                (reference) = INT_GET((reference),arch) , \
-                ((reference) code), \
-                INT_SET(reference, arch, reference) \
-            ) \
-    )
-/* does not return a value */
-#define INT_MOD(reference,arch,delta) \
-    (void)( \
-        INT_MOD_EXPR(reference,arch,+=(delta)) \
-    )
-/*
- * INT_COPY - copy a value between two locations with the
- *            _same architecture_ but _potentially different sizes_
- *
- *          if the types of the two parameters are equal or they are
- *              in native architecture, a simple copy is done
- *
- *          otherwise, architecture conversions are done
- *
- */
-/* does not return a value */
-#define INT_COPY(dst,src,arch) \
-    ( \
-        ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
-            ? \
-                (void)((dst) = (src)) \
-            : \
-                INT_SET(dst, arch, INT_GET(src, arch)) \
-    )
-/*
- * INT_XLATE - copy a value in either direction between two locations
- *             with different architectures
- *
- *                  dir < 0     - copy from memory to buffer (native to arch)
- *                  dir > 0     - copy from buffer to memory (arch to native)
- */
-/* does not return a value */
-#define INT_XLATE(buf,mem,dir,arch) {\
-    ASSERT(dir); \
-    if (dir>0) { \
-        (mem)=INT_GET(buf, arch); \
-    } else { \
-        INT_SET(buf, arch, mem); \
-    } \
-}
 /*
 * In directories inode numbers are stored as unaligned arrays of unsigned
 * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 78de80e3caa..f7cdc28aff4 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -194,6 +194,46 @@ xfs_attr_get(
        return(error);
 }
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+int
+xfs_attr_calc_size(
+        struct xfs_inode        *ip,
+        int                     namelen,
+        int                     valuelen,
+        int                     *local)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     size;
+        int                     nblks;
+        /*
+         * Determine space new attribute will use, and if it would be
+         * "local" or "remote" (note: local != inline).
+         */
+        size = xfs_attr_leaf_newentsize(namelen, valuelen,
+                                        mp->m_sb.sb_blocksize, local);
+        nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+        if (*local) {
+                if (size > (mp->m_sb.sb_blocksize >> 1)) {
+                        /* Double split possible */
+                        nblks *= 2;
+                }
+        } else {
+                /*
+                 * Out of line attribute, cannot double split, but
+                 * make room for the attribute value itself.
+                 */
+                uint    dblocks = XFS_B_TO_FSB(mp, valuelen);
+                nblks += dblocks;
+                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+        }
+        return nblks;
+}
 STATIC int
 xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
                char *value, int valuelen, int flags)
@@ -202,10 +242,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        xfs_fsblock_t   firstblock;
        xfs_bmap_free_t flist;
        int             error, err2, committed;
-        int             local, size;
-        uint            nblks;
        xfs_mount_t     *mp = dp->i_mount;
        int             rsvd = (flags & ATTR_ROOT) != 0;
+        int             local;
        /*
         * Attach the dquots to the inode.
@@ -241,30 +280,8 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        args.whichfork = XFS_ATTR_FORK;
        args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-        /*
-         * Determine space new attribute will use, and if it would be
-         * "local" or "remote" (note: local != inline).
-         */
-        size = xfs_attr_leaf_newentsize(name->len, valuelen,
-                                        mp->m_sb.sb_blocksize, &local);
-        nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
-        if (local) {
-                if (size > (mp->m_sb.sb_blocksize >> 1)) {
-                        /* Double split possible */
-                        nblks <<= 1;
-                }
-        } else {
-                uint    dblocks = XFS_B_TO_FSB(mp, valuelen);
-                /* Out of line attribute, cannot double split, but make
-                 * room for the attribute value itself.
-                 */
-                nblks += dblocks;
-                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
-        }
        /* Size is now blocks for attribute data */
-        args.total = nblks;
+        args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
        /*
         * Start our first transaction of the day.
@@ -286,18 +303,17 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        if (rsvd)
                args.trans->t_flags |= XFS_TRANS_RESERVE;
-        if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
+        if ((error = xfs_trans_reserve(args.trans, args.total,
-                                      XFS_ATTRSET_LOG_RES(mp, nblks),
+                        XFS_ATTRSET_LOG_RES(mp, args.total), 0,
-                                      0, XFS_TRANS_PERM_LOG_RES,
+                        XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
-                                      XFS_ATTRSET_LOG_COUNT))) {
                xfs_trans_cancel(args.trans, 0);
                return(error);
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL);
-        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0,
+        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0,
-                         rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
-                                XFS_QMOPT_RES_REGBLKS);
+                                       XFS_QMOPT_RES_REGBLKS);
        if (error) {
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
                xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
@@ -384,7 +400,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
                 * Commit the leaf transformation.  We'll need another (linked)
                 * transaction to add the new attribute to the leaf.
                 */
-                if ((error = xfs_attr_rolltrans(&args.trans, dp)))
+                error = xfs_trans_roll(&args.trans, dp);
+                if (error)
                        goto out;
        }
@@ -964,7 +982,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * Commit the current trans (including the inode) and start
                 * a new one.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        return (error);
                /*
@@ -978,7 +997,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
         * Commit the transaction that added the attr name so that
         * later routines can manage their own transactions.
         */
-        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+        error = xfs_trans_roll(&args->trans, dp);
+        if (error)
                return (error);
        /*
@@ -1067,7 +1087,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                /*
                 * Commit the remove and start the next trans in series.
                 */
-                error = xfs_attr_rolltrans(&args->trans, dp);
+                error = xfs_trans_roll(&args->trans, dp);
        } else if (args->rmtblkno > 0) {
                /*
@@ -1298,7 +1318,8 @@ restart:
                         * Commit the node conversion and start the next
                         * trans in the chain.
                         */
-                        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                        error = xfs_trans_roll(&args->trans, dp);
+                        if (error)
                                goto out;
                        goto restart;
@@ -1349,7 +1370,8 @@ restart:
         * Commit the leaf addition or btree split and start the next
         * trans in the chain.
         */
-        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+        error = xfs_trans_roll(&args->trans, dp);
+        if (error)
                goto out;
        /*
@@ -1449,7 +1471,8 @@ restart:
                /*
                 * Commit and start the next trans in the chain.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        goto out;
        } else if (args->rmtblkno > 0) {
@@ -1581,7 +1604,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                /*
                 * Commit the Btree join operation and start a new trans.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        goto out;
        }
@@ -2082,7 +2106,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                /*
                 * Start the next trans in the chain.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        return (error);
        }
@@ -2232,7 +2257,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                /*
                 * Close out trans and start the next one in the chain.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
+                error = xfs_trans_roll(&args->trans, args->dp);
+                if (error)
                        return (error);
        }
        return(0);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 8b2d31c19e4..fb3b2a68b9b 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -129,6 +129,7 @@ typedef struct xfs_attr_list_context {
 /*
 * Overall external interface routines.
 */
+int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
 int xfs_attr_inactive(struct xfs_inode *dp);
 int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 23ef5d7c87e..79da6b2ea99 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2498,9 +2498,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
        /*
         * Commit the flag value change and start the next trans in series.
         */
-        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        return xfs_trans_roll(&args->trans, args->dp);
-        return(error);
 }
 /*
@@ -2547,9 +2545,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
        /*
         * Commit the flag value change and start the next trans in series.
         */
-        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        return xfs_trans_roll(&args->trans, args->dp);
-        return(error);
 }
 /*
@@ -2665,7 +2661,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
        /*
         * Commit the flag value change and start the next trans in series.
         */
-        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        error = xfs_trans_roll(&args->trans, args->dp);
        return(error);
 }
@@ -2723,7 +2719,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
        /*
         * Commit the invalidate and start the next transaction.
         */
-        error = xfs_attr_rolltrans(trans, dp);
+        error = xfs_trans_roll(trans, dp);
        return (error);
 }
@@ -2825,7 +2821,8 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
                /*
                 * Atomically commit the whole invalidate stuff.
                 */
-                if ((error = xfs_attr_rolltrans(trans, dp)))
+                error = xfs_trans_roll(trans, dp);
+                if (error)
                        return (error);
        }
@@ -2964,7 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
                        /*
                         * Roll to next transaction.
                         */
-                        if ((error = xfs_attr_rolltrans(trans, dp)))
+                        error = xfs_trans_roll(trans, dp);
+                        if (error)
                                return (error);
                }
@@ -2974,60 +2972,3 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
        return(0);
 }
-/*
- * Roll from one trans in the sequence of PERMANENT transactions to the next.
- */
-int
-xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
-{
-        xfs_trans_t *trans;
-        unsigned int logres, count;
-        int     error;
-        /*
-         * Ensure that the inode is always logged.
-         */
-        trans = *transp;
-        xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
-        /*
-         * Copy the critical parameters from one trans to the next.
-         */
-        logres = trans->t_log_res;
-        count = trans->t_log_count;
-        *transp = xfs_trans_dup(trans);
-        /*
-         * Commit the current transaction.
-         * If this commit failed, then it'd just unlock those items that
-         * are not marked ihold. That also means that a filesystem shutdown
-         * is in progress. The caller takes the responsibility to cancel
-         * the duplicate transaction that gets returned.
-         */
-        if ((error = xfs_trans_commit(trans, 0)))
-                return (error);
-        trans = *transp;
-        /*
-         * Reserve space in the log for th next transaction.
-         * This also pushes items in the "AIL", the list of logged items,
-         * out to disk if they are taking up space at the tail of the log
-         * that we want to use.  This requires that either nothing be locked
-         * across this call, or that anything that is locked be logged in
-         * the prior and the next transactions.
-         */
-        error = xfs_trans_reserve(trans, 0, logres, 0,
-                                  XFS_TRANS_PERM_LOG_RES, count);
-        /*
-         *  Ensure that the inode is in the new transaction and locked.
-         */
-        if (!error) {
-                xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
-                xfs_trans_ihold(trans, dp);
-        }
-        return (error);
-}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 5ecf437b782..83e9af417ca 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -274,6 +274,4 @@ int	xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
                                   struct xfs_dabuf *leaf2_bp);
 int     xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
                                        int *local);
-int     xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
 #endif  /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index fab0b6d5a41..48228848f5a 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -25,109 +25,6 @@
 * XFS bit manipulation routines, used in non-realtime code.
 */
-#ifndef HAVE_ARCH_HIGHBIT
-/*
- * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
- */
-static const char xfs_highbit[256] = {
-       -1, 0, 1, 1, 2, 2, 2, 2,                 /* 00 .. 07 */
-        3, 3, 3, 3, 3, 3, 3, 3,                 /* 08 .. 0f */
-        4, 4, 4, 4, 4, 4, 4, 4,                 /* 10 .. 17 */
-        4, 4, 4, 4, 4, 4, 4, 4,                 /* 18 .. 1f */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 20 .. 27 */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 28 .. 2f */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 30 .. 37 */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 38 .. 3f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 40 .. 47 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 48 .. 4f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 50 .. 57 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 58 .. 5f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 60 .. 67 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 68 .. 6f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 70 .. 77 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 78 .. 7f */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 80 .. 87 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 88 .. 8f */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 90 .. 97 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 98 .. 9f */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* a0 .. a7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* a8 .. af */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* b0 .. b7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* b8 .. bf */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* c0 .. c7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* c8 .. cf */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* d0 .. d7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* d8 .. df */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* e0 .. e7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* e8 .. ef */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* f0 .. f7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* f8 .. ff */
-};
-#endif
-/*
- * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
- */
-inline int
-xfs_highbit32(
-        __uint32_t      v)
-{
-#ifdef HAVE_ARCH_HIGHBIT
-        return highbit32(v);
-#else
-        int             i;
-        if (v & 0xffff0000)
-                if (v & 0xff000000)
-                        i = 24;
-                else
-                        i = 16;
-        else if (v & 0x0000ffff)
-                if (v & 0x0000ff00)
-                        i = 8;
-                else
-                        i = 0;
-        else
-                return -1;
-        return i + xfs_highbit[(v >> i) & 0xff];
-#endif
-}
-/*
- * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_lowbit64(
-        __uint64_t      v)
-{
-        __uint32_t      w = (__uint32_t)v;
-        int             n = 0;
-        if (w) {        /* lower bits */
-                n = ffs(w);
-        } else {        /* upper bits */
-                w = (__uint32_t)(v >> 32);
-                if (w && (n = ffs(w)))
-                        n += 32;
-        }
-        return n - 1;
-}
-/*
- * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_highbit64(
-        __uint64_t      v)
-{
-        __uint32_t      h = (__uint32_t)(v >> 32);
-        if (h)
-                return xfs_highbit32(h) + 32;
-        return xfs_highbit32((__uint32_t)v);
-}
 /*
 * Return whether bitmap is empty.
 * Size is number of words in the bitmap, which is padded to word boundary
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 082641a9782..8e0e463dae2 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -47,13 +47,39 @@ static inline __uint64_t xfs_mask64lo(int n)
 }
 /* Get high bit set out of 32-bit argument, -1 if none set */
-extern int xfs_highbit32(__uint32_t v);
+static inline int xfs_highbit32(__uint32_t v)
+{
+        return fls(v) - 1;
+}
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+        return fls64(v) - 1;
+}
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+        unsigned long   t = v;
+        return (v) ? find_first_bit(&t, 32) : -1;
+}
 /* Get low bit set out of 64-bit argument, -1 if none set */
-extern int xfs_lowbit64(__uint64_t v);
+static inline int xfs_lowbit64(__uint64_t v)
+{
+        __uint32_t      w = (__uint32_t)v;
+        int             n = 0;
-/* Get high bit set out of 64-bit argument, -1 if none set */
+        if (w) {        /* lower bits */
-extern int xfs_highbit64(__uint64_t);
+                n = ffs(w);
+        } else {        /* upper bits */
+                w = (__uint32_t)(v >> 32);
+                if (w && (n = ffs(w)))
+                n += 32;
+        }
+        return n - 1;
+}
 /* Return whether bitmap is empty (1 == empty) */
 extern int xfs_bitmap_empty(uint *map, uint size);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3c4beb3a432..a1aab9275d5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -384,14 +384,14 @@ xfs_bmap_count_tree(
        int             levelin,
        int             *count);
-STATIC int
+STATIC void
 xfs_bmap_count_leaves(
        xfs_ifork_t             *ifp,
        xfs_extnum_t            idx,
        int                     numrecs,
        int                     *count);
-STATIC int
+STATIC void
 xfs_bmap_disk_count_leaves(
        xfs_extnum_t            idx,
        xfs_bmbt_block_t        *block,
@@ -4000,7 +4000,7 @@ xfs_bmap_add_attrfork(
                ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
        }
        ASSERT(ip->i_d.di_anextents == 0);
-        VN_HOLD(XFS_ITOV(ip));
+        IHOLD(ip);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        switch (ip->i_d.di_format) {
@@ -6096,7 +6096,7 @@ xfs_bmap_get_bp(
                tp = cur->bc_tp;
                licp = &tp->t_items;
                while (!bp && licp != NULL) {
-                        if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                        if (xfs_lic_are_all_free(licp)) {
                                licp = licp->lic_next;
                                continue;
                        }
@@ -6106,11 +6106,11 @@ xfs_bmap_get_bp(
                                xfs_buf_log_item_t      *bip;
                                xfs_buf_t               *lbp;
-                                if (XFS_LIC_ISFREE(licp, i)) {
+                                if (xfs_lic_isfree(licp, i)) {
                                        continue;
                                }
-                                lidp = XFS_LIC_SLOT(licp, i);
+                                lidp = xfs_lic_slot(licp, i);
                                lip = lidp->lid_item;
                                if (lip->li_type != XFS_LI_BUF)
                                        continue;
@@ -6367,13 +6367,9 @@ xfs_bmap_count_blocks(
        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
-                if (unlikely(xfs_bmap_count_leaves(ifp, 0,
+                xfs_bmap_count_leaves(ifp, 0,
                        ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
-                        count) < 0)) {
+                        count);
-                        XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)",
-                                         XFS_ERRLEVEL_LOW, mp);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
                return 0;
        }
@@ -6454,13 +6450,7 @@ xfs_bmap_count_tree(
                for (;;) {
                        nextbno = be64_to_cpu(block->bb_rightsib);
                        numrecs = be16_to_cpu(block->bb_numrecs);
-                        if (unlikely(xfs_bmap_disk_count_leaves(0,
+                        xfs_bmap_disk_count_leaves(0, block, numrecs, count);
-                                        block, numrecs, count) < 0)) {
-                                xfs_trans_brelse(tp, bp);
-                                XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                return XFS_ERROR(EFSCORRUPTED);
-                        }
                        xfs_trans_brelse(tp, bp);
                        if (nextbno == NULLFSBLOCK)
                                break;
@@ -6478,7 +6468,7 @@ xfs_bmap_count_tree(
 /*
 * Count leaf blocks given a range of extent records.
 */
-STATIC int
+STATIC void
 xfs_bmap_count_leaves(
        xfs_ifork_t             *ifp,
        xfs_extnum_t            idx,
@@ -6491,14 +6481,13 @@ xfs_bmap_count_leaves(
                xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
                *count += xfs_bmbt_get_blockcount(frp);
        }
-        return 0;
 }
 /*
 * Count leaf blocks given a range of extent records originally
 * in btree format.
 */
-STATIC int
+STATIC void
 xfs_bmap_disk_count_leaves(
        xfs_extnum_t            idx,
        xfs_bmbt_block_t        *block,
@@ -6512,5 +6501,4 @@ xfs_bmap_disk_count_leaves(
                frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
-        return 0;
 }
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index aeb87ca69fc..cc593a84c34 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -46,38 +46,11 @@ kmem_zone_t	*xfs_btree_cur_zone;
 /*
 * Btree magic numbers.
 */
-const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
-{
        XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
 };
 /*
- * Prototypes for internal routines.
- */
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int                              /* number of records fitting in block */
-xfs_btree_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_block_t       *block);/* generic btree block pointer */
-/*
- * Internal routines.
- */
-/*
- * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
- */
-STATIC xfs_btree_block_t *                      /* generic btree block pointer */
-xfs_btree_get_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level in btree */
-        struct xfs_buf          **bpp); /* buffer containing the block */
-/*
 * Checking routine: return maxrecs for the block.
 */
 STATIC int                              /* number of records fitting in block */
@@ -457,35 +430,6 @@ xfs_btree_dup_cursor(
 }
 /*
- * Change the cursor to point to the first record at the given level.
- * Other levels are unaffected.
- */
-int                                     /* success=1, failure=0 */
-xfs_btree_firstrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level)  /* level to change */
-{
-        xfs_btree_block_t       *block; /* generic btree block pointer */
-        xfs_buf_t               *bp;    /* buffer containing block */
-        /*
-         * Get the block pointer for this level.
-         */
-        block = xfs_btree_get_block(cur, level, &bp);
-        xfs_btree_check_block(cur, block, level, bp);
-        /*
-         * It's empty, there is no such record.
-         */
-        if (!block->bb_h.bb_numrecs)
-                return 0;
-        /*
-         * Set the ptr value to 1, that's the first record/key.
-         */
-        cur->bc_ptrs[level] = 1;
-        return 1;
-}
-/*
 * Retrieve the block pointer from the cursor at the given level.
 * This may be a bmap btree root or from a buffer.
 */
@@ -626,6 +570,13 @@ xfs_btree_init_cursor(
                cur->bc_private.a.agbp = agbp;
                cur->bc_private.a.agno = agno;
                break;
+        case XFS_BTNUM_INO:
+                /*
+                 * Inode allocation btree fields.
+                 */
+                cur->bc_private.a.agbp = agbp;
+                cur->bc_private.a.agno = agno;
+                break;
        case XFS_BTNUM_BMAP:
                /*
                 * Bmap btree fields.
@@ -638,13 +589,6 @@ xfs_btree_init_cursor(
                cur->bc_private.b.flags = 0;
                cur->bc_private.b.whichfork = whichfork;
                break;
-        case XFS_BTNUM_INO:
-                /*
-                 * Inode allocation btree fields.
-                 */
-                cur->bc_private.i.agbp = agbp;
-                cur->bc_private.i.agno = agno;
-                break;
        default:
                ASSERT(0);
        }
@@ -671,6 +615,35 @@ xfs_btree_islastblock(
 }
 /*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+int                                     /* success=1, failure=0 */
+xfs_btree_firstrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level)  /* level to change */
+{
+        xfs_btree_block_t       *block; /* generic btree block pointer */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        /*
+         * Get the block pointer for this level.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        xfs_btree_check_block(cur, block, level, bp);
+        /*
+         * It's empty, there is no such record.
+         */
+        if (!block->bb_h.bb_numrecs)
+                return 0;
+        /*
+         * Set the ptr value to 1, that's the first record/key.
+         */
+        cur->bc_ptrs[level] = 1;
+        return 1;
+}
+/*
 * Change the cursor to point to the last record in the current block
 * at the given level.  Other levels are unaffected.
 */
@@ -890,12 +863,12 @@ xfs_btree_readahead_core(
        case XFS_BTNUM_INO:
                i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
                if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
                                be32_to_cpu(i->bb_leftsib), 1);
                        rval++;
                }
                if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
                                be32_to_cpu(i->bb_rightsib), 1);
                        rval++;
                }
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7440b78f9ce..1f528a2a375 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -158,8 +158,8 @@ typedef struct xfs_btree_cur
        __uint8_t       bc_blocklog;    /* log2(blocksize) of btree blocks */
        xfs_btnum_t     bc_btnum;       /* identifies which btree type */
        union {
-                struct {                        /* needed for BNO, CNT */
+                struct {                        /* needed for BNO, CNT, INO */
-                        struct xfs_buf  *agbp;  /* agf buffer pointer */
+                        struct xfs_buf  *agbp;  /* agf/agi buffer pointer */
                        xfs_agnumber_t  agno;   /* ag number */
                } a;
                struct {                        /* needed for BMAP */
@@ -172,10 +172,6 @@ typedef struct xfs_btree_cur
                        char            flags;          /* flags */
 #define XFS_BTCUR_BPRV_WASDEL   1                       /* was delayed */
                } b;
-                struct {                        /* needed for INO */
-                        struct xfs_buf  *agbp;  /* agi buffer pointer */
-                        xfs_agnumber_t  agno;   /* ag number */
-                } i;
        }               bc_private;     /* per-btree type data */
 } xfs_btree_cur_t;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index d86ca2c03a7..608c30c3f76 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -737,7 +737,7 @@ xfs_buf_item_init(
        bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
        bip->bli_format.blf_map_size = map_size;
 #ifdef XFS_BLI_TRACE
-        bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
+        bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_TRANS_DEBUG
@@ -1056,7 +1056,7 @@ xfs_buf_iodone_callbacks(
                           anyway. */
                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_V_IODONESEMA(bp);
+                        XFS_BUF_FINISH_IOWAIT(bp);
                }
                return;
        }
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 2211e885ef2..760f4c5b516 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -128,10 +128,8 @@ xfs_swap_extents(
        xfs_swapext_t   *sxp)
 {
        xfs_mount_t     *mp;
-        xfs_inode_t     *ips[2];
        xfs_trans_t     *tp;
        xfs_bstat_t     *sbp = &sxp->sx_stat;
-        bhv_vnode_t     *vp, *tvp;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
        int             ilf_fields, tilf_fields;
        static uint     lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
@@ -150,19 +148,8 @@ xfs_swap_extents(
        }
        sbp = &sxp->sx_stat;
-        vp = XFS_ITOV(ip);
-        tvp = XFS_ITOV(tip);
-        /* Lock in i_ino order */
-        if (ip->i_ino < tip->i_ino) {
-                ips[0] = ip;
-                ips[1] = tip;
-        } else {
-                ips[0] = tip;
-                ips[1] = ip;
-        }
-        xfs_lock_inodes(ips, 2, lock_flags);
+        xfs_lock_two_inodes(ip, tip, lock_flags);
        locked = 1;
        /* Verify that both files have the same format */
@@ -184,7 +171,7 @@ xfs_swap_extents(
                goto error0;
        }
-        if (VN_CACHED(tvp) != 0) {
+        if (VN_CACHED(VFS_I(tip)) != 0) {
                xfs_inval_cached_trace(tip, 0, -1, 0, -1);
                error = xfs_flushinval_pages(tip, 0, -1,
                                FI_REMAPF_LOCKED);
@@ -193,7 +180,7 @@ xfs_swap_extents(
        }
        /* Verify O_DIRECT for ftmp */
-        if (VN_CACHED(tvp) != 0) {
+        if (VN_CACHED(VFS_I(tip)) != 0) {
                error = XFS_ERROR(EINVAL);
                goto error0;
        }
@@ -237,7 +224,7 @@ xfs_swap_extents(
         * vop_read (or write in the case of autogrow) they block on the iolock
         * until we have switched the extents.
         */
-        if (VN_MAPPED(vp)) {
+        if (VN_MAPPED(VFS_I(ip))) {
                error = XFS_ERROR(EBUSY);
                goto error0;
        }
@@ -265,7 +252,7 @@ xfs_swap_extents(
                locked = 0;
                goto error0;
        }
-        xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
+        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        /*
         * Count the number of extended attribute blocks
@@ -350,15 +337,11 @@ xfs_swap_extents(
                break;
        }
-        /*
-         * Increment vnode ref counts since xfs_trans_commit &
-         * xfs_trans_cancel will both unlock the inodes and
-         * decrement the associated ref counts.
-         */
-        VN_HOLD(vp);
-        VN_HOLD(tvp);
+        IHOLD(ip);
        xfs_trans_ijoin(tp, ip, lock_flags);
+        IHOLD(tip);
        xfs_trans_ijoin(tp, tip, lock_flags);
        xfs_trans_log_inode(tp, ip,  ilf_fields);
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index cdc2d3464a1..2813cdd7237 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_DMAPI_H__
 #define __XFS_DMAPI_H__
-#include <linux/version.h>
 /*      Values used to define the on-disk version of dm_attrname_t. All
 *      on-disk attribute names start with the 8-byte string "SGI_DMI_".
 *
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index f66756cfb5e..f227ecd1a29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,9 +58,6 @@ xfs_error_trap(int e)
        }
        return e;
 }
-#endif
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
@@ -154,7 +151,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
        return 0;
 }
-#endif /* DEBUG || INDUCE_IO_ERROR */
+#endif /* DEBUG */
 static void
 xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index d8559d132ef..11543f10b0c 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -125,22 +125,14 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
 #define XFS_RANDOM_DIOWRITE_IOERR                       (XFS_RANDOM_DEFAULT/10)
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+#ifdef DEBUG
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 #define XFS_NUM_INJECT_ERROR                            10
-#ifdef __ANSI_CPP__
-#define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
-         xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
-                         (rf)))
-#else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
        ((expr) || \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
                        (rf)))
-#endif /* __ANSI_CPP__ */
 extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
 extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
@@ -148,7 +140,7 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
 #define xfs_errortag_add(tag, mp)               (ENOSYS)
 #define xfs_errortag_clearall(mp, loud)         (ENOSYS)
-#endif /* (DEBUG || INDUCE_IO_ERROR) */
+#endif /* DEBUG */
 /*
 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index c38fd14fca2..f3bb75da384 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -400,7 +400,7 @@ xfs_filestream_init(void)
        if (!item_zone)
                return -ENOMEM;
 #ifdef XFS_FILESTREAMS_TRACE
-        xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP);
+        xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS);
 #endif
        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index e5310c90e50..83502f3edef 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -181,7 +181,7 @@ xfs_inobt_delrec(
                 * then we can get rid of this level.
                 */
                if (numrecs == 1 && level > 0) {
-                        agbp = cur->bc_private.i.agbp;
+                        agbp = cur->bc_private.a.agbp;
                        agi = XFS_BUF_TO_AGI(agbp);
                        /*
                         * pp is still set to the first pointer in the block.
@@ -194,7 +194,7 @@ xfs_inobt_delrec(
                         * Free the block.
                         */
                        if ((error = xfs_free_extent(cur->bc_tp,
-                                XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1)))
+                                XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
                                return error;
                        xfs_trans_binval(cur->bc_tp, bp);
                        xfs_ialloc_log_agi(cur->bc_tp, agbp,
@@ -379,7 +379,7 @@ xfs_inobt_delrec(
                rrecs = be16_to_cpu(right->bb_numrecs);
                rbp = bp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.i.agno, lbno, 0, &lbp,
+                                cur->bc_private.a.agno, lbno, 0, &lbp,
                                XFS_INO_BTREE_REF)))
                        return error;
                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -401,7 +401,7 @@ xfs_inobt_delrec(
                lrecs = be16_to_cpu(left->bb_numrecs);
                lbp = bp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.i.agno, rbno, 0, &rbp,
+                                cur->bc_private.a.agno, rbno, 0, &rbp,
                                XFS_INO_BTREE_REF)))
                        return error;
                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -484,7 +484,7 @@ xfs_inobt_delrec(
                xfs_buf_t               *rrbp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0,
+                                cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
                                &rrbp, XFS_INO_BTREE_REF)))
                        return error;
                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
@@ -497,7 +497,7 @@ xfs_inobt_delrec(
         * Free the deleting block.
         */
        if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
-                                     cur->bc_private.i.agno, rbno), 1)))
+                                     cur->bc_private.a.agno, rbno), 1)))
                return error;
        xfs_trans_binval(cur->bc_tp, rbp);
        /*
@@ -854,7 +854,7 @@ xfs_inobt_lookup(
        {
                xfs_agi_t       *agi;   /* a.g. inode header */
-                agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+                agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
                agno = be32_to_cpu(agi->agi_seqno);
                agbno = be32_to_cpu(agi->agi_root);
        }
@@ -1089,7 +1089,7 @@ xfs_inobt_lshift(
         * Set up the left neighbor as "left".
         */
        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib),
+                        cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
                        0, &lbp, XFS_INO_BTREE_REF)))
                return error;
        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -1207,10 +1207,10 @@ xfs_inobt_newroot(
        /*
         * Get a block & a buffer.
         */
-        agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+        agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
        args.tp = cur->bc_tp;
        args.mp = cur->bc_mp;
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
                be32_to_cpu(agi->agi_root));
        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
                args.isfl = args.userdata = args.minalignslop = 0;
@@ -1233,7 +1233,7 @@ xfs_inobt_newroot(
         */
        agi->agi_root = cpu_to_be32(args.agbno);
        be32_add_cpu(&agi->agi_level, 1);
-        xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
+        xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
                XFS_AGI_ROOT | XFS_AGI_LEVEL);
        /*
         * At the previous root level there are now two blocks: the old
@@ -1376,7 +1376,7 @@ xfs_inobt_rshift(
         * Set up the right neighbor as "right".
         */
        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib),
+                        cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
                        0, &rbp, XFS_INO_BTREE_REF)))
                return error;
        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -1492,7 +1492,7 @@ xfs_inobt_split(
         * Allocate the new block.
         * If we can't do it, we're toast.  Give up.
         */
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
                args.isfl = args.userdata = args.minalignslop = 0;
        args.minlen = args.maxlen = args.prod = 1;
@@ -1725,7 +1725,7 @@ xfs_inobt_decrement(
                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.i.agno, agbno, 0, &bp,
+                                cur->bc_private.a.agno, agbno, 0, &bp,
                                XFS_INO_BTREE_REF)))
                        return error;
                lev--;
@@ -1897,7 +1897,7 @@ xfs_inobt_increment(
                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.i.agno, agbno, 0, &bp,
+                                cur->bc_private.a.agno, agbno, 0, &bp,
                                XFS_INO_BTREE_REF)))
                        return error;
                lev--;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b07604b94d9..e229e9e001c 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -216,7 +216,14 @@ finish_inode:
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
        init_waitqueue_head(&ip->i_ipin_wait);
        atomic_set(&ip->i_pincount, 0);
-        initnsema(&ip->i_flock, 1, "xfsfino");
+        /*
+         * Because we want to use a counting completion, complete
+         * the flush completion once to allow a single access to
+         * the flush completion without blocking.
+         */
+        init_completion(&ip->i_flush);
+        complete(&ip->i_flush);
        if (lock_flags)
                xfs_ilock(ip, lock_flags);
@@ -288,10 +295,17 @@ finish_inode:
        *ipp = ip;
        /*
+         * Set up the Linux with the Linux inode.
+         */
+        ip->i_vnode = inode;
+        inode->i_private = ip;
+        /*
         * If we have a real type for an on-disk inode, we can set ops(&unlock)
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
         */
-        xfs_initialize_vnode(mp, inode, ip);
+        if (ip->i_d.di_mode != 0)
+                xfs_setup_inode(ip);
        return 0;
 }
@@ -411,10 +425,11 @@ xfs_iput(xfs_inode_t	*ip,
 * Special iput for brand-new inodes that are still locked
 */
 void
-xfs_iput_new(xfs_inode_t        *ip,
+xfs_iput_new(
-             uint               lock_flags)
+        xfs_inode_t     *ip,
+        uint            lock_flags)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        xfs_itrace_entry(ip);
@@ -775,26 +790,3 @@ xfs_isilocked(
 }
 #endif
-/*
- * The following three routines simply manage the i_flock
- * semaphore embedded in the inode.  This semaphore synchronizes
- * processes attempting to flush the in-core inode back to disk.
- */
-void
-xfs_iflock(xfs_inode_t *ip)
-{
-        psema(&(ip->i_flock), PINOD|PLTWAIT);
-}
-int
-xfs_iflock_nowait(xfs_inode_t *ip)
-{
-        return (cpsema(&(ip->i_flock)));
-}
-void
-xfs_ifunlock(xfs_inode_t *ip)
-{
-        ASSERT(issemalocked(&(ip->i_flock)));
-        vsema(&(ip->i_flock));
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bedc6616317..00e80df9dd9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -580,8 +580,8 @@ xfs_iformat_extents(
                xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
                for (i = 0; i < nex; i++, dp++) {
                        xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-                        ep->l0 = be64_to_cpu(get_unaligned(&dp->l0));
+                        ep->l0 = get_unaligned_be64(&dp->l0);
-                        ep->l1 = be64_to_cpu(get_unaligned(&dp->l1));
+                        ep->l1 = get_unaligned_be64(&dp->l1);
                }
                XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
                if (whichfork != XFS_DATA_FORK ||
@@ -835,22 +835,22 @@ xfs_iread(
         * Do this before xfs_iformat in case it adds entries.
         */
 #ifdef  XFS_INODE_TRACE
-        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_SLEEP);
+        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BMAP_TRACE
-        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
+        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BMBT_TRACE
-        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
+        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_RW_TRACE
-        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
+        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_ILOCK_TRACE
-        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
+        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_DIR2_TRACE
-        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
+        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
        /*
@@ -1046,9 +1046,9 @@ xfs_ialloc(
 {
        xfs_ino_t       ino;
        xfs_inode_t     *ip;
-        bhv_vnode_t     *vp;
        uint            flags;
        int             error;
+        timespec_t      tv;
        /*
         * Call the space management code to pick
@@ -1077,13 +1077,12 @@ xfs_ialloc(
        }
        ASSERT(ip != NULL);
-        vp = XFS_ITOV(ip);
        ip->i_d.di_mode = (__uint16_t)mode;
        ip->i_d.di_onlink = 0;
        ip->i_d.di_nlink = nlink;
        ASSERT(ip->i_d.di_nlink == nlink);
-        ip->i_d.di_uid = current_fsuid(cr);
+        ip->i_d.di_uid = current_fsuid();
-        ip->i_d.di_gid = current_fsgid(cr);
+        ip->i_d.di_gid = current_fsgid();
        ip->i_d.di_projid = prid;
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
@@ -1130,7 +1129,13 @@ xfs_ialloc(
        ip->i_size = 0;
        ip->i_d.di_nextents = 0;
        ASSERT(ip->i_d.di_nblocks == 0);
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
+        nanotime(&tv);
+        ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+        ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+        ip->i_d.di_atime = ip->i_d.di_mtime;
+        ip->i_d.di_ctime = ip->i_d.di_mtime;
        /*
         * di_gen will have been taken care of in xfs_iread.
         */
@@ -1220,7 +1225,7 @@ xfs_ialloc(
        xfs_trans_log_inode(tp, ip, flags);
        /* now that we have an i_mode we can setup inode ops and unlock */
-        xfs_initialize_vnode(tp->t_mountp, vp, ip);
+        xfs_setup_inode(ip);
        *ipp = ip;
        return 0;
@@ -1399,7 +1404,6 @@ xfs_itruncate_start(
        xfs_fsize_t     last_byte;
        xfs_off_t       toss_start;
        xfs_mount_t     *mp;
-        bhv_vnode_t     *vp;
        int             error = 0;
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1408,7 +1412,6 @@ xfs_itruncate_start(
               (flags == XFS_ITRUNC_MAYBE));
        mp = ip->i_mount;
-        vp = XFS_ITOV(ip);
        /* wait for the completion of any pending DIOs */
        if (new_size < ip->i_size)
@@ -1457,7 +1460,7 @@ xfs_itruncate_start(
 #ifdef DEBUG
        if (new_size == 0) {
-                ASSERT(VN_CACHED(vp) == 0);
+                ASSERT(VN_CACHED(VFS_I(ip)) == 0);
        }
 #endif
        return error;
@@ -2630,7 +2633,6 @@ xfs_idestroy(
                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
        mrfree(&ip->i_lock);
        mrfree(&ip->i_iolock);
-        freesema(&ip->i_flock);
 #ifdef XFS_INODE_TRACE
        ktrace_free(ip->i_trace);
@@ -3048,10 +3050,10 @@ cluster_corrupt_out:
 /*
 * xfs_iflush() will write a modified inode's changes out to the
 * inode's on disk home.  The caller must have the inode lock held
- * in at least shared mode and the inode flush semaphore must be
+ * in at least shared mode and the inode flush completion must be
- * held as well.  The inode lock will still be held upon return from
+ * active as well.  The inode lock will still be held upon return from
 * the call and the caller is free to unlock it.
- * The inode flush lock will be unlocked when the inode reaches the disk.
+ * The inode flush will be completed when the inode reaches the disk.
 * The flags indicate how the inode's buffer should be written out.
 */
 int
@@ -3070,7 +3072,7 @@ xfs_iflush(
        XFS_STATS_INC(xs_iflush_count);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        ASSERT(issemalocked(&(ip->i_flock)));
+        ASSERT(!completion_done(&ip->i_flush));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3233,7 +3235,7 @@ xfs_iflush_int(
 #endif
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        ASSERT(issemalocked(&(ip->i_flock)));
+        ASSERT(!completion_done(&ip->i_flush));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3465,7 +3467,6 @@ xfs_iflush_all(
        xfs_mount_t     *mp)
 {
        xfs_inode_t     *ip;
-        bhv_vnode_t     *vp;
 again:
        XFS_MOUNT_ILOCK(mp);
@@ -3480,14 +3481,13 @@ xfs_iflush_all(
                        continue;
                }
-                vp = XFS_ITOV_NULL(ip);
+                if (!VFS_I(ip)) {
-                if (!vp) {
                        XFS_MOUNT_IUNLOCK(mp);
                        xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
                        goto again;
                }
-                ASSERT(vn_count(vp) == 0);
+                ASSERT(vn_count(VFS_I(ip)) == 0);
                ip = ip->i_mnext;
        } while (ip != mp->m_inodes);
@@ -3707,7 +3707,7 @@ xfs_iext_add_indirect_multi(
         * (all extents past */
        if (nex2) {
                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP);
+                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
                memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
                erp->er_extcount -= nex2;
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
@@ -4007,8 +4007,7 @@ xfs_iext_realloc_direct(
                        ifp->if_u1.if_extents =
                                kmem_realloc(ifp->if_u1.if_extents,
                                                rnew_size,
-                                                ifp->if_real_bytes,
+                                                ifp->if_real_bytes, KM_NOFS);
-                                                KM_SLEEP);
                }
                if (rnew_size > ifp->if_real_bytes) {
                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -4067,7 +4066,7 @@ xfs_iext_inline_to_direct(
        xfs_ifork_t     *ifp,           /* inode fork pointer */
        int             new_size)       /* number of extents in file */
 {
-        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_SLEEP);
+        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
        memset(ifp->if_u1.if_extents, 0, new_size);
        if (ifp->if_bytes) {
                memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
@@ -4099,7 +4098,7 @@ xfs_iext_realloc_indirect(
        } else {
                ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
                        kmem_realloc(ifp->if_u1.if_ext_irec,
-                                new_size, size, KM_SLEEP);
+                                new_size, size, KM_NOFS);
        }
 }
@@ -4341,11 +4340,10 @@ xfs_iext_irec_init(
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        ASSERT(nextents <= XFS_LINEAR_EXTS);
-        erp = (xfs_ext_irec_t *)
+        erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-                kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP);
        if (nextents == 0) {
-                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
        } else if (!ifp->if_real_bytes) {
                xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
        } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
@@ -4393,7 +4391,7 @@ xfs_iext_irec_new(
        /* Initialize new extent record */
        erp = ifp->if_u1.if_ext_irec;
-        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
        memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
        erp[erp_idx].er_extcount = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 17a04b6321e..1420c49674d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -87,8 +87,7 @@ typedef struct xfs_ifork {
 * Flags for xfs_ichgtime().
 */
 #define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-#define XFS_ICHGTIME_ACC        0x2     /* data fork access timestamp */
+#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
-#define XFS_ICHGTIME_CHG        0x4     /* inode field change timestamp */
 /*
 * Per-fork incore inode flags.
@@ -204,7 +203,7 @@ typedef struct xfs_inode {
        struct xfs_inode        *i_mprev;       /* ptr to prev inode */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
        struct list_head        i_reclaim;      /* reclaim list */
-        bhv_vnode_t             *i_vnode;       /* vnode backpointer */
+        struct inode            *i_vnode;       /* vnode backpointer */
        struct xfs_dquot        *i_udquot;      /* user dquot */
        struct xfs_dquot        *i_gdquot;      /* group dquot */
@@ -223,7 +222,7 @@ typedef struct xfs_inode {
        struct xfs_inode_log_item *i_itemp;     /* logging information */
        mrlock_t                i_lock;         /* inode lock */
        mrlock_t                i_iolock;       /* inode IO lock */
-        sema_t                  i_flock;        /* inode flush lock */
+        struct completion       i_flush;        /* inode flush completion q */
        atomic_t                i_pincount;     /* inode pin count */
        wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
@@ -263,6 +262,18 @@ typedef struct xfs_inode {
 #define XFS_ISIZE(ip)   (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
                                (ip)->i_size : (ip)->i_d.di_size;
+/* Convert from vfs inode to xfs inode */
+static inline struct xfs_inode *XFS_I(struct inode *inode)
+{
+        return (struct xfs_inode *)inode->i_private;
+}
+/* convert from xfs inode to vfs inode */
+static inline struct inode *VFS_I(struct xfs_inode *ip)
+{
+        return (struct inode *)ip->i_vnode;
+}
 /*
 * i_flags helper functions
 */
@@ -439,9 +450,6 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define XFS_ITRUNC_DEFINITE     0x1
 #define XFS_ITRUNC_MAYBE        0x2
-#define XFS_ITOV(ip)            ((ip)->i_vnode)
-#define XFS_ITOV_NULL(ip)       ((ip)->i_vnode)
 /*
 * For multiple groups support: if S_ISGID bit is set in the parent
 * directory, group of new file is set to that of the parent, and
@@ -473,11 +481,8 @@ int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void            xfs_iunlock(xfs_inode_t *, uint);
 void            xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
-void            xfs_iflock(xfs_inode_t *);
-int             xfs_iflock_nowait(xfs_inode_t *);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void            xfs_ifunlock(xfs_inode_t *);
 void            xfs_ireclaim(xfs_inode_t *);
 int             xfs_finish_reclaim(xfs_inode_t *, int, int);
 int             xfs_finish_reclaim_all(struct xfs_mount *, int);
@@ -522,6 +527,7 @@ void		xfs_iflush_all(struct xfs_mount *);
 void            xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
+void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void            xfs_synchronize_atime(xfs_inode_t *);
 void            xfs_mark_inode_dirty_sync(xfs_inode_t *);
@@ -570,6 +576,26 @@ extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
+/*
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
+ */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+        wait_for_completion(&ip->i_flush);
+}
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+        return try_wait_for_completion(&ip->i_flush);
+}
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+        complete(&ip->i_flush);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0eee08a32c2..97c7452e262 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -779,11 +779,10 @@ xfs_inode_item_pushbuf(
        ASSERT(iip->ili_push_owner == current_pid());
        /*
-         * If flushlock isn't locked anymore, chances are that the
+         * If a flush is not in progress anymore, chances are that the
-         * inode flush completed and the inode was taken off the AIL.
+         * inode was taken off the AIL. So, just get out.
-         * So, just get out.
         */
-        if (!issemalocked(&(ip->i_flock)) ||
+        if (completion_done(&ip->i_flush) ||
            ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
                iip->ili_pushbuf_flag = 0;
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -805,7 +804,7 @@ xfs_inode_item_pushbuf(
                         * If not, we can flush it async.
                         */
                        dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-                                  issemalocked(&(ip->i_flock)));
+                                  !completion_done(&ip->i_flush));
                        iip->ili_pushbuf_flag = 0;
                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
                        xfs_buftrace("INODE ITEM PUSH", bp);
@@ -858,7 +857,7 @@ xfs_inode_item_push(
        ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-        ASSERT(issemalocked(&(ip->i_flock)));
+        ASSERT(!completion_done(&ip->i_flush));
        /*
         * Since we were able to lock the inode's flush lock and
         * we found it on the AIL, the inode must be dirty.  This
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 9a3ef9dcaeb..cf6754a3c5b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,7 +59,6 @@ xfs_bulkstat_one_iget(
 {
        xfs_icdinode_t  *dic;   /* dinode core info pointer */
        xfs_inode_t     *ip;            /* incore inode pointer */
-        bhv_vnode_t     *vp;
        int             error;
        error = xfs_iget(mp, NULL, ino,
@@ -72,7 +71,6 @@ xfs_bulkstat_one_iget(
        ASSERT(ip != NULL);
        ASSERT(ip->i_blkno != (xfs_daddr_t)0);
-        vp = XFS_ITOV(ip);
        dic = &ip->i_d;
        /* xfs_iget returns the following without needing
@@ -85,7 +83,7 @@ xfs_bulkstat_one_iget(
        buf->bs_uid = dic->di_uid;
        buf->bs_gid = dic->di_gid;
        buf->bs_size = dic->di_size;
-        vn_atime_to_bstime(vp, &buf->bs_atime);
+        vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
        buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
        buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
        buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 91b00a5686c..ccba14eb9db 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -160,7 +160,7 @@ void
 xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
 {
        if (!iclog->ic_trace)
-                iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
+                iclog->ic_trace = ktrace_alloc(256, KM_NOFS);
        ktrace_enter(iclog->ic_trace,
                     (void *)((unsigned long)state),
                     (void *)((unsigned long)current_pid()),
@@ -336,15 +336,12 @@ xfs_log_done(xfs_mount_t	*mp,
        } else {
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
-        }
+                /* If this ticket was a permanent reservation and we aren't
+                 * trying to release it, reset the inited flags; so next time
-        /* If this ticket was a permanent reservation and we aren't
+                 * we write, a start record will be written out.
-         * trying to release it, reset the inited flags; so next time
+                 */
-         * we write, a start record will be written out.
-         */
-        if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
-            (flags & XFS_LOG_REL_PERM_RESERV) == 0)
                ticket->t_flags |= XLOG_TIC_INITED;
+        }
        return lsn;
 }       /* xfs_log_done */
@@ -357,11 +354,11 @@ xfs_log_done(xfs_mount_t	*mp,
 * Asynchronous forces are implemented by setting the WANT_SYNC
 * bit in the appropriate in-core log and then returning.
 *
- * Synchronous forces are implemented with a semaphore.  All callers
+ * Synchronous forces are implemented with a signal variable. All callers
- * to force a given lsn to disk will wait on a semaphore attached to the
+ * to force a given lsn to disk will wait on a the sv attached to the
 * specific in-core log.  When given in-core log finally completes its
 * write to disk, that thread will wake up all threads waiting on the
- * semaphore.
+ * sv.
 */
 int
 _xfs_log_force(
@@ -588,12 +585,12 @@ error:
 * mp           - ubiquitous xfs mount point structure
 */
 int
-xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
+xfs_log_mount_finish(xfs_mount_t *mp)
 {
        int     error;
        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-                error = xlog_recover_finish(mp->m_log, mfsi_flags);
+                error = xlog_recover_finish(mp->m_log);
        else {
                error = 0;
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
@@ -707,7 +704,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_forcesema, PMEM,
+                                sv_wait(&iclog->ic_force_wait, PMEM,
                                        &log->l_icloglock, s);
                        } else {
                                spin_unlock(&log->l_icloglock);
@@ -748,7 +745,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_forcesema, PMEM,
+                                sv_wait(&iclog->ic_force_wait, PMEM,
                                        &log->l_icloglock, s);
                } else {
                        spin_unlock(&log->l_icloglock);
@@ -838,7 +835,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_write_headq);
        }
@@ -859,7 +856,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_reserve_headq);
        }
@@ -1285,8 +1282,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
+                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
-                sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write");
+                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
                iclogp = &iclog->ic_next;
        }
@@ -1565,8 +1562,8 @@ xlog_dealloc_log(xlog_t *log)
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_forcesema);
+                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_writesema);
+                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
 #ifdef XFS_LOG_TRACE
                if (iclog->ic_trace != NULL) {
@@ -1976,7 +1973,7 @@ xlog_write(xfs_mount_t *	mp,
 /* Clean iclogs starting from the head.  This ordering must be
 * maintained, so an iclog doesn't become ACTIVE beyond one that
 * is SYNCING.  This is also required to maintain the notion that we use
- * a counting semaphore to hold off would be writers to the log when every
+ * a ordered wait queue to hold off would be writers to the log when every
 * iclog is trying to sync to disk.
 *
 * State Change: DIRTY -> ACTIVE
@@ -2240,7 +2237,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_forcesema);
+                        sv_broadcast(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2302,8 +2299,7 @@ xlog_state_do_callback(
 * the second completion goes through.
 *
 * Callbacks could take time, so they are done outside the scope of the
- * global state machine log lock.  Assume that the calls to cvsema won't
+ * global state machine log lock.
- * take a long time.  At least we know it won't sleep.
 */
 STATIC void
 xlog_state_done_syncing(
@@ -2339,7 +2335,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_writesema);
+        sv_broadcast(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2347,11 +2343,9 @@ xlog_state_done_syncing(
 /*
 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
- * sleep.  The flush semaphore is set to the number of in-core buffers and
+ * sleep.  We wait on the flush queue on the head iclog as that should be
- * decremented around disk syncing.  Therefore, if all buffers are syncing,
+ * the first iclog to complete flushing. Hence if all iclogs are syncing,
- * this semaphore will cause new writes to sleep until a sync completes.
+ * we will wait here and all new writes will sleep until a sync completes.
- * Otherwise, this code just does p() followed by v().  This approximates
- * a sleep/wakeup except we can't race.
 *
 * The in-core logs are used in a circular fashion. They are not used
 * out-of-order even when an iclog past the head is free.
@@ -2508,7 +2502,7 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
@@ -2534,7 +2528,7 @@ redo:
                xlog_trace_loggrant(log, tic,
                                    "xlog_grant_log_space: sleep 2");
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                if (XLOG_FORCED_SHUTDOWN(log)) {
                        spin_lock(&log->l_grant_lock);
@@ -2633,7 +2627,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_sema);
+                        sv_signal(&ntic->t_wait);
                        ntic = ntic->t_next;
                } while (ntic != log->l_write_headq);
@@ -2644,7 +2638,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        xlog_trace_loggrant(log, tic,
                                    "xlog_regrant_write_log_space: sleep 1");
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_sema, PINOD|PLTWAIT,
+                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
                                &log->l_grant_lock, s);
                        /* If we're shutting down, this tic is already
@@ -2673,7 +2667,7 @@ redo:
                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
                        xlog_ins_ticketq(&log->l_write_headq, tic);
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                /* If we're shutting down, this tic is already off the queue */
                if (XLOG_FORCED_SHUTDOWN(log)) {
@@ -2916,7 +2910,7 @@ xlog_state_switch_iclogs(xlog_t		*log,
 *      2. the current iclog is drity, and the previous iclog is in the
 *              active or dirty state.
 *
- * We may sleep (call psema) if:
+ * We may sleep if:
 *
 *      1. the current iclog is not in the active nor dirty state.
 *      2. the current iclog dirty, and the previous iclog is not in the
@@ -3013,7 +3007,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
+                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3096,7 +3090,7 @@ try_again:
                                                 XLOG_STATE_SYNCING))) {
                        ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
+                        sv_wait(&iclog->ic_prev->ic_write_wait, PSWP,
                                &log->l_icloglock, s);
                        *log_flushed = 1;
                        already_slept = 1;
@@ -3116,7 +3110,7 @@ try_again:
            !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
                /*
-                 * Don't wait on the forcesema if we know that we've
+                 * Don't wait on completion if we know that we've
                 * gotten a log write error.
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR) {
@@ -3124,7 +3118,7 @@ try_again:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
+                sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3180,7 +3174,7 @@ STATIC void
 xlog_ticket_put(xlog_t          *log,
                xlog_ticket_t   *ticket)
 {
-        sv_destroy(&ticket->t_sema);
+        sv_destroy(&ticket->t_wait);
        kmem_zone_free(xfs_log_ticket_zone, ticket);
 }       /* xlog_ticket_put */
@@ -3270,7 +3264,7 @@ xlog_ticket_get(xlog_t		*log,
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
+        sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
        xlog_tic_reset_res(tic);
@@ -3557,14 +3551,14 @@ xfs_log_force_umount(
         */
        if ((tic = log->l_reserve_headq)) {
                do {
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_reserve_headq);
        }
        if ((tic = log->l_write_headq)) {
                do {
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_write_headq);
        }
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d1d678ecb63..d47b91f1082 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -149,7 +149,7 @@ int	  xfs_log_mount(struct xfs_mount	*mp,
                        struct xfs_buftarg      *log_target,
                        xfs_daddr_t             start_block,
                        int                     num_bblocks);
-int       xfs_log_mount_finish(struct xfs_mount *mp, int);
+int       xfs_log_mount_finish(struct xfs_mount *mp);
 void      xfs_log_move_tail(struct xfs_mount    *mp,
                            xfs_lsn_t           tail_lsn);
 int       xfs_log_notify(struct xfs_mount       *mp,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 6245913196b..c8a5b22ee3e 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -241,7 +241,7 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_sema;       /* sleep on this semaphore      : 20 */
+        sv_t               t_wait;       /* ticket wait queue            : 20 */
        struct xlog_ticket *t_next;      /*                              :4|8 */
        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
@@ -314,7 +314,7 @@ typedef struct xlog_rec_ext_header {
 *      xlog_rec_header_t into the reserved space.
 * - ic_data follows, so a write to disk can start at the beginning of
 *      the iclog.
- * - ic_forcesema is used to implement synchronous forcing of the iclog to disk.
+ * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
 * - ic_next is the pointer to the next iclog in the ring.
 * - ic_bp is a pointer to the buffer used to write this incore log to disk.
 * - ic_log is a pointer back to the global log structure.
@@ -339,8 +339,8 @@ typedef struct xlog_rec_ext_header {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_iclog_fields {
-        sv_t                    ic_forcesema;
+        sv_t                    ic_force_wait;
-        sv_t                    ic_writesema;
+        sv_t                    ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -377,8 +377,8 @@ typedef struct xlog_in_core {
 /*
 * Defines to save our code from this glop.
 */
-#define ic_forcesema    hic_fields.ic_forcesema
+#define ic_force_wait   hic_fields.ic_force_wait
-#define ic_writesema    hic_fields.ic_writesema
+#define ic_write_wait   hic_fields.ic_write_wait
 #define ic_next         hic_fields.ic_next
 #define ic_prev         hic_fields.ic_prev
 #define ic_bp           hic_fields.ic_bp
@@ -468,7 +468,7 @@ extern int	 xlog_find_tail(xlog_t	*log,
                                xfs_daddr_t *head_blk,
                                xfs_daddr_t *tail_blk);
 extern int       xlog_recover(xlog_t *log);
-extern int       xlog_recover_finish(xlog_t *log, int mfsi_flags);
+extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 extern void      xlog_recover_process_iunlinks(xlog_t *log);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9eb722ec744..82d46ce69d5 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3940,8 +3940,7 @@ xlog_recover(
 */
 int
 xlog_recover_finish(
-        xlog_t          *log,
+        xlog_t          *log)
-        int             mfsi_flags)
 {
        /*
         * Now we're ready to do the transactions needed for the
@@ -3969,9 +3968,7 @@ xlog_recover_finish(
                xfs_log_force(log->l_mp, (xfs_lsn_t)0,
                              (XFS_LOG_FORCE | XFS_LOG_SYNC));
-                if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
+                xlog_recover_process_iunlinks(log);
-                        xlog_recover_process_iunlinks(log);
-                }
                xlog_recover_check_summary(log);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6c5d1325e7f..a4503f5e949 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -128,7 +128,7 @@ static const struct {
 * initialized.
 */
 STATIC void
-xfs_mount_free(
+xfs_free_perag(
        xfs_mount_t     *mp)
 {
        if (mp->m_perag) {
@@ -139,20 +139,6 @@ xfs_mount_free(
                                kmem_free(mp->m_perag[agno].pagb_list);
                kmem_free(mp->m_perag);
        }
-        spinlock_destroy(&mp->m_ail_lock);
-        spinlock_destroy(&mp->m_sb_lock);
-        mutex_destroy(&mp->m_ilock);
-        mutex_destroy(&mp->m_growlock);
-        if (mp->m_quotainfo)
-                XFS_QM_DONE(mp);
-        if (mp->m_fsname != NULL)
-                kmem_free(mp->m_fsname);
-        if (mp->m_rtname != NULL)
-                kmem_free(mp->m_rtname);
-        if (mp->m_logname != NULL)
-                kmem_free(mp->m_logname);
 }
 /*
@@ -704,11 +690,11 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
 * Update alignment values based on mount options and sb values
 */
 STATIC int
-xfs_update_alignment(xfs_mount_t *mp, int mfsi_flags, __uint64_t *update_flags)
+xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
 {
        xfs_sb_t        *sbp = &(mp->m_sb);
-        if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
+        if (mp->m_dalign) {
                /*
                 * If stripe unit and stripe width are not multiples
                 * of the fs blocksize turn off alignment.
@@ -864,7 +850,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
 * Check that the data (and log if separate) are an ok size.
 */
 STATIC int
-xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
+xfs_check_sizes(xfs_mount_t *mp)
 {
        xfs_buf_t       *bp;
        xfs_daddr_t     d;
@@ -887,8 +873,7 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
                return error;
        }
-        if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
+        if (mp->m_logdev_targp != mp->m_ddev_targp) {
-            mp->m_logdev_targp != mp->m_ddev_targp) {
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
@@ -923,15 +908,13 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
 */
 int
 xfs_mountfs(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             mfsi_flags)
 {
        xfs_sb_t        *sbp = &(mp->m_sb);
        xfs_inode_t     *rip;
        __uint64_t      resblks;
        __int64_t       update_flags = 0LL;
        uint            quotamount, quotaflags;
-        int             agno;
        int             uuid_mounted = 0;
        int             error = 0;
@@ -985,7 +968,7 @@ xfs_mountfs(
         * allocator alignment is within an ag, therefore ag has
         * to be aligned at stripe boundary.
         */
-        error = xfs_update_alignment(mp, mfsi_flags, &update_flags);
+        error = xfs_update_alignment(mp, &update_flags);
        if (error)
                goto error1;
@@ -1004,8 +987,7 @@ xfs_mountfs(
         * since a single partition filesystem is identical to a single
         * partition volume/filesystem.
         */
-        if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
+        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
-            (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
                if (xfs_uuid_mount(mp)) {
                        error = XFS_ERROR(EINVAL);
                        goto error1;
@@ -1033,7 +1015,7 @@ xfs_mountfs(
        /*
         * Check that the data (and log if separate) are an ok size.
         */
-        error = xfs_check_sizes(mp, mfsi_flags);
+        error = xfs_check_sizes(mp);
        if (error)
                goto error1;
@@ -1047,13 +1029,6 @@ xfs_mountfs(
        }
        /*
-         * For client case we are done now
-         */
-        if (mfsi_flags & XFS_MFSI_CLIENT) {
-                return 0;
-        }
-        /*
         *  Copies the low order bits of the timestamp and the randomly
         *  set "sequence" number out of a UUID.
         */
@@ -1077,8 +1052,10 @@ xfs_mountfs(
         * Allocate and initialize the per-ag data.
         */
        init_rwsem(&mp->m_peraglock);
-        mp->m_perag =
+        mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
-                kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
+                                  KM_MAYFAIL);
+        if (!mp->m_perag)
+                goto error1;
        mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
@@ -1190,7 +1167,7 @@ xfs_mountfs(
         * delayed until after the root and real-time bitmap inodes
         * were consistently read in.
         */
-        error = xfs_log_mount_finish(mp, mfsi_flags);
+        error = xfs_log_mount_finish(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: log mount finish failed");
                goto error4;
@@ -1199,7 +1176,7 @@ xfs_mountfs(
        /*
         * Complete the quota initialisation, post-log-replay component.
         */
-        error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags);
+        error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
        if (error)
                goto error4;
@@ -1233,12 +1210,7 @@ xfs_mountfs(
 error3:
        xfs_log_unmount_dealloc(mp);
 error2:
-        for (agno = 0; agno < sbp->sb_agcount; agno++)
+        xfs_free_perag(mp);
-                if (mp->m_perag[agno].pagb_list)
-                        kmem_free(mp->m_perag[agno].pagb_list);
-        kmem_free(mp->m_perag);
-        mp->m_perag = NULL;
-        /* FALLTHROUGH */
 error1:
        if (uuid_mounted)
                uuid_table_remove(&mp->m_sb.sb_uuid);
@@ -1246,16 +1218,17 @@ xfs_mountfs(
 }
 /*
- * xfs_unmountfs
- *
 * This flushes out the inodes,dquots and the superblock, unmounts the
 * log and makes sure that incore structures are freed.
 */
-int
+void
-xfs_unmountfs(xfs_mount_t *mp)
+xfs_unmountfs(
+        struct xfs_mount        *mp)
 {
-        __uint64_t      resblks;
+        __uint64_t              resblks;
-        int             error = 0;
+        int                     error;
+        IRELE(mp->m_rootip);
        /*
         * We can potentially deadlock here if we have an inode cluster
@@ -1312,8 +1285,6 @@ xfs_unmountfs(xfs_mount_t *mp)
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
-        xfs_freesb(mp);
        /*
         * All inodes from this mount point should be freed.
         */
@@ -1322,11 +1293,12 @@ xfs_unmountfs(xfs_mount_t *mp)
        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
                uuid_table_remove(&mp->m_sb.sb_uuid);
-#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+#if defined(DEBUG)
        xfs_errortag_clearall(mp, 0);
 #endif
-        xfs_mount_free(mp);
+        xfs_free_perag(mp);
-        return 0;
+        if (mp->m_quotainfo)
+                XFS_QM_DONE(mp);
 }
 STATIC void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5269bd6e3df..f3c1024b124 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -114,7 +114,7 @@ struct xfs_dqtrxops;
 struct xfs_quotainfo;
 typedef int     (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
-typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int);
+typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
 typedef int     (*xfs_qmunmount_t)(struct xfs_mount *);
 typedef void    (*xfs_qmdone_t)(struct xfs_mount *);
 typedef void    (*xfs_dqrele_t)(struct xfs_dquot *);
@@ -158,8 +158,8 @@ typedef struct xfs_qmops {
 #define XFS_QM_INIT(mp, mnt, fl) \
        (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
-#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \
+#define XFS_QM_MOUNT(mp, mnt, fl) \
-        (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl, mfsi_flags)
+        (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
 #define XFS_QM_UNMOUNT(mp) \
        (*(mp)->m_qm_ops->xfs_qmunmount)(mp)
 #define XFS_QM_DONE(mp) \
@@ -442,13 +442,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 /*
 * Flags for xfs_mountfs
 */
-#define XFS_MFSI_SECOND         0x01    /* Secondary mount -- skip stuff */
-#define XFS_MFSI_CLIENT         0x02    /* Is a client -- skip lots of stuff */
-/*      XFS_MFSI_RRINODES       */
-#define XFS_MFSI_NOUNLINK       0x08    /* Skip unlinked inode processing in */
-                                        /* log recovery */
-#define XFS_MFSI_NO_QUOTACHECK  0x10    /* Skip quotacheck processing */
-/*      XFS_MFSI_CONVERT_SUNIT  */
 #define XFS_MFSI_QUIET          0x40    /* Be silent if mount errors found */
 #define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
@@ -517,10 +510,10 @@ typedef struct xfs_mod_sb {
 extern void     xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
-extern int      xfs_mountfs(xfs_mount_t *mp, int);
+extern int      xfs_mountfs(xfs_mount_t *mp);
 extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
-extern int      xfs_unmountfs(xfs_mount_t *);
+extern void     xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_unmountfs_writesb(xfs_mount_t *);
 extern int      xfs_unmount_flush(xfs_mount_t *, int);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index bf87a591350..e2f68de1615 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -74,18 +74,6 @@ STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
 */
 /*
- * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
- */
-STATIC int
-xfs_lowbit32(
-        __uint32_t      v)
-{
-        if (v)
-                return ffs(v) - 1;
-        return -1;
-}
-/*
 * Allocate space to the bitmap or summary file, and zero it, for growfs.
 */
 STATIC int                              /* error */
@@ -450,6 +438,7 @@ xfs_rtallocate_extent_near(
        }
        bbno = XFS_BITTOBLOCK(mp, bno);
        i = 0;
+        ASSERT(minlen != 0);
        log2len = xfs_highbit32(minlen);
        /*
         * Loop over all bitmap blocks (bbno + i is current block).
@@ -618,6 +607,8 @@ xfs_rtallocate_extent_size(
        xfs_suminfo_t   sum;            /* summary information for extents */
        ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+        ASSERT(maxlen != 0);
        /*
         * Loop over all the levels starting with maxlen.
         * At each level, look at all the bitmap blocks, to see if there
@@ -675,6 +666,9 @@ xfs_rtallocate_extent_size(
                *rtblock = NULLRTBLOCK;
                return 0;
        }
+        ASSERT(minlen != 0);
+        ASSERT(maxlen != 0);
        /*
         * Loop over sizes, from maxlen down to minlen.
         * This time, when we do the allocations, allow smaller ones
@@ -1961,6 +1955,7 @@ xfs_growfs_rt(
                                  nsbp->sb_blocksize * nsbp->sb_rextsize);
                nsbp->sb_rextents = nsbp->sb_rblocks;
                do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+                ASSERT(nsbp->sb_rextents != 0);
                nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
                nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
                nrsumsize =
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index b0f31c09a76..3a82576dde9 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -314,7 +314,7 @@ xfs_bioerror_relse(
                 * ASYNC buffers.
                 */
                XFS_BUF_ERROR(bp, EIO);
-                XFS_BUF_V_IODONESEMA(bp);
+                XFS_BUF_FINISH_IOWAIT(bp);
        } else {
                xfs_buf_relse(bp);
        }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index e4ebddd3c50..4e1c22a23be 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -43,6 +43,7 @@
 #include "xfs_quota.h"
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
 STATIC void     xfs_trans_apply_sb_deltas(xfs_trans_t *);
@@ -253,7 +254,7 @@ _xfs_trans_alloc(
        tp->t_mountp = mp;
        tp->t_items_free = XFS_LIC_NUM_SLOTS;
        tp->t_busy_free = XFS_LBC_NUM_SLOTS;
-        XFS_LIC_INIT(&(tp->t_items));
+        xfs_lic_init(&(tp->t_items));
        XFS_LBC_INIT(&(tp->t_busy));
        return tp;
 }
@@ -282,7 +283,7 @@ xfs_trans_dup(
        ntp->t_mountp = tp->t_mountp;
        ntp->t_items_free = XFS_LIC_NUM_SLOTS;
        ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
-        XFS_LIC_INIT(&(ntp->t_items));
+        xfs_lic_init(&(ntp->t_items));
        XFS_LBC_INIT(&(ntp->t_busy));
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -1169,7 +1170,7 @@ xfs_trans_cancel(
                while (licp != NULL) {
                        lidp = licp->lic_descs;
                        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                                if (XFS_LIC_ISFREE(licp, i)) {
+                                if (xfs_lic_isfree(licp, i)) {
                                        continue;
                                }
@@ -1216,6 +1217,68 @@ xfs_trans_free(
        kmem_zone_free(xfs_trans_zone, tp);
 }
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to
+ * the next: permanent transactions are only flushed out when
+ * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * as possible to let chunks of it go to the log. So we commit the
+ * chunk we've been working on and get a new transaction to continue.
+ */
+int
+xfs_trans_roll(
+        struct xfs_trans        **tpp,
+        struct xfs_inode        *dp)
+{
+        struct xfs_trans        *trans;
+        unsigned int            logres, count;
+        int                     error;
+        /*
+         * Ensure that the inode is always logged.
+         */
+        trans = *tpp;
+        xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+        /*
+         * Copy the critical parameters from one trans to the next.
+         */
+        logres = trans->t_log_res;
+        count = trans->t_log_count;
+        *tpp = xfs_trans_dup(trans);
+        /*
+         * Commit the current transaction.
+         * If this commit failed, then it'd just unlock those items that
+         * are not marked ihold. That also means that a filesystem shutdown
+         * is in progress. The caller takes the responsibility to cancel
+         * the duplicate transaction that gets returned.
+         */
+        error = xfs_trans_commit(trans, 0);
+        if (error)
+                return (error);
+        trans = *tpp;
+        /*
+         * Reserve space in the log for th next transaction.
+         * This also pushes items in the "AIL", the list of logged items,
+         * out to disk if they are taking up space at the tail of the log
+         * that we want to use.  This requires that either nothing be locked
+         * across this call, or that anything that is locked be logged in
+         * the prior and the next transactions.
+         */
+        error = xfs_trans_reserve(trans, 0, logres, 0,
+                                  XFS_TRANS_PERM_LOG_RES, count);
+        /*
+         *  Ensure that the inode is in the new transaction and locked.
+         */
+        if (error)
+                return error;
+        xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(trans, dp);
+        return 0;
+}
 /*
 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
@@ -1253,7 +1316,7 @@ xfs_trans_committed(
         * Special case the chunk embedded in the transaction.
         */
        licp = &(tp->t_items);
-        if (!(XFS_LIC_ARE_ALL_FREE(licp))) {
+        if (!(xfs_lic_are_all_free(licp))) {
                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
        }
@@ -1262,7 +1325,7 @@ xfs_trans_committed(
         */
        licp = licp->lic_next;
        while (licp != NULL) {
-                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                ASSERT(!xfs_lic_are_all_free(licp));
                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
                next_licp = licp->lic_next;
                kmem_free(licp);
@@ -1325,7 +1388,7 @@ xfs_trans_chunk_committed(
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0804207c739..74c80bd2b0e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -210,62 +210,52 @@ typedef struct xfs_log_item_chunk {
 * lic_unused to the right value (0 matches all free).  The
 * lic_descs.lid_index values are set up as each desc is allocated.
 */
-#define XFS_LIC_INIT(cp)        xfs_lic_init(cp)
 static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
 {
        cp->lic_free = XFS_LIC_FREEMASK;
 }
-#define XFS_LIC_INIT_SLOT(cp,slot)      xfs_lic_init_slot(cp, slot)
 static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
 {
        cp->lic_descs[slot].lid_index = (unsigned char)(slot);
 }
-#define XFS_LIC_VACANCY(cp)             xfs_lic_vacancy(cp)
 static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
 {
        return cp->lic_free & XFS_LIC_FREEMASK;
 }
-#define XFS_LIC_ALL_FREE(cp)            xfs_lic_all_free(cp)
 static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
 {
        cp->lic_free = XFS_LIC_FREEMASK;
 }
-#define XFS_LIC_ARE_ALL_FREE(cp)        xfs_lic_are_all_free(cp)
 static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
 {
        return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
 }
-#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot)
 static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
 {
        return (cp->lic_free & (1 << slot));
 }
-#define XFS_LIC_CLAIM(cp,slot)          xfs_lic_claim(cp,slot)
 static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
 {
        cp->lic_free &= ~(1 << slot);
 }
-#define XFS_LIC_RELSE(cp,slot)          xfs_lic_relse(cp,slot)
 static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
 {
        cp->lic_free |= 1 << slot;
 }
-#define XFS_LIC_SLOT(cp,slot)           xfs_lic_slot(cp,slot)
 static inline xfs_log_item_desc_t *
 xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
 {
        return &(cp->lic_descs[slot]);
 }
-#define XFS_LIC_DESC_TO_SLOT(dp)        xfs_lic_desc_to_slot(dp)
 static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
 {
        return (uint)dp->lid_index;
@@ -278,7 +268,6 @@ static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
 * All of this yields the address of the chunk, which is
 * cast to a chunk pointer.
 */
-#define XFS_LIC_DESC_TO_CHUNK(dp)       xfs_lic_desc_to_chunk(dp)
 static inline xfs_log_item_chunk_t *
 xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 {
@@ -986,6 +975,7 @@ int		_xfs_trans_commit(xfs_trans_t *,
                                  int *);
 #define xfs_trans_commit(tp, flags)     _xfs_trans_commit(tp, flags, NULL)
 void            xfs_trans_cancel(xfs_trans_t *, int);
+int             xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
 void            xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index cb0c5839154..4e855b5ced6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -1021,16 +1021,16 @@ xfs_trans_buf_item_match(
        bp = NULL;
        len = BBTOB(len);
        licp = &tp->t_items;
-        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (!xfs_lic_are_all_free(licp)) {
                for (i = 0; i < licp->lic_unused; i++) {
                        /*
                         * Skip unoccupied slots.
                         */
-                        if (XFS_LIC_ISFREE(licp, i)) {
+                        if (xfs_lic_isfree(licp, i)) {
                                continue;
                        }
-                        lidp = XFS_LIC_SLOT(licp, i);
+                        lidp = xfs_lic_slot(licp, i);
                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
                        if (blip->bli_item.li_type != XFS_LI_BUF) {
                                continue;
@@ -1074,7 +1074,7 @@ xfs_trans_buf_item_match_all(
        bp = NULL;
        len = BBTOB(len);
        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-                if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                if (xfs_lic_are_all_free(licp)) {
                        ASSERT(licp == &tp->t_items);
                        ASSERT(licp->lic_next == NULL);
                        return NULL;
@@ -1083,11 +1083,11 @@ xfs_trans_buf_item_match_all(
                        /*
                         * Skip unoccupied slots.
                         */
-                        if (XFS_LIC_ISFREE(licp, i)) {
+                        if (xfs_lic_isfree(licp, i)) {
                                continue;
                        }
-                        lidp = XFS_LIC_SLOT(licp, i);
+                        lidp = xfs_lic_slot(licp, i);
                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
                        if (blip->bli_item.li_type != XFS_LI_BUF) {
                                continue;
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index db5c8359552..3c666e8317f 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -53,11 +53,11 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
                 * Initialize the chunk, and then
                 * claim the first slot in the newly allocated chunk.
                 */
-                XFS_LIC_INIT(licp);
+                xfs_lic_init(licp);
-                XFS_LIC_CLAIM(licp, 0);
+                xfs_lic_claim(licp, 0);
                licp->lic_unused = 1;
-                XFS_LIC_INIT_SLOT(licp, 0);
+                xfs_lic_init_slot(licp, 0);
-                lidp = XFS_LIC_SLOT(licp, 0);
+                lidp = xfs_lic_slot(licp, 0);
                /*
                 * Link in the new chunk and update the free count.
@@ -88,14 +88,14 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
         */
        licp = &tp->t_items;
        while (licp != NULL) {
-                if (XFS_LIC_VACANCY(licp)) {
+                if (xfs_lic_vacancy(licp)) {
                        if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
                                i = licp->lic_unused;
-                                ASSERT(XFS_LIC_ISFREE(licp, i));
+                                ASSERT(xfs_lic_isfree(licp, i));
                                break;
                        }
                        for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
-                                if (XFS_LIC_ISFREE(licp, i))
+                                if (xfs_lic_isfree(licp, i))
                                        break;
                        }
                        ASSERT(i <= XFS_LIC_MAX_SLOT);
@@ -108,12 +108,12 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
         * If we find a free descriptor, claim it,
         * initialize it, and return it.
         */
-        XFS_LIC_CLAIM(licp, i);
+        xfs_lic_claim(licp, i);
        if (licp->lic_unused <= i) {
                licp->lic_unused = i + 1;
-                XFS_LIC_INIT_SLOT(licp, i);
+                xfs_lic_init_slot(licp, i);
        }
-        lidp = XFS_LIC_SLOT(licp, i);
+        lidp = xfs_lic_slot(licp, i);
        tp->t_items_free--;
        lidp->lid_item = lip;
        lidp->lid_flags = 0;
@@ -136,9 +136,9 @@ xfs_trans_free_item(xfs_trans_t	*tp, xfs_log_item_desc_t *lidp)
        xfs_log_item_chunk_t    *licp;
        xfs_log_item_chunk_t    **licpp;
-        slot = XFS_LIC_DESC_TO_SLOT(lidp);
+        slot = xfs_lic_desc_to_slot(lidp);
-        licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+        licp = xfs_lic_desc_to_chunk(lidp);
-        XFS_LIC_RELSE(licp, slot);
+        xfs_lic_relse(licp, slot);
        lidp->lid_item->li_desc = NULL;
        tp->t_items_free++;
@@ -154,7 +154,7 @@ xfs_trans_free_item(xfs_trans_t	*tp, xfs_log_item_desc_t *lidp)
         * Also decrement the transaction structure's count of free items
         * by the number in a chunk since we are freeing an empty chunk.
         */
-        if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) {
+        if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
                licpp = &(tp->t_items.lic_next);
                while (*licpp != licp) {
                        ASSERT(*licpp != NULL);
@@ -207,20 +207,20 @@ xfs_trans_first_item(xfs_trans_t *tp)
        /*
         * If it's not in the first chunk, skip to the second.
         */
-        if (XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (xfs_lic_are_all_free(licp)) {
                licp = licp->lic_next;
        }
        /*
         * Return the first non-free descriptor in the chunk.
         */
-        ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+        ASSERT(!xfs_lic_are_all_free(licp));
        for (i = 0; i < licp->lic_unused; i++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
-                return XFS_LIC_SLOT(licp, i);
+                return xfs_lic_slot(licp, i);
        }
        cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
        return NULL;
@@ -242,18 +242,18 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
        xfs_log_item_chunk_t    *licp;
        int                     i;
-        licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+        licp = xfs_lic_desc_to_chunk(lidp);
        /*
         * First search the rest of the chunk. The for loop keeps us
         * from referencing things beyond the end of the chunk.
         */
-        for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) {
+        for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
-                return XFS_LIC_SLOT(licp, i);
+                return xfs_lic_slot(licp, i);
        }
        /*
@@ -266,13 +266,13 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
        }
        licp = licp->lic_next;
-        ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+        ASSERT(!xfs_lic_are_all_free(licp));
        for (i = 0; i < licp->lic_unused; i++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
-                return XFS_LIC_SLOT(licp, i);
+                return xfs_lic_slot(licp, i);
        }
        ASSERT(0);
        /* NOTREACHED */
@@ -300,9 +300,9 @@ xfs_trans_free_items(
        /*
         * Special case the embedded chunk so we don't free it below.
         */
-        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (!xfs_lic_are_all_free(licp)) {
                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
-                XFS_LIC_ALL_FREE(licp);
+                xfs_lic_all_free(licp);
                licp->lic_unused = 0;
        }
        licp = licp->lic_next;
@@ -311,7 +311,7 @@ xfs_trans_free_items(
         * Unlock each item in each chunk and free the chunks.
         */
        while (licp != NULL) {
-                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                ASSERT(!xfs_lic_are_all_free(licp));
                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
                next_licp = licp->lic_next;
                kmem_free(licp);
@@ -347,7 +347,7 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
        /*
         * Special case the embedded chunk so we don't free.
         */
-        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (!xfs_lic_are_all_free(licp)) {
                freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
        }
        licpp = &(tp->t_items.lic_next);
@@ -358,10 +358,10 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
         * and free empty chunks.
         */
        while (licp != NULL) {
-                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                ASSERT(!xfs_lic_are_all_free(licp));
                freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
                next_licp = licp->lic_next;
-                if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                if (xfs_lic_are_all_free(licp)) {
                        *licpp = next_licp;
                        kmem_free(licp);
                        freed -= XFS_LIC_NUM_SLOTS;
@@ -402,7 +402,7 @@ xfs_trans_unlock_chunk(
        freed = 0;
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
                lip = lidp->lid_item;
@@ -421,7 +421,7 @@ xfs_trans_unlock_chunk(
                 */
                if (!(freeing_chunk) &&
                    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
-                        XFS_LIC_RELSE(licp, i);
+                        xfs_lic_relse(licp, i);
                        freed++;
                }
        }
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 98e5f110ba5..35d4d414bcc 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -237,7 +237,7 @@ xfs_droplink(
        ASSERT (ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink--;
-        drop_nlink(ip->i_vnode);
+        drop_nlink(VFS_I(ip));
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        error = 0;
@@ -301,7 +301,7 @@ xfs_bumplink(
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
-        inc_nlink(ip->i_vnode);
+        inc_nlink(VFS_I(ip));
        if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
            (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
                /*
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f316cb85d8e..ef321225d26 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,9 +18,6 @@
 #ifndef __XFS_UTILS_H__
 #define __XFS_UTILS_H__
-#define IRELE(ip)       VN_RELE(XFS_ITOV(ip))
-#define IHOLD(ip)       VN_HOLD(XFS_ITOV(ip))
 extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
                                xfs_dev_t, cred_t *, prid_t, int,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 4a9a43315a8..439dd3939dd 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -128,7 +128,6 @@ xfs_unmount_flush(
        xfs_inode_t     *rip = mp->m_rootip;
        xfs_inode_t     *rbmip;
        xfs_inode_t     *rsumip = NULL;
-        bhv_vnode_t     *rvp = XFS_ITOV(rip);
        int             error;
        xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -146,7 +145,7 @@ xfs_unmount_flush(
                if (error == EFSCORRUPTED)
                        goto fscorrupt_out;
-                ASSERT(vn_count(XFS_ITOV(rbmip)) == 1);
+                ASSERT(vn_count(VFS_I(rbmip)) == 1);
                rsumip = mp->m_rsumip;
                xfs_ilock(rsumip, XFS_ILOCK_EXCL);
@@ -157,7 +156,7 @@ xfs_unmount_flush(
                if (error == EFSCORRUPTED)
                        goto fscorrupt_out;
-                ASSERT(vn_count(XFS_ITOV(rsumip)) == 1);
+                ASSERT(vn_count(VFS_I(rsumip)) == 1);
        }
        /*
@@ -167,7 +166,7 @@ xfs_unmount_flush(
        if (error == EFSCORRUPTED)
                goto fscorrupt_out2;
-        if (vn_count(rvp) != 1 && !relocation) {
+        if (vn_count(VFS_I(rip)) != 1 && !relocation) {
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                return XFS_ERROR(EBUSY);
        }
@@ -284,7 +283,7 @@ xfs_sync_inodes(
        int             *bypassed)
 {
        xfs_inode_t     *ip = NULL;
-        bhv_vnode_t     *vp = NULL;
+        struct inode    *vp = NULL;
        int             error;
        int             last_error;
        uint64_t        fflag;
@@ -404,7 +403,7 @@ xfs_sync_inodes(
                        continue;
                }
-                vp = XFS_ITOV_NULL(ip);
+                vp = VFS_I(ip);
                /*
                 * If the vnode is gone then this is being torn down,
@@ -479,7 +478,7 @@ xfs_sync_inodes(
                        IPOINTER_INSERT(ip, mp);
                        xfs_ilock(ip, lock_flags);
-                        ASSERT(vp == XFS_ITOV(ip));
+                        ASSERT(vp == VFS_I(ip));
                        ASSERT(ip->i_mount == mp);
                        vnode_refed = B_TRUE;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 76a1166af82..aa238c8fbd7 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -83,7 +83,7 @@ xfs_setattr(
        cred_t                  *credp)
 {
        xfs_mount_t             *mp = ip->i_mount;
-        struct inode            *inode = XFS_ITOV(ip);
+        struct inode            *inode = VFS_I(ip);
        int                     mask = iattr->ia_valid;
        xfs_trans_t             *tp;
        int                     code;
@@ -182,7 +182,7 @@ xfs_setattr(
        xfs_ilock(ip, lock_flags);
        /* boolean: are we the file owner? */
-        file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
+        file_owner = (current_fsuid() == ip->i_d.di_uid);
        /*
         * Change various properties of a file.
@@ -513,7 +513,6 @@ xfs_setattr(
                        ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
                        ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
                        ip->i_update_core = 1;
-                        timeflags &= ~XFS_ICHGTIME_ACC;
                }
                if (mask & ATTR_MTIME) {
                        inode->i_mtime = iattr->ia_mtime;
@@ -714,7 +713,7 @@ xfs_fsync(
                return XFS_ERROR(EIO);
        /* capture size updates in I/O completion before writing the inode. */
-        error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+        error = filemap_fdatawait(VFS_I(ip)->i_mapping);
        if (error)
                return XFS_ERROR(error);
@@ -1160,7 +1159,6 @@ int
 xfs_release(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
        xfs_mount_t     *mp = ip->i_mount;
        int             error;
@@ -1195,13 +1193,13 @@ xfs_release(
                 * be exposed to that problem.
                 */
                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
-                if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
+                if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
                        xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
        }
        if (ip->i_d.di_nlink != 0) {
                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
                       ip->i_delayed_blks > 0)) &&
                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
                    (!(ip->i_d.di_flags &
@@ -1227,7 +1225,6 @@ int
 xfs_inactive(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
        xfs_bmap_free_t free_list;
        xfs_fsblock_t   first_block;
        int             committed;
@@ -1242,7 +1239,7 @@ xfs_inactive(
         * If the inode is already free, then there can be nothing
         * to clean up here.
         */
-        if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
+        if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
                ASSERT(ip->i_df.if_real_bytes == 0);
                ASSERT(ip->i_df.if_broot_bytes == 0);
                return VN_INACTIVE_CACHE;
@@ -1272,7 +1269,7 @@ xfs_inactive(
        if (ip->i_d.di_nlink != 0) {
                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
                       ip->i_delayed_blks > 0)) &&
                      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
                     (!(ip->i_d.di_flags &
@@ -1536,7 +1533,7 @@ xfs_create(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(credp), current_fsgid(credp), prid,
+                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -1708,111 +1705,6 @@ std_return:
 }
 #ifdef DEBUG
-/*
- * Some counters to see if (and how often) we are hitting some deadlock
- * prevention code paths.
- */
-int xfs_rm_locks;
-int xfs_rm_lock_delays;
-int xfs_rm_attempts;
-#endif
-/*
- * The following routine will lock the inodes associated with the
- * directory and the named entry in the directory. The locks are
- * acquired in increasing inode number.
- *
- * If the entry is "..", then only the directory is locked. The
- * vnode ref count will still include that from the .. entry in
- * this case.
- *
- * There is a deadlock we need to worry about. If the locked directory is
- * in the AIL, it might be blocking up the log. The next inode we lock
- * could be already locked by another thread waiting for log space (e.g
- * a permanent log reservation with a long running transaction (see
- * xfs_itruncate_finish)). To solve this, we must check if the directory
- * is in the ail and use lock_nowait. If we can't lock, we need to
- * drop the inode lock on the directory and try again. xfs_iunlock will
- * potentially push the tail if we were holding up the log.
- */
-STATIC int
-xfs_lock_dir_and_entry(
-        xfs_inode_t     *dp,
-        xfs_inode_t     *ip)    /* inode of entry 'name' */
-{
-        int             attempts;
-        xfs_ino_t       e_inum;
-        xfs_inode_t     *ips[2];
-        xfs_log_item_t  *lp;
-#ifdef DEBUG
-        xfs_rm_locks++;
-#endif
-        attempts = 0;
-again:
-        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-        e_inum = ip->i_ino;
-        xfs_itrace_ref(ip);
-        /*
-         * We want to lock in increasing inum. Since we've already
-         * acquired the lock on the directory, we may need to release
-         * if if the inum of the entry turns out to be less.
-         */
-        if (e_inum > dp->i_ino) {
-                /*
-                 * We are already in the right order, so just
-                 * lock on the inode of the entry.
-                 * We need to use nowait if dp is in the AIL.
-                 */
-                lp = (xfs_log_item_t *)dp->i_itemp;
-                if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
-                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                attempts++;
-#ifdef DEBUG
-                                xfs_rm_attempts++;
-#endif
-                                /*
-                                 * Unlock dp and try again.
-                                 * xfs_iunlock will try to push the tail
-                                 * if the inode is in the AIL.
-                                 */
-                                xfs_iunlock(dp, XFS_ILOCK_EXCL);
-                                if ((attempts % 5) == 0) {
-                                        delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
-                                        xfs_rm_lock_delays++;
-#endif
-                                }
-                                goto again;
-                        }
-                } else {
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                }
-        } else if (e_inum < dp->i_ino) {
-                xfs_iunlock(dp, XFS_ILOCK_EXCL);
-                ips[0] = ip;
-                ips[1] = dp;
-                xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
-        }
-        /* else  e_inum == dp->i_ino */
-        /*     This can happen if we're asked to lock /x/..
-         *     the entry is "..", which is also the parent directory.
-         */
-        return 0;
-}
-#ifdef DEBUG
 int xfs_locked_n;
 int xfs_small_retries;
 int xfs_middle_retries;
@@ -1946,6 +1838,45 @@ again:
 #endif
 }
+void
+xfs_lock_two_inodes(
+        xfs_inode_t             *ip0,
+        xfs_inode_t             *ip1,
+        uint                    lock_mode)
+{
+        xfs_inode_t             *temp;
+        int                     attempts = 0;
+        xfs_log_item_t          *lp;
+        ASSERT(ip0->i_ino != ip1->i_ino);
+        if (ip0->i_ino > ip1->i_ino) {
+                temp = ip0;
+                ip0 = ip1;
+                ip1 = temp;
+        }
+ again:
+        xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
+        /*
+         * If the first lock we have locked is in the AIL, we must TRY to get
+         * the second lock. If we can't get it, we must release the first one
+         * and try again.
+         */
+        lp = (xfs_log_item_t *)ip0->i_itemp;
+        if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+                        xfs_iunlock(ip0, lock_mode);
+                        if ((++attempts % 5) == 0)
+                                delay(1); /* Don't just spin the CPU */
+                        goto again;
+                }
+        } else {
+                xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+        }
+}
 int
 xfs_remove(
        xfs_inode_t             *dp,
@@ -2018,9 +1949,7 @@ xfs_remove(
                goto out_trans_cancel;
        }
-        error = xfs_lock_dir_and_entry(dp, ip);
+        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
-        if (error)
-                goto out_trans_cancel;
        /*
         * At this point, we've gotten both the directory and the entry
@@ -2047,9 +1976,6 @@ xfs_remove(
                }
        }
-        /*
-         * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
-         */
        XFS_BMAP_INIT(&free_list, &first_block);
        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks);
@@ -2155,7 +2081,6 @@ xfs_link(
 {
        xfs_mount_t             *mp = tdp->i_mount;
        xfs_trans_t             *tp;
-        xfs_inode_t             *ips[2];
        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
@@ -2203,15 +2128,7 @@ xfs_link(
                goto error_return;
        }
-        if (sip->i_ino < tdp->i_ino) {
+        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
-                ips[0] = sip;
-                ips[1] = tdp;
-        } else {
-                ips[0] = tdp;
-                ips[1] = sip;
-        }
-        xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
        /*
         * Increment vnode ref counts since xfs_trans_commit &
@@ -2352,7 +2269,7 @@ xfs_mkdir(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(credp), current_fsgid(credp), prid,
+                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -2578,7 +2495,7 @@ xfs_symlink(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(credp), current_fsgid(credp), prid,
+                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -2873,14 +2790,13 @@ int
 xfs_reclaim(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
        xfs_itrace_entry(ip);
-        ASSERT(!VN_MAPPED(vp));
+        ASSERT(!VN_MAPPED(VFS_I(ip)));
        /* bad inode, get out here ASAP */
-        if (VN_BAD(vp)) {
+        if (VN_BAD(VFS_I(ip))) {
                xfs_ireclaim(ip);
                return 0;
        }
@@ -2917,7 +2833,7 @@ xfs_reclaim(
                XFS_MOUNT_ILOCK(mp);
                spin_lock(&ip->i_flags_lock);
                __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-                vn_to_inode(vp)->i_private = NULL;
+                VFS_I(ip)->i_private = NULL;
                ip->i_vnode = NULL;
                spin_unlock(&ip->i_flags_lock);
                list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
@@ -2933,7 +2849,7 @@ xfs_finish_reclaim(
        int             sync_mode)
 {
        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-        bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
+        struct inode    *vp = VFS_I(ip);
        if (vp && VN_BAD(vp))
                goto reclaim;
@@ -3321,7 +3237,6 @@ xfs_free_file_space(
        xfs_off_t               len,
        int                     attr_flags)
 {
-        bhv_vnode_t             *vp;
        int                     committed;
        int                     done;
        xfs_off_t               end_dmi_offset;
@@ -3341,7 +3256,6 @@ xfs_free_file_space(
        xfs_trans_t             *tp;
        int                     need_iolock = 1;
-        vp = XFS_ITOV(ip);
        mp = ip->i_mount;
        xfs_itrace_entry(ip);
@@ -3378,7 +3292,7 @@ xfs_free_file_space(
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        ioffset = offset & ~(rounding - 1);
-        if (VN_CACHED(vp) != 0) {
+        if (VN_CACHED(VFS_I(ip)) != 0) {
                xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
                error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
                if (error)
author	H. Peter Anvin <hpa@zytor.com>	2008-09-04 12:04:45 -0400
committer	H. Peter Anvin <hpa@zytor.com>	2008-09-04 12:04:45 -0400
commit	fe47784ba5cbb6b713c013e046859946789b45e4 (patch)
tree	6384958d55e29be0d2eb8ae78fa437c10636d8d6 /fs
parent	83b8e28b14d63db928cb39e5c5ed2a548246bd71 (diff)
parent	af2e1f276ff08f17192411ea3b71c13a758dfe12 (diff)