Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6

* git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6: (292 commits) [GFS2] Fix endian bug for de_type [GFS2] Initialize SELinux extended attributes at inode creation time. [GFS2] Move logging code into log.c (mostly) [GFS2] Mark nlink cleared so VFS sees it happen [GFS2] Two redundant casts removed [GFS2] Remove uneeded endian conversion [GFS2] Remove duplicate sb reading code [GFS2] Mark metadata reads for blktrace [GFS2] Remove iflags.h, use FS_ [GFS2] Fix code style/indent in ops_file.c [GFS2] streamline-generic_file_-interfaces-and-filemap gfs fix [GFS2] Remove readv/writev methods and use aio_read/aio_write instead (gfs bits) [GFS2] inode-diet: Eliminate i_blksize from the inode structure [GFS2] inode_diet: Replace inode.u.generic_ip with inode.i_private (gfs) [GFS2] Fix typo in last patch [GFS2] Fix direct i/o logic in filemap.c [GFS2] Fix bug in Makefiles for lock modules [GFS2] Remove (extra) fs_subsys declaration [GFS2/DLM] Fix trailing whitespace [GFS2] Tidy up meta_io code ...
author: Linus Torvalds <torvalds@g5.osdl.org> 2006-10-04 12:06:16 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-10-04 12:06:16 -0400
commit: 4a61f17378c2cdd9bd8f34ef8bd7422861d0c1f1 (patch)
tree: a2054556900af8c16fd9f5419f012dcf1ee2995a /fs
parent: d002ec481c24f325ed6cfcb7810d317c015dd1b5 (diff)
parent: 7ecdb70a0ea436c06540140242bfac6ac3babfc0 (diff)
114 files changed, 38977 insertions, 1 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 674cfbb83a95..599de54451af 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -325,6 +325,7 @@ config FS_POSIX_ACL
        default n
 source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
 config OCFS2_FS
        tristate "OCFS2 file system support"
@@ -1995,6 +1996,7 @@ endmenu
 endif
 source "fs/nls/Kconfig"
+source "fs/dlm/Kconfig"
 endmenu
diff --git a/fs/Makefile b/fs/Makefile
index fd24d67a7cdb..df614eacee86 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y                           += devpts/
 obj-$(CONFIG_PROFILING)         += dcookies.o
+obj-$(CONFIG_DLM)               += dlm/
 
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)       += reiserfs/
@@ -110,3 +111,4 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)             += hppfs/
 obj-$(CONFIG_DEBUG_FS)          += debugfs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
+obj-$(CONFIG_GFS2_FS)           += gfs2/
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
 EXPORT_SYMBOL(config_group_init);
 EXPORT_SYMBOL(config_item_get);
 EXPORT_SYMBOL(config_item_put);
+EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..490f85b3fa59
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,21 @@
+menu "Distributed Lock Manager"
+        depends on INET && EXPERIMENTAL
+config DLM
+        tristate "Distributed Lock Manager (DLM)"
+        depends on IPV6 || IPV6=n
+        depends on IP_SCTP
+        select CONFIGFS_FS
+        help
+        A general purpose distributed lock manager for kernel or userspace
+        applications.
+config DLM_DEBUG
+        bool "DLM debugging"
+        depends on DLM
+        help
+        Under the debugfs mount point, the name of each lockspace will
+        appear as a file in the "dlm" directory.  The output is the
+        list of resource and locks the local node knows about.
+endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_DLM) +=            dlm.o
+dlm-y :=                        ast.o \
+                                config.o \
+                                dir.o \
+                                lock.o \
+                                lockspace.o \
+                                lowcomms.o \
+                                main.o \
+                                member.o \
+                                memory.o \
+                                midcomms.o \
+                                rcom.o \
+                                recover.o \
+                                recoverd.o \
+                                requestqueue.o \
+                                user.o \
+                                util.o
+dlm-$(CONFIG_DLM_DEBUG) +=      debug_fs.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..f91d39cb1e0b
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,173 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lock.h"
+#include "user.h"
+#define WAKE_ASTS  0
+static struct list_head         ast_queue;
+static spinlock_t               ast_queue_lock;
+static struct task_struct *     astd_task;
+static unsigned long            astd_wakeflags;
+static struct mutex             astd_running;
+void dlm_del_ast(struct dlm_lkb *lkb)
+{
+        spin_lock(&ast_queue_lock);
+        if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
+                list_del(&lkb->lkb_astqueue);
+        spin_unlock(&ast_queue_lock);
+}
+void dlm_add_ast(struct dlm_lkb *lkb, int type)
+{
+        if (lkb->lkb_flags & DLM_IFL_USER) {
+                dlm_user_add_ast(lkb, type);
+                return;
+        }
+        DLM_ASSERT(lkb->lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb););
+        spin_lock(&ast_queue_lock);
+        if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+                kref_get(&lkb->lkb_ref);
+                list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+        }
+        lkb->lkb_ast_type |= type;
+        spin_unlock(&ast_queue_lock);
+        set_bit(WAKE_ASTS, &astd_wakeflags);
+        wake_up_process(astd_task);
+}
+static void process_asts(void)
+{
+        struct dlm_ls *ls = NULL;
+        struct dlm_rsb *r = NULL;
+        struct dlm_lkb *lkb;
+        void (*cast) (long param);
+        void (*bast) (long param, int mode);
+        int type = 0, found, bmode;
+        for (;;) {
+                found = 0;
+                spin_lock(&ast_queue_lock);
+                list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+                        r = lkb->lkb_resource;
+                        ls = r->res_ls;
+                        if (dlm_locking_stopped(ls))
+                                continue;
+                        list_del(&lkb->lkb_astqueue);
+                        type = lkb->lkb_ast_type;
+                        lkb->lkb_ast_type = 0;
+                        found = 1;
+                        break;
+                }
+                spin_unlock(&ast_queue_lock);
+                if (!found)
+                        break;
+                cast = lkb->lkb_astaddr;
+                bast = lkb->lkb_bastaddr;
+                bmode = lkb->lkb_bastmode;
+                if ((type & AST_COMP) && cast)
+                        cast(lkb->lkb_astparam);
+                /* FIXME: Is it safe to look at lkb_grmode here
+                   without doing a lock_rsb() ?
+                   Look at other checks in v1 to avoid basts. */
+                if ((type & AST_BAST) && bast)
+                        if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
+                                bast(lkb->lkb_astparam, bmode);
+                /* this removes the reference added by dlm_add_ast
+                   and may result in the lkb being freed */
+                dlm_put_lkb(lkb);
+                schedule();
+        }
+}
+static inline int no_asts(void)
+{
+        int ret;
+        spin_lock(&ast_queue_lock);
+        ret = list_empty(&ast_queue);
+        spin_unlock(&ast_queue_lock);
+        return ret;
+}
+static int dlm_astd(void *data)
+{
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (!test_bit(WAKE_ASTS, &astd_wakeflags))
+                        schedule();
+                set_current_state(TASK_RUNNING);
+                mutex_lock(&astd_running);
+                if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
+                        process_asts();
+                mutex_unlock(&astd_running);
+        }
+        return 0;
+}
+void dlm_astd_wake(void)
+{
+        if (!no_asts()) {
+                set_bit(WAKE_ASTS, &astd_wakeflags);
+                wake_up_process(astd_task);
+        }
+}
+int dlm_astd_start(void)
+{
+        struct task_struct *p;
+        int error = 0;
+        INIT_LIST_HEAD(&ast_queue);
+        spin_lock_init(&ast_queue_lock);
+        mutex_init(&astd_running);
+        p = kthread_run(dlm_astd, NULL, "dlm_astd");
+        if (IS_ERR(p))
+                error = PTR_ERR(p);
+        else
+                astd_task = p;
+        return error;
+}
+void dlm_astd_stop(void)
+{
+        kthread_stop(astd_task);
+}
+void dlm_astd_suspend(void)
+{
+        mutex_lock(&astd_running);
+}
+void dlm_astd_resume(void)
+{
+        mutex_unlock(&astd_running);
+}
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __ASTD_DOT_H__
+#define __ASTD_DOT_H__
+void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_del_ast(struct dlm_lkb *lkb);
+void dlm_astd_wake(void);
+int dlm_astd_start(void);
+void dlm_astd_stop(void);
+void dlm_astd_suspend(void);
+void dlm_astd_resume(void);
+#endif
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <net/sock.h>
+#include "config.h"
+#include "lowcomms.h"
+/*
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/local
+ * /config/dlm/<cluster>/comms/<comm>/addr
+ * The <cluster> level is useless, but I haven't figured out how to avoid it.
+ */
+static struct config_group *space_list;
+static struct config_group *comm_list;
+static struct comm *local_comm;
+struct clusters;
+struct cluster;
+struct spaces;
+struct space;
+struct comms;
+struct comm;
+struct nodes;
+struct node;
+static struct config_group *make_cluster(struct config_group *, const char *);
+static void drop_cluster(struct config_group *, struct config_item *);
+static void release_cluster(struct config_item *);
+static struct config_group *make_space(struct config_group *, const char *);
+static void drop_space(struct config_group *, struct config_item *);
+static void release_space(struct config_item *);
+static struct config_item *make_comm(struct config_group *, const char *);
+static void drop_comm(struct config_group *, struct config_item *);
+static void release_comm(struct config_item *);
+static struct config_item *make_node(struct config_group *, const char *);
+static void drop_node(struct config_group *, struct config_item *);
+static void release_node(struct config_item *);
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+                         char *buf);
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+                          const char *buf, size_t len);
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+                         char *buf);
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+                          const char *buf, size_t len);
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_read(struct comm *cm, char *buf);
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t node_nodeid_read(struct node *nd, char *buf);
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
+static ssize_t node_weight_read(struct node *nd, char *buf);
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+enum {
+        COMM_ATTR_NODEID = 0,
+        COMM_ATTR_LOCAL,
+        COMM_ATTR_ADDR,
+};
+struct comm_attribute {
+        struct configfs_attribute attr;
+        ssize_t (*show)(struct comm *, char *);
+        ssize_t (*store)(struct comm *, const char *, size_t);
+};
+static struct comm_attribute comm_attr_nodeid = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = comm_nodeid_read,
+        .store  = comm_nodeid_write,
+};
+static struct comm_attribute comm_attr_local = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "local",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = comm_local_read,
+        .store  = comm_local_write,
+};
+static struct comm_attribute comm_attr_addr = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "addr",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .store  = comm_addr_write,
+};
+static struct configfs_attribute *comm_attrs[] = {
+        [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
+        [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
+        [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
+        NULL,
+};
+enum {
+        NODE_ATTR_NODEID = 0,
+        NODE_ATTR_WEIGHT,
+};
+struct node_attribute {
+        struct configfs_attribute attr;
+        ssize_t (*show)(struct node *, char *);
+        ssize_t (*store)(struct node *, const char *, size_t);
+};
+static struct node_attribute node_attr_nodeid = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = node_nodeid_read,
+        .store  = node_nodeid_write,
+};
+static struct node_attribute node_attr_weight = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "weight",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = node_weight_read,
+        .store  = node_weight_write,
+};
+static struct configfs_attribute *node_attrs[] = {
+        [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
+        [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
+        NULL,
+};
+struct clusters {
+        struct configfs_subsystem subsys;
+};
+struct cluster {
+        struct config_group group;
+};
+struct spaces {
+        struct config_group ss_group;
+};
+struct space {
+        struct config_group group;
+        struct list_head members;
+        struct mutex members_lock;
+        int members_count;
+};
+struct comms {
+        struct config_group cs_group;
+};
+struct comm {
+        struct config_item item;
+        int nodeid;
+        int local;
+        int addr_count;
+        struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+struct nodes {
+        struct config_group ns_group;
+};
+struct node {
+        struct config_item item;
+        struct list_head list; /* space->members */
+        int nodeid;
+        int weight;
+};
+static struct configfs_group_operations clusters_ops = {
+        .make_group = make_cluster,
+        .drop_item = drop_cluster,
+};
+static struct configfs_item_operations cluster_ops = {
+        .release = release_cluster,
+};
+static struct configfs_group_operations spaces_ops = {
+        .make_group = make_space,
+        .drop_item = drop_space,
+};
+static struct configfs_item_operations space_ops = {
+        .release = release_space,
+};
+static struct configfs_group_operations comms_ops = {
+        .make_item = make_comm,
+        .drop_item = drop_comm,
+};
+static struct configfs_item_operations comm_ops = {
+        .release = release_comm,
+        .show_attribute = show_comm,
+        .store_attribute = store_comm,
+};
+static struct configfs_group_operations nodes_ops = {
+        .make_item = make_node,
+        .drop_item = drop_node,
+};
+static struct configfs_item_operations node_ops = {
+        .release = release_node,
+        .show_attribute = show_node,
+        .store_attribute = store_node,
+};
+static struct config_item_type clusters_type = {
+        .ct_group_ops = &clusters_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type cluster_type = {
+        .ct_item_ops = &cluster_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type spaces_type = {
+        .ct_group_ops = &spaces_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type space_type = {
+        .ct_item_ops = &space_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type comms_type = {
+        .ct_group_ops = &comms_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type comm_type = {
+        .ct_item_ops = &comm_ops,
+        .ct_attrs = comm_attrs,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type nodes_type = {
+        .ct_group_ops = &nodes_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type node_type = {
+        .ct_item_ops = &node_ops,
+        .ct_attrs = node_attrs,
+        .ct_owner = THIS_MODULE,
+};
+static struct cluster *to_cluster(struct config_item *i)
+{
+        return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+}
+static struct space *to_space(struct config_item *i)
+{
+        return i ? container_of(to_config_group(i), struct space, group) : NULL;
+}
+static struct comm *to_comm(struct config_item *i)
+{
+        return i ? container_of(i, struct comm, item) : NULL;
+}
+static struct node *to_node(struct config_item *i)
+{
+        return i ? container_of(i, struct node, item) : NULL;
+}
+static struct config_group *make_cluster(struct config_group *g,
+                                         const char *name)
+{
+        struct cluster *cl = NULL;
+        struct spaces *sps = NULL;
+        struct comms *cms = NULL;
+        void *gps = NULL;
+        cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+        gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+        sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
+        cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+        if (!cl || !gps || !sps || !cms)
+                goto fail;
+        config_group_init_type_name(&cl->group, name, &cluster_type);
+        config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
+        config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
+        cl->group.default_groups = gps;
+        cl->group.default_groups[0] = &sps->ss_group;
+        cl->group.default_groups[1] = &cms->cs_group;
+        cl->group.default_groups[2] = NULL;
+        space_list = &sps->ss_group;
+        comm_list = &cms->cs_group;
+        return &cl->group;
+ fail:
+        kfree(cl);
+        kfree(gps);
+        kfree(sps);
+        kfree(cms);
+        return NULL;
+}
+static void drop_cluster(struct config_group *g, struct config_item *i)
+{
+        struct cluster *cl = to_cluster(i);
+        struct config_item *tmp;
+        int j;
+        for (j = 0; cl->group.default_groups[j]; j++) {
+                tmp = &cl->group.default_groups[j]->cg_item;
+                cl->group.default_groups[j] = NULL;
+                config_item_put(tmp);
+        }
+        space_list = NULL;
+        comm_list = NULL;
+        config_item_put(i);
+}
+static void release_cluster(struct config_item *i)
+{
+        struct cluster *cl = to_cluster(i);
+        kfree(cl->group.default_groups);
+        kfree(cl);
+}
+static struct config_group *make_space(struct config_group *g, const char *name)
+{
+        struct space *sp = NULL;
+        struct nodes *nds = NULL;
+        void *gps = NULL;
+        sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+        gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
+        nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+        if (!sp || !gps || !nds)
+                goto fail;
+        config_group_init_type_name(&sp->group, name, &space_type);
+        config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+        sp->group.default_groups = gps;
+        sp->group.default_groups[0] = &nds->ns_group;
+        sp->group.default_groups[1] = NULL;
+        INIT_LIST_HEAD(&sp->members);
+        mutex_init(&sp->members_lock);
+        sp->members_count = 0;
+        return &sp->group;
+ fail:
+        kfree(sp);
+        kfree(gps);
+        kfree(nds);
+        return NULL;
+}
+static void drop_space(struct config_group *g, struct config_item *i)
+{
+        struct space *sp = to_space(i);
+        struct config_item *tmp;
+        int j;
+        /* assert list_empty(&sp->members) */
+        for (j = 0; sp->group.default_groups[j]; j++) {
+                tmp = &sp->group.default_groups[j]->cg_item;
+                sp->group.default_groups[j] = NULL;
+                config_item_put(tmp);
+        }
+        config_item_put(i);
+}
+static void release_space(struct config_item *i)
+{
+        struct space *sp = to_space(i);
+        kfree(sp->group.default_groups);
+        kfree(sp);
+}
+static struct config_item *make_comm(struct config_group *g, const char *name)
+{
+        struct comm *cm;
+        cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+        if (!cm)
+                return NULL;
+        config_item_init_type_name(&cm->item, name, &comm_type);
+        cm->nodeid = -1;
+        cm->local = 0;
+        cm->addr_count = 0;
+        return &cm->item;
+}
+static void drop_comm(struct config_group *g, struct config_item *i)
+{
+        struct comm *cm = to_comm(i);
+        if (local_comm == cm)
+                local_comm = NULL;
+        dlm_lowcomms_close(cm->nodeid);
+        while (cm->addr_count--)
+                kfree(cm->addr[cm->addr_count]);
+        config_item_put(i);
+}
+static void release_comm(struct config_item *i)
+{
+        struct comm *cm = to_comm(i);
+        kfree(cm);
+}
+static struct config_item *make_node(struct config_group *g, const char *name)
+{
+        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct node *nd;
+        nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+        if (!nd)
+                return NULL;
+        config_item_init_type_name(&nd->item, name, &node_type);
+        nd->nodeid = -1;
+        nd->weight = 1;  /* default weight of 1 if none is set */
+        mutex_lock(&sp->members_lock);
+        list_add(&nd->list, &sp->members);
+        sp->members_count++;
+        mutex_unlock(&sp->members_lock);
+        return &nd->item;
+}
+static void drop_node(struct config_group *g, struct config_item *i)
+{
+        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct node *nd = to_node(i);
+        mutex_lock(&sp->members_lock);
+        list_del(&nd->list);
+        sp->members_count--;
+        mutex_unlock(&sp->members_lock);
+        config_item_put(i);
+}
+static void release_node(struct config_item *i)
+{
+        struct node *nd = to_node(i);
+        kfree(nd);
+}
+static struct clusters clusters_root = {
+        .subsys = {
+                .su_group = {
+                        .cg_item = {
+                                .ci_namebuf = "dlm",
+                                .ci_type = &clusters_type,
+                        },
+                },
+        },
+};
+int dlm_config_init(void)
+{
+        config_group_init(&clusters_root.subsys.su_group);
+        init_MUTEX(&clusters_root.subsys.su_sem);
+        return configfs_register_subsystem(&clusters_root.subsys);
+}
+void dlm_config_exit(void)
+{
+        configfs_unregister_subsystem(&clusters_root.subsys);
+}
+/*
+ * Functions for user space to read/write attributes
+ */
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+                         char *buf)
+{
+        struct comm *cm = to_comm(i);
+        struct comm_attribute *cma =
+                        container_of(a, struct comm_attribute, attr);
+        return cma->show ? cma->show(cm, buf) : 0;
+}
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+                          const char *buf, size_t len)
+{
+        struct comm *cm = to_comm(i);
+        struct comm_attribute *cma =
+                container_of(a, struct comm_attribute, attr);
+        return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+}
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+{
+        return sprintf(buf, "%d\n", cm->nodeid);
+}
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+{
+        cm->nodeid = simple_strtol(buf, NULL, 0);
+        return len;
+}
+static ssize_t comm_local_read(struct comm *cm, char *buf)
+{
+        return sprintf(buf, "%d\n", cm->local);
+}
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+{
+        cm->local= simple_strtol(buf, NULL, 0);
+        if (cm->local && !local_comm)
+                local_comm = cm;
+        return len;
+}
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+{
+        struct sockaddr_storage *addr;
+        if (len != sizeof(struct sockaddr_storage))
+                return -EINVAL;
+        if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+                return -ENOSPC;
+        addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+        if (!addr)
+                return -ENOMEM;
+        memcpy(addr, buf, len);
+        cm->addr[cm->addr_count++] = addr;
+        return len;
+}
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+                         char *buf)
+{
+        struct node *nd = to_node(i);
+        struct node_attribute *nda =
+                        container_of(a, struct node_attribute, attr);
+        return nda->show ? nda->show(nd, buf) : 0;
+}
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+                          const char *buf, size_t len)
+{
+        struct node *nd = to_node(i);
+        struct node_attribute *nda =
+                container_of(a, struct node_attribute, attr);
+        return nda->store ? nda->store(nd, buf, len) : -EINVAL;
+}
+static ssize_t node_nodeid_read(struct node *nd, char *buf)
+{
+        return sprintf(buf, "%d\n", nd->nodeid);
+}
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+{
+        nd->nodeid = simple_strtol(buf, NULL, 0);
+        return len;
+}
+static ssize_t node_weight_read(struct node *nd, char *buf)
+{
+        return sprintf(buf, "%d\n", nd->weight);
+}
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+{
+        nd->weight = simple_strtol(buf, NULL, 0);
+        return len;
+}
+/*
+ * Functions for the dlm to get the info that's been configured
+ */
+static struct space *get_space(char *name)
+{
+        if (!space_list)
+                return NULL;
+        return to_space(config_group_find_obj(space_list, name));
+}
+static void put_space(struct space *sp)
+{
+        config_item_put(&sp->group.cg_item);
+}
+static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+{
+        struct config_item *i;
+        struct comm *cm = NULL;
+        int found = 0;
+        if (!comm_list)
+                return NULL;
+        down(&clusters_root.subsys.su_sem);
+        list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
+                cm = to_comm(i);
+                if (nodeid) {
+                        if (cm->nodeid != nodeid)
+                                continue;
+                        found = 1;
+                        break;
+                } else {
+                        if (!cm->addr_count ||
+                            memcmp(cm->addr[0], addr, sizeof(*addr)))
+                                continue;
+                        found = 1;
+                        break;
+                }
+        }
+        up(&clusters_root.subsys.su_sem);
+        if (found)
+                config_item_get(i);
+        else
+                cm = NULL;
+        return cm;
+}
+static void put_comm(struct comm *cm)
+{
+        config_item_put(&cm->item);
+}
+/* caller must free mem */
+int dlm_nodeid_list(char *lsname, int **ids_out)
+{
+        struct space *sp;
+        struct node *nd;
+        int i = 0, rv = 0;
+        int *ids;
+        sp = get_space(lsname);
+        if (!sp)
+                return -EEXIST;
+        mutex_lock(&sp->members_lock);
+        if (!sp->members_count) {
+                rv = 0;
+                goto out;
+        }
+        ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
+        if (!ids) {
+                rv = -ENOMEM;
+                goto out;
+        }
+        rv = sp->members_count;
+        list_for_each_entry(nd, &sp->members, list)
+                ids[i++] = nd->nodeid;
+        if (rv != i)
+                printk("bad nodeid count %d %d\n", rv, i);
+        *ids_out = ids;
+ out:
+        mutex_unlock(&sp->members_lock);
+        put_space(sp);
+        return rv;
+}
+int dlm_node_weight(char *lsname, int nodeid)
+{
+        struct space *sp;
+        struct node *nd;
+        int w = -EEXIST;
+        sp = get_space(lsname);
+        if (!sp)
+                goto out;
+        mutex_lock(&sp->members_lock);
+        list_for_each_entry(nd, &sp->members, list) {
+                if (nd->nodeid != nodeid)
+                        continue;
+                w = nd->weight;
+                break;
+        }
+        mutex_unlock(&sp->members_lock);
+        put_space(sp);
+ out:
+        return w;
+}
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
+{
+        struct comm *cm = get_comm(nodeid, NULL);
+        if (!cm)
+                return -EEXIST;
+        if (!cm->addr_count)
+                return -ENOENT;
+        memcpy(addr, cm->addr[0], sizeof(*addr));
+        put_comm(cm);
+        return 0;
+}
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+{
+        struct comm *cm = get_comm(0, addr);
+        if (!cm)
+                return -EEXIST;
+        *nodeid = cm->nodeid;
+        put_comm(cm);
+        return 0;
+}
+int dlm_our_nodeid(void)
+{
+        return local_comm ? local_comm->nodeid : 0;
+}
+/* num 0 is first addr, num 1 is second addr */
+int dlm_our_addr(struct sockaddr_storage *addr, int num)
+{
+        if (!local_comm)
+                return -1;
+        if (num + 1 > local_comm->addr_count)
+                return -1;
+        memcpy(addr, local_comm->addr[num], sizeof(*addr));
+        return 0;
+}
+/* Config file defaults */
+#define DEFAULT_TCP_PORT       21064
+#define DEFAULT_BUFFER_SIZE     4096
+#define DEFAULT_RSBTBL_SIZE      256
+#define DEFAULT_LKBTBL_SIZE     1024
+#define DEFAULT_DIRTBL_SIZE      512
+#define DEFAULT_RECOVER_TIMER      5
+#define DEFAULT_TOSS_SECS         10
+#define DEFAULT_SCAN_SECS          5
+struct dlm_config_info dlm_config = {
+        .tcp_port = DEFAULT_TCP_PORT,
+        .buffer_size = DEFAULT_BUFFER_SIZE,
+        .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+        .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+        .dirtbl_size = DEFAULT_DIRTBL_SIZE,
+        .recover_timer = DEFAULT_RECOVER_TIMER,
+        .toss_secs = DEFAULT_TOSS_SECS,
+        .scan_secs = DEFAULT_SCAN_SECS
+};
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+#define DLM_MAX_ADDR_COUNT 3
+struct dlm_config_info {
+        int tcp_port;
+        int buffer_size;
+        int rsbtbl_size;
+        int lkbtbl_size;
+        int dirtbl_size;
+        int recover_timer;
+        int toss_secs;
+        int scan_secs;
+};
+extern struct dlm_config_info dlm_config;
+int dlm_config_init(void);
+void dlm_config_exit(void);
+int dlm_node_weight(char *lsname, int nodeid);
+int dlm_nodeid_list(char *lsname, int **ids_out);
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
+int dlm_our_nodeid(void);
+int dlm_our_addr(struct sockaddr_storage *addr, int num);
+#endif                          /* __CONFIG_DOT_H__ */
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..ca94a837a5bb
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,387 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include "dlm_internal.h"
+#define DLM_DEBUG_BUF_LEN 4096
+static char debug_buf[DLM_DEBUG_BUF_LEN];
+static struct mutex debug_buf_lock;
+static struct dentry *dlm_root;
+struct rsb_iter {
+        int entry;
+        struct dlm_ls *ls;
+        struct list_head *next;
+        struct dlm_rsb *rsb;
+};
+/*
+ * dump all rsb's in the lockspace hash table
+ */
+static char *print_lockmode(int mode)
+{
+        switch (mode) {
+        case DLM_LOCK_IV:
+                return "--";
+        case DLM_LOCK_NL:
+                return "NL";
+        case DLM_LOCK_CR:
+                return "CR";
+        case DLM_LOCK_CW:
+                return "CW";
+        case DLM_LOCK_PR:
+                return "PR";
+        case DLM_LOCK_PW:
+                return "PW";
+        case DLM_LOCK_EX:
+                return "EX";
+        default:
+                return "??";
+        }
+}
+static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
+                       struct dlm_rsb *res)
+{
+        seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+        if (lkb->lkb_status == DLM_LKSTS_CONVERT
+            || lkb->lkb_status == DLM_LKSTS_WAITING)
+                seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+        if (lkb->lkb_nodeid) {
+                if (lkb->lkb_nodeid != res->res_nodeid)
+                        seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+                                   lkb->lkb_remid);
+                else
+                        seq_printf(s, " Master:     %08x", lkb->lkb_remid);
+        }
+        if (lkb->lkb_wait_type)
+                seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+        seq_printf(s, "\n");
+}
+static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+{
+        struct dlm_lkb *lkb;
+        int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+        seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+        for (i = 0; i < res->res_length; i++) {
+                if (isprint(res->res_name[i]))
+                        seq_printf(s, "%c", res->res_name[i]);
+                else
+                        seq_printf(s, "%c", '.');
+        }
+        if (res->res_nodeid > 0)
+                seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+                           res->res_nodeid);
+        else if (res->res_nodeid == 0)
+                seq_printf(s, "\"  \nMaster Copy\n");
+        else if (res->res_nodeid == -1)
+                seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
+                           res->res_first_lkid);
+        else
+                seq_printf(s, "\"  \nInvalid master %d\n", res->res_nodeid);
+        /* Print the LVB: */
+        if (res->res_lvbptr) {
+                seq_printf(s, "LVB: ");
+                for (i = 0; i < lvblen; i++) {
+                        if (i == lvblen / 2)
+                                seq_printf(s, "\n     ");
+                        seq_printf(s, "%02x ",
+                                   (unsigned char) res->res_lvbptr[i]);
+                }
+                if (rsb_flag(res, RSB_VALNOTVALID))
+                        seq_printf(s, " (INVALID)");
+                seq_printf(s, "\n");
+        }
+        root_list = !list_empty(&res->res_root_list);
+        recover_list = !list_empty(&res->res_recover_list);
+        if (root_list || recover_list) {
+                seq_printf(s, "Recovery: root %d recover %d flags %lx "
+                           "count %d\n", root_list, recover_list,
+                           res->res_flags, res->res_recover_locks_count);
+        }
+        /* Print the locks attached to this resource */
+        seq_printf(s, "Granted Queue\n");
+        list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
+                print_lock(s, lkb, res);
+        seq_printf(s, "Conversion Queue\n");
+        list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
+                print_lock(s, lkb, res);
+        seq_printf(s, "Waiting Queue\n");
+        list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
+                print_lock(s, lkb, res);
+        if (list_empty(&res->res_lookup))
+                goto out;
+        seq_printf(s, "Lookup Queue\n");
+        list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
+                seq_printf(s, "%08x %s", lkb->lkb_id,
+                           print_lockmode(lkb->lkb_rqmode));
+                if (lkb->lkb_wait_type)
+                        seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+                seq_printf(s, "\n");
+        }
+ out:
+        return 0;
+}
+static int rsb_iter_next(struct rsb_iter *ri)
+{
+        struct dlm_ls *ls = ri->ls;
+        int i;
+        if (!ri->next) {
+ top:
+                /* Find the next non-empty hash bucket */
+                for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
+                        read_lock(&ls->ls_rsbtbl[i].lock);
+                        if (!list_empty(&ls->ls_rsbtbl[i].list)) {
+                                ri->next = ls->ls_rsbtbl[i].list.next;
+                                read_unlock(&ls->ls_rsbtbl[i].lock);
+                                break;
+                        }
+                        read_unlock(&ls->ls_rsbtbl[i].lock);
+                }
+                ri->entry = i;
+                if (ri->entry >= ls->ls_rsbtbl_size)
+                        return 1;
+        } else {
+                i = ri->entry;
+                read_lock(&ls->ls_rsbtbl[i].lock);
+                ri->next = ri->next->next;
+                if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
+                        /* End of list - move to next bucket */
+                        ri->next = NULL;
+                        ri->entry++;
+                        read_unlock(&ls->ls_rsbtbl[i].lock);
+                        goto top;
+                }
+                read_unlock(&ls->ls_rsbtbl[i].lock);
+        }
+        ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
+        return 0;
+}
+static void rsb_iter_free(struct rsb_iter *ri)
+{
+        kfree(ri);
+}
+static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
+{
+        struct rsb_iter *ri;
+        ri = kmalloc(sizeof *ri, GFP_KERNEL);
+        if (!ri)
+                return NULL;
+        ri->ls = ls;
+        ri->entry = 0;
+        ri->next = NULL;
+        if (rsb_iter_next(ri)) {
+                rsb_iter_free(ri);
+                return NULL;
+        }
+        return ri;
+}
+static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
+{
+        struct rsb_iter *ri;
+        loff_t n = *pos;
+        ri = rsb_iter_init(file->private);
+        if (!ri)
+                return NULL;
+        while (n--) {
+                if (rsb_iter_next(ri)) {
+                        rsb_iter_free(ri);
+                        return NULL;
+                }
+        }
+        return ri;
+}
+static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
+{
+        struct rsb_iter *ri = iter_ptr;
+        (*pos)++;
+        if (rsb_iter_next(ri)) {
+                rsb_iter_free(ri);
+                return NULL;
+        }
+        return ri;
+}
+static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+        /* nothing for now */
+}
+static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
+{
+        struct rsb_iter *ri = iter_ptr;
+        print_resource(ri->rsb, file);
+        return 0;
+}
+static struct seq_operations rsb_seq_ops = {
+        .start = rsb_seq_start,
+        .next  = rsb_seq_next,
+        .stop  = rsb_seq_stop,
+        .show  = rsb_seq_show,
+};
+static int rsb_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &rsb_seq_ops);
+        if (ret)
+                return ret;
+        seq = file->private_data;
+        seq->private = inode->i_private;
+        return 0;
+}
+static struct file_operations rsb_fops = {
+        .owner   = THIS_MODULE,
+        .open    = rsb_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release
+};
+/*
+ * dump lkb's on the ls_waiters list
+ */
+static int waiters_open(struct inode *inode, struct file *file)
+{
+        file->private_data = inode->i_private;
+        return 0;
+}
+static ssize_t waiters_read(struct file *file, char __user *userbuf,
+                            size_t count, loff_t *ppos)
+{
+        struct dlm_ls *ls = file->private_data;
+        struct dlm_lkb *lkb;
+        size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
+        mutex_lock(&debug_buf_lock);
+        mutex_lock(&ls->ls_waiters_mutex);
+        memset(debug_buf, 0, sizeof(debug_buf));
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
+                               lkb->lkb_id, lkb->lkb_wait_type,
+                               lkb->lkb_nodeid, lkb->lkb_resource->res_name);
+                if (ret >= len - pos)
+                        break;
+                pos += ret;
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
+        rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
+        mutex_unlock(&debug_buf_lock);
+        return rv;
+}
+static struct file_operations waiters_fops = {
+        .owner   = THIS_MODULE,
+        .open    = waiters_open,
+        .read    = waiters_read
+};
+int dlm_create_debug_file(struct dlm_ls *ls)
+{
+        char name[DLM_LOCKSPACE_LEN+8];
+        ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
+                                                      S_IFREG | S_IRUGO,
+                                                      dlm_root,
+                                                      ls,
+                                                      &rsb_fops);
+        if (!ls->ls_debug_rsb_dentry)
+                return -ENOMEM;
+        memset(name, 0, sizeof(name));
+        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+        ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+                                                          S_IFREG | S_IRUGO,
+                                                          dlm_root,
+                                                          ls,
+                                                          &waiters_fops);
+        if (!ls->ls_debug_waiters_dentry) {
+                debugfs_remove(ls->ls_debug_rsb_dentry);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+        if (ls->ls_debug_rsb_dentry)
+                debugfs_remove(ls->ls_debug_rsb_dentry);
+        if (ls->ls_debug_waiters_dentry)
+                debugfs_remove(ls->ls_debug_waiters_dentry);
+}
+int dlm_register_debugfs(void)
+{
+        mutex_init(&debug_buf_lock);
+        dlm_root = debugfs_create_dir("dlm", NULL);
+        return dlm_root ? 0 : -ENOMEM;
+}
+void dlm_unregister_debugfs(void)
+{
+        debugfs_remove(dlm_root);
+}
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+#include "lock.h"
+#include "dir.h"
+static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+        spin_lock(&ls->ls_recover_list_lock);
+        list_add(&de->list, &ls->ls_recover_list);
+        spin_unlock(&ls->ls_recover_list_lock);
+}
+static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
+{
+        int found = 0;
+        struct dlm_direntry *de;
+        spin_lock(&ls->ls_recover_list_lock);
+        list_for_each_entry(de, &ls->ls_recover_list, list) {
+                if (de->length == len) {
+                        list_del(&de->list);
+                        de->master_nodeid = 0;
+                        memset(de->name, 0, len);
+                        found = 1;
+                        break;
+                }
+        }
+        spin_unlock(&ls->ls_recover_list_lock);
+        if (!found)
+                de = allocate_direntry(ls, len);
+        return de;
+}
+void dlm_clear_free_entries(struct dlm_ls *ls)
+{
+        struct dlm_direntry *de;
+        spin_lock(&ls->ls_recover_list_lock);
+        while (!list_empty(&ls->ls_recover_list)) {
+                de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
+                                list);
+                list_del(&de->list);
+                free_direntry(de);
+        }
+        spin_unlock(&ls->ls_recover_list_lock);
+}
+/*
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value.  This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid.
+ */
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
+{
+        struct list_head *tmp;
+        struct dlm_member *memb = NULL;
+        uint32_t node, n = 0;
+        int nodeid;
+        if (ls->ls_num_nodes == 1) {
+                nodeid = dlm_our_nodeid();
+                goto out;
+        }
+        if (ls->ls_node_array) {
+                node = (hash >> 16) % ls->ls_total_weight;
+                nodeid = ls->ls_node_array[node];
+                goto out;
+        }
+        /* make_member_array() failed to kmalloc ls_node_array... */
+        node = (hash >> 16) % ls->ls_num_nodes;
+        list_for_each(tmp, &ls->ls_nodes) {
+                if (n++ != node)
+                        continue;
+                memb = list_entry(tmp, struct dlm_member, list);
+                break;
+        }
+        DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
+                                 ls->ls_num_nodes, n, node););
+        nodeid = memb->nodeid;
+ out:
+        return nodeid;
+}
+int dlm_dir_nodeid(struct dlm_rsb *r)
+{
+        return dlm_hash2nodeid(r->res_ls, r->res_hash);
+}
+static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
+{
+        uint32_t val;
+        val = jhash(name, len, 0);
+        val &= (ls->ls_dirtbl_size - 1);
+        return val;
+}
+static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+        uint32_t bucket;
+        bucket = dir_hash(ls, de->name, de->length);
+        list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+}
+static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
+                                          int namelen, uint32_t bucket)
+{
+        struct dlm_direntry *de;
+        list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
+                if (de->length == namelen && !memcmp(name, de->name, namelen))
+                        goto out;
+        }
+        de = NULL;
+ out:
+        return de;
+}
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
+{
+        struct dlm_direntry *de;
+        uint32_t bucket;
+        bucket = dir_hash(ls, name, namelen);
+        write_lock(&ls->ls_dirtbl[bucket].lock);
+        de = search_bucket(ls, name, namelen, bucket);
+        if (!de) {
+                log_error(ls, "remove fr %u none", nodeid);
+                goto out;
+        }
+        if (de->master_nodeid != nodeid) {
+                log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
+                goto out;
+        }
+        list_del(&de->list);
+        free_direntry(de);
+ out:
+        write_unlock(&ls->ls_dirtbl[bucket].lock);
+}
+void dlm_dir_clear(struct dlm_ls *ls)
+{
+        struct list_head *head;
+        struct dlm_direntry *de;
+        int i;
+        DLM_ASSERT(list_empty(&ls->ls_recover_list), );
+        for (i = 0; i < ls->ls_dirtbl_size; i++) {
+                write_lock(&ls->ls_dirtbl[i].lock);
+                head = &ls->ls_dirtbl[i].list;
+                while (!list_empty(head)) {
+                        de = list_entry(head->next, struct dlm_direntry, list);
+                        list_del(&de->list);
+                        put_free_de(ls, de);
+                }
+                write_unlock(&ls->ls_dirtbl[i].lock);
+        }
+}
+int dlm_recover_directory(struct dlm_ls *ls)
+{
+        struct dlm_member *memb;
+        struct dlm_direntry *de;
+        char *b, *last_name = NULL;
+        int error = -ENOMEM, last_len, count = 0;
+        uint16_t namelen;
+        log_debug(ls, "dlm_recover_directory");
+        if (dlm_no_directory(ls))
+                goto out_status;
+        dlm_dir_clear(ls);
+        last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+        if (!last_name)
+                goto out;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                memset(last_name, 0, DLM_RESNAME_MAXLEN);
+                last_len = 0;
+                for (;;) {
+                        error = dlm_recovery_stopped(ls);
+                        if (error)
+                                goto out_free;
+                        error = dlm_rcom_names(ls, memb->nodeid,
+                                               last_name, last_len);
+                        if (error)
+                                goto out_free;
+                        schedule();
+                        /*
+                         * pick namelen/name pairs out of received buffer
+                         */
+                        b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
+                        for (;;) {
+                                memcpy(&namelen, b, sizeof(uint16_t));
+                                namelen = be16_to_cpu(namelen);
+                                b += sizeof(uint16_t);
+                                /* namelen of 0xFFFFF marks end of names for
+                                   this node; namelen of 0 marks end of the
+                                   buffer */
+                                if (namelen == 0xFFFF)
+                                        goto done;
+                                if (!namelen)
+                                        break;
+                                error = -ENOMEM;
+                                de = get_free_de(ls, namelen);
+                                if (!de)
+                                        goto out_free;
+                                de->master_nodeid = memb->nodeid;
+                                de->length = namelen;
+                                last_len = namelen;
+                                memcpy(de->name, b, namelen);
+                                memcpy(last_name, b, namelen);
+                                b += namelen;
+                                add_entry_to_hash(ls, de);
+                                count++;
+                        }
+                }
+         done:
+                ;
+        }
+ out_status:
+        error = 0;
+        dlm_set_recover_status(ls, DLM_RS_DIR);
+        log_debug(ls, "dlm_recover_directory %d entries", count);
+ out_free:
+        kfree(last_name);
+ out:
+        dlm_clear_free_entries(ls);
+        return error;
+}
+static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
+                     int namelen, int *r_nodeid)
+{
+        struct dlm_direntry *de, *tmp;
+        uint32_t bucket;
+        bucket = dir_hash(ls, name, namelen);
+        write_lock(&ls->ls_dirtbl[bucket].lock);
+        de = search_bucket(ls, name, namelen, bucket);
+        if (de) {
+                *r_nodeid = de->master_nodeid;
+                write_unlock(&ls->ls_dirtbl[bucket].lock);
+                if (*r_nodeid == nodeid)
+                        return -EEXIST;
+                return 0;
+        }
+        write_unlock(&ls->ls_dirtbl[bucket].lock);
+        de = allocate_direntry(ls, namelen);
+        if (!de)
+                return -ENOMEM;
+        de->master_nodeid = nodeid;
+        de->length = namelen;
+        memcpy(de->name, name, namelen);
+        write_lock(&ls->ls_dirtbl[bucket].lock);
+        tmp = search_bucket(ls, name, namelen, bucket);
+        if (tmp) {
+                free_direntry(de);
+                de = tmp;
+        } else {
+                list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+        }
+        *r_nodeid = de->master_nodeid;
+        write_unlock(&ls->ls_dirtbl[bucket].lock);
+        return 0;
+}
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+                   int *r_nodeid)
+{
+        return get_entry(ls, nodeid, name, namelen, r_nodeid);
+}
+/* Copy the names of master rsb's into the buffer provided.
+   Only select names whose dir node is the given nodeid. */
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+                           char *outbuf, int outlen, int nodeid)
+{
+        struct list_head *list;
+        struct dlm_rsb *start_r = NULL, *r = NULL;
+        int offset = 0, start_namelen, error, dir_nodeid;
+        char *start_name;
+        uint16_t be_namelen;
+        /*
+         * Find the rsb where we left off (or start again)
+         */
+        start_namelen = inlen;
+        start_name = inbuf;
+        if (start_namelen > 1) {
+                /*
+                 * We could also use a find_rsb_root() function here that
+                 * searched the ls_root_list.
+                 */
+                error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
+                                     &start_r);
+                DLM_ASSERT(!error && start_r,
+                           printk("error %d\n", error););
+                DLM_ASSERT(!list_empty(&start_r->res_root_list),
+                           dlm_print_rsb(start_r););
+                dlm_put_rsb(start_r);
+        }
+        /*
+         * Send rsb names for rsb's we're master of and whose directory node
+         * matches the requesting node.
+         */
+        down_read(&ls->ls_root_sem);
+        if (start_r)
+                list = start_r->res_root_list.next;
+        else
+                list = ls->ls_root_list.next;
+        for (offset = 0; list != &ls->ls_root_list; list = list->next) {
+                r = list_entry(list, struct dlm_rsb, res_root_list);
+                if (r->res_nodeid)
+                        continue;
+                dir_nodeid = dlm_dir_nodeid(r);
+                if (dir_nodeid != nodeid)
+                        continue;
+                /*
+                 * The block ends when we can't fit the following in the
+                 * remaining buffer space:
+                 * namelen (uint16_t) +
+                 * name (r->res_length) +
+                 * end-of-block record 0x0000 (uint16_t)
+                 */
+                if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
+                        /* Write end-of-block record */
+                        be_namelen = 0;
+                        memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                        offset += sizeof(uint16_t);
+                        goto out;
+                }
+                be_namelen = cpu_to_be16(r->res_length);
+                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                offset += sizeof(uint16_t);
+                memcpy(outbuf + offset, r->res_name, r->res_length);
+                offset += r->res_length;
+        }
+        /*
+         * If we've reached the end of the list (and there's room) write a
+         * terminating record.
+         */
+        if ((list == &ls->ls_root_list) &&
+            (offset + sizeof(uint16_t) <= outlen)) {
+                be_namelen = 0xFFFF;
+                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                offset += sizeof(uint16_t);
+        }
+ out:
+        up_read(&ls->ls_root_sem);
+}
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+int dlm_dir_nodeid(struct dlm_rsb *rsb);
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
+void dlm_dir_clear(struct dlm_ls *ls);
+void dlm_clear_free_entries(struct dlm_ls *ls);
+int dlm_recover_directory(struct dlm_ls *ls);
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+        int *r_nodeid);
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+        char *outbuf, int outlen, int nodeid);
+#endif                          /* __DIR_DOT_H__ */
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..1e5cd67e1b7a
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,543 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/dlm.h>
+#define DLM_LOCKSPACE_LEN       64
+/* Size of the temp buffer midcomms allocates on the stack.
+   We try to make this large enough so most messages fit.
+   FIXME: should sctp make this unnecessary? */
+#define DLM_INBUF_LEN           148
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_member;
+struct dlm_lkbtable;
+struct dlm_rsbtable;
+struct dlm_dirtable;
+struct dlm_direntry;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_message;
+struct dlm_rcom;
+struct dlm_mhandle;
+#define log_print(fmt, args...) \
+        printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_error(ls, fmt, args...) \
+        printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+#define DLM_LOG_DEBUG
+#ifdef DLM_LOG_DEBUG
+#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
+#else
+#define log_debug(ls, fmt, args...)
+#endif
+#define DLM_ASSERT(x, do) \
+{ \
+  if (!(x)) \
+  { \
+    printk(KERN_ERR "\nDLM:  Assertion failed on line %d of file %s\n" \
+               "DLM:  assertion:  \"%s\"\n" \
+               "DLM:  time = %lu\n", \
+               __LINE__, __FILE__, #x, jiffies); \
+    {do} \
+    printk("\n"); \
+    BUG(); \
+    panic("DLM:  Record message above and reboot.\n"); \
+  } \
+}
+#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL)
+struct dlm_direntry {
+        struct list_head        list;
+        uint32_t                master_nodeid;
+        uint16_t                length;
+        char                    name[1];
+};
+struct dlm_dirtable {
+        struct list_head        list;
+        rwlock_t                lock;
+};
+struct dlm_rsbtable {
+        struct list_head        list;
+        struct list_head        toss;
+        rwlock_t                lock;
+};
+struct dlm_lkbtable {
+        struct list_head        list;
+        rwlock_t                lock;
+        uint16_t                counter;
+};
+/*
+ * Lockspace member (per node in a ls)
+ */
+struct dlm_member {
+        struct list_head        list;
+        int                     nodeid;
+        int                     weight;
+};
+/*
+ * Save and manage recovery state for a lockspace.
+ */
+struct dlm_recover {
+        struct list_head        list;
+        int                     *nodeids;
+        int                     node_count;
+        uint64_t                seq;
+};
+/*
+ * Pass input args to second stage locking function.
+ */
+struct dlm_args {
+        uint32_t                flags;
+        void                    *astaddr;
+        long                    astparam;
+        void                    *bastaddr;
+        int                     mode;
+        struct dlm_lksb         *lksb;
+};
+/*
+ * Lock block
+ *
+ * A lock can be one of three types:
+ *
+ * local copy      lock is mastered locally
+ *                 (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
+ * process copy    lock is mastered on a remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
+ * master copy     master node's copy of a lock owned by remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
+ *
+ * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
+ * dlm_unlock.  The dlm does not modify these or use any private flags in
+ * this field; it only contains DLM_LKF_ flags from dlm.h.  These flags
+ * are sent as-is to the remote master when the lock is remote.
+ *
+ * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
+ * Some internal flags are shared between the master and process nodes;
+ * these shared flags are kept in the lower two bytes.  One of these
+ * flags set on the master copy will be propagated to the process copy
+ * and v.v.  Other internal flags are private to the master or process
+ * node (e.g. DLM_IFL_MSTCPY).  These are kept in the high two bytes.
+ *
+ * lkb_sbflags: status block flags.  These flags are copied directly into
+ * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
+ * ast.  All defined in dlm.h with DLM_SBF_ prefix.
+ *
+ * lkb_status: the lock status indicates which rsb queue the lock is
+ * on, grant, convert, or wait.  DLM_LKSTS_ WAITING/GRANTED/CONVERT
+ *
+ * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
+ * reply is needed.  Only set when the lkb is on the lockspace waiters
+ * list awaiting a reply from a remote node.
+ *
+ * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
+ * is a master copy, nodeid specifies the remote lock holder, when the
+ * lkb is a process copy, the nodeid specifies the lock master.
+ */
+/* lkb_ast_type */
+#define AST_COMP                1
+#define AST_BAST                2
+/* lkb_status */
+#define DLM_LKSTS_WAITING       1
+#define DLM_LKSTS_GRANTED       2
+#define DLM_LKSTS_CONVERT       3
+/* lkb_flags */
+#define DLM_IFL_MSTCPY          0x00010000
+#define DLM_IFL_RESEND          0x00020000
+#define DLM_IFL_DEAD            0x00040000
+#define DLM_IFL_USER            0x00000001
+#define DLM_IFL_ORPHAN          0x00000002
+struct dlm_lkb {
+        struct dlm_rsb          *lkb_resource;  /* the rsb */
+        struct kref             lkb_ref;
+        int                     lkb_nodeid;     /* copied from rsb */
+        int                     lkb_ownpid;     /* pid of lock owner */
+        uint32_t                lkb_id;         /* our lock ID */
+        uint32_t                lkb_remid;      /* lock ID on remote partner */
+        uint32_t                lkb_exflags;    /* external flags from caller */
+        uint32_t                lkb_sbflags;    /* lksb flags */
+        uint32_t                lkb_flags;      /* internal flags */
+        uint32_t                lkb_lvbseq;     /* lvb sequence number */
+        int8_t                  lkb_status;     /* granted, waiting, convert */
+        int8_t                  lkb_rqmode;     /* requested lock mode */
+        int8_t                  lkb_grmode;     /* granted lock mode */
+        int8_t                  lkb_bastmode;   /* requested mode */
+        int8_t                  lkb_highbast;   /* highest mode bast sent for */
+        int8_t                  lkb_wait_type;  /* type of reply waiting for */
+        int8_t                  lkb_ast_type;   /* type of ast queued for */
+        struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
+        struct list_head        lkb_statequeue; /* rsb g/c/w list */
+        struct list_head        lkb_rsb_lookup; /* waiting for rsb lookup */
+        struct list_head        lkb_wait_reply; /* waiting for remote reply */
+        struct list_head        lkb_astqueue;   /* need ast to be sent */
+        struct list_head        lkb_ownqueue;   /* list of locks for a process */
+        char                    *lkb_lvbptr;
+        struct dlm_lksb         *lkb_lksb;      /* caller's status block */
+        void                    *lkb_astaddr;   /* caller's ast function */
+        void                    *lkb_bastaddr;  /* caller's bast function */
+        long                    lkb_astparam;   /* caller's ast arg */
+};
+struct dlm_rsb {
+        struct dlm_ls           *res_ls;        /* the lockspace */
+        struct kref             res_ref;
+        struct mutex            res_mutex;
+        unsigned long           res_flags;
+        int                     res_length;     /* length of rsb name */
+        int                     res_nodeid;
+        uint32_t                res_lvbseq;
+        uint32_t                res_hash;
+        uint32_t                res_bucket;     /* rsbtbl */
+        unsigned long           res_toss_time;
+        uint32_t                res_first_lkid;
+        struct list_head        res_lookup;     /* lkbs waiting on first */
+        struct list_head        res_hashchain;  /* rsbtbl */
+        struct list_head        res_grantqueue;
+        struct list_head        res_convertqueue;
+        struct list_head        res_waitqueue;
+        struct list_head        res_root_list;      /* used for recovery */
+        struct list_head        res_recover_list;   /* used for recovery */
+        int                     res_recover_locks_count;
+        char                    *res_lvbptr;
+        char                    res_name[1];
+};
+/* find_rsb() flags */
+#define R_MASTER                1       /* only return rsb if it's a master */
+#define R_CREATE                2       /* create/add rsb if not found */
+/* rsb_flags */
+enum rsb_flags {
+        RSB_MASTER_UNCERTAIN,
+        RSB_VALNOTVALID,
+        RSB_VALNOTVALID_PREV,
+        RSB_NEW_MASTER,
+        RSB_NEW_MASTER2,
+        RSB_RECOVER_CONVERT,
+        RSB_LOCKS_PURGED,
+};
+static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+        __set_bit(flag, &r->res_flags);
+}
+static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+        __clear_bit(flag, &r->res_flags);
+}
+static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+        return test_bit(flag, &r->res_flags);
+}
+/* dlm_header is first element of all structs sent between nodes */
+#define DLM_HEADER_MAJOR        0x00020000
+#define DLM_HEADER_MINOR        0x00000001
+#define DLM_MSG                 1
+#define DLM_RCOM                2
+struct dlm_header {
+        uint32_t                h_version;
+        uint32_t                h_lockspace;
+        uint32_t                h_nodeid;       /* nodeid of sender */
+        uint16_t                h_length;
+        uint8_t                 h_cmd;          /* DLM_MSG, DLM_RCOM */
+        uint8_t                 h_pad;
+};
+#define DLM_MSG_REQUEST         1
+#define DLM_MSG_CONVERT         2
+#define DLM_MSG_UNLOCK          3
+#define DLM_MSG_CANCEL          4
+#define DLM_MSG_REQUEST_REPLY   5
+#define DLM_MSG_CONVERT_REPLY   6
+#define DLM_MSG_UNLOCK_REPLY    7
+#define DLM_MSG_CANCEL_REPLY    8
+#define DLM_MSG_GRANT           9
+#define DLM_MSG_BAST            10
+#define DLM_MSG_LOOKUP          11
+#define DLM_MSG_REMOVE          12
+#define DLM_MSG_LOOKUP_REPLY    13
+struct dlm_message {
+        struct dlm_header       m_header;
+        uint32_t                m_type;         /* DLM_MSG_ */
+        uint32_t                m_nodeid;
+        uint32_t                m_pid;
+        uint32_t                m_lkid;         /* lkid on sender */
+        uint32_t                m_remid;        /* lkid on receiver */
+        uint32_t                m_parent_lkid;
+        uint32_t                m_parent_remid;
+        uint32_t                m_exflags;
+        uint32_t                m_sbflags;
+        uint32_t                m_flags;
+        uint32_t                m_lvbseq;
+        uint32_t                m_hash;
+        int                     m_status;
+        int                     m_grmode;
+        int                     m_rqmode;
+        int                     m_bastmode;
+        int                     m_asts;
+        int                     m_result;       /* 0 or -EXXX */
+        char                    m_extra[0];     /* name or lvb */
+};
+#define DLM_RS_NODES            0x00000001
+#define DLM_RS_NODES_ALL        0x00000002
+#define DLM_RS_DIR              0x00000004
+#define DLM_RS_DIR_ALL          0x00000008
+#define DLM_RS_LOCKS            0x00000010
+#define DLM_RS_LOCKS_ALL        0x00000020
+#define DLM_RS_DONE             0x00000040
+#define DLM_RS_DONE_ALL         0x00000080
+#define DLM_RCOM_STATUS         1
+#define DLM_RCOM_NAMES          2
+#define DLM_RCOM_LOOKUP         3
+#define DLM_RCOM_LOCK           4
+#define DLM_RCOM_STATUS_REPLY   5
+#define DLM_RCOM_NAMES_REPLY    6
+#define DLM_RCOM_LOOKUP_REPLY   7
+#define DLM_RCOM_LOCK_REPLY     8
+struct dlm_rcom {
+        struct dlm_header       rc_header;
+        uint32_t                rc_type;        /* DLM_RCOM_ */
+        int                     rc_result;      /* multi-purpose */
+        uint64_t                rc_id;          /* match reply with request */
+        char                    rc_buf[0];
+};
+struct rcom_config {
+        uint32_t                rf_lvblen;
+        uint32_t                rf_lsflags;
+        uint64_t                rf_unused;
+};
+struct rcom_lock {
+        uint32_t                rl_ownpid;
+        uint32_t                rl_lkid;
+        uint32_t                rl_remid;
+        uint32_t                rl_parent_lkid;
+        uint32_t                rl_parent_remid;
+        uint32_t                rl_exflags;
+        uint32_t                rl_flags;
+        uint32_t                rl_lvbseq;
+        int                     rl_result;
+        int8_t                  rl_rqmode;
+        int8_t                  rl_grmode;
+        int8_t                  rl_status;
+        int8_t                  rl_asts;
+        uint16_t                rl_wait_type;
+        uint16_t                rl_namelen;
+        char                    rl_name[DLM_RESNAME_MAXLEN];
+        char                    rl_lvb[0];
+};
+struct dlm_ls {
+        struct list_head        ls_list;        /* list of lockspaces */
+        dlm_lockspace_t         *ls_local_handle;
+        uint32_t                ls_global_id;   /* global unique lockspace ID */
+        uint32_t                ls_exflags;
+        int                     ls_lvblen;
+        int                     ls_count;       /* reference count */
+        unsigned long           ls_flags;       /* LSFL_ */
+        struct kobject          ls_kobj;
+        struct dlm_rsbtable     *ls_rsbtbl;
+        uint32_t                ls_rsbtbl_size;
+        struct dlm_lkbtable     *ls_lkbtbl;
+        uint32_t                ls_lkbtbl_size;
+        struct dlm_dirtable     *ls_dirtbl;
+        uint32_t                ls_dirtbl_size;
+        struct mutex            ls_waiters_mutex;
+        struct list_head        ls_waiters;     /* lkbs needing a reply */
+        struct list_head        ls_nodes;       /* current nodes in ls */
+        struct list_head        ls_nodes_gone;  /* dead node list, recovery */
+        int                     ls_num_nodes;   /* number of nodes in ls */
+        int                     ls_low_nodeid;
+        int                     ls_total_weight;
+        int                     *ls_node_array;
+        struct dlm_rsb          ls_stub_rsb;    /* for returning errors */
+        struct dlm_lkb          ls_stub_lkb;    /* for returning errors */
+        struct dlm_message      ls_stub_ms;     /* for faking a reply */
+        struct dentry           *ls_debug_rsb_dentry; /* debugfs */
+        struct dentry           *ls_debug_waiters_dentry; /* debugfs */
+        wait_queue_head_t       ls_uevent_wait; /* user part of join/leave */
+        int                     ls_uevent_result;
+        struct miscdevice       ls_device;
+        /* recovery related */
+        struct timer_list       ls_timer;
+        struct task_struct      *ls_recoverd_task;
+        struct mutex            ls_recoverd_active;
+        spinlock_t              ls_recover_lock;
+        uint32_t                ls_recover_status; /* DLM_RS_ */
+        uint64_t                ls_recover_seq;
+        struct dlm_recover      *ls_recover_args;
+        struct rw_semaphore     ls_in_recovery; /* block local requests */
+        struct list_head        ls_requestqueue;/* queue remote requests */
+        struct mutex            ls_requestqueue_mutex;
+        char                    *ls_recover_buf;
+        int                     ls_recover_nodeid; /* for debugging */
+        uint64_t                ls_rcom_seq;
+        struct list_head        ls_recover_list;
+        spinlock_t              ls_recover_list_lock;
+        int                     ls_recover_list_count;
+        wait_queue_head_t       ls_wait_general;
+        struct mutex            ls_clear_proc_locks;
+        struct list_head        ls_root_list;   /* root resources */
+        struct rw_semaphore     ls_root_sem;    /* protect root_list */
+        int                     ls_namelen;
+        char                    ls_name[1];
+};
+#define LSFL_WORK               0
+#define LSFL_RUNNING            1
+#define LSFL_RECOVERY_STOP      2
+#define LSFL_RCOM_READY         3
+#define LSFL_UEVENT_WAIT        4
+/* much of this is just saving user space pointers associated with the
+   lock that we pass back to the user lib with an ast */
+struct dlm_user_args {
+        struct dlm_user_proc    *proc; /* each process that opens the lockspace
+                                          device has private data
+                                          (dlm_user_proc) on the struct file,
+                                          the process's locks point back to it*/
+        struct dlm_lksb         lksb;
+        int                     old_mode;
+        int                     update_user_lvb;
+        struct dlm_lksb __user  *user_lksb;
+        void __user             *castparam;
+        void __user             *castaddr;
+        void __user             *bastparam;
+        void __user             *bastaddr;
+};
+#define DLM_PROC_FLAGS_CLOSING 1
+#define DLM_PROC_FLAGS_COMPAT  2
+/* locks list is kept so we can remove all a process's locks when it
+   exits (or orphan those that are persistent) */
+struct dlm_user_proc {
+        dlm_lockspace_t         *lockspace;
+        unsigned long           flags; /* DLM_PROC_FLAGS */
+        struct list_head        asts;
+        spinlock_t              asts_spin;
+        struct list_head        locks;
+        spinlock_t              locks_spin;
+        wait_queue_head_t       wait;
+};
+static inline int dlm_locking_stopped(struct dlm_ls *ls)
+{
+        return !test_bit(LSFL_RUNNING, &ls->ls_flags);
+}
+static inline int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+        return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+}
+static inline int dlm_no_directory(struct dlm_ls *ls)
+{
+        return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
+}
+#endif                          /* __DLM_INTERNAL_DOT_H__ */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..3f2befa4797b
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3871 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+/* Central locking logic has four stages:
+   dlm_lock()
+   dlm_unlock()
+   request_lock(ls, lkb)
+   convert_lock(ls, lkb)
+   unlock_lock(ls, lkb)
+   cancel_lock(ls, lkb)
+   _request_lock(r, lkb)
+   _convert_lock(r, lkb)
+   _unlock_lock(r, lkb)
+   _cancel_lock(r, lkb)
+   do_request(r, lkb)
+   do_convert(r, lkb)
+   do_unlock(r, lkb)
+   do_cancel(r, lkb)
+   Stage 1 (lock, unlock) is mainly about checking input args and
+   splitting into one of the four main operations:
+       dlm_lock          = request_lock
+       dlm_lock+CONVERT  = convert_lock
+       dlm_unlock        = unlock_lock
+       dlm_unlock+CANCEL = cancel_lock
+   Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+   provided to the next stage.
+   Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+   When remote, it calls send_xxxx(), when local it calls do_xxxx().
+   Stage 4, do_xxxx(), is the guts of the operation.  It manipulates the
+   given rsb and lkb and queues callbacks.
+   For remote operations, send_xxxx() results in the corresponding do_xxxx()
+   function being executed on the remote node.  The connecting send/receive
+   calls on local (L) and remote (R) nodes:
+   L: send_xxxx()              ->  R: receive_xxxx()
+                                   R: do_xxxx()
+   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
+*/
+#include <linux/types.h>
+#include "dlm_internal.h"
+#include <linux/dlm_device.h>
+#include "memory.h"
+#include "lowcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "config.h"
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                                    struct dlm_message *ms);
+static int receive_extralen(struct dlm_message *ms);
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
+ */
+static const int __dlm_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
+        {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
+        {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
+        {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
+        {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
+        {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+const int dlm_lvb_operations[8][8] = {
+        /* UN   NL  CR  CW  PR  PW  EX  PD*/
+        {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
+        {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
+        {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
+        {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
+        {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
+        {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
+};
+#define modes_compat(gr, rq) \
+        __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+int dlm_modes_compat(int mode1, int mode2)
+{
+        return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+static const int __quecvt_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
+        {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
+        {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
+        {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
+        {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
+        {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+        printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
+               "     status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+               lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+               lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
+               lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+}
+void dlm_print_rsb(struct dlm_rsb *r)
+{
+        printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
+               r->res_nodeid, r->res_flags, r->res_first_lkid,
+               r->res_recover_locks_count, r->res_name);
+}
+void dlm_dump_rsb(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb;
+        dlm_print_rsb(r);
+        printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
+               list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
+        printk(KERN_ERR "rsb lookup list\n");
+        list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+                dlm_print_lkb(lkb);
+        printk(KERN_ERR "rsb grant queue:\n");
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+                dlm_print_lkb(lkb);
+        printk(KERN_ERR "rsb convert queue:\n");
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+                dlm_print_lkb(lkb);
+        printk(KERN_ERR "rsb wait queue:\n");
+        list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+                dlm_print_lkb(lkb);
+}
+/* Threads cannot use the lockspace while it's being recovered */
+static inline void lock_recovery(struct dlm_ls *ls)
+{
+        down_read(&ls->ls_in_recovery);
+}
+static inline void unlock_recovery(struct dlm_ls *ls)
+{
+        up_read(&ls->ls_in_recovery);
+}
+static inline int lock_recovery_try(struct dlm_ls *ls)
+{
+        return down_read_trylock(&ls->ls_in_recovery);
+}
+static inline int can_be_queued(struct dlm_lkb *lkb)
+{
+        return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
+}
+static inline int force_blocking_asts(struct dlm_lkb *lkb)
+{
+        return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+static inline int is_demoted(struct dlm_lkb *lkb)
+{
+        return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
+}
+static inline int is_remote(struct dlm_rsb *r)
+{
+        DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+        return !!r->res_nodeid;
+}
+static inline int is_process_copy(struct dlm_lkb *lkb)
+{
+        return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
+}
+static inline int is_master_copy(struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+                DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
+        return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
+}
+static inline int middle_conversion(struct dlm_lkb *lkb)
+{
+        if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+            (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+                return 1;
+        return 0;
+}
+static inline int down_conversion(struct dlm_lkb *lkb)
+{
+        return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
+}
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        if (is_master_copy(lkb))
+                return;
+        DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+        lkb->lkb_lksb->sb_status = rv;
+        lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
+        dlm_add_ast(lkb, AST_COMP);
+}
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+        if (is_master_copy(lkb))
+                send_bast(r, lkb, rqmode);
+        else {
+                lkb->lkb_bastmode = rqmode;
+                dlm_add_ast(lkb, AST_BAST);
+        }
+}
+/*
+ * Basic operations on rsb's and lkb's
+ */
+static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
+{
+        struct dlm_rsb *r;
+        r = allocate_rsb(ls, len);
+        if (!r)
+                return NULL;
+        r->res_ls = ls;
+        r->res_length = len;
+        memcpy(r->res_name, name, len);
+        mutex_init(&r->res_mutex);
+        INIT_LIST_HEAD(&r->res_lookup);
+        INIT_LIST_HEAD(&r->res_grantqueue);
+        INIT_LIST_HEAD(&r->res_convertqueue);
+        INIT_LIST_HEAD(&r->res_waitqueue);
+        INIT_LIST_HEAD(&r->res_root_list);
+        INIT_LIST_HEAD(&r->res_recover_list);
+        return r;
+}
+static int search_rsb_list(struct list_head *head, char *name, int len,
+                           unsigned int flags, struct dlm_rsb **r_ret)
+{
+        struct dlm_rsb *r;
+        int error = 0;
+        list_for_each_entry(r, head, res_hashchain) {
+                if (len == r->res_length && !memcmp(name, r->res_name, len))
+                        goto found;
+        }
+        return -EBADR;
+ found:
+        if (r->res_nodeid && (flags & R_MASTER))
+                error = -ENOTBLK;
+        *r_ret = r;
+        return error;
+}
+static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+                       unsigned int flags, struct dlm_rsb **r_ret)
+{
+        struct dlm_rsb *r;
+        int error;
+        error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+        if (!error) {
+                kref_get(&r->res_ref);
+                goto out;
+        }
+        error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+        if (error)
+                goto out;
+        list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+        if (dlm_no_directory(ls))
+                goto out;
+        if (r->res_nodeid == -1) {
+                rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+                r->res_first_lkid = 0;
+        } else if (r->res_nodeid > 0) {
+                rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
+                r->res_first_lkid = 0;
+        } else {
+                DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
+                DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
+        }
+ out:
+        *r_ret = r;
+        return error;
+}
+static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+                      unsigned int flags, struct dlm_rsb **r_ret)
+{
+        int error;
+        write_lock(&ls->ls_rsbtbl[b].lock);
+        error = _search_rsb(ls, name, len, b, flags, r_ret);
+        write_unlock(&ls->ls_rsbtbl[b].lock);
+        return error;
+}
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused.  Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list.  When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ */
+static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
+                    unsigned int flags, struct dlm_rsb **r_ret)
+{
+        struct dlm_rsb *r, *tmp;
+        uint32_t hash, bucket;
+        int error = 0;
+        if (dlm_no_directory(ls))
+                flags |= R_CREATE;
+        hash = jhash(name, namelen, 0);
+        bucket = hash & (ls->ls_rsbtbl_size - 1);
+        error = search_rsb(ls, name, namelen, bucket, flags, &r);
+        if (!error)
+                goto out;
+        if (error == -EBADR && !(flags & R_CREATE))
+                goto out;
+        /* the rsb was found but wasn't a master copy */
+        if (error == -ENOTBLK)
+                goto out;
+        error = -ENOMEM;
+        r = create_rsb(ls, name, namelen);
+        if (!r)
+                goto out;
+        r->res_hash = hash;
+        r->res_bucket = bucket;
+        r->res_nodeid = -1;
+        kref_init(&r->res_ref);
+        /* With no directory, the master can be set immediately */
+        if (dlm_no_directory(ls)) {
+                int nodeid = dlm_dir_nodeid(r);
+                if (nodeid == dlm_our_nodeid())
+                        nodeid = 0;
+                r->res_nodeid = nodeid;
+        }
+        write_lock(&ls->ls_rsbtbl[bucket].lock);
+        error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
+        if (!error) {
+                write_unlock(&ls->ls_rsbtbl[bucket].lock);
+                free_rsb(r);
+                r = tmp;
+                goto out;
+        }
+        list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+        write_unlock(&ls->ls_rsbtbl[bucket].lock);
+        error = 0;
+ out:
+        *r_ret = r;
+        return error;
+}
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+                 unsigned int flags, struct dlm_rsb **r_ret)
+{
+        return find_rsb(ls, name, namelen, flags, r_ret);
+}
+/* This is only called to add a reference when the code already holds
+   a valid reference to the rsb, so there's no need for locking. */
+static inline void hold_rsb(struct dlm_rsb *r)
+{
+        kref_get(&r->res_ref);
+}
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+        hold_rsb(r);
+}
+static void toss_rsb(struct kref *kref)
+{
+        struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+        struct dlm_ls *ls = r->res_ls;
+        DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+        kref_init(&r->res_ref);
+        list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+        r->res_toss_time = jiffies;
+        if (r->res_lvbptr) {
+                free_lvb(r->res_lvbptr);
+                r->res_lvbptr = NULL;
+        }
+}
+/* When all references to the rsb are gone it's transfered to
+   the tossed list for later disposal. */
+static void put_rsb(struct dlm_rsb *r)
+{
+        struct dlm_ls *ls = r->res_ls;
+        uint32_t bucket = r->res_bucket;
+        write_lock(&ls->ls_rsbtbl[bucket].lock);
+        kref_put(&r->res_ref, toss_rsb);
+        write_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+        put_rsb(r);
+}
+/* See comment for unhold_lkb */
+static void unhold_rsb(struct dlm_rsb *r)
+{
+        int rv;
+        rv = kref_put(&r->res_ref, toss_rsb);
+        DLM_ASSERT(!rv, dlm_dump_rsb(r););
+}
+static void kill_rsb(struct kref *kref)
+{
+        struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+        /* All work is done after the return from kref_put() so we
+           can release the write_lock before the remove and free. */
+        DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
+}
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+   The rsb must exist as long as any lkb's for it do. */
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        hold_rsb(r);
+        lkb->lkb_resource = r;
+}
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_resource) {
+                put_rsb(lkb->lkb_resource);
+                lkb->lkb_resource = NULL;
+        }
+}
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+        struct dlm_lkb *lkb, *tmp;
+        uint32_t lkid = 0;
+        uint16_t bucket;
+        lkb = allocate_lkb(ls);
+        if (!lkb)
+                return -ENOMEM;
+        lkb->lkb_nodeid = -1;
+        lkb->lkb_grmode = DLM_LOCK_IV;
+        kref_init(&lkb->lkb_ref);
+        INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+        get_random_bytes(&bucket, sizeof(bucket));
+        bucket &= (ls->ls_lkbtbl_size - 1);
+        write_lock(&ls->ls_lkbtbl[bucket].lock);
+        /* counter can roll over so we must verify lkid is not in use */
+        while (lkid == 0) {
+                lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+                list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
+                                    lkb_idtbl_list) {
+                        if (tmp->lkb_id != lkid)
+                                continue;
+                        lkid = 0;
+                        break;
+                }
+        }
+        lkb->lkb_id = lkid;
+        list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
+        write_unlock(&ls->ls_lkbtbl[bucket].lock);
+        *lkb_ret = lkb;
+        return 0;
+}
+static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
+{
+        uint16_t bucket = lkid & 0xFFFF;
+        struct dlm_lkb *lkb;
+        list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
+                if (lkb->lkb_id == lkid)
+                        return lkb;
+        }
+        return NULL;
+}
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+        struct dlm_lkb *lkb;
+        uint16_t bucket = lkid & 0xFFFF;
+        if (bucket >= ls->ls_lkbtbl_size)
+                return -EBADSLT;
+        read_lock(&ls->ls_lkbtbl[bucket].lock);
+        lkb = __find_lkb(ls, lkid);
+        if (lkb)
+                kref_get(&lkb->lkb_ref);
+        read_unlock(&ls->ls_lkbtbl[bucket].lock);
+        *lkb_ret = lkb;
+        return lkb ? 0 : -ENOENT;
+}
+static void kill_lkb(struct kref *kref)
+{
+        struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+        /* All work is done after the return from kref_put() so we
+           can release the write_lock before the detach_lkb */
+        DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+/* __put_lkb() is used when an lkb may not have an rsb attached to
+   it so we need to provide the lockspace explicitly */
+static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        uint16_t bucket = lkb->lkb_id & 0xFFFF;
+        write_lock(&ls->ls_lkbtbl[bucket].lock);
+        if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+                list_del(&lkb->lkb_idtbl_list);
+                write_unlock(&ls->ls_lkbtbl[bucket].lock);
+                detach_lkb(lkb);
+                /* for local/process lkbs, lvbptr points to caller's lksb */
+                if (lkb->lkb_lvbptr && is_master_copy(lkb))
+                        free_lvb(lkb->lkb_lvbptr);
+                free_lkb(lkb);
+                return 1;
+        } else {
+                write_unlock(&ls->ls_lkbtbl[bucket].lock);
+                return 0;
+        }
+}
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+        struct dlm_ls *ls;
+        DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
+        DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
+        ls = lkb->lkb_resource->res_ls;
+        return __put_lkb(ls, lkb);
+}
+/* This is only called to add a reference when the code already holds
+   a valid reference to the lkb, so there's no need for locking. */
+static inline void hold_lkb(struct dlm_lkb *lkb)
+{
+        kref_get(&lkb->lkb_ref);
+}
+/* This is called when we need to remove a reference and are certain
+   it's not the last ref.  e.g. del_lkb is always called between a
+   find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+   put_lkb would work fine, but would involve unnecessary locking */
+static inline void unhold_lkb(struct dlm_lkb *lkb)
+{
+        int rv;
+        rv = kref_put(&lkb->lkb_ref, kill_lkb);
+        DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+}
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+                            int mode)
+{
+        struct dlm_lkb *lkb = NULL;
+        list_for_each_entry(lkb, head, lkb_statequeue)
+                if (lkb->lkb_rqmode < mode)
+                        break;
+        if (!lkb)
+                list_add_tail(new, head);
+        else
+                __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+}
+/* add/remove lkb to rsb's grant/convert/wait queue */
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+        kref_get(&lkb->lkb_ref);
+        DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+        lkb->lkb_status = status;
+        switch (status) {
+        case DLM_LKSTS_WAITING:
+                if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+                        list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+                else
+                        list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+                break;
+        case DLM_LKSTS_GRANTED:
+                /* convention says granted locks kept in order of grmode */
+                lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+                                lkb->lkb_grmode);
+                break;
+        case DLM_LKSTS_CONVERT:
+                if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+                        list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+                else
+                        list_add_tail(&lkb->lkb_statequeue,
+                                      &r->res_convertqueue);
+                break;
+        default:
+                DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+        }
+}
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        lkb->lkb_status = 0;
+        list_del(&lkb->lkb_statequeue);
+        unhold_lkb(lkb);
+}
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+        hold_lkb(lkb);
+        del_lkb(r, lkb);
+        add_lkb(r, lkb, sts);
+        unhold_lkb(lkb);
+}
+/* add/remove lkb from global waiters list of lkb's waiting for
+   a reply from a remote node */
+static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+{
+        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+        mutex_lock(&ls->ls_waiters_mutex);
+        if (lkb->lkb_wait_type) {
+                log_print("add_to_waiters error %d", lkb->lkb_wait_type);
+                goto out;
+        }
+        lkb->lkb_wait_type = mstype;
+        kref_get(&lkb->lkb_ref);
+        list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+        mutex_unlock(&ls->ls_waiters_mutex);
+}
+static int _remove_from_waiters(struct dlm_lkb *lkb)
+{
+        int error = 0;
+        if (!lkb->lkb_wait_type) {
+                log_print("remove_from_waiters error");
+                error = -EINVAL;
+                goto out;
+        }
+        lkb->lkb_wait_type = 0;
+        list_del(&lkb->lkb_wait_reply);
+        unhold_lkb(lkb);
+ out:
+        return error;
+}
+static int remove_from_waiters(struct dlm_lkb *lkb)
+{
+        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+        int error;
+        mutex_lock(&ls->ls_waiters_mutex);
+        error = _remove_from_waiters(lkb);
+        mutex_unlock(&ls->ls_waiters_mutex);
+        return error;
+}
+static void dir_remove(struct dlm_rsb *r)
+{
+        int to_nodeid;
+        if (dlm_no_directory(r->res_ls))
+                return;
+        to_nodeid = dlm_dir_nodeid(r);
+        if (to_nodeid != dlm_our_nodeid())
+                send_remove(r);
+        else
+                dlm_dir_remove_entry(r->res_ls, to_nodeid,
+                                     r->res_name, r->res_length);
+}
+/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
+   found since they are in order of newest to oldest? */
+static int shrink_bucket(struct dlm_ls *ls, int b)
+{
+        struct dlm_rsb *r;
+        int count = 0, found;
+        for (;;) {
+                found = 0;
+                write_lock(&ls->ls_rsbtbl[b].lock);
+                list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
+                                            res_hashchain) {
+                        if (!time_after_eq(jiffies, r->res_toss_time +
+                                           dlm_config.toss_secs * HZ))
+                                continue;
+                        found = 1;
+                        break;
+                }
+                if (!found) {
+                        write_unlock(&ls->ls_rsbtbl[b].lock);
+                        break;
+                }
+                if (kref_put(&r->res_ref, kill_rsb)) {
+                        list_del(&r->res_hashchain);
+                        write_unlock(&ls->ls_rsbtbl[b].lock);
+                        if (is_master(r))
+                                dir_remove(r);
+                        free_rsb(r);
+                        count++;
+                } else {
+                        write_unlock(&ls->ls_rsbtbl[b].lock);
+                        log_error(ls, "tossed rsb in use %s", r->res_name);
+                }
+        }
+        return count;
+}
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+        int i;
+        if (dlm_locking_stopped(ls))
+                return;
+        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+                shrink_bucket(ls, i);
+                cond_resched();
+        }
+}
+/* lkb is master or local copy */
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int b, len = r->res_ls->ls_lvblen;
+        /* b=1 lvb returned to caller
+           b=0 lvb written to rsb or invalidated
+           b=-1 do nothing */
+        b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+        if (b == 1) {
+                if (!lkb->lkb_lvbptr)
+                        return;
+                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                        return;
+                if (!r->res_lvbptr)
+                        return;
+                memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
+                lkb->lkb_lvbseq = r->res_lvbseq;
+        } else if (b == 0) {
+                if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+                        rsb_set_flag(r, RSB_VALNOTVALID);
+                        return;
+                }
+                if (!lkb->lkb_lvbptr)
+                        return;
+                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                        return;
+                if (!r->res_lvbptr)
+                        r->res_lvbptr = allocate_lvb(r->res_ls);
+                if (!r->res_lvbptr)
+                        return;
+                memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
+                r->res_lvbseq++;
+                lkb->lkb_lvbseq = r->res_lvbseq;
+                rsb_clear_flag(r, RSB_VALNOTVALID);
+        }
+        if (rsb_flag(r, RSB_VALNOTVALID))
+                lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
+}
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_grmode < DLM_LOCK_PW)
+                return;
+        if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+                rsb_set_flag(r, RSB_VALNOTVALID);
+                return;
+        }
+        if (!lkb->lkb_lvbptr)
+                return;
+        if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                return;
+        if (!r->res_lvbptr)
+                r->res_lvbptr = allocate_lvb(r->res_ls);
+        if (!r->res_lvbptr)
+                return;
+        memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+        r->res_lvbseq++;
+        rsb_clear_flag(r, RSB_VALNOTVALID);
+}
+/* lkb is process copy (pc) */
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                            struct dlm_message *ms)
+{
+        int b;
+        if (!lkb->lkb_lvbptr)
+                return;
+        if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                return;
+        b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+        if (b == 1) {
+                int len = receive_extralen(ms);
+                memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+                lkb->lkb_lvbseq = ms->m_lvbseq;
+        }
+}
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+   remove_lock -- used for unlock, removes lkb from granted
+   revert_lock -- used for cancel, moves lkb from convert to granted
+   grant_lock  -- used for request and convert, adds lkb to granted or
+                  moves lkb from convert or waiting to granted
+   Each of these is used for master or local copy lkb's.  There is
+   also a _pc() variation used to make the corresponding change on
+   a process copy (pc) lkb. */
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        del_lkb(r, lkb);
+        lkb->lkb_grmode = DLM_LOCK_IV;
+        /* this unhold undoes the original ref from create_lkb()
+           so this leads to the lkb being freed */
+        unhold_lkb(lkb);
+}
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        set_lvb_unlock(r, lkb);
+        _remove_lock(r, lkb);
+}
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        _remove_lock(r, lkb);
+}
+static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        lkb->lkb_rqmode = DLM_LOCK_IV;
+        switch (lkb->lkb_status) {
+        case DLM_LKSTS_GRANTED:
+                break;
+        case DLM_LKSTS_CONVERT:
+                move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+                break;
+        case DLM_LKSTS_WAITING:
+                del_lkb(r, lkb);
+                lkb->lkb_grmode = DLM_LOCK_IV;
+                /* this unhold undoes the original ref from create_lkb()
+                   so this leads to the lkb being freed */
+                unhold_lkb(lkb);
+                break;
+        default:
+                log_print("invalid status for revert %d", lkb->lkb_status);
+        }
+}
+static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        revert_lock(r, lkb);
+}
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+                lkb->lkb_grmode = lkb->lkb_rqmode;
+                if (lkb->lkb_status)
+                        move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+                else
+                        add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+        }
+        lkb->lkb_rqmode = DLM_LOCK_IV;
+}
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        set_lvb_lock(r, lkb);
+        _grant_lock(r, lkb);
+        lkb->lkb_highbast = 0;
+}
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                          struct dlm_message *ms)
+{
+        set_lvb_lock_pc(r, lkb, ms);
+        _grant_lock(r, lkb);
+}
+/* called by grant_pending_locks() which means an async grant message must
+   be sent to the requesting node in addition to granting the lock if the
+   lkb belongs to a remote node. */
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        grant_lock(r, lkb);
+        if (is_master_copy(lkb))
+                send_grant(r, lkb);
+        else
+                queue_cast(r, lkb, 0);
+}
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+        struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+                                           lkb_statequeue);
+        if (lkb->lkb_id == first->lkb_id)
+                return 1;
+        return 0;
+}
+/* Check if the given lkb conflicts with another lkb on the queue. */
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+        struct dlm_lkb *this;
+        list_for_each_entry(this, head, lkb_statequeue) {
+                if (this == lkb)
+                        continue;
+                if (!modes_compat(this, lkb))
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource.  The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing the first lkb in the
+ * convert queue from being granted, then demote lkb (set grmode to NL).
+ * This second form requires that we check for conv-deadlk even when
+ * now == 0 in _can_be_granted().
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ *                PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list.  We demote the granted mode of the second lock (the lkb passed to this
+ * function).
+ *
+ * After the resolution, the "grant pending" function needs to go back and try
+ * to grant locks on the convert queue again since the first lock can now be
+ * granted.
+ */
+static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
+{
+        struct dlm_lkb *this, *first = NULL, *self = NULL;
+        list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+                if (!first)
+                        first = this;
+                if (this == lkb) {
+                        self = lkb;
+                        continue;
+                }
+                if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
+                        return 1;
+        }
+        /* if lkb is on the convert queue and is preventing the first
+           from being granted, then there's deadlock and we demote lkb.
+           multiple converting locks may need to do this before the first
+           converting lock can be granted. */
+        if (self && self != first) {
+                if (!modes_compat(lkb, first) &&
+                    !queue_conflict(&rsb->res_grantqueue, first))
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+        int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+        /*
+         * 6-10: Version 5.4 introduced an option to address the phenomenon of
+         * a new request for a NL mode lock being blocked.
+         *
+         * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+         * request, then it would be granted.  In essence, the use of this flag
+         * tells the Lock Manager to expedite theis request by not considering
+         * what may be in the CONVERTING or WAITING queues...  As of this
+         * writing, the EXPEDITE flag can be used only with new requests for NL
+         * mode locks.  This flag is not valid for conversion requests.
+         *
+         * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
+         * conversion or used with a non-NL requested mode.  We also know an
+         * EXPEDITE request is always granted immediately, so now must always
+         * be 1.  The full condition to grant an expedite request: (now &&
+         * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+         * therefore be shortened to just checking the flag.
+         */
+        if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+                return 1;
+        /*
+         * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+         * added to the remaining conditions.
+         */
+        if (queue_conflict(&r->res_grantqueue, lkb))
+                goto out;
+        /*
+         * 6-3: By default, a conversion request is immediately granted if the
+         * requested mode is compatible with the modes of all other granted
+         * locks
+         */
+        if (queue_conflict(&r->res_convertqueue, lkb))
+                goto out;
+        /*
+         * 6-5: But the default algorithm for deciding whether to grant or
+         * queue conversion requests does not by itself guarantee that such
+         * requests are serviced on a "first come first serve" basis.  This, in
+         * turn, can lead to a phenomenon known as "indefinate postponement".
+         *
+         * 6-7: This issue is dealt with by using the optional QUECVT flag with
+         * the system service employed to request a lock conversion.  This flag
+         * forces certain conversion requests to be queued, even if they are
+         * compatible with the granted modes of other locks on the same
+         * resource.  Thus, the use of this flag results in conversion requests
+         * being ordered on a "first come first servce" basis.
+         *
+         * DCT: This condition is all about new conversions being able to occur
+         * "in place" while the lock remains on the granted queue (assuming
+         * nothing else conflicts.)  IOW if QUECVT isn't set, a conversion
+         * doesn't _have_ to go onto the convert queue where it's processed in
+         * order.  The "now" variable is necessary to distinguish converts
+         * being received and processed for the first time now, because once a
+         * convert is moved to the conversion queue the condition below applies
+         * requiring fifo granting.
+         */
+        if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+                return 1;
+        /*
+         * The NOORDER flag is set to avoid the standard vms rules on grant
+         * order.
+         */
+        if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+                return 1;
+        /*
+         * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+         * granted until all other conversion requests ahead of it are granted
+         * and/or canceled.
+         */
+        if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+                return 1;
+        /*
+         * 6-4: By default, a new request is immediately granted only if all
+         * three of the following conditions are satisfied when the request is
+         * issued:
+         * - The queue of ungranted conversion requests for the resource is
+         *   empty.
+         * - The queue of ungranted new requests for the resource is empty.
+         * - The mode of the new request is compatible with the most
+         *   restrictive mode of all granted locks on the resource.
+         */
+        if (now && !conv && list_empty(&r->res_convertqueue) &&
+            list_empty(&r->res_waitqueue))
+                return 1;
+        /*
+         * 6-4: Once a lock request is in the queue of ungranted new requests,
+         * it cannot be granted until the queue of ungranted conversion
+         * requests is empty, all ungranted new requests ahead of it are
+         * granted and/or canceled, and it is compatible with the granted mode
+         * of the most restrictive lock granted on the resource.
+         */
+        if (!now && !conv && list_empty(&r->res_convertqueue) &&
+            first_in_list(lkb, &r->res_waitqueue))
+                return 1;
+ out:
+        /*
+         * The following, enabled by CONVDEADLK, departs from VMS.
+         */
+        if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
+            conversion_deadlock_detect(r, lkb)) {
+                lkb->lkb_grmode = DLM_LOCK_NL;
+                lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
+        }
+        return 0;
+}
+/*
+ * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
+ * simple way to provide a big optimization to applications that can use them.
+ */
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+        uint32_t flags = lkb->lkb_exflags;
+        int rv;
+        int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+        rv = _can_be_granted(r, lkb, now);
+        if (rv)
+                goto out;
+        if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
+                goto out;
+        if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
+                alt = DLM_LOCK_PR;
+        else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
+                alt = DLM_LOCK_CW;
+        if (alt) {
+                lkb->lkb_rqmode = alt;
+                rv = _can_be_granted(r, lkb, now);
+                if (rv)
+                        lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
+                else
+                        lkb->lkb_rqmode = rqmode;
+        }
+ out:
+        return rv;
+}
+static int grant_pending_convert(struct dlm_rsb *r, int high)
+{
+        struct dlm_lkb *lkb, *s;
+        int hi, demoted, quit, grant_restart, demote_restart;
+        quit = 0;
+ restart:
+        grant_restart = 0;
+        demote_restart = 0;
+        hi = DLM_LOCK_IV;
+        list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+                demoted = is_demoted(lkb);
+                if (can_be_granted(r, lkb, 0)) {
+                        grant_lock_pending(r, lkb);
+                        grant_restart = 1;
+                } else {
+                        hi = max_t(int, lkb->lkb_rqmode, hi);
+                        if (!demoted && is_demoted(lkb))
+                                demote_restart = 1;
+                }
+        }
+        if (grant_restart)
+                goto restart;
+        if (demote_restart && !quit) {
+                quit = 1;
+                goto restart;
+        }
+        return max_t(int, high, hi);
+}
+static int grant_pending_wait(struct dlm_rsb *r, int high)
+{
+        struct dlm_lkb *lkb, *s;
+        list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+                if (can_be_granted(r, lkb, 0))
+                        grant_lock_pending(r, lkb);
+                else
+                        high = max_t(int, lkb->lkb_rqmode, high);
+        }
+        return high;
+}
+static void grant_pending_locks(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb, *s;
+        int high = DLM_LOCK_IV;
+        DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
+        high = grant_pending_convert(r, high);
+        high = grant_pending_wait(r, high);
+        if (high == DLM_LOCK_IV)
+                return;
+        /*
+         * If there are locks left on the wait/convert queue then send blocking
+         * ASTs to granted locks based on the largest requested mode (high)
+         * found above. FIXME: highbast < high comparison not valid for PR/CW.
+         */
+        list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+                if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
+                    !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
+                        queue_bast(r, lkb, high);
+                        lkb->lkb_highbast = high;
+                }
+        }
+}
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+                            struct dlm_lkb *lkb)
+{
+        struct dlm_lkb *gr;
+        list_for_each_entry(gr, head, lkb_statequeue) {
+                if (gr->lkb_bastaddr &&
+                    gr->lkb_highbast < lkb->lkb_rqmode &&
+                    !modes_compat(gr, lkb)) {
+                        queue_bast(r, gr, lkb->lkb_rqmode);
+                        gr->lkb_highbast = lkb->lkb_rqmode;
+                }
+        }
+}
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        send_bast_queue(r, &r->res_grantqueue, lkb);
+        send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+/* set_master(r, lkb) -- set the master nodeid of a resource
+   The purpose of this function is to set the nodeid field in the given
+   lkb using the nodeid field in the given rsb.  If the rsb's nodeid is
+   known, it can just be copied to the lkb and the function will return
+   0.  If the rsb's nodeid is _not_ known, it needs to be looked up
+   before it can be copied to the lkb.
+   When the rsb nodeid is being looked up remotely, the initial lkb
+   causing the lookup is kept on the ls_waiters list waiting for the
+   lookup reply.  Other lkb's waiting for the same rsb lookup are kept
+   on the rsb's res_lookup list until the master is verified.
+   Return values:
+   0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+   1: the rsb master is not available and the lkb has been placed on
+      a wait queue
+*/
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        struct dlm_ls *ls = r->res_ls;
+        int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+        if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
+                rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+                r->res_first_lkid = lkb->lkb_id;
+                lkb->lkb_nodeid = r->res_nodeid;
+                return 0;
+        }
+        if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
+                list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+                return 1;
+        }
+        if (r->res_nodeid == 0) {
+                lkb->lkb_nodeid = 0;
+                return 0;
+        }
+        if (r->res_nodeid > 0) {
+                lkb->lkb_nodeid = r->res_nodeid;
+                return 0;
+        }
+        DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
+        dir_nodeid = dlm_dir_nodeid(r);
+        if (dir_nodeid != our_nodeid) {
+                r->res_first_lkid = lkb->lkb_id;
+                send_lookup(r, lkb);
+                return 1;
+        }
+        for (;;) {
+                /* It's possible for dlm_scand to remove an old rsb for
+                   this same resource from the toss list, us to create
+                   a new one, look up the master locally, and find it
+                   already exists just before dlm_scand does the
+                   dir_remove() on the previous rsb. */
+                error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+                                       r->res_length, &ret_nodeid);
+                if (!error)
+                        break;
+                log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
+                schedule();
+        }
+        if (ret_nodeid == our_nodeid) {
+                r->res_first_lkid = 0;
+                r->res_nodeid = 0;
+                lkb->lkb_nodeid = 0;
+        } else {
+                r->res_first_lkid = lkb->lkb_id;
+                r->res_nodeid = ret_nodeid;
+                lkb->lkb_nodeid = ret_nodeid;
+        }
+        return 0;
+}
+static void process_lookup_list(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb, *safe;
+        list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
+                list_del(&lkb->lkb_rsb_lookup);
+                _request_lock(r, lkb);
+                schedule();
+        }
+}
+/* confirm_master -- confirm (or deny) an rsb's master nodeid */
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+        struct dlm_lkb *lkb;
+        if (!r->res_first_lkid)
+                return;
+        switch (error) {
+        case 0:
+        case -EINPROGRESS:
+                r->res_first_lkid = 0;
+                process_lookup_list(r);
+                break;
+        case -EAGAIN:
+                /* the remote master didn't queue our NOQUEUE request;
+                   make a waiting lkb the first_lkid */
+                r->res_first_lkid = 0;
+                if (!list_empty(&r->res_lookup)) {
+                        lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+                                         lkb_rsb_lookup);
+                        list_del(&lkb->lkb_rsb_lookup);
+                        r->res_first_lkid = lkb->lkb_id;
+                        _request_lock(r, lkb);
+                } else
+                        r->res_nodeid = -1;
+                break;
+        default:
+                log_error(r->res_ls, "confirm_master unknown error %d", error);
+        }
+}
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+                         int namelen, uint32_t parent_lkid, void *ast,
+                         void *astarg, void *bast, struct dlm_args *args)
+{
+        int rv = -EINVAL;
+        /* check for invalid arg usage */
+        if (mode < 0 || mode > DLM_LOCK_EX)
+                goto out;
+        if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+                goto out;
+        if (flags & DLM_LKF_CANCEL)
+                goto out;
+        if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+                goto out;
+        if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+                goto out;
+        if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+                goto out;
+        if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+                goto out;
+        if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+                goto out;
+        if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+                goto out;
+        if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+                goto out;
+        if (!ast || !lksb)
+                goto out;
+        if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+                goto out;
+        /* parent/child locks not yet supported */
+        if (parent_lkid)
+                goto out;
+        if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+                goto out;
+        /* these args will be copied to the lkb in validate_lock_args,
+           it cannot be done now because when converting locks, fields in
+           an active lkb cannot be modified before locking the rsb */
+        args->flags = flags;
+        args->astaddr = ast;
+        args->astparam = (long) astarg;
+        args->bastaddr = bast;
+        args->mode = mode;
+        args->lksb = lksb;
+        rv = 0;
+ out:
+        return rv;
+}
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+        if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
+                      DLM_LKF_FORCEUNLOCK))
+                return -EINVAL;
+        args->flags = flags;
+        args->astparam = (long) astarg;
+        return 0;
+}
+static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                              struct dlm_args *args)
+{
+        int rv = -EINVAL;
+        if (args->flags & DLM_LKF_CONVERT) {
+                if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+                        goto out;
+                if (args->flags & DLM_LKF_QUECVT &&
+                    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+                        goto out;
+                rv = -EBUSY;
+                if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+                        goto out;
+                if (lkb->lkb_wait_type)
+                        goto out;
+        }
+        lkb->lkb_exflags = args->flags;
+        lkb->lkb_sbflags = 0;
+        lkb->lkb_astaddr = args->astaddr;
+        lkb->lkb_astparam = args->astparam;
+        lkb->lkb_bastaddr = args->bastaddr;
+        lkb->lkb_rqmode = args->mode;
+        lkb->lkb_lksb = args->lksb;
+        lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+        lkb->lkb_ownpid = (int) current->pid;
+        rv = 0;
+ out:
+        return rv;
+}
+static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+        int rv = -EINVAL;
+        if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+                goto out;
+        if (args->flags & DLM_LKF_FORCEUNLOCK)
+                goto out_ok;
+        if (args->flags & DLM_LKF_CANCEL &&
+            lkb->lkb_status == DLM_LKSTS_GRANTED)
+                goto out;
+        if (!(args->flags & DLM_LKF_CANCEL) &&
+            lkb->lkb_status != DLM_LKSTS_GRANTED)
+                goto out;
+        rv = -EBUSY;
+        if (lkb->lkb_wait_type)
+                goto out;
+ out_ok:
+        lkb->lkb_exflags = args->flags;
+        lkb->lkb_sbflags = 0;
+        lkb->lkb_astparam = args->astparam;
+        rv = 0;
+ out:
+        return rv;
+}
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error = 0;
+        if (can_be_granted(r, lkb, 1)) {
+                grant_lock(r, lkb);
+                queue_cast(r, lkb, 0);
+                goto out;
+        }
+        if (can_be_queued(lkb)) {
+                error = -EINPROGRESS;
+                add_lkb(r, lkb, DLM_LKSTS_WAITING);
+                send_blocking_asts(r, lkb);
+                goto out;
+        }
+        error = -EAGAIN;
+        if (force_blocking_asts(lkb))
+                send_blocking_asts_all(r, lkb);
+        queue_cast(r, lkb, -EAGAIN);
+ out:
+        return error;
+}
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error = 0;
+        /* changing an existing lock may allow others to be granted */
+        if (can_be_granted(r, lkb, 1)) {
+                grant_lock(r, lkb);
+                queue_cast(r, lkb, 0);
+                grant_pending_locks(r);
+                goto out;
+        }
+        if (can_be_queued(lkb)) {
+                if (is_demoted(lkb))
+                        grant_pending_locks(r);
+                error = -EINPROGRESS;
+                del_lkb(r, lkb);
+                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+                send_blocking_asts(r, lkb);
+                goto out;
+        }
+        error = -EAGAIN;
+        if (force_blocking_asts(lkb))
+                send_blocking_asts_all(r, lkb);
+        queue_cast(r, lkb, -EAGAIN);
+ out:
+        return error;
+}
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        remove_lock(r, lkb);
+        queue_cast(r, lkb, -DLM_EUNLOCK);
+        grant_pending_locks(r);
+        return -DLM_EUNLOCK;
+}
+/* FIXME: if revert_lock() finds that the lkb is granted, we should
+   skip the queue_cast(ECANCEL).  It indicates that the request/convert
+   completed (and queued a normal ast) just before the cancel; we don't
+   want to clobber the sb_result for the normal ast with ECANCEL. */
+ 
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        revert_lock(r, lkb);
+        queue_cast(r, lkb, -DLM_ECANCEL);
+        grant_pending_locks(r);
+        return -DLM_ECANCEL;
+}
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+/* add a new lkb to a possibly new rsb, called by requesting process */
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        /* set_master: sets lkb nodeid from r */
+        error = set_master(r, lkb);
+        if (error < 0)
+                goto out;
+        if (error) {
+                error = 0;
+                goto out;
+        }
+        if (is_remote(r))
+                /* receive_request() calls do_request() on remote node */
+                error = send_request(r, lkb);
+        else
+                error = do_request(r, lkb);
+ out:
+        return error;
+}
+/* change some property of an existing lkb, e.g. mode */
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        if (is_remote(r))
+                /* receive_convert() calls do_convert() on remote node */
+                error = send_convert(r, lkb);
+        else
+                error = do_convert(r, lkb);
+        return error;
+}
+/* remove an existing lkb from the granted queue */
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        if (is_remote(r))
+                /* receive_unlock() calls do_unlock() on remote node */
+                error = send_unlock(r, lkb);
+        else
+                error = do_unlock(r, lkb);
+        return error;
+}
+/* remove an existing lkb from the convert or wait queue */
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        if (is_remote(r))
+                /* receive_cancel() calls do_cancel() on remote node */
+                error = send_cancel(r, lkb);
+        else
+                error = do_cancel(r, lkb);
+        return error;
+}
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+                        int len, struct dlm_args *args)
+{
+        struct dlm_rsb *r;
+        int error;
+        error = validate_lock_args(ls, lkb, args);
+        if (error)
+                goto out;
+        error = find_rsb(ls, name, len, R_CREATE, &r);
+        if (error)
+                goto out;
+        lock_rsb(r);
+        attach_lkb(r, lkb);
+        lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+        error = _request_lock(r, lkb);
+        unlock_rsb(r);
+        put_rsb(r);
+ out:
+        return error;
+}
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                        struct dlm_args *args)
+{
+        struct dlm_rsb *r;
+        int error;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        error = validate_lock_args(ls, lkb, args);
+        if (error)
+                goto out;
+        error = _convert_lock(r, lkb);
+ out:
+        unlock_rsb(r);
+        put_rsb(r);
+        return error;
+}
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                       struct dlm_args *args)
+{
+        struct dlm_rsb *r;
+        int error;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        error = validate_unlock_args(lkb, args);
+        if (error)
+                goto out;
+        error = _unlock_lock(r, lkb);
+ out:
+        unlock_rsb(r);
+        put_rsb(r);
+        return error;
+}
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                       struct dlm_args *args)
+{
+        struct dlm_rsb *r;
+        int error;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        error = validate_unlock_args(lkb, args);
+        if (error)
+                goto out;
+        error = _cancel_lock(r, lkb);
+ out:
+        unlock_rsb(r);
+        put_rsb(r);
+        return error;
+}
+/*
+ * Two stage 1 varieties:  dlm_lock() and dlm_unlock()
+ */
+int dlm_lock(dlm_lockspace_t *lockspace,
+             int mode,
+             struct dlm_lksb *lksb,
+             uint32_t flags,
+             void *name,
+             unsigned int namelen,
+             uint32_t parent_lkid,
+             void (*ast) (void *astarg),
+             void *astarg,
+             void (*bast) (void *astarg, int mode))
+{
+        struct dlm_ls *ls;
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        int error, convert = flags & DLM_LKF_CONVERT;
+        ls = dlm_find_lockspace_local(lockspace);
+        if (!ls)
+                return -EINVAL;
+        lock_recovery(ls);
+        if (convert)
+                error = find_lkb(ls, lksb->sb_lkid, &lkb);
+        else
+                error = create_lkb(ls, &lkb);
+        if (error)
+                goto out;
+        error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
+                              astarg, bast, &args);
+        if (error)
+                goto out_put;
+        if (convert)
+                error = convert_lock(ls, lkb, &args);
+        else
+                error = request_lock(ls, lkb, name, namelen, &args);
+        if (error == -EINPROGRESS)
+                error = 0;
+ out_put:
+        if (convert || error)
+                __put_lkb(ls, lkb);
+        if (error == -EAGAIN)
+                error = 0;
+ out:
+        unlock_recovery(ls);
+        dlm_put_lockspace(ls);
+        return error;
+}
+int dlm_unlock(dlm_lockspace_t *lockspace,
+               uint32_t lkid,
+               uint32_t flags,
+               struct dlm_lksb *lksb,
+               void *astarg)
+{
+        struct dlm_ls *ls;
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        int error;
+        ls = dlm_find_lockspace_local(lockspace);
+        if (!ls)
+                return -EINVAL;
+        lock_recovery(ls);
+        error = find_lkb(ls, lkid, &lkb);
+        if (error)
+                goto out;
+        error = set_unlock_args(flags, astarg, &args);
+        if (error)
+                goto out_put;
+        if (flags & DLM_LKF_CANCEL)
+                error = cancel_lock(ls, lkb, &args);
+        else
+                error = unlock_lock(ls, lkb, &args);
+        if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+                error = 0;
+ out_put:
+        dlm_put_lkb(lkb);
+ out:
+        unlock_recovery(ls);
+        dlm_put_lockspace(ls);
+        return error;
+}
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request                 receive_request
+ * send_convert                 receive_convert
+ * send_unlock                  receive_unlock
+ * send_cancel                  receive_cancel
+ * send_grant                   receive_grant
+ * send_bast                    receive_bast
+ * send_lookup                  receive_lookup
+ * send_remove                  receive_remove
+ *
+ *                              send_common_reply
+ * receive_request_reply        send_request_reply
+ * receive_convert_reply        send_convert_reply
+ * receive_unlock_reply         send_unlock_reply
+ * receive_cancel_reply         send_cancel_reply
+ * receive_lookup_reply         send_lookup_reply
+ */
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                          int to_nodeid, int mstype,
+                          struct dlm_message **ms_ret,
+                          struct dlm_mhandle **mh_ret)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        char *mb;
+        int mb_len = sizeof(struct dlm_message);
+        switch (mstype) {
+        case DLM_MSG_REQUEST:
+        case DLM_MSG_LOOKUP:
+        case DLM_MSG_REMOVE:
+                mb_len += r->res_length;
+                break;
+        case DLM_MSG_CONVERT:
+        case DLM_MSG_UNLOCK:
+        case DLM_MSG_REQUEST_REPLY:
+        case DLM_MSG_CONVERT_REPLY:
+        case DLM_MSG_GRANT:
+                if (lkb && lkb->lkb_lvbptr)
+                        mb_len += r->res_ls->ls_lvblen;
+                break;
+        }
+        /* get_buffer gives us a message handle (mh) that we need to
+           pass into lowcomms_commit and a message buffer (mb) that we
+           write our data into */
+        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+        if (!mh)
+                return -ENOBUFS;
+        memset(mb, 0, mb_len);
+        ms = (struct dlm_message *) mb;
+        ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+        ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+        ms->m_header.h_nodeid = dlm_our_nodeid();
+        ms->m_header.h_length = mb_len;
+        ms->m_header.h_cmd = DLM_MSG;
+        ms->m_type = mstype;
+        *mh_ret = mh;
+        *ms_ret = ms;
+        return 0;
+}
+/* further lowcomms enhancements or alternate implementations may make
+   the return value from this function useful at some point */
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+{
+        dlm_message_out(ms);
+        dlm_lowcomms_commit_buffer(mh);
+        return 0;
+}
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                      struct dlm_message *ms)
+{
+        ms->m_nodeid   = lkb->lkb_nodeid;
+        ms->m_pid      = lkb->lkb_ownpid;
+        ms->m_lkid     = lkb->lkb_id;
+        ms->m_remid    = lkb->lkb_remid;
+        ms->m_exflags  = lkb->lkb_exflags;
+        ms->m_sbflags  = lkb->lkb_sbflags;
+        ms->m_flags    = lkb->lkb_flags;
+        ms->m_lvbseq   = lkb->lkb_lvbseq;
+        ms->m_status   = lkb->lkb_status;
+        ms->m_grmode   = lkb->lkb_grmode;
+        ms->m_rqmode   = lkb->lkb_rqmode;
+        ms->m_hash     = r->res_hash;
+        /* m_result and m_bastmode are set from function args,
+           not from lkb fields */
+        if (lkb->lkb_bastaddr)
+                ms->m_asts |= AST_BAST;
+        if (lkb->lkb_astaddr)
+                ms->m_asts |= AST_COMP;
+        if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
+                memcpy(ms->m_extra, r->res_name, r->res_length);
+        else if (lkb->lkb_lvbptr)
+                memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        add_to_waiters(lkb, mstype);
+        to_nodeid = r->res_nodeid;
+        error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+        if (error)
+                goto fail;
+        send_args(r, lkb, ms);
+        error = send_message(mh, ms);
+        if (error)
+                goto fail;
+        return 0;
+ fail:
+        remove_from_waiters(lkb);
+        return error;
+}
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        error = send_common(r, lkb, DLM_MSG_CONVERT);
+        /* down conversions go without a reply from the master */
+        if (!error && down_conversion(lkb)) {
+                remove_from_waiters(lkb);
+                r->res_ls->ls_stub_ms.m_result = 0;
+                r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
+        }
+        return error;
+}
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+   MASTER_UNCERTAIN to force the next request on the rsb to confirm
+   that the master is still correct. */
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        to_nodeid = lkb->lkb_nodeid;
+        error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+        if (error)
+                goto out;
+        send_args(r, lkb, ms);
+        ms->m_result = 0;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        to_nodeid = lkb->lkb_nodeid;
+        error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+        if (error)
+                goto out;
+        send_args(r, lkb, ms);
+        ms->m_bastmode = mode;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        add_to_waiters(lkb, DLM_MSG_LOOKUP);
+        to_nodeid = dlm_dir_nodeid(r);
+        error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+        if (error)
+                goto fail;
+        send_args(r, lkb, ms);
+        error = send_message(mh, ms);
+        if (error)
+                goto fail;
+        return 0;
+ fail:
+        remove_from_waiters(lkb);
+        return error;
+}
+static int send_remove(struct dlm_rsb *r)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        to_nodeid = dlm_dir_nodeid(r);
+        error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+        if (error)
+                goto out;
+        memcpy(ms->m_extra, r->res_name, r->res_length);
+        ms->m_hash = r->res_hash;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                             int mstype, int rv)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        to_nodeid = lkb->lkb_nodeid;
+        error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+        if (error)
+                goto out;
+        send_args(r, lkb, ms);
+        ms->m_result = rv;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
+                             int ret_nodeid, int rv)
+{
+        struct dlm_rsb *r = &ls->ls_stub_rsb;
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int error, nodeid = ms_in->m_header.h_nodeid;
+        error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+        if (error)
+                goto out;
+        ms->m_lkid = ms_in->m_lkid;
+        ms->m_result = rv;
+        ms->m_nodeid = ret_nodeid;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+/* which args we save from a received message depends heavily on the type
+   of message, unlike the send side where we can safely send everything about
+   the lkb for any type of message */
+static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        lkb->lkb_exflags = ms->m_exflags;
+        lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+                         (ms->m_flags & 0x0000FFFF);
+}
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        lkb->lkb_sbflags = ms->m_sbflags;
+        lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+                         (ms->m_flags & 0x0000FFFF);
+}
+static int receive_extralen(struct dlm_message *ms)
+{
+        return (ms->m_header.h_length - sizeof(struct dlm_message));
+}
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                       struct dlm_message *ms)
+{
+        int len;
+        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+                if (!lkb->lkb_lvbptr)
+                        lkb->lkb_lvbptr = allocate_lvb(ls);
+                if (!lkb->lkb_lvbptr)
+                        return -ENOMEM;
+                len = receive_extralen(ms);
+                memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+        }
+        return 0;
+}
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                struct dlm_message *ms)
+{
+        lkb->lkb_nodeid = ms->m_header.h_nodeid;
+        lkb->lkb_ownpid = ms->m_pid;
+        lkb->lkb_remid = ms->m_lkid;
+        lkb->lkb_grmode = DLM_LOCK_IV;
+        lkb->lkb_rqmode = ms->m_rqmode;
+        lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
+        lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
+        DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
+        if (receive_lvb(ls, lkb, ms))
+                return -ENOMEM;
+        return 0;
+}
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                struct dlm_message *ms)
+{
+        if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
+                log_error(ls, "convert_args nodeid %d %d lkid %x %x",
+                          lkb->lkb_nodeid, ms->m_header.h_nodeid,
+                          lkb->lkb_id, lkb->lkb_remid);
+                return -EINVAL;
+        }
+        if (!is_master_copy(lkb))
+                return -EINVAL;
+        if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+                return -EBUSY;
+        if (receive_lvb(ls, lkb, ms))
+                return -ENOMEM;
+        lkb->lkb_rqmode = ms->m_rqmode;
+        lkb->lkb_lvbseq = ms->m_lvbseq;
+        return 0;
+}
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                               struct dlm_message *ms)
+{
+        if (!is_master_copy(lkb))
+                return -EINVAL;
+        if (receive_lvb(ls, lkb, ms))
+                return -ENOMEM;
+        return 0;
+}
+/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+   uses to send a reply and that the remote end uses to process the reply. */
+static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+        lkb->lkb_nodeid = ms->m_header.h_nodeid;
+        lkb->lkb_remid = ms->m_lkid;
+}
+static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error, namelen;
+        error = create_lkb(ls, &lkb);
+        if (error)
+                goto fail;
+        receive_flags(lkb, ms);
+        lkb->lkb_flags |= DLM_IFL_MSTCPY;
+        error = receive_request_args(ls, lkb, ms);
+        if (error) {
+                __put_lkb(ls, lkb);
+                goto fail;
+        }
+        namelen = receive_extralen(ms);
+        error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
+        if (error) {
+                __put_lkb(ls, lkb);
+                goto fail;
+        }
+        lock_rsb(r);
+        attach_lkb(r, lkb);
+        error = do_request(r, lkb);
+        send_request_reply(r, lkb, error);
+        unlock_rsb(r);
+        put_rsb(r);
+        if (error == -EINPROGRESS)
+                error = 0;
+        if (error)
+                dlm_put_lkb(lkb);
+        return;
+ fail:
+        setup_stub_lkb(ls, ms);
+        send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error, reply = 1;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error)
+                goto fail;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        receive_flags(lkb, ms);
+        error = receive_convert_args(ls, lkb, ms);
+        if (error)
+                goto out;
+        reply = !down_conversion(lkb);
+        error = do_convert(r, lkb);
+ out:
+        if (reply)
+                send_convert_reply(r, lkb, error);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+        return;
+ fail:
+        setup_stub_lkb(ls, ms);
+        send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error)
+                goto fail;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        receive_flags(lkb, ms);
+        error = receive_unlock_args(ls, lkb, ms);
+        if (error)
+                goto out;
+        error = do_unlock(r, lkb);
+ out:
+        send_unlock_reply(r, lkb, error);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+        return;
+ fail:
+        setup_stub_lkb(ls, ms);
+        send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error)
+                goto fail;
+        receive_flags(lkb, ms);
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        error = do_cancel(r, lkb);
+        send_cancel_reply(r, lkb, error);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+        return;
+ fail:
+        setup_stub_lkb(ls, ms);
+        send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_grant no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        receive_flags_reply(lkb, ms);
+        grant_lock_pc(r, lkb, ms);
+        queue_cast(r, lkb, 0);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+}
+static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_bast no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        queue_bast(r, lkb, ms->m_bastmode);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+}
+static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
+        from_nodeid = ms->m_header.h_nodeid;
+        our_nodeid = dlm_our_nodeid();
+        len = receive_extralen(ms);
+        dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+        if (dir_nodeid != our_nodeid) {
+                log_error(ls, "lookup dir_nodeid %d from %d",
+                          dir_nodeid, from_nodeid);
+                error = -EINVAL;
+                ret_nodeid = -1;
+                goto out;
+        }
+        error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
+        /* Optimization: we're master so treat lookup as a request */
+        if (!error && ret_nodeid == our_nodeid) {
+                receive_request(ls, ms);
+                return;
+        }
+ out:
+        send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        int len, dir_nodeid, from_nodeid;
+        from_nodeid = ms->m_header.h_nodeid;
+        len = receive_extralen(ms);
+        dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+        if (dir_nodeid != dlm_our_nodeid()) {
+                log_error(ls, "remove dir entry dir_nodeid %d from %d",
+                          dir_nodeid, from_nodeid);
+                return;
+        }
+        dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
+}
+static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error, mstype;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_request_reply no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        mstype = lkb->lkb_wait_type;
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_request_reply not on waiters");
+                goto out;
+        }
+        /* this is the value returned from do_request() on the master */
+        error = ms->m_result;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        /* Optimization: the dir node was also the master, so it took our
+           lookup as a request and sent request reply instead of lookup reply */
+        if (mstype == DLM_MSG_LOOKUP) {
+                r->res_nodeid = ms->m_header.h_nodeid;
+                lkb->lkb_nodeid = r->res_nodeid;
+        }
+        switch (error) {
+        case -EAGAIN:
+                /* request would block (be queued) on remote master;
+                   the unhold undoes the original ref from create_lkb()
+                   so it leads to the lkb being freed */
+                queue_cast(r, lkb, -EAGAIN);
+                confirm_master(r, -EAGAIN);
+                unhold_lkb(lkb);
+                break;
+        case -EINPROGRESS:
+        case 0:
+                /* request was queued or granted on remote master */
+                receive_flags_reply(lkb, ms);
+                lkb->lkb_remid = ms->m_lkid;
+                if (error)
+                        add_lkb(r, lkb, DLM_LKSTS_WAITING);
+                else {
+                        grant_lock_pc(r, lkb, ms);
+                        queue_cast(r, lkb, 0);
+                }
+                confirm_master(r, error);
+                break;
+        case -EBADR:
+        case -ENOTBLK:
+                /* find_rsb failed to find rsb or rsb wasn't master */
+                r->res_nodeid = -1;
+                lkb->lkb_nodeid = -1;
+                _request_lock(r, lkb);
+                break;
+        default:
+                log_error(ls, "receive_request_reply error %d", error);
+        }
+        unlock_rsb(r);
+        put_rsb(r);
+ out:
+        dlm_put_lkb(lkb);
+}
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                                    struct dlm_message *ms)
+{
+        int error = ms->m_result;
+        /* this is the value returned from do_convert() on the master */
+        switch (error) {
+        case -EAGAIN:
+                /* convert would block (be queued) on remote master */
+                queue_cast(r, lkb, -EAGAIN);
+                break;
+        case -EINPROGRESS:
+                /* convert was queued on remote master */
+                del_lkb(r, lkb);
+                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+                break;
+        case 0:
+                /* convert was granted on remote master */
+                receive_flags_reply(lkb, ms);
+                grant_lock_pc(r, lkb, ms);
+                queue_cast(r, lkb, 0);
+                break;
+        default:
+                log_error(r->res_ls, "receive_convert_reply error %d", error);
+        }
+}
+static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        struct dlm_rsb *r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        __receive_convert_reply(r, lkb, ms);
+        unlock_rsb(r);
+        put_rsb(r);
+}
+static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_convert_reply no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_convert_reply not on waiters");
+                goto out;
+        }
+        _receive_convert_reply(lkb, ms);
+ out:
+        dlm_put_lkb(lkb);
+}
+static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        struct dlm_rsb *r = lkb->lkb_resource;
+        int error = ms->m_result;
+        hold_rsb(r);
+        lock_rsb(r);
+        /* this is the value returned from do_unlock() on the master */
+        switch (error) {
+        case -DLM_EUNLOCK:
+                receive_flags_reply(lkb, ms);
+                remove_lock_pc(r, lkb);
+                queue_cast(r, lkb, -DLM_EUNLOCK);
+                break;
+        default:
+                log_error(r->res_ls, "receive_unlock_reply error %d", error);
+        }
+        unlock_rsb(r);
+        put_rsb(r);
+}
+static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_unlock_reply no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_unlock_reply not on waiters");
+                goto out;
+        }
+        _receive_unlock_reply(lkb, ms);
+ out:
+        dlm_put_lkb(lkb);
+}
+static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        struct dlm_rsb *r = lkb->lkb_resource;
+        int error = ms->m_result;
+        hold_rsb(r);
+        lock_rsb(r);
+        /* this is the value returned from do_cancel() on the master */
+        switch (error) {
+        case -DLM_ECANCEL:
+                receive_flags_reply(lkb, ms);
+                revert_lock_pc(r, lkb);
+                queue_cast(r, lkb, -DLM_ECANCEL);
+                break;
+        default:
+                log_error(r->res_ls, "receive_cancel_reply error %d", error);
+        }
+        unlock_rsb(r);
+        put_rsb(r);
+}
+static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_cancel_reply no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_cancel_reply not on waiters");
+                goto out;
+        }
+        _receive_cancel_reply(lkb, ms);
+ out:
+        dlm_put_lkb(lkb);
+}
+static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error, ret_nodeid;
+        error = find_lkb(ls, ms->m_lkid, &lkb);
+        if (error) {
+                log_error(ls, "receive_lookup_reply no lkb");
+                return;
+        }
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_lookup_reply not on waiters");
+                goto out;
+        }
+        /* this is the value returned by dlm_dir_lookup on dir node
+           FIXME: will a non-zero error ever be returned? */
+        error = ms->m_result;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        ret_nodeid = ms->m_nodeid;
+        if (ret_nodeid == dlm_our_nodeid()) {
+                r->res_nodeid = 0;
+                ret_nodeid = 0;
+                r->res_first_lkid = 0;
+        } else {
+                /* set_master() will copy res_nodeid to lkb_nodeid */
+                r->res_nodeid = ret_nodeid;
+        }
+        _request_lock(r, lkb);
+        if (!ret_nodeid)
+                process_lookup_list(r);
+        unlock_rsb(r);
+        put_rsb(r);
+ out:
+        dlm_put_lkb(lkb);
+}
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
+{
+        struct dlm_message *ms = (struct dlm_message *) hd;
+        struct dlm_ls *ls;
+        int error;
+        if (!recovery)
+                dlm_message_in(ms);
+        ls = dlm_find_lockspace_global(hd->h_lockspace);
+        if (!ls) {
+                log_print("drop message %d from %d for unknown lockspace %d",
+                          ms->m_type, nodeid, hd->h_lockspace);
+                return -EINVAL;
+        }
+        /* recovery may have just ended leaving a bunch of backed-up requests
+           in the requestqueue; wait while dlm_recoverd clears them */
+        if (!recovery)
+                dlm_wait_requestqueue(ls);
+        /* recovery may have just started while there were a bunch of
+           in-flight requests -- save them in requestqueue to be processed
+           after recovery.  we can't let dlm_recvd block on the recovery
+           lock.  if dlm_recoverd is calling this function to clear the
+           requestqueue, it needs to be interrupted (-EINTR) if another
+           recovery operation is starting. */
+        while (1) {
+                if (dlm_locking_stopped(ls)) {
+                        if (!recovery)
+                                dlm_add_requestqueue(ls, nodeid, hd);
+                        error = -EINTR;
+                        goto out;
+                }
+                if (lock_recovery_try(ls))
+                        break;
+                schedule();
+        }
+        switch (ms->m_type) {
+        /* messages sent to a master node */
+        case DLM_MSG_REQUEST:
+                receive_request(ls, ms);
+                break;
+        case DLM_MSG_CONVERT:
+                receive_convert(ls, ms);
+                break;
+        case DLM_MSG_UNLOCK:
+                receive_unlock(ls, ms);
+                break;
+        case DLM_MSG_CANCEL:
+                receive_cancel(ls, ms);
+                break;
+        /* messages sent from a master node (replies to above) */
+        case DLM_MSG_REQUEST_REPLY:
+                receive_request_reply(ls, ms);
+                break;
+        case DLM_MSG_CONVERT_REPLY:
+                receive_convert_reply(ls, ms);
+                break;
+        case DLM_MSG_UNLOCK_REPLY:
+                receive_unlock_reply(ls, ms);
+                break;
+        case DLM_MSG_CANCEL_REPLY:
+                receive_cancel_reply(ls, ms);
+                break;
+        /* messages sent from a master node (only two types of async msg) */
+        case DLM_MSG_GRANT:
+                receive_grant(ls, ms);
+                break;
+        case DLM_MSG_BAST:
+                receive_bast(ls, ms);
+                break;
+        /* messages sent to a dir node */
+        case DLM_MSG_LOOKUP:
+                receive_lookup(ls, ms);
+                break;
+        case DLM_MSG_REMOVE:
+                receive_remove(ls, ms);
+                break;
+        /* messages sent from a dir node (remove has no reply) */
+        case DLM_MSG_LOOKUP_REPLY:
+                receive_lookup_reply(ls, ms);
+                break;
+        default:
+                log_error(ls, "unknown message type %d", ms->m_type);
+        }
+        unlock_recovery(ls);
+ out:
+        dlm_put_lockspace(ls);
+        dlm_astd_wake();
+        return 0;
+}
+/*
+ * Recovery related
+ */
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        if (middle_conversion(lkb)) {
+                hold_lkb(lkb);
+                ls->ls_stub_ms.m_result = -EINPROGRESS;
+                _remove_from_waiters(lkb);
+                _receive_convert_reply(lkb, &ls->ls_stub_ms);
+                /* Same special case as in receive_rcom_lock_args() */
+                lkb->lkb_grmode = DLM_LOCK_IV;
+                rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
+                unhold_lkb(lkb);
+        } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+                lkb->lkb_flags |= DLM_IFL_RESEND;
+        }
+        /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
+           conversions are async; there's no reply from the remote master */
+}
+/* A waiting lkb needs recovery if the master node has failed, or
+   the master node is changing (only when no directory is used) */
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        if (dlm_is_removed(ls, lkb->lkb_nodeid))
+                return 1;
+        if (!dlm_no_directory(ls))
+                return 0;
+        if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
+                return 1;
+        return 0;
+}
+/* Recovery for locks that are waiting for replies from nodes that are now
+   gone.  We can just complete unlocks and cancels by faking a reply from the
+   dead node.  Requests and up-conversions we flag to be resent after
+   recovery.  Down-conversions can just be completed with a fake reply like
+   unlocks.  Conversions between PR and CW need special attention. */
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+        struct dlm_lkb *lkb, *safe;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+                log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
+                          lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+                /* all outstanding lookups, regardless of destination  will be
+                   resent after recovery is done */
+                if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
+                        lkb->lkb_flags |= DLM_IFL_RESEND;
+                        continue;
+                }
+                if (!waiter_needs_recovery(ls, lkb))
+                        continue;
+                switch (lkb->lkb_wait_type) {
+                case DLM_MSG_REQUEST:
+                        lkb->lkb_flags |= DLM_IFL_RESEND;
+                        break;
+                case DLM_MSG_CONVERT:
+                        recover_convert_waiter(ls, lkb);
+                        break;
+                case DLM_MSG_UNLOCK:
+                        hold_lkb(lkb);
+                        ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+                        _remove_from_waiters(lkb);
+                        _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+                        dlm_put_lkb(lkb);
+                        break;
+                case DLM_MSG_CANCEL:
+                        hold_lkb(lkb);
+                        ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+                        _remove_from_waiters(lkb);
+                        _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+                        dlm_put_lkb(lkb);
+                        break;
+                default:
+                        log_error(ls, "invalid lkb wait_type %d",
+                                  lkb->lkb_wait_type);
+                }
+                schedule();
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
+}
+static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+        struct dlm_lkb *lkb;
+        int rv = 0;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                if (lkb->lkb_flags & DLM_IFL_RESEND) {
+                        rv = lkb->lkb_wait_type;
+                        _remove_from_waiters(lkb);
+                        lkb->lkb_flags &= ~DLM_IFL_RESEND;
+                        break;
+                }
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
+        if (!rv)
+                lkb = NULL;
+        *lkb_ret = lkb;
+        return rv;
+}
+/* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
+   master or dir-node for r.  Processing the lkb may result in it being placed
+   back on waiters. */
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error = 0, mstype;
+        while (1) {
+                if (dlm_locking_stopped(ls)) {
+                        log_debug(ls, "recover_waiters_post aborted");
+                        error = -EINTR;
+                        break;
+                }
+                mstype = remove_resend_waiter(ls, &lkb);
+                if (!mstype)
+                        break;
+                r = lkb->lkb_resource;
+                log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+                          lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+                switch (mstype) {
+                case DLM_MSG_LOOKUP:
+                        hold_rsb(r);
+                        lock_rsb(r);
+                        _request_lock(r, lkb);
+                        if (is_master(r))
+                                confirm_master(r, 0);
+                        unlock_rsb(r);
+                        put_rsb(r);
+                        break;
+                case DLM_MSG_REQUEST:
+                        hold_rsb(r);
+                        lock_rsb(r);
+                        _request_lock(r, lkb);
+                        if (is_master(r))
+                                confirm_master(r, 0);
+                        unlock_rsb(r);
+                        put_rsb(r);
+                        break;
+                case DLM_MSG_CONVERT:
+                        hold_rsb(r);
+                        lock_rsb(r);
+                        _convert_lock(r, lkb);
+                        unlock_rsb(r);
+                        put_rsb(r);
+                        break;
+                default:
+                        log_error(ls, "recover_waiters_post type %d", mstype);
+                }
+        }
+        return error;
+}
+static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
+                        int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
+{
+        struct dlm_ls *ls = r->res_ls;
+        struct dlm_lkb *lkb, *safe;
+        list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+                if (test(ls, lkb)) {
+                        rsb_set_flag(r, RSB_LOCKS_PURGED);
+                        del_lkb(r, lkb);
+                        /* this put should free the lkb */
+                        if (!dlm_put_lkb(lkb))
+                                log_error(ls, "purged lkb not released");
+                }
+        }
+}
+static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
+}
+static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        return is_master_copy(lkb);
+}
+static void purge_dead_locks(struct dlm_rsb *r)
+{
+        purge_queue(r, &r->res_grantqueue, &purge_dead_test);
+        purge_queue(r, &r->res_convertqueue, &purge_dead_test);
+        purge_queue(r, &r->res_waitqueue, &purge_dead_test);
+}
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
+{
+        purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
+        purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
+        purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
+}
+/* Get rid of locks held by nodes that are gone. */
+int dlm_purge_locks(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        log_debug(ls, "dlm_purge_locks");
+        down_write(&ls->ls_root_sem);
+        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+                hold_rsb(r);
+                lock_rsb(r);
+                if (is_master(r))
+                        purge_dead_locks(r);
+                unlock_rsb(r);
+                unhold_rsb(r);
+                schedule();
+        }
+        up_write(&ls->ls_root_sem);
+        return 0;
+}
+static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
+{
+        struct dlm_rsb *r, *r_ret = NULL;
+        read_lock(&ls->ls_rsbtbl[bucket].lock);
+        list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+                if (!rsb_flag(r, RSB_LOCKS_PURGED))
+                        continue;
+                hold_rsb(r);
+                rsb_clear_flag(r, RSB_LOCKS_PURGED);
+                r_ret = r;
+                break;
+        }
+        read_unlock(&ls->ls_rsbtbl[bucket].lock);
+        return r_ret;
+}
+void dlm_grant_after_purge(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int bucket = 0;
+        while (1) {
+                r = find_purged_rsb(ls, bucket);
+                if (!r) {
+                        if (bucket == ls->ls_rsbtbl_size - 1)
+                                break;
+                        bucket++;
+                        continue;
+                }
+                lock_rsb(r);
+                if (is_master(r)) {
+                        grant_pending_locks(r);
+                        confirm_master(r, 0);
+                }
+                unlock_rsb(r);
+                put_rsb(r);
+                schedule();
+        }
+}
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+                                         uint32_t remid)
+{
+        struct dlm_lkb *lkb;
+        list_for_each_entry(lkb, head, lkb_statequeue) {
+                if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+                        return lkb;
+        }
+        return NULL;
+}
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+                                    uint32_t remid)
+{
+        struct dlm_lkb *lkb;
+        lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+        if (lkb)
+                return lkb;
+        lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+        if (lkb)
+                return lkb;
+        lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+        if (lkb)
+                return lkb;
+        return NULL;
+}
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                  struct dlm_rsb *r, struct dlm_rcom *rc)
+{
+        struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+        int lvblen;
+        lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+        lkb->lkb_ownpid = rl->rl_ownpid;
+        lkb->lkb_remid = rl->rl_lkid;
+        lkb->lkb_exflags = rl->rl_exflags;
+        lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
+        lkb->lkb_flags |= DLM_IFL_MSTCPY;
+        lkb->lkb_lvbseq = rl->rl_lvbseq;
+        lkb->lkb_rqmode = rl->rl_rqmode;
+        lkb->lkb_grmode = rl->rl_grmode;
+        /* don't set lkb_status because add_lkb wants to itself */
+        lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
+        lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
+        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+                lkb->lkb_lvbptr = allocate_lvb(ls);
+                if (!lkb->lkb_lvbptr)
+                        return -ENOMEM;
+                lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
+                         sizeof(struct rcom_lock);
+                memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
+        }
+        /* Conversions between PR and CW (middle modes) need special handling.
+           The real granted mode of these converting locks cannot be determined
+           until all locks have been rebuilt on the rsb (recover_conversion) */
+        if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
+                rl->rl_status = DLM_LKSTS_CONVERT;
+                lkb->lkb_grmode = DLM_LOCK_IV;
+                rsb_set_flag(r, RSB_RECOVER_CONVERT);
+        }
+        return 0;
+}
+/* This lkb may have been recovered in a previous aborted recovery so we need
+   to check if the rsb already has an lkb with the given remote nodeid/lkid.
+   If so we just send back a standard reply.  If not, we create a new lkb with
+   the given values and send back our lkid.  We send back our lkid by sending
+   back the rcom_lock struct we got but with the remid field filled in. */
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+        struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+        struct dlm_rsb *r;
+        struct dlm_lkb *lkb;
+        int error;
+        if (rl->rl_parent_lkid) {
+                error = -EOPNOTSUPP;
+                goto out;
+        }
+        error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
+        if (error)
+                goto out;
+        lock_rsb(r);
+        lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
+        if (lkb) {
+                error = -EEXIST;
+                goto out_remid;
+        }
+        error = create_lkb(ls, &lkb);
+        if (error)
+                goto out_unlock;
+        error = receive_rcom_lock_args(ls, lkb, r, rc);
+        if (error) {
+                __put_lkb(ls, lkb);
+                goto out_unlock;
+        }
+        attach_lkb(r, lkb);
+        add_lkb(r, lkb, rl->rl_status);
+        error = 0;
+ out_remid:
+        /* this is the new value returned to the lock holder for
+           saving in its process-copy lkb */
+        rl->rl_remid = lkb->lkb_id;
+ out_unlock:
+        unlock_rsb(r);
+        put_rsb(r);
+ out:
+        if (error)
+                log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+        rl->rl_result = error;
+        return error;
+}
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+        struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+        struct dlm_rsb *r;
+        struct dlm_lkb *lkb;
+        int error;
+        error = find_lkb(ls, rl->rl_lkid, &lkb);
+        if (error) {
+                log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
+                return error;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        error = rl->rl_result;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        switch (error) {
+        case -EEXIST:
+                log_debug(ls, "master copy exists %x", lkb->lkb_id);
+                /* fall through */
+        case 0:
+                lkb->lkb_remid = rl->rl_remid;
+                break;
+        default:
+                log_error(ls, "dlm_recover_process_copy unknown error %d %x",
+                          error, lkb->lkb_id);
+        }
+        /* an ack for dlm_recover_locks() which waits for replies from
+           all the locks it sends to new masters */
+        dlm_recovered_lock(r);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+        return 0;
+}
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+                     int mode, uint32_t flags, void *name, unsigned int namelen,
+                     uint32_t parent_lkid)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        int error;
+        lock_recovery(ls);
+        error = create_lkb(ls, &lkb);
+        if (error) {
+                kfree(ua);
+                goto out;
+        }
+        if (flags & DLM_LKF_VALBLK) {
+                ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                if (!ua->lksb.sb_lvbptr) {
+                        kfree(ua);
+                        __put_lkb(ls, lkb);
+                        error = -ENOMEM;
+                        goto out;
+                }
+        }
+        /* After ua is attached to lkb it will be freed by free_lkb().
+           When DLM_IFL_USER is set, the dlm knows that this is a userspace
+           lock and that lkb_astparam is the dlm_user_args structure. */
+        error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
+                              DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
+        lkb->lkb_flags |= DLM_IFL_USER;
+        ua->old_mode = DLM_LOCK_IV;
+        if (error) {
+                __put_lkb(ls, lkb);
+                goto out;
+        }
+        error = request_lock(ls, lkb, name, namelen, &args);
+        switch (error) {
+        case 0:
+                break;
+        case -EINPROGRESS:
+                error = 0;
+                break;
+        case -EAGAIN:
+                error = 0;
+                /* fall through */
+        default:
+                __put_lkb(ls, lkb);
+                goto out;
+        }
+        /* add this new lkb to the per-process list of locks */
+        spin_lock(&ua->proc->locks_spin);
+        kref_get(&lkb->lkb_ref);
+        list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+        spin_unlock(&ua->proc->locks_spin);
+ out:
+        unlock_recovery(ls);
+        return error;
+}
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+                     int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        struct dlm_user_args *ua;
+        int error;
+        lock_recovery(ls);
+        error = find_lkb(ls, lkid, &lkb);
+        if (error)
+                goto out;
+        /* user can change the params on its lock when it converts it, or
+           add an lvb that didn't exist before */
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
+                ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                if (!ua->lksb.sb_lvbptr) {
+                        error = -ENOMEM;
+                        goto out_put;
+                }
+        }
+        if (lvb_in && ua->lksb.sb_lvbptr)
+                memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+        ua->castparam = ua_tmp->castparam;
+        ua->castaddr = ua_tmp->castaddr;
+        ua->bastparam = ua_tmp->bastparam;
+        ua->bastaddr = ua_tmp->bastaddr;
+        ua->user_lksb = ua_tmp->user_lksb;
+        ua->old_mode = lkb->lkb_grmode;
+        error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST,
+                              ua, DLM_FAKE_USER_AST, &args);
+        if (error)
+                goto out_put;
+        error = convert_lock(ls, lkb, &args);
+        if (error == -EINPROGRESS || error == -EAGAIN)
+                error = 0;
+ out_put:
+        dlm_put_lkb(lkb);
+ out:
+        unlock_recovery(ls);
+        kfree(ua_tmp);
+        return error;
+}
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+                    uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        struct dlm_user_args *ua;
+        int error;
+        lock_recovery(ls);
+        error = find_lkb(ls, lkid, &lkb);
+        if (error)
+                goto out;
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        if (lvb_in && ua->lksb.sb_lvbptr)
+                memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+        ua->castparam = ua_tmp->castparam;
+        ua->user_lksb = ua_tmp->user_lksb;
+        error = set_unlock_args(flags, ua, &args);
+        if (error)
+                goto out_put;
+        error = unlock_lock(ls, lkb, &args);
+        if (error == -DLM_EUNLOCK)
+                error = 0;
+        if (error)
+                goto out_put;
+        spin_lock(&ua->proc->locks_spin);
+        list_del_init(&lkb->lkb_ownqueue);
+        spin_unlock(&ua->proc->locks_spin);
+        /* this removes the reference for the proc->locks list added by
+           dlm_user_request */
+        unhold_lkb(lkb);
+ out_put:
+        dlm_put_lkb(lkb);
+ out:
+        unlock_recovery(ls);
+        return error;
+}
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+                    uint32_t flags, uint32_t lkid)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        struct dlm_user_args *ua;
+        int error;
+        lock_recovery(ls);
+        error = find_lkb(ls, lkid, &lkb);
+        if (error)
+                goto out;
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        ua->castparam = ua_tmp->castparam;
+        ua->user_lksb = ua_tmp->user_lksb;
+        error = set_unlock_args(flags, ua, &args);
+        if (error)
+                goto out_put;
+        error = cancel_lock(ls, lkb, &args);
+        if (error == -DLM_ECANCEL)
+                error = 0;
+        if (error)
+                goto out_put;
+        /* this lkb was removed from the WAITING queue */
+        if (lkb->lkb_grmode == DLM_LOCK_IV) {
+                spin_lock(&ua->proc->locks_spin);
+                list_del_init(&lkb->lkb_ownqueue);
+                spin_unlock(&ua->proc->locks_spin);
+                unhold_lkb(lkb);
+        }
+ out_put:
+        dlm_put_lkb(lkb);
+ out:
+        unlock_recovery(ls);
+        return error;
+}
+static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        if (ua->lksb.sb_lvbptr)
+                kfree(ua->lksb.sb_lvbptr);
+        kfree(ua);
+        lkb->lkb_astparam = (long)NULL;
+        /* TODO: propogate to master if needed */
+        return 0;
+}
+/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
+   Regardless of what rsb queue the lock is on, it's removed and freed. */
+static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        struct dlm_args args;
+        int error;
+        /* FIXME: we need to handle the case where the lkb is in limbo
+           while the rsb is being looked up, currently we assert in
+           _unlock_lock/is_remote because rsb nodeid is -1. */
+        set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
+        error = unlock_lock(ls, lkb, &args);
+        if (error == -DLM_EUNLOCK)
+                error = 0;
+        return error;
+}
+/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
+   1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
+   which we clear here. */
+/* proc CLOSING flag is set so no more device_reads should look at proc->asts
+   list, and no more device_writes should add lkb's to proc->locks list; so we
+   shouldn't need to take asts_spin or locks_spin here.  this assumes that
+   device reads/writes/closes are serialized -- FIXME: we may need to serialize
+   them ourself. */
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+        struct dlm_lkb *lkb, *safe;
+        lock_recovery(ls);
+        mutex_lock(&ls->ls_clear_proc_locks);
+        list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
+                if (lkb->lkb_ast_type) {
+                        list_del(&lkb->lkb_astqueue);
+                        unhold_lkb(lkb);
+                }
+                list_del_init(&lkb->lkb_ownqueue);
+                if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
+                        lkb->lkb_flags |= DLM_IFL_ORPHAN;
+                        orphan_proc_lock(ls, lkb);
+                } else {
+                        lkb->lkb_flags |= DLM_IFL_DEAD;
+                        unlock_proc_lock(ls, lkb);
+                }
+                /* this removes the reference for the proc->locks list
+                   added by dlm_user_request, it may result in the lkb
+                   being freed */
+                dlm_put_lkb(lkb);
+        }
+        mutex_unlock(&ls->ls_clear_proc_locks);
+        unlock_recovery(ls);
+}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..0843a3073ec3
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,62 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __LOCK_DOT_H__
+#define __LOCK_DOT_H__
+void dlm_print_rsb(struct dlm_rsb *r);
+void dlm_dump_rsb(struct dlm_rsb *r);
+void dlm_print_lkb(struct dlm_lkb *lkb);
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
+int dlm_modes_compat(int mode1, int mode2);
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+        unsigned int flags, struct dlm_rsb **r_ret);
+void dlm_put_rsb(struct dlm_rsb *r);
+void dlm_hold_rsb(struct dlm_rsb *r);
+int dlm_put_lkb(struct dlm_lkb *lkb);
+void dlm_scan_rsbs(struct dlm_ls *ls);
+int dlm_purge_locks(struct dlm_ls *ls);
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+void dlm_grant_after_purge(struct dlm_ls *ls);
+int dlm_recover_waiters_post(struct dlm_ls *ls);
+void dlm_recover_waiters_pre(struct dlm_ls *ls);
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+        uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+        int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+        uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
+        uint32_t flags, uint32_t lkid);
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+static inline int is_master(struct dlm_rsb *r)
+{
+        return !r->res_nodeid;
+}
+static inline void lock_rsb(struct dlm_rsb *r)
+{
+        mutex_lock(&r->res_mutex);
+}
+static inline void unlock_rsb(struct dlm_rsb *r)
+{
+        mutex_unlock(&r->res_mutex);
+}
+#endif
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..109333c8ecb9
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,717 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "recover.h"
+#ifdef CONFIG_DLM_DEBUG
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+static int                      ls_count;
+static struct mutex             ls_lock;
+static struct list_head         lslist;
+static spinlock_t               lslist_lock;
+static struct task_struct *     scand_task;
+static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+        ssize_t ret = len;
+        int n = simple_strtol(buf, NULL, 0);
+        switch (n) {
+        case 0:
+                dlm_ls_stop(ls);
+                break;
+        case 1:
+                dlm_ls_start(ls);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        return ret;
+}
+static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+        ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
+        set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
+        wake_up(&ls->ls_uevent_wait);
+        return len;
+}
+static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
+}
+static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+        ls->ls_global_id = simple_strtoul(buf, NULL, 0);
+        return len;
+}
+static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
+{
+        uint32_t status = dlm_recover_status(ls);
+        return snprintf(buf, PAGE_SIZE, "%x\n", status);
+}
+static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
+}
+struct dlm_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct dlm_ls *, char *);
+        ssize_t (*store)(struct dlm_ls *, const char *, size_t);
+};
+static struct dlm_attr dlm_attr_control = {
+        .attr  = {.name = "control", .mode = S_IWUSR},
+        .store = dlm_control_store
+};
+static struct dlm_attr dlm_attr_event = {
+        .attr  = {.name = "event_done", .mode = S_IWUSR},
+        .store = dlm_event_store
+};
+static struct dlm_attr dlm_attr_id = {
+        .attr  = {.name = "id", .mode = S_IRUGO | S_IWUSR},
+        .show  = dlm_id_show,
+        .store = dlm_id_store
+};
+static struct dlm_attr dlm_attr_recover_status = {
+        .attr  = {.name = "recover_status", .mode = S_IRUGO},
+        .show  = dlm_recover_status_show
+};
+static struct dlm_attr dlm_attr_recover_nodeid = {
+        .attr  = {.name = "recover_nodeid", .mode = S_IRUGO},
+        .show  = dlm_recover_nodeid_show
+};
+static struct attribute *dlm_attrs[] = {
+        &dlm_attr_control.attr,
+        &dlm_attr_event.attr,
+        &dlm_attr_id.attr,
+        &dlm_attr_recover_status.attr,
+        &dlm_attr_recover_nodeid.attr,
+        NULL,
+};
+static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
+                             char *buf)
+{
+        struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+        struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+        return a->show ? a->show(ls, buf) : 0;
+}
+static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
+                              const char *buf, size_t len)
+{
+        struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+        struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+        return a->store ? a->store(ls, buf, len) : len;
+}
+static struct sysfs_ops dlm_attr_ops = {
+        .show  = dlm_attr_show,
+        .store = dlm_attr_store,
+};
+static struct kobj_type dlm_ktype = {
+        .default_attrs = dlm_attrs,
+        .sysfs_ops     = &dlm_attr_ops,
+};
+static struct kset dlm_kset = {
+        .subsys = &kernel_subsys,
+        .kobj   = {.name = "dlm",},
+        .ktype  = &dlm_ktype,
+};
+static int kobject_setup(struct dlm_ls *ls)
+{
+        char lsname[DLM_LOCKSPACE_LEN];
+        int error;
+        memset(lsname, 0, DLM_LOCKSPACE_LEN);
+        snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
+        error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
+        if (error)
+                return error;
+        ls->ls_kobj.kset = &dlm_kset;
+        ls->ls_kobj.ktype = &dlm_ktype;
+        return 0;
+}
+static int do_uevent(struct dlm_ls *ls, int in)
+{
+        int error;
+        if (in)
+                kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
+        else
+                kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+        error = wait_event_interruptible(ls->ls_uevent_wait,
+                        test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
+        if (error)
+                goto out;
+        error = ls->ls_uevent_result;
+ out:
+        return error;
+}
+int dlm_lockspace_init(void)
+{
+        int error;
+        ls_count = 0;
+        mutex_init(&ls_lock);
+        INIT_LIST_HEAD(&lslist);
+        spin_lock_init(&lslist_lock);
+        error = kset_register(&dlm_kset);
+        if (error)
+                printk("dlm_lockspace_init: cannot register kset %d\n", error);
+        return error;
+}
+void dlm_lockspace_exit(void)
+{
+        kset_unregister(&dlm_kset);
+}
+static int dlm_scand(void *data)
+{
+        struct dlm_ls *ls;
+        while (!kthread_should_stop()) {
+                list_for_each_entry(ls, &lslist, ls_list)
+                        dlm_scan_rsbs(ls);
+                schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
+        }
+        return 0;
+}
+static int dlm_scand_start(void)
+{
+        struct task_struct *p;
+        int error = 0;
+        p = kthread_run(dlm_scand, NULL, "dlm_scand");
+        if (IS_ERR(p))
+                error = PTR_ERR(p);
+        else
+                scand_task = p;
+        return error;
+}
+static void dlm_scand_stop(void)
+{
+        kthread_stop(scand_task);
+}
+static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
+{
+        struct dlm_ls *ls;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (ls->ls_namelen == namelen &&
+                    memcmp(ls->ls_name, name, namelen) == 0)
+                        goto out;
+        }
+        ls = NULL;
+ out:
+        spin_unlock(&lslist_lock);
+        return ls;
+}
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
+{
+        struct dlm_ls *ls;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (ls->ls_global_id == id) {
+                        ls->ls_count++;
+                        goto out;
+                }
+        }
+        ls = NULL;
+ out:
+        spin_unlock(&lslist_lock);
+        return ls;
+}
+struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
+{
+        struct dlm_ls *ls;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (ls->ls_local_handle == lockspace) {
+                        ls->ls_count++;
+                        goto out;
+                }
+        }
+        ls = NULL;
+ out:
+        spin_unlock(&lslist_lock);
+        return ls;
+}
+struct dlm_ls *dlm_find_lockspace_device(int minor)
+{
+        struct dlm_ls *ls;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (ls->ls_device.minor == minor) {
+                        ls->ls_count++;
+                        goto out;
+                }
+        }
+        ls = NULL;
+ out:
+        spin_unlock(&lslist_lock);
+        return ls;
+}
+void dlm_put_lockspace(struct dlm_ls *ls)
+{
+        spin_lock(&lslist_lock);
+        ls->ls_count--;
+        spin_unlock(&lslist_lock);
+}
+static void remove_lockspace(struct dlm_ls *ls)
+{
+        for (;;) {
+                spin_lock(&lslist_lock);
+                if (ls->ls_count == 0) {
+                        list_del(&ls->ls_list);
+                        spin_unlock(&lslist_lock);
+                        return;
+                }
+                spin_unlock(&lslist_lock);
+                ssleep(1);
+        }
+}
+static int threads_start(void)
+{
+        int error;
+        /* Thread which process lock requests for all lockspace's */
+        error = dlm_astd_start();
+        if (error) {
+                log_print("cannot start dlm_astd thread %d", error);
+                goto fail;
+        }
+        error = dlm_scand_start();
+        if (error) {
+                log_print("cannot start dlm_scand thread %d", error);
+                goto astd_fail;
+        }
+        /* Thread for sending/receiving messages for all lockspace's */
+        error = dlm_lowcomms_start();
+        if (error) {
+                log_print("cannot start dlm lowcomms %d", error);
+                goto scand_fail;
+        }
+        return 0;
+ scand_fail:
+        dlm_scand_stop();
+ astd_fail:
+        dlm_astd_stop();
+ fail:
+        return error;
+}
+static void threads_stop(void)
+{
+        dlm_scand_stop();
+        dlm_lowcomms_stop();
+        dlm_astd_stop();
+}
+static int new_lockspace(char *name, int namelen, void **lockspace,
+                         uint32_t flags, int lvblen)
+{
+        struct dlm_ls *ls;
+        int i, size, error = -ENOMEM;
+        if (namelen > DLM_LOCKSPACE_LEN)
+                return -EINVAL;
+        if (!lvblen || (lvblen % 8))
+                return -EINVAL;
+        if (!try_module_get(THIS_MODULE))
+                return -EINVAL;
+        ls = dlm_find_lockspace_name(name, namelen);
+        if (ls) {
+                *lockspace = ls;
+                module_put(THIS_MODULE);
+                return -EEXIST;
+        }
+        ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+        if (!ls)
+                goto out;
+        memcpy(ls->ls_name, name, namelen);
+        ls->ls_namelen = namelen;
+        ls->ls_exflags = flags;
+        ls->ls_lvblen = lvblen;
+        ls->ls_count = 0;
+        ls->ls_flags = 0;
+        size = dlm_config.rsbtbl_size;
+        ls->ls_rsbtbl_size = size;
+        ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+        if (!ls->ls_rsbtbl)
+                goto out_lsfree;
+        for (i = 0; i < size; i++) {
+                INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
+                INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+                rwlock_init(&ls->ls_rsbtbl[i].lock);
+        }
+        size = dlm_config.lkbtbl_size;
+        ls->ls_lkbtbl_size = size;
+        ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+        if (!ls->ls_lkbtbl)
+                goto out_rsbfree;
+        for (i = 0; i < size; i++) {
+                INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
+                rwlock_init(&ls->ls_lkbtbl[i].lock);
+                ls->ls_lkbtbl[i].counter = 1;
+        }
+        size = dlm_config.dirtbl_size;
+        ls->ls_dirtbl_size = size;
+        ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+        if (!ls->ls_dirtbl)
+                goto out_lkbfree;
+        for (i = 0; i < size; i++) {
+                INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
+                rwlock_init(&ls->ls_dirtbl[i].lock);
+        }
+        INIT_LIST_HEAD(&ls->ls_waiters);
+        mutex_init(&ls->ls_waiters_mutex);
+        INIT_LIST_HEAD(&ls->ls_nodes);
+        INIT_LIST_HEAD(&ls->ls_nodes_gone);
+        ls->ls_num_nodes = 0;
+        ls->ls_low_nodeid = 0;
+        ls->ls_total_weight = 0;
+        ls->ls_node_array = NULL;
+        memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
+        ls->ls_stub_rsb.res_ls = ls;
+        ls->ls_debug_rsb_dentry = NULL;
+        ls->ls_debug_waiters_dentry = NULL;
+        init_waitqueue_head(&ls->ls_uevent_wait);
+        ls->ls_uevent_result = 0;
+        ls->ls_recoverd_task = NULL;
+        mutex_init(&ls->ls_recoverd_active);
+        spin_lock_init(&ls->ls_recover_lock);
+        ls->ls_recover_status = 0;
+        ls->ls_recover_seq = 0;
+        ls->ls_recover_args = NULL;
+        init_rwsem(&ls->ls_in_recovery);
+        INIT_LIST_HEAD(&ls->ls_requestqueue);
+        mutex_init(&ls->ls_requestqueue_mutex);
+        mutex_init(&ls->ls_clear_proc_locks);
+        ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+        if (!ls->ls_recover_buf)
+                goto out_dirfree;
+        INIT_LIST_HEAD(&ls->ls_recover_list);
+        spin_lock_init(&ls->ls_recover_list_lock);
+        ls->ls_recover_list_count = 0;
+        ls->ls_local_handle = ls;
+        init_waitqueue_head(&ls->ls_wait_general);
+        INIT_LIST_HEAD(&ls->ls_root_list);
+        init_rwsem(&ls->ls_root_sem);
+        down_write(&ls->ls_in_recovery);
+        spin_lock(&lslist_lock);
+        list_add(&ls->ls_list, &lslist);
+        spin_unlock(&lslist_lock);
+        /* needs to find ls in lslist */
+        error = dlm_recoverd_start(ls);
+        if (error) {
+                log_error(ls, "can't start dlm_recoverd %d", error);
+                goto out_rcomfree;
+        }
+        dlm_create_debug_file(ls);
+        error = kobject_setup(ls);
+        if (error)
+                goto out_del;
+        error = kobject_register(&ls->ls_kobj);
+        if (error)
+                goto out_del;
+        error = do_uevent(ls, 1);
+        if (error)
+                goto out_unreg;
+        *lockspace = ls;
+        return 0;
+ out_unreg:
+        kobject_unregister(&ls->ls_kobj);
+ out_del:
+        dlm_delete_debug_file(ls);
+        dlm_recoverd_stop(ls);
+ out_rcomfree:
+        spin_lock(&lslist_lock);
+        list_del(&ls->ls_list);
+        spin_unlock(&lslist_lock);
+        kfree(ls->ls_recover_buf);
+ out_dirfree:
+        kfree(ls->ls_dirtbl);
+ out_lkbfree:
+        kfree(ls->ls_lkbtbl);
+ out_rsbfree:
+        kfree(ls->ls_rsbtbl);
+ out_lsfree:
+        kfree(ls);
+ out:
+        module_put(THIS_MODULE);
+        return error;
+}
+int dlm_new_lockspace(char *name, int namelen, void **lockspace,
+                      uint32_t flags, int lvblen)
+{
+        int error = 0;
+        mutex_lock(&ls_lock);
+        if (!ls_count)
+                error = threads_start();
+        if (error)
+                goto out;
+        error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+        if (!error)
+                ls_count++;
+ out:
+        mutex_unlock(&ls_lock);
+        return error;
+}
+/* Return 1 if the lockspace still has active remote locks,
+ *        2 if the lockspace still has active local locks.
+ */
+static int lockspace_busy(struct dlm_ls *ls)
+{
+        int i, lkb_found = 0;
+        struct dlm_lkb *lkb;
+        /* NOTE: We check the lockidtbl here rather than the resource table.
+           This is because there may be LKBs queued as ASTs that have been
+           unlinked from their RSBs and are pending deletion once the AST has
+           been delivered */
+        for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+                read_lock(&ls->ls_lkbtbl[i].lock);
+                if (!list_empty(&ls->ls_lkbtbl[i].list)) {
+                        lkb_found = 1;
+                        list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
+                                            lkb_idtbl_list) {
+                                if (!lkb->lkb_nodeid) {
+                                        read_unlock(&ls->ls_lkbtbl[i].lock);
+                                        return 2;
+                                }
+                        }
+                }
+                read_unlock(&ls->ls_lkbtbl[i].lock);
+        }
+        return lkb_found;
+}
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *rsb;
+        struct list_head *head;
+        int i;
+        int busy = lockspace_busy(ls);
+        if (busy > force)
+                return -EBUSY;
+        if (force < 3)
+                do_uevent(ls, 0);
+        dlm_recoverd_stop(ls);
+        remove_lockspace(ls);
+        dlm_delete_debug_file(ls);
+        dlm_astd_suspend();
+        kfree(ls->ls_recover_buf);
+        /*
+         * Free direntry structs.
+         */
+        dlm_dir_clear(ls);
+        kfree(ls->ls_dirtbl);
+        /*
+         * Free all lkb's on lkbtbl[] lists.
+         */
+        for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+                head = &ls->ls_lkbtbl[i].list;
+                while (!list_empty(head)) {
+                        lkb = list_entry(head->next, struct dlm_lkb,
+                                         lkb_idtbl_list);
+                        list_del(&lkb->lkb_idtbl_list);
+                        dlm_del_ast(lkb);
+                        if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
+                                free_lvb(lkb->lkb_lvbptr);
+                        free_lkb(lkb);
+                }
+        }
+        dlm_astd_resume();
+        kfree(ls->ls_lkbtbl);
+        /*
+         * Free all rsb's on rsbtbl[] lists
+         */
+        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+                head = &ls->ls_rsbtbl[i].list;
+                while (!list_empty(head)) {
+                        rsb = list_entry(head->next, struct dlm_rsb,
+                                         res_hashchain);
+                        list_del(&rsb->res_hashchain);
+                        free_rsb(rsb);
+                }
+                head = &ls->ls_rsbtbl[i].toss;
+                while (!list_empty(head)) {
+                        rsb = list_entry(head->next, struct dlm_rsb,
+                                         res_hashchain);
+                        list_del(&rsb->res_hashchain);
+                        free_rsb(rsb);
+                }
+        }
+        kfree(ls->ls_rsbtbl);
+        /*
+         * Free structures on any other lists
+         */
+        kfree(ls->ls_recover_args);
+        dlm_clear_free_entries(ls);
+        dlm_clear_members(ls);
+        dlm_clear_members_gone(ls);
+        kfree(ls->ls_node_array);
+        kobject_unregister(&ls->ls_kobj);
+        kfree(ls);
+        mutex_lock(&ls_lock);
+        ls_count--;
+        if (!ls_count)
+                threads_stop();
+        mutex_unlock(&ls_lock);
+        module_put(THIS_MODULE);
+        return 0;
+}
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer.  We free everything we're managing for this lockspace.
+ * Remaining nodes will go through the recovery process as if we'd died.  The
+ * lockspace must continue to function as usual, participating in recoveries,
+ * until this returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+int dlm_release_lockspace(void *lockspace, int force)
+{
+        struct dlm_ls *ls;
+        ls = dlm_find_lockspace_local(lockspace);
+        if (!ls)
+                return -EINVAL;
+        dlm_put_lockspace(ls);
+        return release_lockspace(ls, force);
+}
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+int dlm_lockspace_init(void);
+void dlm_lockspace_exit(void);
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
+struct dlm_ls *dlm_find_lockspace_local(void *id);
+struct dlm_ls *dlm_find_lockspace_device(int minor);
+void dlm_put_lockspace(struct dlm_ls *ls);
+#endif                          /* __LOCKSPACE_DOT_H__ */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..23f5ce12080b
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1238 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never (well, hardly ever) waits.
+ *
+ */
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/sctp/user.h>
+#include <linux/pagemap.h>
+#include <linux/socket.h>
+#include <linux/idr.h>
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "midcomms.h"
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int                      dlm_local_count;
+static int                      dlm_local_nodeid;
+/* One of these per connected node */
+#define NI_INIT_PENDING 1
+#define NI_WRITE_PENDING 2
+struct nodeinfo {
+        spinlock_t              lock;
+        sctp_assoc_t            assoc_id;
+        unsigned long           flags;
+        struct list_head        write_list; /* nodes with pending writes */
+        struct list_head        writequeue; /* outgoing writequeue_entries */
+        spinlock_t              writequeue_lock;
+        int                     nodeid;
+};
+static DEFINE_IDR(nodeinfo_idr);
+static struct rw_semaphore      nodeinfo_lock;
+static int                      max_nodeid;
+struct cbuf {
+        unsigned                base;
+        unsigned                len;
+        unsigned                mask;
+};
+/* Just the one of these, now. But this struct keeps
+   the connection-specific variables together */
+#define CF_READ_PENDING 1
+struct connection {
+        struct socket          *sock;
+        unsigned long           flags;
+        struct page            *rx_page;
+        atomic_t                waiting_requests;
+        struct cbuf             cb;
+        int                     eagain_flag;
+};
+/* An entry waiting to be sent */
+struct writequeue_entry {
+        struct list_head        list;
+        struct page            *page;
+        int                     offset;
+        int                     len;
+        int                     end;
+        int                     users;
+        struct nodeinfo        *ni;
+};
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+#define CBUF_INIT(cb, size) \
+do { \
+        (cb)->base = (cb)->len = 0; \
+        (cb)->mask = ((size)-1); \
+} while(0)
+#define CBUF_EAT(cb, n) \
+do { \
+        (cb)->len  -= (n); \
+        (cb)->base += (n); \
+        (cb)->base &= (cb)->mask; \
+} while(0)
+/* List of nodes which have writes pending */
+static struct list_head write_nodes;
+static spinlock_t write_nodes_lock;
+/* Maximum number of incoming messages to process before
+ * doing a schedule()
+ */
+#define MAX_RX_MSG_COUNT 25
+/* Manage daemons */
+static struct task_struct *recv_task;
+static struct task_struct *send_task;
+static wait_queue_head_t lowcomms_recv_wait;
+static atomic_t accepting;
+/* The SCTP connection */
+static struct connection sctp_con;
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+        struct sockaddr_storage addr;
+        int error;
+        if (!dlm_local_count)
+                return -1;
+        error = dlm_nodeid_to_addr(nodeid, &addr);
+        if (error)
+                return error;
+        if (dlm_local_addr[0]->ss_family == AF_INET) {
+                struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+                struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+                ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+        } else {
+                struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+                struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+                memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+                       sizeof(in6->sin6_addr));
+        }
+        return 0;
+}
+static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
+{
+        struct nodeinfo *ni;
+        int r;
+        int n;
+        down_read(&nodeinfo_lock);
+        ni = idr_find(&nodeinfo_idr, nodeid);
+        up_read(&nodeinfo_lock);
+        if (!ni && alloc) {
+                down_write(&nodeinfo_lock);
+                ni = idr_find(&nodeinfo_idr, nodeid);
+                if (ni)
+                        goto out_up;
+                r = idr_pre_get(&nodeinfo_idr, alloc);
+                if (!r)
+                        goto out_up;
+                ni = kmalloc(sizeof(struct nodeinfo), alloc);
+                if (!ni)
+                        goto out_up;
+                r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
+                if (r) {
+                        kfree(ni);
+                        ni = NULL;
+                        goto out_up;
+                }
+                if (n != nodeid) {
+                        idr_remove(&nodeinfo_idr, n);
+                        kfree(ni);
+                        ni = NULL;
+                        goto out_up;
+                }
+                memset(ni, 0, sizeof(struct nodeinfo));
+                spin_lock_init(&ni->lock);
+                INIT_LIST_HEAD(&ni->writequeue);
+                spin_lock_init(&ni->writequeue_lock);
+                ni->nodeid = nodeid;
+                if (nodeid > max_nodeid)
+                        max_nodeid = nodeid;
+        out_up:
+                up_write(&nodeinfo_lock);
+        }
+        return ni;
+}
+/* Don't call this too often... */
+static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
+{
+        int i;
+        struct nodeinfo *ni;
+        for (i=1; i<=max_nodeid; i++) {
+                ni = nodeid2nodeinfo(i, 0);
+                if (ni && ni->assoc_id == assoc)
+                        return ni;
+        }
+        return NULL;
+}
+/* Data or notification available on socket */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+        atomic_inc(&sctp_con.waiting_requests);
+        if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
+                return;
+        wake_up_interruptible(&lowcomms_recv_wait);
+}
+/* Add the port number to an IP6 or 4 sockaddr and return the address length.
+   Also padd out the struct with zeros to make comparisons meaningful */
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+                          int *addr_len)
+{
+        struct sockaddr_in *local4_addr;
+        struct sockaddr_in6 *local6_addr;
+        if (!dlm_local_count)
+                return;
+        if (!port) {
+                if (dlm_local_addr[0]->ss_family == AF_INET) {
+                        local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
+                        port = be16_to_cpu(local4_addr->sin_port);
+                } else {
+                        local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
+                        port = be16_to_cpu(local6_addr->sin6_port);
+                }
+        }
+        saddr->ss_family = dlm_local_addr[0]->ss_family;
+        if (dlm_local_addr[0]->ss_family == AF_INET) {
+                struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+                in4_addr->sin_port = cpu_to_be16(port);
+                memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+                memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
+                                      sizeof(struct sockaddr_in));
+                *addr_len = sizeof(struct sockaddr_in);
+        } else {
+                struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+                in6_addr->sin6_port = cpu_to_be16(port);
+                memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
+                                      sizeof(struct sockaddr_in6));
+                *addr_len = sizeof(struct sockaddr_in6);
+        }
+}
+/* Close the connection and tidy up */
+static void close_connection(void)
+{
+        if (sctp_con.sock) {
+                sock_release(sctp_con.sock);
+                sctp_con.sock = NULL;
+        }
+        if (sctp_con.rx_page) {
+                __free_page(sctp_con.rx_page);
+                sctp_con.rx_page = NULL;
+        }
+}
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void send_shutdown(sctp_assoc_t associd)
+{
+        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        struct msghdr outmessage;
+        struct cmsghdr *cmsg;
+        struct sctp_sndrcvinfo *sinfo;
+        int ret;
+        outmessage.msg_name = NULL;
+        outmessage.msg_namelen = 0;
+        outmessage.msg_control = outcmsg;
+        outmessage.msg_controllen = sizeof(outcmsg);
+        outmessage.msg_flags = MSG_EOR;
+        cmsg = CMSG_FIRSTHDR(&outmessage);
+        cmsg->cmsg_level = IPPROTO_SCTP;
+        cmsg->cmsg_type = SCTP_SNDRCV;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+        outmessage.msg_controllen = cmsg->cmsg_len;
+        sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+        sinfo->sinfo_flags |= MSG_EOF;
+        sinfo->sinfo_assoc_id = associd;
+        ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
+        if (ret != 0)
+                log_print("send EOF to node failed: %d", ret);
+}
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void init_failed(void)
+{
+        int i;
+        struct nodeinfo *ni;
+        for (i=1; i<=max_nodeid; i++) {
+                ni = nodeid2nodeinfo(i, 0);
+                if (!ni)
+                        continue;
+                if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+                        ni->assoc_id = 0;
+                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                                spin_lock_bh(&write_nodes_lock);
+                                list_add_tail(&ni->write_list, &write_nodes);
+                                spin_unlock_bh(&write_nodes_lock);
+                        }
+                }
+        }
+        wake_up_process(send_task);
+}
+/* Something happened to an association */
+static void process_sctp_notification(struct msghdr *msg, char *buf)
+{
+        union sctp_notification *sn = (union sctp_notification *)buf;
+        if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+                switch (sn->sn_assoc_change.sac_state) {
+                case SCTP_COMM_UP:
+                case SCTP_RESTART:
+                {
+                        /* Check that the new node is in the lockspace */
+                        struct sctp_prim prim;
+                        mm_segment_t fs;
+                        int nodeid;
+                        int prim_len, ret;
+                        int addr_len;
+                        struct nodeinfo *ni;
+                        /* This seems to happen when we received a connection
+                         * too early... or something...  anyway, it happens but
+                         * we always seem to get a real message too, see
+                         * receive_from_sock */
+                        if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+                                log_print("COMM_UP for invalid assoc ID %d",
+                                         (int)sn->sn_assoc_change.sac_assoc_id);
+                                init_failed();
+                                return;
+                        }
+                        memset(&prim, 0, sizeof(struct sctp_prim));
+                        prim_len = sizeof(struct sctp_prim);
+                        prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+                        fs = get_fs();
+                        set_fs(get_ds());
+                        ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
+                                                IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
+                                                (char*)&prim, &prim_len);
+                        set_fs(fs);
+                        if (ret < 0) {
+                                struct nodeinfo *ni;
+                                log_print("getsockopt/sctp_primary_addr on "
+                                          "new assoc %d failed : %d",
+                                    (int)sn->sn_assoc_change.sac_assoc_id, ret);
+                                /* Retry INIT later */
+                                ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+                                if (ni)
+                                        clear_bit(NI_INIT_PENDING, &ni->flags);
+                                return;
+                        }
+                        make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+                        if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+                                log_print("reject connect from unknown addr");
+                                send_shutdown(prim.ssp_assoc_id);
+                                return;
+                        }
+                        ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+                        if (!ni)
+                                return;
+                        /* Save the assoc ID */
+                        spin_lock(&ni->lock);
+                        ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
+                        spin_unlock(&ni->lock);
+                        log_print("got new/restarted association %d nodeid %d",
+                               (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+                        /* Send any pending writes */
+                        clear_bit(NI_INIT_PENDING, &ni->flags);
+                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                                spin_lock_bh(&write_nodes_lock);
+                                list_add_tail(&ni->write_list, &write_nodes);
+                                spin_unlock_bh(&write_nodes_lock);
+                        }
+                        wake_up_process(send_task);
+                }
+                break;
+                case SCTP_COMM_LOST:
+                case SCTP_SHUTDOWN_COMP:
+                {
+                        struct nodeinfo *ni;
+                        ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+                        if (ni) {
+                                spin_lock(&ni->lock);
+                                ni->assoc_id = 0;
+                                spin_unlock(&ni->lock);
+                        }
+                }
+                break;
+                /* We don't know which INIT failed, so clear the PENDING flags
+                 * on them all.  if assoc_id is zero then it will then try
+                 * again */
+                case SCTP_CANT_STR_ASSOC:
+                {
+                        log_print("Can't start SCTP association - retrying");
+                        init_failed();
+                }
+                break;
+                default:
+                        log_print("unexpected SCTP assoc change id=%d state=%d",
+                                  (int)sn->sn_assoc_change.sac_assoc_id,
+                                  sn->sn_assoc_change.sac_state);
+                }
+        }
+}
+/* Data received from remote end */
+static int receive_from_sock(void)
+{
+        int ret = 0;
+        struct msghdr msg;
+        struct kvec iov[2];
+        unsigned len;
+        int r;
+        struct sctp_sndrcvinfo *sinfo;
+        struct cmsghdr *cmsg;
+        struct nodeinfo *ni;
+        /* These two are marginally too big for stack allocation, but this
+         * function is (currently) only called by dlm_recvd so static should be
+         * OK.
+         */
+        static struct sockaddr_storage msgname;
+        static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        if (sctp_con.sock == NULL)
+                goto out;
+        if (sctp_con.rx_page == NULL) {
+                /*
+                 * This doesn't need to be atomic, but I think it should
+                 * improve performance if it is.
+                 */
+                sctp_con.rx_page = alloc_page(GFP_ATOMIC);
+                if (sctp_con.rx_page == NULL)
+                        goto out_resched;
+                CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
+        }
+        memset(&incmsg, 0, sizeof(incmsg));
+        memset(&msgname, 0, sizeof(msgname));
+        memset(incmsg, 0, sizeof(incmsg));
+        msg.msg_name = &msgname;
+        msg.msg_namelen = sizeof(msgname);
+        msg.msg_flags = 0;
+        msg.msg_control = incmsg;
+        msg.msg_controllen = sizeof(incmsg);
+        /* I don't see why this circular buffer stuff is necessary for SCTP
+         * which is a packet-based protocol, but the whole thing breaks under
+         * load without it! The overhead is minimal (and is in the TCP lowcomms
+         * anyway, of course) so I'll leave it in until I can figure out what's
+         * really happening.
+         */
+        /*
+         * iov[0] is the bit of the circular buffer between the current end
+         * point (cb.base + cb.len) and the end of the buffer.
+         */
+        iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
+        iov[0].iov_base = page_address(sctp_con.rx_page) +
+                          CBUF_DATA(&sctp_con.cb);
+        iov[1].iov_len = 0;
+        /*
+         * iov[1] is the bit of the circular buffer between the start of the
+         * buffer and the start of the currently used section (cb.base)
+         */
+        if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
+                iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
+                iov[1].iov_len = sctp_con.cb.base;
+                iov[1].iov_base = page_address(sctp_con.rx_page);
+                msg.msg_iovlen = 2;
+        }
+        len = iov[0].iov_len + iov[1].iov_len;
+        r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
+                                 MSG_NOSIGNAL | MSG_DONTWAIT);
+        if (ret <= 0)
+                goto out_close;
+        msg.msg_control = incmsg;
+        msg.msg_controllen = sizeof(incmsg);
+        cmsg = CMSG_FIRSTHDR(&msg);
+        sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+        if (msg.msg_flags & MSG_NOTIFICATION) {
+                process_sctp_notification(&msg, page_address(sctp_con.rx_page));
+                return 0;
+        }
+        /* Is this a new association ? */
+        ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
+        if (ni) {
+                ni->assoc_id = sinfo->sinfo_assoc_id;
+                if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                                spin_lock_bh(&write_nodes_lock);
+                                list_add_tail(&ni->write_list, &write_nodes);
+                                spin_unlock_bh(&write_nodes_lock);
+                        }
+                        wake_up_process(send_task);
+                }
+        }
+        /* INIT sends a message with length of 1 - ignore it */
+        if (r == 1)
+                return 0;
+        CBUF_ADD(&sctp_con.cb, ret);
+        ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
+                                          page_address(sctp_con.rx_page),
+                                          sctp_con.cb.base, sctp_con.cb.len,
+                                          PAGE_CACHE_SIZE);
+        if (ret < 0)
+                goto out_close;
+        CBUF_EAT(&sctp_con.cb, ret);
+      out:
+        ret = 0;
+        goto out_ret;
+      out_resched:
+        lowcomms_data_ready(sctp_con.sock->sk, 0);
+        ret = 0;
+        schedule();
+        goto out_ret;
+      out_close:
+        if (ret != -EAGAIN)
+                log_print("error reading from sctp socket: %d", ret);
+      out_ret:
+        return ret;
+}
+/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
+static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
+{
+        mm_segment_t fs;
+        int result = 0;
+        fs = get_fs();
+        set_fs(get_ds());
+        if (num == 1)
+                result = sctp_con.sock->ops->bind(sctp_con.sock,
+                                        (struct sockaddr *) addr, addr_len);
+        else
+                result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
+                                SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
+        set_fs(fs);
+        if (result < 0)
+                log_print("Can't bind to port %d addr number %d",
+                          dlm_config.tcp_port, num);
+        return result;
+}
+static void init_local(void)
+{
+        struct sockaddr_storage sas, *addr;
+        int i;
+        dlm_local_nodeid = dlm_our_nodeid();
+        for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+                if (dlm_our_addr(&sas, i))
+                        break;
+                addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+                if (!addr)
+                        break;
+                memcpy(addr, &sas, sizeof(*addr));
+                dlm_local_addr[dlm_local_count++] = addr;
+        }
+}
+/* Initialise SCTP socket and bind to all interfaces */
+static int init_sock(void)
+{
+        mm_segment_t fs;
+        struct socket *sock = NULL;
+        struct sockaddr_storage localaddr;
+        struct sctp_event_subscribe subscribe;
+        int result = -EINVAL, num = 1, i, addr_len;
+        if (!dlm_local_count) {
+                init_local();
+                if (!dlm_local_count) {
+                        log_print("no local IP address has been set");
+                        goto out;
+                }
+        }
+        result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+                                  IPPROTO_SCTP, &sock);
+        if (result < 0) {
+                log_print("Can't create comms socket, check SCTP is loaded");
+                goto out;
+        }
+        /* Listen for events */
+        memset(&subscribe, 0, sizeof(subscribe));
+        subscribe.sctp_data_io_event = 1;
+        subscribe.sctp_association_event = 1;
+        subscribe.sctp_send_failure_event = 1;
+        subscribe.sctp_shutdown_event = 1;
+        subscribe.sctp_partial_delivery_event = 1;
+        fs = get_fs();
+        set_fs(get_ds());
+        result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+                                       (char *)&subscribe, sizeof(subscribe));
+        set_fs(fs);
+        if (result < 0) {
+                log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+                          result);
+                goto create_delsock;
+        }
+        /* Init con struct */
+        sock->sk->sk_user_data = &sctp_con;
+        sctp_con.sock = sock;
+        sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
+        /* Bind to all interfaces. */
+        for (i = 0; i < dlm_local_count; i++) {
+                memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+                make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
+                result = add_bind_addr(&localaddr, addr_len, num);
+                if (result)
+                        goto create_delsock;
+                ++num;
+        }
+        result = sock->ops->listen(sock, 5);
+        if (result < 0) {
+                log_print("Can't set socket listening");
+                goto create_delsock;
+        }
+        return 0;
+ create_delsock:
+        sock_release(sock);
+        sctp_con.sock = NULL;
+ out:
+        return result;
+}
+static struct writequeue_entry *new_writequeue_entry(int allocation)
+{
+        struct writequeue_entry *entry;
+        entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+        if (!entry)
+                return NULL;
+        entry->page = alloc_page(allocation);
+        if (!entry->page) {
+                kfree(entry);
+                return NULL;
+        }
+        entry->offset = 0;
+        entry->len = 0;
+        entry->end = 0;
+        entry->users = 0;
+        return entry;
+}
+void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
+{
+        struct writequeue_entry *e;
+        int offset = 0;
+        int users = 0;
+        struct nodeinfo *ni;
+        if (!atomic_read(&accepting))
+                return NULL;
+        ni = nodeid2nodeinfo(nodeid, allocation);
+        if (!ni)
+                return NULL;
+        spin_lock(&ni->writequeue_lock);
+        e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
+        if (((struct list_head *) e == &ni->writequeue) ||
+            (PAGE_CACHE_SIZE - e->end < len)) {
+                e = NULL;
+        } else {
+                offset = e->end;
+                e->end += len;
+                users = e->users++;
+        }
+        spin_unlock(&ni->writequeue_lock);
+        if (e) {
+              got_one:
+                if (users == 0)
+                        kmap(e->page);
+                *ppc = page_address(e->page) + offset;
+                return e;
+        }
+        e = new_writequeue_entry(allocation);
+        if (e) {
+                spin_lock(&ni->writequeue_lock);
+                offset = e->end;
+                e->end += len;
+                e->ni = ni;
+                users = e->users++;
+                list_add_tail(&e->list, &ni->writequeue);
+                spin_unlock(&ni->writequeue_lock);
+                goto got_one;
+        }
+        return NULL;
+}
+void dlm_lowcomms_commit_buffer(void *arg)
+{
+        struct writequeue_entry *e = (struct writequeue_entry *) arg;
+        int users;
+        struct nodeinfo *ni = e->ni;
+        if (!atomic_read(&accepting))
+                return;
+        spin_lock(&ni->writequeue_lock);
+        users = --e->users;
+        if (users)
+                goto out;
+        e->len = e->end - e->offset;
+        kunmap(e->page);
+        spin_unlock(&ni->writequeue_lock);
+        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                spin_lock_bh(&write_nodes_lock);
+                list_add_tail(&ni->write_list, &write_nodes);
+                spin_unlock_bh(&write_nodes_lock);
+                wake_up_process(send_task);
+        }
+        return;
+      out:
+        spin_unlock(&ni->writequeue_lock);
+        return;
+}
+static void free_entry(struct writequeue_entry *e)
+{
+        __free_page(e->page);
+        kfree(e);
+}
+/* Initiate an SCTP association. In theory we could just use sendmsg() on
+   the first IP address and it should work, but this allows us to set up the
+   association before sending any valuable data that we can't afford to lose.
+   It also keeps the send path clean as it can now always use the association ID */
+static void initiate_association(int nodeid)
+{
+        struct sockaddr_storage rem_addr;
+        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        struct msghdr outmessage;
+        struct cmsghdr *cmsg;
+        struct sctp_sndrcvinfo *sinfo;
+        int ret;
+        int addrlen;
+        char buf[1];
+        struct kvec iov[1];
+        struct nodeinfo *ni;
+        log_print("Initiating association with node %d", nodeid);
+        ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+        if (!ni)
+                return;
+        if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
+                log_print("no address for nodeid %d", nodeid);
+                return;
+        }
+        make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
+        outmessage.msg_name = &rem_addr;
+        outmessage.msg_namelen = addrlen;
+        outmessage.msg_control = outcmsg;
+        outmessage.msg_controllen = sizeof(outcmsg);
+        outmessage.msg_flags = MSG_EOR;
+        iov[0].iov_base = buf;
+        iov[0].iov_len = 1;
+        /* Real INIT messages seem to cause trouble. Just send a 1 byte message
+           we can afford to lose */
+        cmsg = CMSG_FIRSTHDR(&outmessage);
+        cmsg->cmsg_level = IPPROTO_SCTP;
+        cmsg->cmsg_type = SCTP_SNDRCV;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+        sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+        sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+        outmessage.msg_controllen = cmsg->cmsg_len;
+        ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
+        if (ret < 0) {
+                log_print("send INIT to node failed: %d", ret);
+                /* Try again later */
+                clear_bit(NI_INIT_PENDING, &ni->flags);
+        }
+}
+/* Send a message */
+static int send_to_sock(struct nodeinfo *ni)
+{
+        int ret = 0;
+        struct writequeue_entry *e;
+        int len, offset;
+        struct msghdr outmsg;
+        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        struct cmsghdr *cmsg;
+        struct sctp_sndrcvinfo *sinfo;
+        struct kvec iov;
+        /* See if we need to init an association before we start
+           sending precious messages */
+        spin_lock(&ni->lock);
+        if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+                spin_unlock(&ni->lock);
+                initiate_association(ni->nodeid);
+                return 0;
+        }
+        spin_unlock(&ni->lock);
+        outmsg.msg_name = NULL; /* We use assoc_id */
+        outmsg.msg_namelen = 0;
+        outmsg.msg_control = outcmsg;
+        outmsg.msg_controllen = sizeof(outcmsg);
+        outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
+        cmsg = CMSG_FIRSTHDR(&outmsg);
+        cmsg->cmsg_level = IPPROTO_SCTP;
+        cmsg->cmsg_type = SCTP_SNDRCV;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+        sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+        sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+        sinfo->sinfo_assoc_id = ni->assoc_id;
+        outmsg.msg_controllen = cmsg->cmsg_len;
+        spin_lock(&ni->writequeue_lock);
+        for (;;) {
+                if (list_empty(&ni->writequeue))
+                        break;
+                e = list_entry(ni->writequeue.next, struct writequeue_entry,
+                               list);
+                len = e->len;
+                offset = e->offset;
+                BUG_ON(len == 0 && e->users == 0);
+                spin_unlock(&ni->writequeue_lock);
+                kmap(e->page);
+                ret = 0;
+                if (len) {
+                        iov.iov_base = page_address(e->page)+offset;
+                        iov.iov_len = len;
+                        ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
+                                             len);
+                        if (ret == -EAGAIN) {
+                                sctp_con.eagain_flag = 1;
+                                goto out;
+                        } else if (ret < 0)
+                                goto send_error;
+                } else {
+                        /* Don't starve people filling buffers */
+                        schedule();
+                }
+                spin_lock(&ni->writequeue_lock);
+                e->offset += ret;
+                e->len -= ret;
+                if (e->len == 0 && e->users == 0) {
+                        list_del(&e->list);
+                        free_entry(e);
+                        continue;
+                }
+        }
+        spin_unlock(&ni->writequeue_lock);
+ out:
+        return ret;
+ send_error:
+        log_print("Error sending to node %d %d", ni->nodeid, ret);
+        spin_lock(&ni->lock);
+        if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+                ni->assoc_id = 0;
+                spin_unlock(&ni->lock);
+                initiate_association(ni->nodeid);
+        } else
+                spin_unlock(&ni->lock);
+        return ret;
+}
+/* Try to send any messages that are pending */
+static void process_output_queue(void)
+{
+        struct list_head *list;
+        struct list_head *temp;
+        spin_lock_bh(&write_nodes_lock);
+        list_for_each_safe(list, temp, &write_nodes) {
+                struct nodeinfo *ni =
+                    list_entry(list, struct nodeinfo, write_list);
+                clear_bit(NI_WRITE_PENDING, &ni->flags);
+                list_del(&ni->write_list);
+                spin_unlock_bh(&write_nodes_lock);
+                send_to_sock(ni);
+                spin_lock_bh(&write_nodes_lock);
+        }
+        spin_unlock_bh(&write_nodes_lock);
+}
+/* Called after we've had -EAGAIN and been woken up */
+static void refill_write_queue(void)
+{
+        int i;
+        for (i=1; i<=max_nodeid; i++) {
+                struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+                if (ni) {
+                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                                spin_lock_bh(&write_nodes_lock);
+                                list_add_tail(&ni->write_list, &write_nodes);
+                                spin_unlock_bh(&write_nodes_lock);
+                        }
+                }
+        }
+}
+static void clean_one_writequeue(struct nodeinfo *ni)
+{
+        struct list_head *list;
+        struct list_head *temp;
+        spin_lock(&ni->writequeue_lock);
+        list_for_each_safe(list, temp, &ni->writequeue) {
+                struct writequeue_entry *e =
+                        list_entry(list, struct writequeue_entry, list);
+                list_del(&e->list);
+                free_entry(e);
+        }
+        spin_unlock(&ni->writequeue_lock);
+}
+static void clean_writequeues(void)
+{
+        int i;
+        for (i=1; i<=max_nodeid; i++) {
+                struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+                if (ni)
+                        clean_one_writequeue(ni);
+        }
+}
+static void dealloc_nodeinfo(void)
+{
+        int i;
+        for (i=1; i<=max_nodeid; i++) {
+                struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+                if (ni) {
+                        idr_remove(&nodeinfo_idr, i);
+                        kfree(ni);
+                }
+        }
+}
+int dlm_lowcomms_close(int nodeid)
+{
+        struct nodeinfo *ni;
+        ni = nodeid2nodeinfo(nodeid, 0);
+        if (!ni)
+                return -1;
+        spin_lock(&ni->lock);
+        if (ni->assoc_id) {
+                ni->assoc_id = 0;
+                /* Don't send shutdown here, sctp will just queue it
+                   till the node comes back up! */
+        }
+        spin_unlock(&ni->lock);
+        clean_one_writequeue(ni);
+        clear_bit(NI_INIT_PENDING, &ni->flags);
+        return 0;
+}
+static int write_list_empty(void)
+{
+        int status;
+        spin_lock_bh(&write_nodes_lock);
+        status = list_empty(&write_nodes);
+        spin_unlock_bh(&write_nodes_lock);
+        return status;
+}
+static int dlm_recvd(void *data)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        while (!kthread_should_stop()) {
+                int count = 0;
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue(&lowcomms_recv_wait, &wait);
+                if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
+                        schedule();
+                remove_wait_queue(&lowcomms_recv_wait, &wait);
+                set_current_state(TASK_RUNNING);
+                if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
+                        int ret;
+                        do {
+                                ret = receive_from_sock();
+                                /* Don't starve out everyone else */
+                                if (++count >= MAX_RX_MSG_COUNT) {
+                                        schedule();
+                                        count = 0;
+                                }
+                        } while (!kthread_should_stop() && ret >=0);
+                }
+                schedule();
+        }
+        return 0;
+}
+static int dlm_sendd(void *data)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (write_list_empty())
+                        schedule();
+                set_current_state(TASK_RUNNING);
+                if (sctp_con.eagain_flag) {
+                        sctp_con.eagain_flag = 0;
+                        refill_write_queue();
+                }
+                process_output_queue();
+        }
+        remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+        return 0;
+}
+static void daemons_stop(void)
+{
+        kthread_stop(recv_task);
+        kthread_stop(send_task);
+}
+static int daemons_start(void)
+{
+        struct task_struct *p;
+        int error;
+        p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+        error = IS_ERR(p);
+        if (error) {
+                log_print("can't start dlm_recvd %d", error);
+                return error;
+        }
+        recv_task = p;
+        p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+        error = IS_ERR(p);
+        if (error) {
+                log_print("can't start dlm_sendd %d", error);
+                kthread_stop(recv_task);
+                return error;
+        }
+        send_task = p;
+        return 0;
+}
+/*
+ * This is quite likely to sleep...
+ */
+int dlm_lowcomms_start(void)
+{
+        int error;
+        error = init_sock();
+        if (error)
+                goto fail_sock;
+        error = daemons_start();
+        if (error)
+                goto fail_sock;
+        atomic_set(&accepting, 1);
+        return 0;
+ fail_sock:
+        close_connection();
+        return error;
+}
+/* Set all the activity flags to prevent any socket activity. */
+void dlm_lowcomms_stop(void)
+{
+        atomic_set(&accepting, 0);
+        sctp_con.flags = 0x7;
+        daemons_stop();
+        clean_writequeues();
+        close_connection();
+        dealloc_nodeinfo();
+        max_nodeid = 0;
+}
+int dlm_lowcomms_init(void)
+{
+        init_waitqueue_head(&lowcomms_recv_wait);
+        spin_lock_init(&write_nodes_lock);
+        INIT_LIST_HEAD(&write_nodes);
+        init_rwsem(&nodeinfo_lock);
+        return 0;
+}
+void dlm_lowcomms_exit(void)
+{
+        int i;
+        for (i = 0; i < dlm_local_count; i++)
+                kfree(dlm_local_addr[i]);
+        dlm_local_count = 0;
+        dlm_local_nodeid = 0;
+}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..6c04bb09cfa8
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+int dlm_lowcomms_init(void);
+void dlm_lowcomms_exit(void);
+int dlm_lowcomms_start(void);
+void dlm_lowcomms_stop(void);
+int dlm_lowcomms_close(int nodeid);
+void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
+void dlm_lowcomms_commit_buffer(void *mh);
+#endif                          /* __LOWCOMMS_DOT_H__ */
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __LVB_TABLE_DOT_H__
+#define __LVB_TABLE_DOT_H__
+extern const int dlm_lvb_operations[8][8];
+#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "user.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "config.h"
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+#endif
+static int __init init_dlm(void)
+{
+        int error;
+        error = dlm_memory_init();
+        if (error)
+                goto out;
+        error = dlm_lockspace_init();
+        if (error)
+                goto out_mem;
+        error = dlm_config_init();
+        if (error)
+                goto out_lockspace;
+        error = dlm_register_debugfs();
+        if (error)
+                goto out_config;
+        error = dlm_lowcomms_init();
+        if (error)
+                goto out_debug;
+        error = dlm_user_init();
+        if (error)
+                goto out_lowcomms;
+        printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
+        return 0;
+ out_lowcomms:
+        dlm_lowcomms_exit();
+ out_debug:
+        dlm_unregister_debugfs();
+ out_config:
+        dlm_config_exit();
+ out_lockspace:
+        dlm_lockspace_exit();
+ out_mem:
+        dlm_memory_exit();
+ out:
+        return error;
+}
+static void __exit exit_dlm(void)
+{
+        dlm_user_exit();
+        dlm_lowcomms_exit();
+        dlm_config_exit();
+        dlm_memory_exit();
+        dlm_lockspace_exit();
+        dlm_unregister_debugfs();
+}
+module_init(init_dlm);
+module_exit(exit_dlm);
+MODULE_DESCRIPTION("Distributed Lock Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+EXPORT_SYMBOL_GPL(dlm_new_lockspace);
+EXPORT_SYMBOL_GPL(dlm_release_lockspace);
+EXPORT_SYMBOL_GPL(dlm_lock);
+EXPORT_SYMBOL_GPL(dlm_unlock);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..a3f7de7f3a8f
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,327 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "recover.h"
+#include "rcom.h"
+#include "config.h"
+/*
+ * Following called by dlm_recoverd thread
+ */
+static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
+{
+        struct dlm_member *memb = NULL;
+        struct list_head *tmp;
+        struct list_head *newlist = &new->list;
+        struct list_head *head = &ls->ls_nodes;
+        list_for_each(tmp, head) {
+                memb = list_entry(tmp, struct dlm_member, list);
+                if (new->nodeid < memb->nodeid)
+                        break;
+        }
+        if (!memb)
+                list_add_tail(newlist, head);
+        else {
+                /* FIXME: can use list macro here */
+                newlist->prev = tmp->prev;
+                newlist->next = tmp;
+                tmp->prev->next = newlist;
+                tmp->prev = newlist;
+        }
+}
+static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+{
+        struct dlm_member *memb;
+        int w;
+        memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+        if (!memb)
+                return -ENOMEM;
+        w = dlm_node_weight(ls->ls_name, nodeid);
+        if (w < 0)
+                return w;
+        memb->nodeid = nodeid;
+        memb->weight = w;
+        add_ordered_member(ls, memb);
+        ls->ls_num_nodes++;
+        return 0;
+}
+static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
+{
+        list_move(&memb->list, &ls->ls_nodes_gone);
+        ls->ls_num_nodes--;
+}
+static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+        struct dlm_member *memb;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (memb->nodeid == nodeid)
+                        return 1;
+        }
+        return 0;
+}
+int dlm_is_removed(struct dlm_ls *ls, int nodeid)
+{
+        struct dlm_member *memb;
+        list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+                if (memb->nodeid == nodeid)
+                        return 1;
+        }
+        return 0;
+}
+static void clear_memb_list(struct list_head *head)
+{
+        struct dlm_member *memb;
+        while (!list_empty(head)) {
+                memb = list_entry(head->next, struct dlm_member, list);
+                list_del(&memb->list);
+                kfree(memb);
+        }
+}
+void dlm_clear_members(struct dlm_ls *ls)
+{
+        clear_memb_list(&ls->ls_nodes);
+        ls->ls_num_nodes = 0;
+}
+void dlm_clear_members_gone(struct dlm_ls *ls)
+{
+        clear_memb_list(&ls->ls_nodes_gone);
+}
+static void make_member_array(struct dlm_ls *ls)
+{
+        struct dlm_member *memb;
+        int i, w, x = 0, total = 0, all_zero = 0, *array;
+        kfree(ls->ls_node_array);
+        ls->ls_node_array = NULL;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (memb->weight)
+                        total += memb->weight;
+        }
+        /* all nodes revert to weight of 1 if all have weight 0 */
+        if (!total) {
+                total = ls->ls_num_nodes;
+                all_zero = 1;
+        }
+        ls->ls_total_weight = total;
+        array = kmalloc(sizeof(int) * total, GFP_KERNEL);
+        if (!array)
+                return;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (!all_zero && !memb->weight)
+                        continue;
+                if (all_zero)
+                        w = 1;
+                else
+                        w = memb->weight;
+                DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
+                for (i = 0; i < w; i++)
+                        array[x++] = memb->nodeid;
+        }
+        ls->ls_node_array = array;
+}
+/* send a status request to all members just to establish comms connections */
+static int ping_members(struct dlm_ls *ls)
+{
+        struct dlm_member *memb;
+        int error = 0;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                error = dlm_recovery_stopped(ls);
+                if (error)
+                        break;
+                error = dlm_rcom_status(ls, memb->nodeid);
+                if (error)
+                        break;
+        }
+        if (error)
+                log_debug(ls, "ping_members aborted %d last nodeid %d",
+                          error, ls->ls_recover_nodeid);
+        return error;
+}
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+        struct dlm_member *memb, *safe;
+        int i, error, found, pos = 0, neg = 0, low = -1;
+        /* move departed members from ls_nodes to ls_nodes_gone */
+        list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
+                found = 0;
+                for (i = 0; i < rv->node_count; i++) {
+                        if (memb->nodeid == rv->nodeids[i]) {
+                                found = 1;
+                                break;
+                        }
+                }
+                if (!found) {
+                        neg++;
+                        dlm_remove_member(ls, memb);
+                        log_debug(ls, "remove member %d", memb->nodeid);
+                }
+        }
+        /* add new members to ls_nodes */
+        for (i = 0; i < rv->node_count; i++) {
+                if (dlm_is_member(ls, rv->nodeids[i]))
+                        continue;
+                dlm_add_member(ls, rv->nodeids[i]);
+                pos++;
+                log_debug(ls, "add member %d", rv->nodeids[i]);
+        }
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (low == -1 || memb->nodeid < low)
+                        low = memb->nodeid;
+        }
+        ls->ls_low_nodeid = low;
+        make_member_array(ls);
+        dlm_set_recover_status(ls, DLM_RS_NODES);
+        *neg_out = neg;
+        error = ping_members(ls);
+        if (error)
+                goto out;
+        error = dlm_recover_members_wait(ls);
+ out:
+        log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
+        return error;
+}
+/*
+ * Following called from lockspace.c
+ */
+int dlm_ls_stop(struct dlm_ls *ls)
+{
+        int new;
+        /*
+         * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
+         * dlm_recovery_stopped()) and prevents any new locks from being
+         * processed (see RUNNING, dlm_locking_stopped()).
+         */
+        spin_lock(&ls->ls_recover_lock);
+        set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+        new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
+        ls->ls_recover_seq++;
+        spin_unlock(&ls->ls_recover_lock);
+        /*
+         * This in_recovery lock does two things:
+         *
+         * 1) Keeps this function from returning until all threads are out
+         *    of locking routines and locking is truely stopped.
+         * 2) Keeps any new requests from being processed until it's unlocked
+         *    when recovery is complete.
+         */
+        if (new)
+                down_write(&ls->ls_in_recovery);
+        /*
+         * The recoverd suspend/resume makes sure that dlm_recoverd (if
+         * running) has noticed the clearing of RUNNING above and quit
+         * processing the previous recovery.  This will be true for all nodes
+         * before any nodes start the new recovery.
+         */
+        dlm_recoverd_suspend(ls);
+        ls->ls_recover_status = 0;
+        dlm_recoverd_resume(ls);
+        return 0;
+}
+int dlm_ls_start(struct dlm_ls *ls)
+{
+        struct dlm_recover *rv = NULL, *rv_old;
+        int *ids = NULL;
+        int error, count;
+        rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
+        if (!rv)
+                return -ENOMEM;
+        error = count = dlm_nodeid_list(ls->ls_name, &ids);
+        if (error <= 0)
+                goto fail;
+        spin_lock(&ls->ls_recover_lock);
+        /* the lockspace needs to be stopped before it can be started */
+        if (!dlm_locking_stopped(ls)) {
+                spin_unlock(&ls->ls_recover_lock);
+                log_error(ls, "start ignored: lockspace running");
+                error = -EINVAL;
+                goto fail;
+        }
+        rv->nodeids = ids;
+        rv->node_count = count;
+        rv->seq = ++ls->ls_recover_seq;
+        rv_old = ls->ls_recover_args;
+        ls->ls_recover_args = rv;
+        spin_unlock(&ls->ls_recover_lock);
+        if (rv_old) {
+                kfree(rv_old->nodeids);
+                kfree(rv_old);
+        }
+        dlm_recoverd_kick(ls);
+        return 0;
+ fail:
+        kfree(rv);
+        kfree(ids);
+        return error;
+}
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __MEMBER_DOT_H__
+#define __MEMBER_DOT_H__
+int dlm_ls_stop(struct dlm_ls *ls);
+int dlm_ls_start(struct dlm_ls *ls);
+void dlm_clear_members(struct dlm_ls *ls);
+void dlm_clear_members_gone(struct dlm_ls *ls);
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
+int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+#endif                          /* __MEMBER_DOT_H__ */
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..989b608fd836
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,116 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "config.h"
+#include "memory.h"
+static kmem_cache_t *lkb_cache;
+int dlm_memory_init(void)
+{
+        int ret = 0;
+        lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+                                __alignof__(struct dlm_lkb), 0, NULL, NULL);
+        if (!lkb_cache)
+                ret = -ENOMEM;
+        return ret;
+}
+void dlm_memory_exit(void)
+{
+        if (lkb_cache)
+                kmem_cache_destroy(lkb_cache);
+}
+char *allocate_lvb(struct dlm_ls *ls)
+{
+        char *p;
+        p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
+        if (p)
+                memset(p, 0, ls->ls_lvblen);
+        return p;
+}
+void free_lvb(char *p)
+{
+        kfree(p);
+}
+/* FIXME: have some minimal space built-in to rsb for the name and
+   kmalloc a separate name if needed, like dentries are done */
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+{
+        struct dlm_rsb *r;
+        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+        r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
+        if (r)
+                memset(r, 0, sizeof(*r) + namelen);
+        return r;
+}
+void free_rsb(struct dlm_rsb *r)
+{
+        if (r->res_lvbptr)
+                free_lvb(r->res_lvbptr);
+        kfree(r);
+}
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+{
+        struct dlm_lkb *lkb;
+        lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
+        if (lkb)
+                memset(lkb, 0, sizeof(*lkb));
+        return lkb;
+}
+void free_lkb(struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_flags & DLM_IFL_USER) {
+                struct dlm_user_args *ua;
+                ua = (struct dlm_user_args *)lkb->lkb_astparam;
+                if (ua) {
+                        if (ua->lksb.sb_lvbptr)
+                                kfree(ua->lksb.sb_lvbptr);
+                        kfree(ua);
+                }
+        }
+        kmem_cache_free(lkb_cache, lkb);
+}
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
+{
+        struct dlm_direntry *de;
+        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
+                   printk("namelen = %d\n", namelen););
+        de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
+        if (de)
+                memset(de, 0, sizeof(*de) + namelen);
+        return de;
+}
+void free_direntry(struct dlm_direntry *de)
+{
+        kfree(de);
+}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
+void free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
+void free_lkb(struct dlm_lkb *l);
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
+void free_direntry(struct dlm_direntry *de);
+char *allocate_lvb(struct dlm_ls *ls);
+void free_lvb(char *l);
+#endif          /* __MEMORY_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ */
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "rcom.h"
+#include "lock.h"
+#include "midcomms.h"
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+                         unsigned len, unsigned limit)
+{
+        unsigned copy = len;
+        if ((copy + offset) > limit)
+                copy = limit - offset;
+        memcpy(dst, base + offset, copy);
+        len -= copy;
+        if (len)
+                memcpy(dst + copy, base, len);
+}
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ */
+int dlm_process_incoming_buffer(int nodeid, const void *base,
+                                unsigned offset, unsigned len, unsigned limit)
+{
+        unsigned char __tmp[DLM_INBUF_LEN];
+        struct dlm_header *msg = (struct dlm_header *) __tmp;
+        int ret = 0;
+        int err = 0;
+        uint16_t msglen;
+        uint32_t lockspace;
+        while (len > sizeof(struct dlm_header)) {
+                /* Copy just the header to check the total length.  The
+                   message may wrap around the end of the buffer back to the
+                   start, so we need to use a temp buffer and copy_from_cb. */
+                copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
+                             limit);
+                msglen = le16_to_cpu(msg->h_length);
+                lockspace = msg->h_lockspace;
+                err = -EINVAL;
+                if (msglen < sizeof(struct dlm_header))
+                        break;
+                err = -E2BIG;
+                if (msglen > dlm_config.buffer_size) {
+                        log_print("message size %d from %d too big, buf len %d",
+                                  msglen, nodeid, len);
+                        break;
+                }
+                err = 0;
+                /* If only part of the full message is contained in this
+                   buffer, then do nothing and wait for lowcomms to call
+                   us again later with more data.  We return 0 meaning
+                   we've consumed none of the input buffer. */
+                if (msglen > len)
+                        break;
+                /* Allocate a larger temp buffer if the full message won't fit
+                   in the buffer on the stack (which should work for most
+                   ordinary messages). */
+                if (msglen > sizeof(__tmp) &&
+                    msg == (struct dlm_header *) __tmp) {
+                        msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+                        if (msg == NULL)
+                                return ret;
+                }
+                copy_from_cb(msg, base, offset, msglen, limit);
+                BUG_ON(lockspace != msg->h_lockspace);
+                ret += msglen;
+                offset += msglen;
+                offset &= (limit - 1);
+                len -= msglen;
+                switch (msg->h_cmd) {
+                case DLM_MSG:
+                        dlm_receive_message(msg, nodeid, 0);
+                        break;
+                case DLM_RCOM:
+                        dlm_receive_rcom(msg, nodeid);
+                        break;
+                default:
+                        log_print("unknown msg type %x from %u: %u %u %u %u",
+                                  msg->h_cmd, nodeid, msglen, len, offset, ret);
+                }
+        }
+        if (msg != (struct dlm_header *) __tmp)
+                kfree(msg);
+        return err ? err : ret;
+}
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
+                                unsigned len, unsigned limit);
+#endif                          /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..518239a8b1e9
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,472 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rcom.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+static int rcom_response(struct dlm_ls *ls)
+{
+        return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
+}
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+                       struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        char *mb;
+        int mb_len = sizeof(struct dlm_rcom) + len;
+        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+        if (!mh) {
+                log_print("create_rcom to %d type %d len %d ENOBUFS",
+                          to_nodeid, type, len);
+                return -ENOBUFS;
+        }
+        memset(mb, 0, mb_len);
+        rc = (struct dlm_rcom *) mb;
+        rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+        rc->rc_header.h_lockspace = ls->ls_global_id;
+        rc->rc_header.h_nodeid = dlm_our_nodeid();
+        rc->rc_header.h_length = mb_len;
+        rc->rc_header.h_cmd = DLM_RCOM;
+        rc->rc_type = type;
+        *mh_ret = mh;
+        *rc_ret = rc;
+        return 0;
+}
+static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
+                      struct dlm_rcom *rc)
+{
+        dlm_rcom_out(rc);
+        dlm_lowcomms_commit_buffer(mh);
+}
+/* When replying to a status request, a node also sends back its
+   configuration values.  The requesting node then checks that the remote
+   node is configured the same way as itself. */
+static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+{
+        rf->rf_lvblen = ls->ls_lvblen;
+        rf->rf_lsflags = ls->ls_exflags;
+}
+static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
+{
+        if (rf->rf_lvblen != ls->ls_lvblen ||
+            rf->rf_lsflags != ls->ls_exflags) {
+                log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
+                          ls->ls_lvblen, ls->ls_exflags,
+                          nodeid, rf->rf_lvblen, rf->rf_lsflags);
+                return -EINVAL;
+        }
+        return 0;
+}
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error = 0;
+        memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+        ls->ls_recover_nodeid = nodeid;
+        if (nodeid == dlm_our_nodeid()) {
+                rc = (struct dlm_rcom *) ls->ls_recover_buf;
+                rc->rc_result = dlm_recover_status(ls);
+                goto out;
+        }
+        error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+        if (error)
+                goto out;
+        rc->rc_id = ++ls->ls_rcom_seq;
+        send_rcom(ls, mh, rc);
+        error = dlm_wait_function(ls, &rcom_response);
+        clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+        if (error)
+                goto out;
+        rc = (struct dlm_rcom *) ls->ls_recover_buf;
+        if (rc->rc_result == -ESRCH) {
+                /* we pretend the remote lockspace exists with 0 status */
+                log_debug(ls, "remote node %d not ready", nodeid);
+                rc->rc_result = 0;
+        } else
+                error = check_config(ls, (struct rcom_config *) rc->rc_buf,
+                                     nodeid);
+        /* the caller looks at rc_result for the remote recovery status */
+ out:
+        return error;
+}
+static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error, nodeid = rc_in->rc_header.h_nodeid;
+        error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+                            sizeof(struct rcom_config), &rc, &mh);
+        if (error)
+                return;
+        rc->rc_id = rc_in->rc_id;
+        rc->rc_result = dlm_recover_status(ls);
+        make_config(ls, (struct rcom_config *) rc->rc_buf);
+        send_rcom(ls, mh, rc);
+}
+static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        if (rc_in->rc_id != ls->ls_rcom_seq) {
+                log_debug(ls, "reject old reply %d got %llx wanted %llx",
+                          rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
+                return;
+        }
+        memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+        set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+        wake_up(&ls->ls_wait_general);
+}
+static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        receive_sync_reply(ls, rc_in);
+}
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error = 0, len = sizeof(struct dlm_rcom);
+        memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+        ls->ls_recover_nodeid = nodeid;
+        if (nodeid == dlm_our_nodeid()) {
+                dlm_copy_master_names(ls, last_name, last_len,
+                                      ls->ls_recover_buf + len,
+                                      dlm_config.buffer_size - len, nodeid);
+                goto out;
+        }
+        error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+        if (error)
+                goto out;
+        memcpy(rc->rc_buf, last_name, last_len);
+        rc->rc_id = ++ls->ls_rcom_seq;
+        send_rcom(ls, mh, rc);
+        error = dlm_wait_function(ls, &rcom_response);
+        clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ out:
+        return error;
+}
+static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error, inlen, outlen;
+        int nodeid = rc_in->rc_header.h_nodeid;
+        uint32_t status = dlm_recover_status(ls);
+        /*
+         * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
+         * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
+         * It could only happen in rare cases where we get a late NAMES
+         * message from a previous instance of recovery.
+         */
+        if (!(status & DLM_RS_NODES)) {
+                log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
+                return;
+        }
+        nodeid = rc_in->rc_header.h_nodeid;
+        inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+        outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+        error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+        if (error)
+                return;
+        rc->rc_id = rc_in->rc_id;
+        dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
+                              nodeid);
+        send_rcom(ls, mh, rc);
+}
+static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        receive_sync_reply(ls, rc_in);
+}
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        struct dlm_ls *ls = r->res_ls;
+        int error;
+        error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+                            &rc, &mh);
+        if (error)
+                goto out;
+        memcpy(rc->rc_buf, r->res_name, r->res_length);
+        rc->rc_id = (unsigned long) r;
+        send_rcom(ls, mh, rc);
+ out:
+        return error;
+}
+static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
+        int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+        error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+        if (error)
+                return;
+        error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
+        if (error)
+                ret_nodeid = error;
+        rc->rc_result = ret_nodeid;
+        rc->rc_id = rc_in->rc_id;
+        send_rcom(ls, mh, rc);
+}
+static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        dlm_recover_master_reply(ls, rc_in);
+}
+static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                           struct rcom_lock *rl)
+{
+        memset(rl, 0, sizeof(*rl));
+        rl->rl_ownpid = lkb->lkb_ownpid;
+        rl->rl_lkid = lkb->lkb_id;
+        rl->rl_exflags = lkb->lkb_exflags;
+        rl->rl_flags = lkb->lkb_flags;
+        rl->rl_lvbseq = lkb->lkb_lvbseq;
+        rl->rl_rqmode = lkb->lkb_rqmode;
+        rl->rl_grmode = lkb->lkb_grmode;
+        rl->rl_status = lkb->lkb_status;
+        rl->rl_wait_type = lkb->lkb_wait_type;
+        if (lkb->lkb_bastaddr)
+                rl->rl_asts |= AST_BAST;
+        if (lkb->lkb_astaddr)
+                rl->rl_asts |= AST_COMP;
+        rl->rl_namelen = r->res_length;
+        memcpy(rl->rl_name, r->res_name, r->res_length);
+        /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
+           If so, receive_rcom_lock_args() won't take this copy. */
+        if (lkb->lkb_lvbptr)
+                memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        struct dlm_ls *ls = r->res_ls;
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        struct rcom_lock *rl;
+        int error, len = sizeof(struct rcom_lock);
+        if (lkb->lkb_lvbptr)
+                len += ls->ls_lvblen;
+        error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
+        if (error)
+                goto out;
+        rl = (struct rcom_lock *) rc->rc_buf;
+        pack_rcom_lock(r, lkb, rl);
+        rc->rc_id = (unsigned long) r;
+        send_rcom(ls, mh, rc);
+ out:
+        return error;
+}
+static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error, nodeid = rc_in->rc_header.h_nodeid;
+        dlm_recover_master_copy(ls, rc_in);
+        error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
+                            sizeof(struct rcom_lock), &rc, &mh);
+        if (error)
+                return;
+        /* We send back the same rcom_lock struct we received, but
+           dlm_recover_master_copy() has filled in rl_remid and rl_result */
+        memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+        rc->rc_id = rc_in->rc_id;
+        send_rcom(ls, mh, rc);
+}
+static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        uint32_t status = dlm_recover_status(ls);
+        if (!(status & DLM_RS_DIR)) {
+                log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
+                          rc_in->rc_header.h_nodeid);
+                return;
+        }
+        dlm_recover_process_copy(ls, rc_in);
+}
+static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        char *mb;
+        int mb_len = sizeof(struct dlm_rcom);
+        mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
+        if (!mh)
+                return -ENOBUFS;
+        memset(mb, 0, mb_len);
+        rc = (struct dlm_rcom *) mb;
+        rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+        rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+        rc->rc_header.h_nodeid = dlm_our_nodeid();
+        rc->rc_header.h_length = mb_len;
+        rc->rc_header.h_cmd = DLM_RCOM;
+        rc->rc_type = DLM_RCOM_STATUS_REPLY;
+        rc->rc_id = rc_in->rc_id;
+        rc->rc_result = -ESRCH;
+        dlm_rcom_out(rc);
+        dlm_lowcomms_commit_buffer(mh);
+        return 0;
+}
+/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
+   recovery-only comms are sent through here. */
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
+{
+        struct dlm_rcom *rc = (struct dlm_rcom *) hd;
+        struct dlm_ls *ls;
+        dlm_rcom_in(rc);
+        /* If the lockspace doesn't exist then still send a status message
+           back; it's possible that it just doesn't have its global_id yet. */
+        ls = dlm_find_lockspace_global(hd->h_lockspace);
+        if (!ls) {
+                log_print("lockspace %x from %d not found",
+                          hd->h_lockspace, nodeid);
+                send_ls_not_ready(nodeid, rc);
+                return;
+        }
+        if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
+                log_error(ls, "ignoring recovery message %x from %d",
+                          rc->rc_type, nodeid);
+                goto out;
+        }
+        if (nodeid != rc->rc_header.h_nodeid) {
+                log_error(ls, "bad rcom nodeid %d from %d",
+                          rc->rc_header.h_nodeid, nodeid);
+                goto out;
+        }
+        switch (rc->rc_type) {
+        case DLM_RCOM_STATUS:
+                receive_rcom_status(ls, rc);
+                break;
+        case DLM_RCOM_NAMES:
+                receive_rcom_names(ls, rc);
+                break;
+        case DLM_RCOM_LOOKUP:
+                receive_rcom_lookup(ls, rc);
+                break;
+        case DLM_RCOM_LOCK:
+                receive_rcom_lock(ls, rc);
+                break;
+        case DLM_RCOM_STATUS_REPLY:
+                receive_rcom_status_reply(ls, rc);
+                break;
+        case DLM_RCOM_NAMES_REPLY:
+                receive_rcom_names_reply(ls, rc);
+                break;
+        case DLM_RCOM_LOOKUP_REPLY:
+                receive_rcom_lookup_reply(ls, rc);
+                break;
+        case DLM_RCOM_LOCK_REPLY:
+                receive_rcom_lock_reply(ls, rc);
+                break;
+        default:
+                DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+        }
+ out:
+        dlm_put_lockspace(ls);
+}
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __RCOM_DOT_H__
+#define __RCOM_DOT_H__
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
+#endif
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..a5e6d184872e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,765 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "dir.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+#include "rcom.h"
+#include "lock.h"
+#include "lowcomms.h"
+#include "member.h"
+#include "recover.h"
+/*
+ * Recovery waiting routines: these functions wait for a particular reply from
+ * a remote node, or for the remote node to report a certain status.  They need
+ * to abort if the lockspace is stopped indicating a node has failed (perhaps
+ * the one being waited for).
+ */
+/*
+ * Wait until given function returns non-zero or lockspace is stopped
+ * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes).  When another
+ * function thinks it could have completed the waited-on task, they should wake
+ * up ls_wait_general to get an immediate response rather than waiting for the
+ * timer to detect the result.  A timer wakes us up periodically while waiting
+ * to see if we should abort due to a node failure.  This should only be called
+ * by the dlm_recoverd thread.
+ */
+static void dlm_wait_timer_fn(unsigned long data)
+{
+        struct dlm_ls *ls = (struct dlm_ls *) data;
+        mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
+        wake_up(&ls->ls_wait_general);
+}
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+        int error = 0;
+        init_timer(&ls->ls_timer);
+        ls->ls_timer.function = dlm_wait_timer_fn;
+        ls->ls_timer.data = (long) ls;
+        ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
+        add_timer(&ls->ls_timer);
+        wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
+        del_timer_sync(&ls->ls_timer);
+        if (dlm_recovery_stopped(ls)) {
+                log_debug(ls, "dlm_wait_function aborted");
+                error = -EINTR;
+        }
+        return error;
+}
+/*
+ * An efficient way for all nodes to wait for all others to have a certain
+ * status.  The node with the lowest nodeid polls all the others for their
+ * status (wait_status_all) and all the others poll the node with the low id
+ * for its accumulated result (wait_status_low).  When all nodes have set
+ * status flag X, then status flag X_ALL will be set on the low nodeid.
+ */
+uint32_t dlm_recover_status(struct dlm_ls *ls)
+{
+        uint32_t status;
+        spin_lock(&ls->ls_recover_lock);
+        status = ls->ls_recover_status;
+        spin_unlock(&ls->ls_recover_lock);
+        return status;
+}
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+        spin_lock(&ls->ls_recover_lock);
+        ls->ls_recover_status |= status;
+        spin_unlock(&ls->ls_recover_lock);
+}
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+{
+        struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+        struct dlm_member *memb;
+        int error = 0, delay;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                delay = 0;
+                for (;;) {
+                        if (dlm_recovery_stopped(ls)) {
+                                error = -EINTR;
+                                goto out;
+                        }
+                        error = dlm_rcom_status(ls, memb->nodeid);
+                        if (error)
+                                goto out;
+                        if (rc->rc_result & wait_status)
+                                break;
+                        if (delay < 1000)
+                                delay += 20;
+                        msleep(delay);
+                }
+        }
+ out:
+        return error;
+}
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+{
+        struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+        int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
+        for (;;) {
+                if (dlm_recovery_stopped(ls)) {
+                        error = -EINTR;
+                        goto out;
+                }
+                error = dlm_rcom_status(ls, nodeid);
+                if (error)
+                        break;
+                if (rc->rc_result & wait_status)
+                        break;
+                if (delay < 1000)
+                        delay += 20;
+                msleep(delay);
+        }
+ out:
+        return error;
+}
+static int wait_status(struct dlm_ls *ls, uint32_t status)
+{
+        uint32_t status_all = status << 1;
+        int error;
+        if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+                error = wait_status_all(ls, status);
+                if (!error)
+                        dlm_set_recover_status(ls, status_all);
+        } else
+                error = wait_status_low(ls, status_all);
+        return error;
+}
+int dlm_recover_members_wait(struct dlm_ls *ls)
+{
+        return wait_status(ls, DLM_RS_NODES);
+}
+int dlm_recover_directory_wait(struct dlm_ls *ls)
+{
+        return wait_status(ls, DLM_RS_DIR);
+}
+int dlm_recover_locks_wait(struct dlm_ls *ls)
+{
+        return wait_status(ls, DLM_RS_LOCKS);
+}
+int dlm_recover_done_wait(struct dlm_ls *ls)
+{
+        return wait_status(ls, DLM_RS_DONE);
+}
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid.  As replies are returned from the resource directories the
+ * rsb's are removed from the list.  When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ *
+ * We use the address of the rsb struct as a simple local identifier for the
+ * rsb so we can match an rcom reply with the rsb it was sent for.
+ */
+static int recover_list_empty(struct dlm_ls *ls)
+{
+        int empty;
+        spin_lock(&ls->ls_recover_list_lock);
+        empty = list_empty(&ls->ls_recover_list);
+        spin_unlock(&ls->ls_recover_list_lock);
+        return empty;
+}
+static void recover_list_add(struct dlm_rsb *r)
+{
+        struct dlm_ls *ls = r->res_ls;
+        spin_lock(&ls->ls_recover_list_lock);
+        if (list_empty(&r->res_recover_list)) {
+                list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
+                ls->ls_recover_list_count++;
+                dlm_hold_rsb(r);
+        }
+        spin_unlock(&ls->ls_recover_list_lock);
+}
+static void recover_list_del(struct dlm_rsb *r)
+{
+        struct dlm_ls *ls = r->res_ls;
+        spin_lock(&ls->ls_recover_list_lock);
+        list_del_init(&r->res_recover_list);
+        ls->ls_recover_list_count--;
+        spin_unlock(&ls->ls_recover_list_lock);
+        dlm_put_rsb(r);
+}
+static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
+{
+        struct dlm_rsb *r = NULL;
+        spin_lock(&ls->ls_recover_list_lock);
+        list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
+                if (id == (unsigned long) r)
+                        goto out;
+        }
+        r = NULL;
+ out:
+        spin_unlock(&ls->ls_recover_list_lock);
+        return r;
+}
+static void recover_list_clear(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r, *s;
+        spin_lock(&ls->ls_recover_list_lock);
+        list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
+                list_del_init(&r->res_recover_list);
+                dlm_put_rsb(r);
+                ls->ls_recover_list_count--;
+        }
+        if (ls->ls_recover_list_count != 0) {
+                log_error(ls, "warning: recover_list_count %d",
+                          ls->ls_recover_list_count);
+                ls->ls_recover_list_count = 0;
+        }
+        spin_unlock(&ls->ls_recover_list_lock);
+}
+/* Master recovery: find new master node for rsb's that were
+   mastered on nodes that have been removed.
+   dlm_recover_masters
+   recover_master
+   dlm_send_rcom_lookup            ->  receive_rcom_lookup
+                                       dlm_dir_lookup
+   receive_rcom_lookup_reply       <-
+   dlm_recover_master_reply
+   set_new_master
+   set_master_lkbs
+   set_lock_master
+*/
+/*
+ * Set the lock master for all LKBs in a lock queue
+ * If we are the new master of the rsb, we may have received new
+ * MSTCPY locks from other nodes already which we need to ignore
+ * when setting the new nodeid.
+ */
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+        struct dlm_lkb *lkb;
+        list_for_each_entry(lkb, queue, lkb_statequeue)
+                if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
+                        lkb->lkb_nodeid = nodeid;
+}
+static void set_master_lkbs(struct dlm_rsb *r)
+{
+        set_lock_master(&r->res_grantqueue, r->res_nodeid);
+        set_lock_master(&r->res_convertqueue, r->res_nodeid);
+        set_lock_master(&r->res_waitqueue, r->res_nodeid);
+}
+/*
+ * Propogate the new master nodeid to locks
+ * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
+ * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
+ * rsb's to consider.
+ */
+static void set_new_master(struct dlm_rsb *r, int nodeid)
+{
+        lock_rsb(r);
+        r->res_nodeid = nodeid;
+        set_master_lkbs(r);
+        rsb_set_flag(r, RSB_NEW_MASTER);
+        rsb_set_flag(r, RSB_NEW_MASTER2);
+        unlock_rsb(r);
+}
+/*
+ * We do async lookups on rsb's that need new masters.  The rsb's
+ * waiting for a lookup reply are kept on the recover_list.
+ */
+static int recover_master(struct dlm_rsb *r)
+{
+        struct dlm_ls *ls = r->res_ls;
+        int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+        dir_nodeid = dlm_dir_nodeid(r);
+        if (dir_nodeid == our_nodeid) {
+                error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+                                       r->res_length, &ret_nodeid);
+                if (error)
+                        log_error(ls, "recover dir lookup error %d", error);
+                if (ret_nodeid == our_nodeid)
+                        ret_nodeid = 0;
+                set_new_master(r, ret_nodeid);
+        } else {
+                recover_list_add(r);
+                error = dlm_send_rcom_lookup(r, dir_nodeid);
+        }
+        return error;
+}
+/*
+ * When not using a directory, most resource names will hash to a new static
+ * master nodeid and the resource will need to be remastered.
+ */
+static int recover_master_static(struct dlm_rsb *r)
+{
+        int master = dlm_dir_nodeid(r);
+        if (master == dlm_our_nodeid())
+                master = 0;
+        if (r->res_nodeid != master) {
+                if (is_master(r))
+                        dlm_purge_mstcpy_locks(r);
+                set_new_master(r, master);
+                return 1;
+        }
+        return 0;
+}
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the directory.  The dir will
+ * assign mastery to the first node to look up the new master.  That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the dir lookup requests individually and asynchronously to
+ * the correct dir node.
+ */
+int dlm_recover_masters(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int error = 0, count = 0;
+        log_debug(ls, "dlm_recover_masters");
+        down_read(&ls->ls_root_sem);
+        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+                if (dlm_recovery_stopped(ls)) {
+                        up_read(&ls->ls_root_sem);
+                        error = -EINTR;
+                        goto out;
+                }
+                if (dlm_no_directory(ls))
+                        count += recover_master_static(r);
+                else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
+                        recover_master(r);
+                        count++;
+                }
+                schedule();
+        }
+        up_read(&ls->ls_root_sem);
+        log_debug(ls, "dlm_recover_masters %d resources", count);
+        error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+        if (error)
+                recover_list_clear(ls);
+        return error;
+}
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+        struct dlm_rsb *r;
+        int nodeid;
+        r = recover_list_find(ls, rc->rc_id);
+        if (!r) {
+                log_error(ls, "dlm_recover_master_reply no id %llx",
+                          (unsigned long long)rc->rc_id);
+                goto out;
+        }
+        nodeid = rc->rc_result;
+        if (nodeid == dlm_our_nodeid())
+                nodeid = 0;
+        set_new_master(r, nodeid);
+        recover_list_del(r);
+        if (recover_list_empty(ls))
+                wake_up(&ls->ls_wait_general);
+ out:
+        return 0;
+}
+/* Lock recovery: rebuild the process-copy locks we hold on a
+   remastered rsb on the new rsb master.
+   dlm_recover_locks
+   recover_locks
+   recover_locks_queue
+   dlm_send_rcom_lock              ->  receive_rcom_lock
+                                       dlm_recover_master_copy
+   receive_rcom_lock_reply         <-
+   dlm_recover_process_copy
+*/
+/*
+ * keep a count of the number of lkb's we send to the new master; when we get
+ * an equal number of replies then recovery for the rsb is done
+ */
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
+{
+        struct dlm_lkb *lkb;
+        int error = 0;
+        list_for_each_entry(lkb, head, lkb_statequeue) {
+                error = dlm_send_rcom_lock(r, lkb);
+                if (error)
+                        break;
+                r->res_recover_locks_count++;
+        }
+        return error;
+}
+static int recover_locks(struct dlm_rsb *r)
+{
+        int error = 0;
+        lock_rsb(r);
+        DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
+        error = recover_locks_queue(r, &r->res_grantqueue);
+        if (error)
+                goto out;
+        error = recover_locks_queue(r, &r->res_convertqueue);
+        if (error)
+                goto out;
+        error = recover_locks_queue(r, &r->res_waitqueue);
+        if (error)
+                goto out;
+        if (r->res_recover_locks_count)
+                recover_list_add(r);
+        else
+                rsb_clear_flag(r, RSB_NEW_MASTER);
+ out:
+        unlock_rsb(r);
+        return error;
+}
+int dlm_recover_locks(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int error, count = 0;
+        log_debug(ls, "dlm_recover_locks");
+        down_read(&ls->ls_root_sem);
+        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+                if (is_master(r)) {
+                        rsb_clear_flag(r, RSB_NEW_MASTER);
+                        continue;
+                }
+                if (!rsb_flag(r, RSB_NEW_MASTER))
+                        continue;
+                if (dlm_recovery_stopped(ls)) {
+                        error = -EINTR;
+                        up_read(&ls->ls_root_sem);
+                        goto out;
+                }
+                error = recover_locks(r);
+                if (error) {
+                        up_read(&ls->ls_root_sem);
+                        goto out;
+                }
+                count += r->res_recover_locks_count;
+        }
+        up_read(&ls->ls_root_sem);
+        log_debug(ls, "dlm_recover_locks %d locks", count);
+        error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+        if (error)
+                recover_list_clear(ls);
+        else
+                dlm_set_recover_status(ls, DLM_RS_LOCKS);
+        return error;
+}
+void dlm_recovered_lock(struct dlm_rsb *r)
+{
+        DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
+        r->res_recover_locks_count--;
+        if (!r->res_recover_locks_count) {
+                rsb_clear_flag(r, RSB_NEW_MASTER);
+                recover_list_del(r);
+        }
+        if (recover_list_empty(r->res_ls))
+                wake_up(&r->res_ls->ls_wait_general);
+}
+/*
+ * The lvb needs to be recovered on all master rsb's.  This includes setting
+ * the VALNOTVALID flag if necessary, and determining the correct lvb contents
+ * based on the lvb's of the locks held on the rsb.
+ *
+ * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb.  If it
+ * was already set prior to recovery, it's not cleared, regardless of locks.
+ *
+ * The LVB contents are only considered for changing when this is a new master
+ * of the rsb (NEW_MASTER2).  Then, the rsb's lvb is taken from any lkb with
+ * mode > CR.  If no lkb's exist with mode above CR, the lvb contents are taken
+ * from the lkb with the largest lvb sequence number.
+ */
+static void recover_lvb(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb, *high_lkb = NULL;
+        uint32_t high_seq = 0;
+        int lock_lvb_exists = 0;
+        int big_lock_exists = 0;
+        int lvblen = r->res_ls->ls_lvblen;
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                        continue;
+                lock_lvb_exists = 1;
+                if (lkb->lkb_grmode > DLM_LOCK_CR) {
+                        big_lock_exists = 1;
+                        goto setflag;
+                }
+                if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+                        high_lkb = lkb;
+                        high_seq = lkb->lkb_lvbseq;
+                }
+        }
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                        continue;
+                lock_lvb_exists = 1;
+                if (lkb->lkb_grmode > DLM_LOCK_CR) {
+                        big_lock_exists = 1;
+                        goto setflag;
+                }
+                if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+                        high_lkb = lkb;
+                        high_seq = lkb->lkb_lvbseq;
+                }
+        }
+ setflag:
+        if (!lock_lvb_exists)
+                goto out;
+        if (!big_lock_exists)
+                rsb_set_flag(r, RSB_VALNOTVALID);
+        /* don't mess with the lvb unless we're the new master */
+        if (!rsb_flag(r, RSB_NEW_MASTER2))
+                goto out;
+        if (!r->res_lvbptr) {
+                r->res_lvbptr = allocate_lvb(r->res_ls);
+                if (!r->res_lvbptr)
+                        goto out;
+        }
+        if (big_lock_exists) {
+                r->res_lvbseq = lkb->lkb_lvbseq;
+                memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+        } else if (high_lkb) {
+                r->res_lvbseq = high_lkb->lkb_lvbseq;
+                memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
+        } else {
+                r->res_lvbseq = 0;
+                memset(r->res_lvbptr, 0, lvblen);
+        }
+ out:
+        return;
+}
+/* All master rsb's flagged RECOVER_CONVERT need to be looked at.  The locks
+   converting PR->CW or CW->PR need to have their lkb_grmode set. */
+static void recover_conversion(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb;
+        int grmode = -1;
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+                if (lkb->lkb_grmode == DLM_LOCK_PR ||
+                    lkb->lkb_grmode == DLM_LOCK_CW) {
+                        grmode = lkb->lkb_grmode;
+                        break;
+                }
+        }
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+                if (lkb->lkb_grmode != DLM_LOCK_IV)
+                        continue;
+                if (grmode == -1)
+                        lkb->lkb_grmode = lkb->lkb_rqmode;
+                else
+                        lkb->lkb_grmode = grmode;
+        }
+}
+/* We've become the new master for this rsb and waiting/converting locks may
+   need to be granted in dlm_grant_after_purge() due to locks that may have
+   existed from a removed node. */
+static void set_locks_purged(struct dlm_rsb *r)
+{
+        if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+                rsb_set_flag(r, RSB_LOCKS_PURGED);
+}
+void dlm_recover_rsbs(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int count = 0;
+        log_debug(ls, "dlm_recover_rsbs");
+        down_read(&ls->ls_root_sem);
+        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+                lock_rsb(r);
+                if (is_master(r)) {
+                        if (rsb_flag(r, RSB_RECOVER_CONVERT))
+                                recover_conversion(r);
+                        if (rsb_flag(r, RSB_NEW_MASTER2))
+                                set_locks_purged(r);
+                        recover_lvb(r);
+                        count++;
+                }
+                rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+                rsb_clear_flag(r, RSB_NEW_MASTER2);
+                unlock_rsb(r);
+        }
+        up_read(&ls->ls_root_sem);
+        log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
+}
+/* Create a single list of all root rsb's to be used during recovery */
+int dlm_create_root_list(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int i, error = 0;
+        down_write(&ls->ls_root_sem);
+        if (!list_empty(&ls->ls_root_list)) {
+                log_error(ls, "root list not empty");
+                error = -EINVAL;
+                goto out;
+        }
+        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+                read_lock(&ls->ls_rsbtbl[i].lock);
+                list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+                        list_add(&r->res_root_list, &ls->ls_root_list);
+                        dlm_hold_rsb(r);
+                }
+                read_unlock(&ls->ls_rsbtbl[i].lock);
+        }
+ out:
+        up_write(&ls->ls_root_sem);
+        return error;
+}
+void dlm_release_root_list(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r, *safe;
+        down_write(&ls->ls_root_sem);
+        list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
+                list_del_init(&r->res_root_list);
+                dlm_put_rsb(r);
+        }
+        up_write(&ls->ls_root_sem);
+}
+void dlm_clear_toss_list(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r, *safe;
+        int i;
+        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+                write_lock(&ls->ls_rsbtbl[i].lock);
+                list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
+                                         res_hashchain) {
+                        list_del(&r->res_hashchain);
+                        free_rsb(r);
+                }
+                write_unlock(&ls->ls_rsbtbl[i].lock);
+        }
+}
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
+uint32_t dlm_recover_status(struct dlm_ls *ls);
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
+int dlm_recover_members_wait(struct dlm_ls *ls);
+int dlm_recover_directory_wait(struct dlm_ls *ls);
+int dlm_recover_locks_wait(struct dlm_ls *ls);
+int dlm_recover_done_wait(struct dlm_ls *ls);
+int dlm_recover_masters(struct dlm_ls *ls);
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls);
+void dlm_recovered_lock(struct dlm_rsb *r);
+int dlm_create_root_list(struct dlm_ls *ls);
+void dlm_release_root_list(struct dlm_ls *ls);
+void dlm_clear_toss_list(struct dlm_ls *ls);
+void dlm_recover_rsbs(struct dlm_ls *ls);
+#endif                          /* __RECOVER_DOT_H__ */
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..362e3eff4dc9
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,290 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "lock.h"
+#include "requestqueue.h"
+#include "recoverd.h"
+/* If the start for which we're re-enabling locking (seq) has been superseded
+   by a newer stop (ls_recover_seq), we need to leave locking disabled. */
+static int enable_locking(struct dlm_ls *ls, uint64_t seq)
+{
+        int error = -EINTR;
+        spin_lock(&ls->ls_recover_lock);
+        if (ls->ls_recover_seq == seq) {
+                set_bit(LSFL_RUNNING, &ls->ls_flags);
+                up_write(&ls->ls_in_recovery);
+                error = 0;
+        }
+        spin_unlock(&ls->ls_recover_lock);
+        return error;
+}
+static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+        unsigned long start;
+        int error, neg = 0;
+        log_debug(ls, "recover %llx", rv->seq);
+        mutex_lock(&ls->ls_recoverd_active);
+        /*
+         * Suspending and resuming dlm_astd ensures that no lkb's from this ls
+         * will be processed by dlm_astd during recovery.
+         */
+        dlm_astd_suspend();
+        dlm_astd_resume();
+        /*
+         * This list of root rsb's will be the basis of most of the recovery
+         * routines.
+         */
+        dlm_create_root_list(ls);
+        /*
+         * Free all the tossed rsb's so we don't have to recover them.
+         */
+        dlm_clear_toss_list(ls);
+        /*
+         * Add or remove nodes from the lockspace's ls_nodes list.
+         * Also waits for all nodes to complete dlm_recover_members.
+         */
+        error = dlm_recover_members(ls, rv, &neg);
+        if (error) {
+                log_error(ls, "recover_members failed %d", error);
+                goto fail;
+        }
+        start = jiffies;
+        /*
+         * Rebuild our own share of the directory by collecting from all other
+         * nodes their master rsb names that hash to us.
+         */
+        error = dlm_recover_directory(ls);
+        if (error) {
+                log_error(ls, "recover_directory failed %d", error);
+                goto fail;
+        }
+        /*
+         * Purge directory-related requests that are saved in requestqueue.
+         * All dir requests from before recovery are invalid now due to the dir
+         * rebuild and will be resent by the requesting nodes.
+         */
+        dlm_purge_requestqueue(ls);
+        /*
+         * Wait for all nodes to complete directory rebuild.
+         */
+        error = dlm_recover_directory_wait(ls);
+        if (error) {
+                log_error(ls, "recover_directory_wait failed %d", error);
+                goto fail;
+        }
+        /*
+         * We may have outstanding operations that are waiting for a reply from
+         * a failed node.  Mark these to be resent after recovery.  Unlock and
+         * cancel ops can just be completed.
+         */
+        dlm_recover_waiters_pre(ls);
+        error = dlm_recovery_stopped(ls);
+        if (error)
+                goto fail;
+        if (neg || dlm_no_directory(ls)) {
+                /*
+                 * Clear lkb's for departed nodes.
+                 */
+                dlm_purge_locks(ls);
+                /*
+                 * Get new master nodeid's for rsb's that were mastered on
+                 * departed nodes.
+                 */
+                error = dlm_recover_masters(ls);
+                if (error) {
+                        log_error(ls, "recover_masters failed %d", error);
+                        goto fail;
+                }
+                /*
+                 * Send our locks on remastered rsb's to the new masters.
+                 */
+                error = dlm_recover_locks(ls);
+                if (error) {
+                        log_error(ls, "recover_locks failed %d", error);
+                        goto fail;
+                }
+                error = dlm_recover_locks_wait(ls);
+                if (error) {
+                        log_error(ls, "recover_locks_wait failed %d", error);
+                        goto fail;
+                }
+                /*
+                 * Finalize state in master rsb's now that all locks can be
+                 * checked.  This includes conversion resolution and lvb
+                 * settings.
+                 */
+                dlm_recover_rsbs(ls);
+        }
+        dlm_release_root_list(ls);
+        dlm_set_recover_status(ls, DLM_RS_DONE);
+        error = dlm_recover_done_wait(ls);
+        if (error) {
+                log_error(ls, "recover_done_wait failed %d", error);
+                goto fail;
+        }
+        dlm_clear_members_gone(ls);
+        error = enable_locking(ls, rv->seq);
+        if (error) {
+                log_error(ls, "enable_locking failed %d", error);
+                goto fail;
+        }
+        error = dlm_process_requestqueue(ls);
+        if (error) {
+                log_error(ls, "process_requestqueue failed %d", error);
+                goto fail;
+        }
+        error = dlm_recover_waiters_post(ls);
+        if (error) {
+                log_error(ls, "recover_waiters_post failed %d", error);
+                goto fail;
+        }
+        dlm_grant_after_purge(ls);
+        dlm_astd_wake();
+        log_debug(ls, "recover %llx done: %u ms", rv->seq,
+                  jiffies_to_msecs(jiffies - start));
+        mutex_unlock(&ls->ls_recoverd_active);
+        return 0;
+ fail:
+        dlm_release_root_list(ls);
+        log_debug(ls, "recover %llx error %d", rv->seq, error);
+        mutex_unlock(&ls->ls_recoverd_active);
+        return error;
+}
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+        struct dlm_recover *rv = NULL;
+        spin_lock(&ls->ls_recover_lock);
+        rv = ls->ls_recover_args;
+        ls->ls_recover_args = NULL;
+        clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+        spin_unlock(&ls->ls_recover_lock);
+        if (rv) {
+                ls_recover(ls, rv);
+                kfree(rv->nodeids);
+                kfree(rv);
+        }
+}
+static int dlm_recoverd(void *arg)
+{
+        struct dlm_ls *ls;
+        ls = dlm_find_lockspace_local(arg);
+        if (!ls) {
+                log_print("dlm_recoverd: no lockspace %p", arg);
+                return -1;
+        }
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (!test_bit(LSFL_WORK, &ls->ls_flags))
+                        schedule();
+                set_current_state(TASK_RUNNING);
+                if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
+                        do_ls_recovery(ls);
+        }
+        dlm_put_lockspace(ls);
+        return 0;
+}
+void dlm_recoverd_kick(struct dlm_ls *ls)
+{
+        set_bit(LSFL_WORK, &ls->ls_flags);
+        wake_up_process(ls->ls_recoverd_task);
+}
+int dlm_recoverd_start(struct dlm_ls *ls)
+{
+        struct task_struct *p;
+        int error = 0;
+        p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
+        if (IS_ERR(p))
+                error = PTR_ERR(p);
+        else
+                ls->ls_recoverd_task = p;
+        return error;
+}
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+        kthread_stop(ls->ls_recoverd_task);
+}
+void dlm_recoverd_suspend(struct dlm_ls *ls)
+{
+        wake_up(&ls->ls_wait_general);
+        mutex_lock(&ls->ls_recoverd_active);
+}
+void dlm_recoverd_resume(struct dlm_ls *ls)
+{
+        mutex_unlock(&ls->ls_recoverd_active);
+}
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+void dlm_recoverd_kick(struct dlm_ls *ls);
+void dlm_recoverd_stop(struct dlm_ls *ls);
+int dlm_recoverd_start(struct dlm_ls *ls);
+void dlm_recoverd_suspend(struct dlm_ls *ls);
+void dlm_recoverd_resume(struct dlm_ls *ls);
+#endif                          /* __RECOVERD_DOT_H__ */
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "member.h"
+#include "lock.h"
+#include "dir.h"
+#include "config.h"
+#include "requestqueue.h"
+struct rq_entry {
+        struct list_head list;
+        int nodeid;
+        char request[1];
+};
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete.  This happens when
+ * the lockspace is suspended on some nodes before it is on others, or the
+ * lockspace is enabled on some while still suspended on others.
+ */
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+{
+        struct rq_entry *e;
+        int length = hd->h_length;
+        if (dlm_is_removed(ls, nodeid))
+                return;
+        e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+        if (!e) {
+                log_print("dlm_add_requestqueue: out of memory\n");
+                return;
+        }
+        e->nodeid = nodeid;
+        memcpy(e->request, hd, length);
+        mutex_lock(&ls->ls_requestqueue_mutex);
+        list_add_tail(&e->list, &ls->ls_requestqueue);
+        mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+int dlm_process_requestqueue(struct dlm_ls *ls)
+{
+        struct rq_entry *e;
+        struct dlm_header *hd;
+        int error = 0;
+        mutex_lock(&ls->ls_requestqueue_mutex);
+        for (;;) {
+                if (list_empty(&ls->ls_requestqueue)) {
+                        mutex_unlock(&ls->ls_requestqueue_mutex);
+                        error = 0;
+                        break;
+                }
+                e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
+                mutex_unlock(&ls->ls_requestqueue_mutex);
+                hd = (struct dlm_header *) e->request;
+                error = dlm_receive_message(hd, e->nodeid, 1);
+                if (error == -EINTR) {
+                        /* entry is left on requestqueue */
+                        log_debug(ls, "process_requestqueue abort eintr");
+                        break;
+                }
+                mutex_lock(&ls->ls_requestqueue_mutex);
+                list_del(&e->list);
+                kfree(e);
+                if (dlm_locking_stopped(ls)) {
+                        log_debug(ls, "process_requestqueue abort running");
+                        mutex_unlock(&ls->ls_requestqueue_mutex);
+                        error = -EINTR;
+                        break;
+                }
+                schedule();
+        }
+        return error;
+}
+/*
+ * After recovery is done, locking is resumed and dlm_recoverd takes all the
+ * saved requests and processes them as they would have been by dlm_recvd.  At
+ * the same time, dlm_recvd will start receiving new requests from remote
+ * nodes.  We want to delay dlm_recvd processing new requests until
+ * dlm_recoverd has finished processing the old saved requests.
+ */
+void dlm_wait_requestqueue(struct dlm_ls *ls)
+{
+        for (;;) {
+                mutex_lock(&ls->ls_requestqueue_mutex);
+                if (list_empty(&ls->ls_requestqueue))
+                        break;
+                if (dlm_locking_stopped(ls))
+                        break;
+                mutex_unlock(&ls->ls_requestqueue_mutex);
+                schedule();
+        }
+        mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
+{
+        uint32_t type = ms->m_type;
+        if (dlm_is_removed(ls, nodeid))
+                return 1;
+        /* directory operations are always purged because the directory is
+           always rebuilt during recovery and the lookups resent */
+        if (type == DLM_MSG_REMOVE ||
+            type == DLM_MSG_LOOKUP ||
+            type == DLM_MSG_LOOKUP_REPLY)
+                return 1;
+        if (!dlm_no_directory(ls))
+                return 0;
+        /* with no directory, the master is likely to change as a part of
+           recovery; requests to/from the defunct master need to be purged */
+        switch (type) {
+        case DLM_MSG_REQUEST:
+        case DLM_MSG_CONVERT:
+        case DLM_MSG_UNLOCK:
+        case DLM_MSG_CANCEL:
+                /* we're no longer the master of this resource, the sender
+                   will resend to the new master (see waiter_needs_recovery) */
+                if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
+                        return 1;
+                break;
+        case DLM_MSG_REQUEST_REPLY:
+        case DLM_MSG_CONVERT_REPLY:
+        case DLM_MSG_UNLOCK_REPLY:
+        case DLM_MSG_CANCEL_REPLY:
+        case DLM_MSG_GRANT:
+                /* this reply is from the former master of the resource,
+                   we'll resend to the new master if needed */
+                if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
+                        return 1;
+                break;
+        }
+        return 0;
+}
+void dlm_purge_requestqueue(struct dlm_ls *ls)
+{
+        struct dlm_message *ms;
+        struct rq_entry *e, *safe;
+        mutex_lock(&ls->ls_requestqueue_mutex);
+        list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
+                ms = (struct dlm_message *) e->request;
+                if (purge_request(ls, ms, e->nodeid)) {
+                        list_del(&e->list);
+                        kfree(e);
+                }
+        }
+        mutex_unlock(&ls->ls_requestqueue_mutex);
+}
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __REQUESTQUEUE_DOT_H__
+#define __REQUESTQUEUE_DOT_H__
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+int dlm_process_requestqueue(struct dlm_ls *ls);
+void dlm_wait_requestqueue(struct dlm_ls *ls);
+void dlm_purge_requestqueue(struct dlm_ls *ls);
+#endif
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..c37e93e4f2df
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,788 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/dlm.h>
+#include <linux/dlm_device.h>
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "lvb_table.h"
+static const char *name_prefix="dlm";
+static struct miscdevice ctl_device;
+static struct file_operations device_fops;
+#ifdef CONFIG_COMPAT
+struct dlm_lock_params32 {
+        __u8 mode;
+        __u8 namelen;
+        __u16 flags;
+        __u32 lkid;
+        __u32 parent;
+        __u32 castparam;
+        __u32 castaddr;
+        __u32 bastparam;
+        __u32 bastaddr;
+        __u32 lksb;
+        char lvb[DLM_USER_LVB_LEN];
+        char name[0];
+};
+struct dlm_write_request32 {
+        __u32 version[3];
+        __u8 cmd;
+        __u8 is64bit;
+        __u8 unused[2];
+        union  {
+                struct dlm_lock_params32 lock;
+                struct dlm_lspace_params lspace;
+        } i;
+};
+struct dlm_lksb32 {
+        __u32 sb_status;
+        __u32 sb_lkid;
+        __u8 sb_flags;
+        __u32 sb_lvbptr;
+};
+struct dlm_lock_result32 {
+        __u32 length;
+        __u32 user_astaddr;
+        __u32 user_astparam;
+        __u32 user_lksb;
+        struct dlm_lksb32 lksb;
+        __u8 bast_mode;
+        __u8 unused[3];
+        /* Offsets may be zero if no data is present */
+        __u32 lvb_offset;
+};
+static void compat_input(struct dlm_write_request *kb,
+                         struct dlm_write_request32 *kb32)
+{
+        kb->version[0] = kb32->version[0];
+        kb->version[1] = kb32->version[1];
+        kb->version[2] = kb32->version[2];
+        kb->cmd = kb32->cmd;
+        kb->is64bit = kb32->is64bit;
+        if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
+            kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
+                kb->i.lspace.flags = kb32->i.lspace.flags;
+                kb->i.lspace.minor = kb32->i.lspace.minor;
+                strcpy(kb->i.lspace.name, kb32->i.lspace.name);
+        } else {
+                kb->i.lock.mode = kb32->i.lock.mode;
+                kb->i.lock.namelen = kb32->i.lock.namelen;
+                kb->i.lock.flags = kb32->i.lock.flags;
+                kb->i.lock.lkid = kb32->i.lock.lkid;
+                kb->i.lock.parent = kb32->i.lock.parent;
+                kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
+                kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
+                kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
+                kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
+                kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
+                memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
+                memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
+        }
+}
+static void compat_output(struct dlm_lock_result *res,
+                          struct dlm_lock_result32 *res32)
+{
+        res32->length = res->length - (sizeof(struct dlm_lock_result) -
+                                       sizeof(struct dlm_lock_result32));
+        res32->user_astaddr = (__u32)(long)res->user_astaddr;
+        res32->user_astparam = (__u32)(long)res->user_astparam;
+        res32->user_lksb = (__u32)(long)res->user_lksb;
+        res32->bast_mode = res->bast_mode;
+        res32->lvb_offset = res->lvb_offset;
+        res32->length = res->length;
+        res32->lksb.sb_status = res->lksb.sb_status;
+        res32->lksb.sb_flags = res->lksb.sb_flags;
+        res32->lksb.sb_lkid = res->lksb.sb_lkid;
+        res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
+}
+#endif
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+{
+        struct dlm_ls *ls;
+        struct dlm_user_args *ua;
+        struct dlm_user_proc *proc;
+        int remove_ownqueue = 0;
+        /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
+           lkb before dealing with it.  We need to check this
+           flag before taking ls_clear_proc_locks mutex because if
+           it's set, dlm_clear_proc_locks() holds the mutex. */
+        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+                /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
+                return;
+        }
+        ls = lkb->lkb_resource->res_ls;
+        mutex_lock(&ls->ls_clear_proc_locks);
+        /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
+           can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
+           lkb->ua so we can't try to use it. */
+        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+                /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
+                goto out;
+        }
+        DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        proc = ua->proc;
+        if (type == AST_BAST && ua->bastaddr == NULL)
+                goto out;
+        spin_lock(&proc->asts_spin);
+        if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+                kref_get(&lkb->lkb_ref);
+                list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+                lkb->lkb_ast_type |= type;
+                wake_up_interruptible(&proc->wait);
+        }
+        /* noqueue requests that fail may need to be removed from the
+           proc's locks list, there should be a better way of detecting
+           this situation than checking all these things... */
+        if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
+            ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
+                remove_ownqueue = 1;
+        /* We want to copy the lvb to userspace when the completion
+           ast is read if the status is 0, the lock has an lvb and
+           lvb_ops says we should.  We could probably have set_lvb_lock()
+           set update_user_lvb instead and not need old_mode */
+        if ((lkb->lkb_ast_type & AST_COMP) &&
+            (lkb->lkb_lksb->sb_status == 0) &&
+            lkb->lkb_lksb->sb_lvbptr &&
+            dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
+                ua->update_user_lvb = 1;
+        else
+                ua->update_user_lvb = 0;
+        spin_unlock(&proc->asts_spin);
+        if (remove_ownqueue) {
+                spin_lock(&ua->proc->locks_spin);
+                list_del_init(&lkb->lkb_ownqueue);
+                spin_unlock(&ua->proc->locks_spin);
+                dlm_put_lkb(lkb);
+        }
+ out:
+        mutex_unlock(&ls->ls_clear_proc_locks);
+}
+static int device_user_lock(struct dlm_user_proc *proc,
+                            struct dlm_lock_params *params)
+{
+        struct dlm_ls *ls;
+        struct dlm_user_args *ua;
+        int error = -ENOMEM;
+        ls = dlm_find_lockspace_local(proc->lockspace);
+        if (!ls)
+                return -ENOENT;
+        if (!params->castaddr || !params->lksb) {
+                error = -EINVAL;
+                goto out;
+        }
+        ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+        if (!ua)
+                goto out;
+        ua->proc = proc;
+        ua->user_lksb = params->lksb;
+        ua->castparam = params->castparam;
+        ua->castaddr = params->castaddr;
+        ua->bastparam = params->bastparam;
+        ua->bastaddr = params->bastaddr;
+        if (params->flags & DLM_LKF_CONVERT)
+                error = dlm_user_convert(ls, ua,
+                                         params->mode, params->flags,
+                                         params->lkid, params->lvb);
+        else {
+                error = dlm_user_request(ls, ua,
+                                         params->mode, params->flags,
+                                         params->name, params->namelen,
+                                         params->parent);
+                if (!error)
+                        error = ua->lksb.sb_lkid;
+        }
+ out:
+        dlm_put_lockspace(ls);
+        return error;
+}
+static int device_user_unlock(struct dlm_user_proc *proc,
+                              struct dlm_lock_params *params)
+{
+        struct dlm_ls *ls;
+        struct dlm_user_args *ua;
+        int error = -ENOMEM;
+        ls = dlm_find_lockspace_local(proc->lockspace);
+        if (!ls)
+                return -ENOENT;
+        ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+        if (!ua)
+                goto out;
+        ua->proc = proc;
+        ua->user_lksb = params->lksb;
+        ua->castparam = params->castparam;
+        ua->castaddr = params->castaddr;
+        if (params->flags & DLM_LKF_CANCEL)
+                error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
+        else
+                error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
+                                        params->lvb);
+ out:
+        dlm_put_lockspace(ls);
+        return error;
+}
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+        dlm_lockspace_t *lockspace;
+        struct dlm_ls *ls;
+        int error, len;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        error = dlm_new_lockspace(params->name, strlen(params->name),
+                                  &lockspace, 0, DLM_USER_LVB_LEN);
+        if (error)
+                return error;
+        ls = dlm_find_lockspace_local(lockspace);
+        if (!ls)
+                return -ENOENT;
+        error = -ENOMEM;
+        len = strlen(params->name) + strlen(name_prefix) + 2;
+        ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+        if (!ls->ls_device.name)
+                goto fail;
+        snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
+                 params->name);
+        ls->ls_device.fops = &device_fops;
+        ls->ls_device.minor = MISC_DYNAMIC_MINOR;
+        error = misc_register(&ls->ls_device);
+        if (error) {
+                kfree(ls->ls_device.name);
+                goto fail;
+        }
+        error = ls->ls_device.minor;
+        dlm_put_lockspace(ls);
+        return error;
+ fail:
+        dlm_put_lockspace(ls);
+        dlm_release_lockspace(lockspace, 0);
+        return error;
+}
+static int device_remove_lockspace(struct dlm_lspace_params *params)
+{
+        dlm_lockspace_t *lockspace;
+        struct dlm_ls *ls;
+        int error, force = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        ls = dlm_find_lockspace_device(params->minor);
+        if (!ls)
+                return -ENOENT;
+        error = misc_deregister(&ls->ls_device);
+        if (error) {
+                dlm_put_lockspace(ls);
+                goto out;
+        }
+        kfree(ls->ls_device.name);
+        if (params->flags & DLM_USER_LSFLG_FORCEFREE)
+                force = 2;
+        lockspace = ls->ls_local_handle;
+        /* dlm_release_lockspace waits for references to go to zero,
+           so all processes will need to close their device for the ls
+           before the release will procede */
+        dlm_put_lockspace(ls);
+        error = dlm_release_lockspace(lockspace, force);
+ out:
+        return error;
+}
+/* Check the user's version matches ours */
+static int check_version(struct dlm_write_request *req)
+{
+        if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+            (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+             req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+                printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
+                       "user (%d.%d.%d) kernel (%d.%d.%d)\n",
+                       current->comm,
+                       current->pid,
+                       req->version[0],
+                       req->version[1],
+                       req->version[2],
+                       DLM_DEVICE_VERSION_MAJOR,
+                       DLM_DEVICE_VERSION_MINOR,
+                       DLM_DEVICE_VERSION_PATCH);
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
+ * device_write
+ *
+ *   device_user_lock
+ *     dlm_user_request -> request_lock
+ *     dlm_user_convert -> convert_lock
+ *
+ *   device_user_unlock
+ *     dlm_user_unlock -> unlock_lock
+ *     dlm_user_cancel -> cancel_lock
+ *
+ *   device_create_lockspace
+ *     dlm_new_lockspace
+ *
+ *   device_remove_lockspace
+ *     dlm_release_lockspace
+ */
+/* a write to a lockspace device is a lock or unlock request, a write
+   to the control device is to create/remove a lockspace */
+static ssize_t device_write(struct file *file, const char __user *buf,
+                            size_t count, loff_t *ppos)
+{
+        struct dlm_user_proc *proc = file->private_data;
+        struct dlm_write_request *kbuf;
+        sigset_t tmpsig, allsigs;
+        int error;
+#ifdef CONFIG_COMPAT
+        if (count < sizeof(struct dlm_write_request32))
+#else
+        if (count < sizeof(struct dlm_write_request))
+#endif
+                return -EINVAL;
+        kbuf = kmalloc(count, GFP_KERNEL);
+        if (!kbuf)
+                return -ENOMEM;
+        if (copy_from_user(kbuf, buf, count)) {
+                error = -EFAULT;
+                goto out_free;
+        }
+        if (check_version(kbuf)) {
+                error = -EBADE;
+                goto out_free;
+        }
+#ifdef CONFIG_COMPAT
+        if (!kbuf->is64bit) {
+                struct dlm_write_request32 *k32buf;
+                k32buf = (struct dlm_write_request32 *)kbuf;
+                kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
+                               sizeof(struct dlm_write_request32)), GFP_KERNEL);
+                if (!kbuf)
+                        return -ENOMEM;
+                if (proc)
+                        set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
+                compat_input(kbuf, k32buf);
+                kfree(k32buf);
+        }
+#endif
+        /* do we really need this? can a write happen after a close? */
+        if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
+            test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+                return -EINVAL;
+        sigfillset(&allsigs);
+        sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+        error = -EINVAL;
+        switch (kbuf->cmd)
+        {
+        case DLM_USER_LOCK:
+                if (!proc) {
+                        log_print("no locking on control device");
+                        goto out_sig;
+                }
+                error = device_user_lock(proc, &kbuf->i.lock);
+                break;
+        case DLM_USER_UNLOCK:
+                if (!proc) {
+                        log_print("no locking on control device");
+                        goto out_sig;
+                }
+                error = device_user_unlock(proc, &kbuf->i.lock);
+                break;
+        case DLM_USER_CREATE_LOCKSPACE:
+                if (proc) {
+                        log_print("create/remove only on control device");
+                        goto out_sig;
+                }
+                error = device_create_lockspace(&kbuf->i.lspace);
+                break;
+        case DLM_USER_REMOVE_LOCKSPACE:
+                if (proc) {
+                        log_print("create/remove only on control device");
+                        goto out_sig;
+                }
+                error = device_remove_lockspace(&kbuf->i.lspace);
+                break;
+        default:
+                log_print("Unknown command passed to DLM device : %d\n",
+                          kbuf->cmd);
+        }
+ out_sig:
+        sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+        recalc_sigpending();
+ out_free:
+        kfree(kbuf);
+        return error;
+}
+/* Every process that opens the lockspace device has its own "proc" structure
+   hanging off the open file that's used to keep track of locks owned by the
+   process and asts that need to be delivered to the process. */
+static int device_open(struct inode *inode, struct file *file)
+{
+        struct dlm_user_proc *proc;
+        struct dlm_ls *ls;
+        ls = dlm_find_lockspace_device(iminor(inode));
+        if (!ls)
+                return -ENOENT;
+        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+        if (!proc) {
+                dlm_put_lockspace(ls);
+                return -ENOMEM;
+        }
+        proc->lockspace = ls->ls_local_handle;
+        INIT_LIST_HEAD(&proc->asts);
+        INIT_LIST_HEAD(&proc->locks);
+        spin_lock_init(&proc->asts_spin);
+        spin_lock_init(&proc->locks_spin);
+        init_waitqueue_head(&proc->wait);
+        file->private_data = proc;
+        return 0;
+}
+static int device_close(struct inode *inode, struct file *file)
+{
+        struct dlm_user_proc *proc = file->private_data;
+        struct dlm_ls *ls;
+        sigset_t tmpsig, allsigs;
+        ls = dlm_find_lockspace_local(proc->lockspace);
+        if (!ls)
+                return -ENOENT;
+        sigfillset(&allsigs);
+        sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+        set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
+        dlm_clear_proc_locks(ls, proc);
+        /* at this point no more lkb's should exist for this lockspace,
+           so there's no chance of dlm_user_add_ast() being called and
+           looking for lkb->ua->proc */
+        kfree(proc);
+        file->private_data = NULL;
+        dlm_put_lockspace(ls);
+        dlm_put_lockspace(ls);  /* for the find in device_open() */
+        /* FIXME: AUTOFREE: if this ls is no longer used do
+           device_remove_lockspace() */
+        sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+        recalc_sigpending();
+        return 0;
+}
+static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
+                               int bmode, char __user *buf, size_t count)
+{
+#ifdef CONFIG_COMPAT
+        struct dlm_lock_result32 result32;
+#endif
+        struct dlm_lock_result result;
+        void *resultptr;
+        int error=0;
+        int len;
+        int struct_len;
+        memset(&result, 0, sizeof(struct dlm_lock_result));
+        memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+        result.user_lksb = ua->user_lksb;
+        /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
+           in a conversion unless the conversion is successful.  See code
+           in dlm_user_convert() for updating ua from ua_tmp.  OpenVMS, though,
+           notes that a new blocking AST address and parameter are set even if
+           the conversion fails, so maybe we should just do that. */
+        if (type == AST_BAST) {
+                result.user_astaddr = ua->bastaddr;
+                result.user_astparam = ua->bastparam;
+                result.bast_mode = bmode;
+        } else {
+                result.user_astaddr = ua->castaddr;
+                result.user_astparam = ua->castparam;
+        }
+#ifdef CONFIG_COMPAT
+        if (compat)
+                len = sizeof(struct dlm_lock_result32);
+        else
+#endif
+                len = sizeof(struct dlm_lock_result);
+        struct_len = len;
+        /* copy lvb to userspace if there is one, it's been updated, and
+           the user buffer has space for it */
+        if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
+            count >= len + DLM_USER_LVB_LEN) {
+                if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
+                                 DLM_USER_LVB_LEN)) {
+                        error = -EFAULT;
+                        goto out;
+                }
+                result.lvb_offset = len;
+                len += DLM_USER_LVB_LEN;
+        }
+        result.length = len;
+        resultptr = &result;
+#ifdef CONFIG_COMPAT
+        if (compat) {
+                compat_output(&result, &result32);
+                resultptr = &result32;
+        }
+#endif
+        if (copy_to_user(buf, resultptr, struct_len))
+                error = -EFAULT;
+        else
+                error = len;
+ out:
+        return error;
+}
+/* a read returns a single ast described in a struct dlm_lock_result */
+static ssize_t device_read(struct file *file, char __user *buf, size_t count,
+                           loff_t *ppos)
+{
+        struct dlm_user_proc *proc = file->private_data;
+        struct dlm_lkb *lkb;
+        struct dlm_user_args *ua;
+        DECLARE_WAITQUEUE(wait, current);
+        int error, type=0, bmode=0, removed = 0;
+#ifdef CONFIG_COMPAT
+        if (count < sizeof(struct dlm_lock_result32))
+#else
+        if (count < sizeof(struct dlm_lock_result))
+#endif
+                return -EINVAL;
+        /* do we really need this? can a read happen after a close? */
+        if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+                return -EINVAL;
+        spin_lock(&proc->asts_spin);
+        if (list_empty(&proc->asts)) {
+                if (file->f_flags & O_NONBLOCK) {
+                        spin_unlock(&proc->asts_spin);
+                        return -EAGAIN;
+                }
+                add_wait_queue(&proc->wait, &wait);
+        repeat:
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (list_empty(&proc->asts) && !signal_pending(current)) {
+                        spin_unlock(&proc->asts_spin);
+                        schedule();
+                        spin_lock(&proc->asts_spin);
+                        goto repeat;
+                }
+                set_current_state(TASK_RUNNING);
+                remove_wait_queue(&proc->wait, &wait);
+                if (signal_pending(current)) {
+                        spin_unlock(&proc->asts_spin);
+                        return -ERESTARTSYS;
+                }
+        }
+        if (list_empty(&proc->asts)) {
+                spin_unlock(&proc->asts_spin);
+                return -EAGAIN;
+        }
+        /* there may be both completion and blocking asts to return for
+           the lkb, don't remove lkb from asts list unless no asts remain */
+        lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
+        if (lkb->lkb_ast_type & AST_COMP) {
+                lkb->lkb_ast_type &= ~AST_COMP;
+                type = AST_COMP;
+        } else if (lkb->lkb_ast_type & AST_BAST) {
+                lkb->lkb_ast_type &= ~AST_BAST;
+                type = AST_BAST;
+                bmode = lkb->lkb_bastmode;
+        }
+        if (!lkb->lkb_ast_type) {
+                list_del(&lkb->lkb_astqueue);
+                removed = 1;
+        }
+        spin_unlock(&proc->asts_spin);
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        error = copy_result_to_user(ua,
+                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                                type, bmode, buf, count);
+        /* removes reference for the proc->asts lists added by
+           dlm_user_add_ast() and may result in the lkb being freed */
+        if (removed)
+                dlm_put_lkb(lkb);
+        return error;
+}
+static unsigned int device_poll(struct file *file, poll_table *wait)
+{
+        struct dlm_user_proc *proc = file->private_data;
+        poll_wait(file, &proc->wait, wait);
+        spin_lock(&proc->asts_spin);
+        if (!list_empty(&proc->asts)) {
+                spin_unlock(&proc->asts_spin);
+                return POLLIN | POLLRDNORM;
+        }
+        spin_unlock(&proc->asts_spin);
+        return 0;
+}
+static int ctl_device_open(struct inode *inode, struct file *file)
+{
+        file->private_data = NULL;
+        return 0;
+}
+static int ctl_device_close(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static struct file_operations device_fops = {
+        .open    = device_open,
+        .release = device_close,
+        .read    = device_read,
+        .write   = device_write,
+        .poll    = device_poll,
+        .owner   = THIS_MODULE,
+};
+static struct file_operations ctl_device_fops = {
+        .open    = ctl_device_open,
+        .release = ctl_device_close,
+        .write   = device_write,
+        .owner   = THIS_MODULE,
+};
+int dlm_user_init(void)
+{
+        int error;
+        ctl_device.name = "dlm-control";
+        ctl_device.fops = &ctl_device_fops;
+        ctl_device.minor = MISC_DYNAMIC_MINOR;
+        error = misc_register(&ctl_device);
+        if (error)
+                log_print("misc_register failed for control device");
+        return error;
+}
+void dlm_user_exit(void)
+{
+        misc_deregister(&ctl_device);
+}
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+#ifndef __USER_DOT_H__
+#define __USER_DOT_H__
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+int dlm_user_init(void);
+void dlm_user_exit(void);
+#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "rcom.h"
+#include "util.h"
+static void header_out(struct dlm_header *hd)
+{
+        hd->h_version           = cpu_to_le32(hd->h_version);
+        hd->h_lockspace         = cpu_to_le32(hd->h_lockspace);
+        hd->h_nodeid            = cpu_to_le32(hd->h_nodeid);
+        hd->h_length            = cpu_to_le16(hd->h_length);
+}
+static void header_in(struct dlm_header *hd)
+{
+        hd->h_version           = le32_to_cpu(hd->h_version);
+        hd->h_lockspace         = le32_to_cpu(hd->h_lockspace);
+        hd->h_nodeid            = le32_to_cpu(hd->h_nodeid);
+        hd->h_length            = le16_to_cpu(hd->h_length);
+}
+void dlm_message_out(struct dlm_message *ms)
+{
+        struct dlm_header *hd = (struct dlm_header *) ms;
+        header_out(hd);
+        ms->m_type              = cpu_to_le32(ms->m_type);
+        ms->m_nodeid            = cpu_to_le32(ms->m_nodeid);
+        ms->m_pid               = cpu_to_le32(ms->m_pid);
+        ms->m_lkid              = cpu_to_le32(ms->m_lkid);
+        ms->m_remid             = cpu_to_le32(ms->m_remid);
+        ms->m_parent_lkid       = cpu_to_le32(ms->m_parent_lkid);
+        ms->m_parent_remid      = cpu_to_le32(ms->m_parent_remid);
+        ms->m_exflags           = cpu_to_le32(ms->m_exflags);
+        ms->m_sbflags           = cpu_to_le32(ms->m_sbflags);
+        ms->m_flags             = cpu_to_le32(ms->m_flags);
+        ms->m_lvbseq            = cpu_to_le32(ms->m_lvbseq);
+        ms->m_hash              = cpu_to_le32(ms->m_hash);
+        ms->m_status            = cpu_to_le32(ms->m_status);
+        ms->m_grmode            = cpu_to_le32(ms->m_grmode);
+        ms->m_rqmode            = cpu_to_le32(ms->m_rqmode);
+        ms->m_bastmode          = cpu_to_le32(ms->m_bastmode);
+        ms->m_asts              = cpu_to_le32(ms->m_asts);
+        ms->m_result            = cpu_to_le32(ms->m_result);
+}
+void dlm_message_in(struct dlm_message *ms)
+{
+        struct dlm_header *hd = (struct dlm_header *) ms;
+        header_in(hd);
+        ms->m_type              = le32_to_cpu(ms->m_type);
+        ms->m_nodeid            = le32_to_cpu(ms->m_nodeid);
+        ms->m_pid               = le32_to_cpu(ms->m_pid);
+        ms->m_lkid              = le32_to_cpu(ms->m_lkid);
+        ms->m_remid             = le32_to_cpu(ms->m_remid);
+        ms->m_parent_lkid       = le32_to_cpu(ms->m_parent_lkid);
+        ms->m_parent_remid      = le32_to_cpu(ms->m_parent_remid);
+        ms->m_exflags           = le32_to_cpu(ms->m_exflags);
+        ms->m_sbflags           = le32_to_cpu(ms->m_sbflags);
+        ms->m_flags             = le32_to_cpu(ms->m_flags);
+        ms->m_lvbseq            = le32_to_cpu(ms->m_lvbseq);
+        ms->m_hash              = le32_to_cpu(ms->m_hash);
+        ms->m_status            = le32_to_cpu(ms->m_status);
+        ms->m_grmode            = le32_to_cpu(ms->m_grmode);
+        ms->m_rqmode            = le32_to_cpu(ms->m_rqmode);
+        ms->m_bastmode          = le32_to_cpu(ms->m_bastmode);
+        ms->m_asts              = le32_to_cpu(ms->m_asts);
+        ms->m_result            = le32_to_cpu(ms->m_result);
+}
+static void rcom_lock_out(struct rcom_lock *rl)
+{
+        rl->rl_ownpid           = cpu_to_le32(rl->rl_ownpid);
+        rl->rl_lkid             = cpu_to_le32(rl->rl_lkid);
+        rl->rl_remid            = cpu_to_le32(rl->rl_remid);
+        rl->rl_parent_lkid      = cpu_to_le32(rl->rl_parent_lkid);
+        rl->rl_parent_remid     = cpu_to_le32(rl->rl_parent_remid);
+        rl->rl_exflags          = cpu_to_le32(rl->rl_exflags);
+        rl->rl_flags            = cpu_to_le32(rl->rl_flags);
+        rl->rl_lvbseq           = cpu_to_le32(rl->rl_lvbseq);
+        rl->rl_result           = cpu_to_le32(rl->rl_result);
+        rl->rl_wait_type        = cpu_to_le16(rl->rl_wait_type);
+        rl->rl_namelen          = cpu_to_le16(rl->rl_namelen);
+}
+static void rcom_lock_in(struct rcom_lock *rl)
+{
+        rl->rl_ownpid           = le32_to_cpu(rl->rl_ownpid);
+        rl->rl_lkid             = le32_to_cpu(rl->rl_lkid);
+        rl->rl_remid            = le32_to_cpu(rl->rl_remid);
+        rl->rl_parent_lkid      = le32_to_cpu(rl->rl_parent_lkid);
+        rl->rl_parent_remid     = le32_to_cpu(rl->rl_parent_remid);
+        rl->rl_exflags          = le32_to_cpu(rl->rl_exflags);
+        rl->rl_flags            = le32_to_cpu(rl->rl_flags);
+        rl->rl_lvbseq           = le32_to_cpu(rl->rl_lvbseq);
+        rl->rl_result           = le32_to_cpu(rl->rl_result);
+        rl->rl_wait_type        = le16_to_cpu(rl->rl_wait_type);
+        rl->rl_namelen          = le16_to_cpu(rl->rl_namelen);
+}
+static void rcom_config_out(struct rcom_config *rf)
+{
+        rf->rf_lvblen           = cpu_to_le32(rf->rf_lvblen);
+        rf->rf_lsflags          = cpu_to_le32(rf->rf_lsflags);
+}
+static void rcom_config_in(struct rcom_config *rf)
+{
+        rf->rf_lvblen           = le32_to_cpu(rf->rf_lvblen);
+        rf->rf_lsflags          = le32_to_cpu(rf->rf_lsflags);
+}
+void dlm_rcom_out(struct dlm_rcom *rc)
+{
+        struct dlm_header *hd = (struct dlm_header *) rc;
+        int type = rc->rc_type;
+        header_out(hd);
+        rc->rc_type             = cpu_to_le32(rc->rc_type);
+        rc->rc_result           = cpu_to_le32(rc->rc_result);
+        rc->rc_id               = cpu_to_le64(rc->rc_id);
+        if (type == DLM_RCOM_LOCK)
+                rcom_lock_out((struct rcom_lock *) rc->rc_buf);
+        else if (type == DLM_RCOM_STATUS_REPLY)
+                rcom_config_out((struct rcom_config *) rc->rc_buf);
+}
+void dlm_rcom_in(struct dlm_rcom *rc)
+{
+        struct dlm_header *hd = (struct dlm_header *) rc;
+        header_in(hd);
+        rc->rc_type             = le32_to_cpu(rc->rc_type);
+        rc->rc_result           = le32_to_cpu(rc->rc_result);
+        rc->rc_id               = le64_to_cpu(rc->rc_id);
+        if (rc->rc_type == DLM_RCOM_LOCK)
+                rcom_lock_in((struct rcom_lock *) rc->rc_buf);
+        else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+                rcom_config_in((struct rcom_config *) rc->rc_buf);
+}
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+void dlm_message_out(struct dlm_message *ms);
+void dlm_message_in(struct dlm_message *ms);
+void dlm_rcom_out(struct dlm_rcom *rc);
+void dlm_rcom_in(struct dlm_rcom *rc);
+#endif
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
+config GFS2_FS
+        tristate "GFS2 file system support"
+        depends on EXPERIMENTAL
+        select FS_POSIX_ACL
+        help
+        A cluster filesystem.
+        Allows a cluster of computers to simultaneously use a block device
+        that is shared between them (with FC, iSCSI, NBD, etc...).  GFS reads
+        and writes to the block device like a local filesystem, but also uses
+        a lock module to allow the computers coordinate their I/O so
+        filesystem consistency is maintained.  One of the nifty features of
+        GFS is perfect consistency -- changes made to the filesystem on one
+        machine show up immediately on all other machines in the cluster.
+        To use the GFS2 filesystem, you will need to enable one or more of
+        the below locking modules. Documentation and utilities for GFS2 can
+        be found here: http://sources.redhat.com/cluster
+config GFS2_FS_LOCKING_NOLOCK
+        tristate "GFS2 \"nolock\" locking module"
+        depends on GFS2_FS
+        help
+        Single node locking module for GFS2.
+        Use this module if you want to use GFS2 on a single node without
+        its clustering features. You can still take advantage of the
+        large file support, and upgrade to running a full cluster later on
+        if required.
+        If you will only be using GFS2 in cluster mode, you do not need this
+        module.
+config GFS2_FS_LOCKING_DLM
+        tristate "GFS2 DLM locking module"
+        depends on GFS2_FS
+        select DLM
+        help
+        Multiple node locking module for GFS2
+        Most users of GFS2 will require this module. It provides the locking
+        interface between GFS2 and the DLM, which is required to use GFS2
+        in a cluster environment.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..e3f1ada643ac
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
+obj-$(CONFIG_GFS2_FS) += gfs2.o
+gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+        glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
+        mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+        ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
+        recovery.o rgrp.o super.o sys.o trans.o util.o
+obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
+obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..5f959b8ce406
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+#define ACL_ACCESS 1
+#define ACL_DEFAULT 0
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+                      struct gfs2_ea_request *er,
+                      int *remove, mode_t *mode)
+{
+        struct posix_acl *acl;
+        int error;
+        error = gfs2_acl_validate_remove(ip, access);
+        if (error)
+                return error;
+        if (!er->er_data)
+                return -EINVAL;
+        acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (!acl) {
+                *remove = 1;
+                return 0;
+        }
+        error = posix_acl_valid(acl);
+        if (error)
+                goto out;
+        if (access) {
+                error = posix_acl_equiv_mode(acl, mode);
+                if (!error)
+                        *remove = 1;
+                else if (error > 0)
+                        error = 0;
+        }
+out:
+        posix_acl_release(acl);
+        return error;
+}
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
+{
+        if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
+                return -EOPNOTSUPP;
+        if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (S_ISLNK(ip->i_di.di_mode))
+                return -EOPNOTSUPP;
+        if (!access && !S_ISDIR(ip->i_di.di_mode))
+                return -EACCES;
+        return 0;
+}
+static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
+                   struct gfs2_ea_location *el, char **data, unsigned int *len)
+{
+        struct gfs2_ea_request er;
+        struct gfs2_ea_location el_this;
+        int error;
+        if (!ip->i_di.di_eattr)
+                return 0;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        if (access) {
+                er.er_name = GFS2_POSIX_ACL_ACCESS;
+                er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+        } else {
+                er.er_name = GFS2_POSIX_ACL_DEFAULT;
+                er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+        }
+        er.er_type = GFS2_EATYPE_SYS;
+        if (!el)
+                el = &el_this;
+        error = gfs2_ea_find(ip, &er, el);
+        if (error)
+                return error;
+        if (!el->el_ea)
+                return 0;
+        if (!GFS2_EA_DATA_LEN(el->el_ea))
+                goto out;
+        er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
+        er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+        error = -ENOMEM;
+        if (!er.er_data)
+                goto out;
+        error = gfs2_ea_get_copy(ip, el, er.er_data);
+        if (error)
+                goto out_kfree;
+        if (acl) {
+                *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+                if (IS_ERR(*acl))
+                        error = PTR_ERR(*acl);
+        }
+out_kfree:
+        if (error || !data)
+                kfree(er.er_data);
+        else {
+                *data = er.er_data;
+                *len = er.er_data_len;
+        }
+out:
+        if (error || el == &el_this)
+                brelse(el->el_bh);
+        return error;
+}
+/**
+ * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
+ * @inode: the file we want to do something to
+ * @mask: what we want to do
+ *
+ * Returns: errno
+ */
+int gfs2_check_acl_locked(struct inode *inode, int mask)
+{
+        struct posix_acl *acl = NULL;
+        int error;
+        error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+        if (error)
+                return error;
+        if (acl) {
+                error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return error;
+        }
+        return -EAGAIN;
+}
+int gfs2_check_acl(struct inode *inode, int mask)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (!error) {
+                error = gfs2_check_acl_locked(inode, mask);
+                gfs2_glock_dq_uninit(&i_gh);
+        }
+        return error;
+}
+static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                gfs2_assert_withdraw(sdp,
+                                (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
+                ip->i_di.di_mode = mode;
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+        return 0;
+}
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct posix_acl *acl = NULL, *clone;
+        struct gfs2_ea_request er;
+        mode_t mode = ip->i_di.di_mode;
+        int error;
+        if (!sdp->sd_args.ar_posix_acl)
+                return 0;
+        if (S_ISLNK(ip->i_di.di_mode))
+                return 0;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = GFS2_EATYPE_SYS;
+        error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
+                        &er.er_data, &er.er_data_len);
+        if (error)
+                return error;
+        if (!acl) {
+                mode &= ~current->fs->umask;
+                if (mode != ip->i_di.di_mode)
+                        error = munge_mode(ip, mode);
+                return error;
+        }
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        error = -ENOMEM;
+        if (!clone)
+                goto out;
+        posix_acl_release(acl);
+        acl = clone;
+        if (S_ISDIR(ip->i_di.di_mode)) {
+                er.er_name = GFS2_POSIX_ACL_DEFAULT;
+                er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+                error = gfs2_system_eaops.eo_set(ip, &er);
+                if (error)
+                        goto out;
+        }
+        error = posix_acl_create_masq(acl, &mode);
+        if (error < 0)
+                goto out;
+        if (error > 0) {
+                er.er_name = GFS2_POSIX_ACL_ACCESS;
+                er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+                posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
+                er.er_mode = mode;
+                er.er_flags = GFS2_ERF_MODE;
+                error = gfs2_system_eaops.eo_set(ip, &er);
+                if (error)
+                        goto out;
+        } else
+                munge_mode(ip, mode);
+out:
+        posix_acl_release(acl);
+        kfree(er.er_data);
+        return error;
+}
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
+{
+        struct posix_acl *acl = NULL, *clone;
+        struct gfs2_ea_location el;
+        char *data;
+        unsigned int len;
+        int error;
+        error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+        if (error)
+                return error;
+        if (!acl)
+                return gfs2_setattr_simple(ip, attr);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        error = -ENOMEM;
+        if (!clone)
+                goto out;
+        posix_acl_release(acl);
+        acl = clone;
+        error = posix_acl_chmod_masq(acl, attr->ia_mode);
+        if (!error) {
+                posix_acl_to_xattr(acl, data, len);
+                error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+        }
+out:
+        posix_acl_release(acl);
+        brelse(el.el_bh);
+        kfree(data);
+        return error;
+}
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..05c294fe0d78
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __ACL_DOT_H__
+#define __ACL_DOT_H__
+#include "incore.h"
+#define GFS2_POSIX_ACL_ACCESS           "posix_acl_access"
+#define GFS2_POSIX_ACL_ACCESS_LEN       16
+#define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
+#define GFS2_POSIX_ACL_DEFAULT_LEN      17
+#define GFS2_ACL_IS_ACCESS(name, len) \
+         ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
+         !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
+#define GFS2_ACL_IS_DEFAULT(name, len) \
+         ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
+         !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
+struct gfs2_ea_request;
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+                          struct gfs2_ea_request *er,
+                          int *remove, mode_t *mode);
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
+int gfs2_check_acl_locked(struct inode *inode, int mask);
+int gfs2_check_acl(struct inode *inode, int mask);
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..cc57f2ecd219
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1221 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "dir.h"
+#include "util.h"
+#include "ops_address.h"
+/* This doesn't need to be that large as max 64 bit pointers in a 4k
+ * block is 512, so __u16 is fine for that. It saves stack space to
+ * keep it small.
+ */
+struct metapath {
+        __u16 mp_list[GFS2_MAX_META_HEIGHT];
+};
+typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
+                             struct buffer_head *bh, u64 *top,
+                             u64 *bottom, unsigned int height,
+                             void *data);
+struct strip_mine {
+        int sm_first;
+        unsigned int sm_height;
+};
+/**
+ * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @block: the block number that was allocated
+ * @private: any locked page held by the caller process
+ *
+ * Returns: errno
+ */
+static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
+                               u64 block, struct page *page)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct inode *inode = &ip->i_inode;
+        struct buffer_head *bh;
+        int release = 0;
+        if (!page || page->index) {
+                page = grab_cache_page(inode->i_mapping, 0);
+                if (!page)
+                        return -ENOMEM;
+                release = 1;
+        }
+        if (!PageUptodate(page)) {
+                void *kaddr = kmap(page);
+                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+                       ip->i_di.di_size);
+                memset(kaddr + ip->i_di.di_size, 0,
+                       PAGE_CACHE_SIZE - ip->i_di.di_size);
+                kunmap(page);
+                SetPageUptodate(page);
+        }
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << inode->i_blkbits,
+                                     (1 << BH_Uptodate));
+        bh = page_buffers(page);
+        if (!buffer_mapped(bh))
+                map_bh(bh, inode->i_sb, block);
+        set_buffer_uptodate(bh);
+        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+                gfs2_trans_add_bh(ip->i_gl, bh, 0);
+        mark_buffer_dirty(bh);
+        if (release) {
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        return 0;
+}
+/**
+ * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
+ * @ip: The GFS2 inode to unstuff
+ * @unstuffer: the routine that handles unstuffing a non-zero length file
+ * @private: private data for the unstuffer
+ *
+ * This routine unstuffs a dinode and returns it to a "normal" state such
+ * that the height can be grown in the traditional way.
+ *
+ * Returns: errno
+ */
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
+{
+        struct buffer_head *bh, *dibh;
+        struct gfs2_dinode *di;
+        u64 block = 0;
+        int isdir = gfs2_is_dir(ip);
+        int error;
+        down_write(&ip->i_rw_mutex);
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        if (ip->i_di.di_size) {
+                /* Get a free block, fill it with the stuffed data,
+                   and write it out to disk */
+                if (isdir) {
+                        block = gfs2_alloc_meta(ip);
+                        error = gfs2_dir_get_new_buffer(ip, block, &bh);
+                        if (error)
+                                goto out_brelse;
+                        gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
+                                              dibh, sizeof(struct gfs2_dinode));
+                        brelse(bh);
+                } else {
+                        block = gfs2_alloc_data(ip);
+                        error = gfs2_unstuffer_page(ip, dibh, block, page);
+                        if (error)
+                                goto out_brelse;
+                }
+        }
+        /*  Set up the pointer to the new block  */
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        di = (struct gfs2_dinode *)dibh->b_data;
+        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        if (ip->i_di.di_size) {
+                *(__be64 *)(di + 1) = cpu_to_be64(block);
+                ip->i_di.di_blocks++;
+                di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+        }
+        ip->i_di.di_height = 1;
+        di->di_height = cpu_to_be16(1);
+out_brelse:
+        brelse(dibh);
+out:
+        up_write(&ip->i_rw_mutex);
+        return error;
+}
+/**
+ * calc_tree_height - Calculate the height of a metadata tree
+ * @ip: The GFS2 inode
+ * @size: The proposed size of the file
+ *
+ * Work out how tall a metadata tree needs to be in order to accommodate a
+ * file of a particular size. If size is less than the current size of
+ * the inode, then the current size of the inode is used instead of the
+ * supplied one.
+ *
+ * Returns: the height the tree should be
+ */
+static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u64 *arr;
+        unsigned int max, height;
+        if (ip->i_di.di_size > size)
+                size = ip->i_di.di_size;
+        if (gfs2_is_dir(ip)) {
+                arr = sdp->sd_jheightsize;
+                max = sdp->sd_max_jheight;
+        } else {
+                arr = sdp->sd_heightsize;
+                max = sdp->sd_max_height;
+        }
+        for (height = 0; height < max; height++)
+                if (arr[height] >= size)
+                        break;
+        return height;
+}
+/**
+ * build_height - Build a metadata tree of the requested height
+ * @ip: The GFS2 inode
+ * @height: The height to build to
+ *
+ *
+ * Returns: errno
+ */
+static int build_height(struct inode *inode, unsigned height)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        unsigned new_height = height - ip->i_di.di_height;
+        struct buffer_head *dibh;
+        struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
+        struct gfs2_dinode *di;
+        int error;
+        u64 *bp;
+        u64 bn;
+        unsigned n;
+        if (height <= ip->i_di.di_height)
+                return 0;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        for(n = 0; n < new_height; n++) {
+                bn = gfs2_alloc_meta(ip);
+                blocks[n] = gfs2_meta_new(ip->i_gl, bn);
+                gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
+        }
+        n = 0;
+        bn = blocks[0]->b_blocknr;
+        if (new_height > 1) {
+                for(; n < new_height-1; n++) {
+                        gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
+                                          GFS2_FORMAT_IN);
+                        gfs2_buffer_clear_tail(blocks[n],
+                                               sizeof(struct gfs2_meta_header));
+                        bp = (u64 *)(blocks[n]->b_data +
+                                     sizeof(struct gfs2_meta_header));
+                        *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
+                        brelse(blocks[n]);
+                        blocks[n] = NULL;
+                }
+        }
+        gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+        gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
+                              dibh, sizeof(struct gfs2_dinode));
+        brelse(blocks[n]);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        di = (struct gfs2_dinode *)dibh->b_data;
+        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        *(__be64 *)(di + 1) = cpu_to_be64(bn);
+        ip->i_di.di_height += new_height;
+        ip->i_di.di_blocks += new_height;
+        di->di_height = cpu_to_be16(ip->i_di.di_height);
+        di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+        brelse(dibh);
+        return error;
+}
+/**
+ * find_metapath - Find path through the metadata tree
+ * @ip: The inode pointer
+ * @mp: The metapath to return the result in
+ * @block: The disk block to look up
+ *
+ *   This routine returns a struct metapath structure that defines a path
+ *   through the metadata of inode "ip" to get to block "block".
+ *
+ *   Example:
+ *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
+ *   filesystem with a blocksize of 4096.
+ *
+ *   find_metapath() would return a struct metapath structure set to:
+ *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
+ *   and mp_list[2] = 165.
+ *
+ *   That means that in order to get to the block containing the byte at
+ *   offset 101342453, we would load the indirect block pointed to by pointer
+ *   0 in the dinode.  We would then load the indirect block pointed to by
+ *   pointer 48 in that indirect block.  We would then load the data block
+ *   pointed to by pointer 165 in that indirect block.
+ *
+ *             ----------------------------------------
+ *             | Dinode |                             |
+ *             |        |                            4|
+ *             |        |0 1 2 3 4 5                 9|
+ *             |        |                            6|
+ *             ----------------------------------------
+ *                       |
+ *                       |
+ *                       V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                                     5|
+ *             |            4 4 4 4 4 5 5            1|
+ *             |0           5 6 7 8 9 0 1            2|
+ *             ----------------------------------------
+ *                                |
+ *                                |
+ *                                V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                         1 1 1 1 1   5|
+ *             |                         6 6 6 6 6   1|
+ *             |0                        3 4 5 6 7   2|
+ *             ----------------------------------------
+ *                                           |
+ *                                           |
+ *                                           V
+ *             ----------------------------------------
+ *             | Data block containing offset         |
+ *             |            101342453                 |
+ *             |                                      |
+ *             |                                      |
+ *             ----------------------------------------
+ *
+ */
+static void find_metapath(struct gfs2_inode *ip, u64 block,
+                          struct metapath *mp)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u64 b = block;
+        unsigned int i;
+        for (i = ip->i_di.di_height; i--;)
+                mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
+}
+/**
+ * metapointer - Return pointer to start of metadata in a buffer
+ * @bh: The buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ *
+ * Return a pointer to the block number of the next height of the metadata
+ * tree given a buffer containing the pointer to the current height of the
+ * metadata tree.
+ */
+static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
+                               unsigned int height, const struct metapath *mp)
+{
+        unsigned int head_size = (height > 0) ?
+                sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
+        u64 *ptr;
+        *boundary = 0;
+        ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+        if (ptr + 1 == (u64 *)(bh->b_data + bh->b_size))
+                *boundary = 1;
+        return ptr;
+}
+/**
+ * lookup_block - Get the next metadata block in metadata tree
+ * @ip: The GFS2 inode
+ * @bh: Buffer containing the pointers to metadata blocks
+ * @height: The height of the tree (0 = dinode)
+ * @mp: The metapath
+ * @create: Non-zero if we may create a new meatdata block
+ * @new: Used to indicate if we did create a new metadata block
+ * @block: the returned disk block number
+ *
+ * Given a metatree, complete to a particular height, checks to see if the next
+ * height of the tree exists. If not the next height of the tree is created.
+ * The block number of the next height of the metadata tree is returned.
+ *
+ */
+static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
+                        unsigned int height, struct metapath *mp, int create,
+                        int *new, u64 *block)
+{
+        int boundary;
+        u64 *ptr = metapointer(bh, &boundary, height, mp);
+        if (*ptr) {
+                *block = be64_to_cpu(*ptr);
+                return boundary;
+        }
+        *block = 0;
+        if (!create)
+                return 0;
+        if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
+                *block = gfs2_alloc_data(ip);
+        else
+                *block = gfs2_alloc_meta(ip);
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        *ptr = cpu_to_be64(*block);
+        ip->i_di.di_blocks++;
+        *new = 1;
+        return 0;
+}
+/**
+ * gfs2_block_pointers - Map a block from an inode to a disk block
+ * @inode: The inode
+ * @lblock: The logical block number
+ * @map_bh: The bh to be mapped
+ * @mp: metapath to use
+ *
+ * Find the block number on the current device which corresponds to an
+ * inode's block. If the block had to be created, "new" will be set.
+ *
+ * Returns: errno
+ */
+static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
+                               struct buffer_head *bh_map, struct metapath *mp,
+                               unsigned int maxlen)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct buffer_head *bh;
+        unsigned int bsize;
+        unsigned int height;
+        unsigned int end_of_metadata;
+        unsigned int x;
+        int error = 0;
+        int new = 0;
+        u64 dblock = 0;
+        int boundary;
+        BUG_ON(maxlen == 0);
+        if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
+                return 0;
+        bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
+        height = calc_tree_height(ip, (lblock + 1) * bsize);
+        if (ip->i_di.di_height < height) {
+                if (!create)
+                        return 0;
+                error = build_height(inode, height);
+                if (error)
+                        return error;
+        }
+        find_metapath(ip, lblock, mp);
+        end_of_metadata = ip->i_di.di_height - 1;
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                return error;
+        for (x = 0; x < end_of_metadata; x++) {
+                lookup_block(ip, bh, x, mp, create, &new, &dblock);
+                brelse(bh);
+                if (!dblock)
+                        return 0;
+                error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
+                if (error)
+                        return error;
+        }
+        boundary = lookup_block(ip, bh, end_of_metadata, mp, create, &new, &dblock);
+        clear_buffer_mapped(bh_map);
+        clear_buffer_new(bh_map);
+        clear_buffer_boundary(bh_map);
+        if (dblock) {
+                map_bh(bh_map, inode->i_sb, dblock);
+                if (boundary)
+                        set_buffer_boundary(bh);
+                if (new) {
+                        struct buffer_head *dibh;
+                        error = gfs2_meta_inode_buffer(ip, &dibh);
+                        if (!error) {
+                                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                                brelse(dibh);
+                        }
+                        set_buffer_new(bh_map);
+                        goto out_brelse;
+                }
+                while(--maxlen && !buffer_boundary(bh_map)) {
+                        u64 eblock;
+                        mp->mp_list[end_of_metadata]++;
+                        boundary = lookup_block(ip, bh, end_of_metadata, mp, 0, &new, &eblock);
+                        if (eblock != ++dblock)
+                                break;
+                        bh_map->b_size += (1 << inode->i_blkbits);
+                        if (boundary)
+                                set_buffer_boundary(bh_map);
+                }
+        }
+out_brelse:
+        brelse(bh);
+        return 0;
+}
+static inline void bmap_lock(struct inode *inode, int create)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (create)
+                down_write(&ip->i_rw_mutex);
+        else
+                down_read(&ip->i_rw_mutex);
+}
+static inline void bmap_unlock(struct inode *inode, int create)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (create)
+                up_write(&ip->i_rw_mutex);
+        else
+                up_read(&ip->i_rw_mutex);
+}
+int gfs2_block_map(struct inode *inode, u64 lblock, int create,
+                   struct buffer_head *bh, unsigned int maxlen)
+{
+        struct metapath mp;
+        int ret;
+        bmap_lock(inode, create);
+        ret = gfs2_block_pointers(inode, lblock, create, bh, &mp, maxlen);
+        bmap_unlock(inode, create);
+        return ret;
+}
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
+{
+        struct metapath mp;
+        struct buffer_head bh = { .b_state = 0, .b_blocknr = 0, .b_size = 0 };
+        int ret;
+        int create = *new;
+        BUG_ON(!extlen);
+        BUG_ON(!dblock);
+        BUG_ON(!new);
+        bmap_lock(inode, create);
+        ret = gfs2_block_pointers(inode, lblock, create, &bh, &mp, 32);
+        bmap_unlock(inode, create);
+        *extlen = bh.b_size >> inode->i_blkbits;
+        *dblock = bh.b_blocknr;
+        if (buffer_new(&bh))
+                *new = 1;
+        else
+                *new = 0;
+        return ret;
+}
+/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @bc: the call to make for each piece of metadata
+ * @data: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+                          struct metapath *mp, unsigned int height,
+                          u64 block, int first, block_call_t bc,
+                          void *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *bh = NULL;
+        u64 *top, *bottom;
+        u64 bn;
+        int error;
+        int mh_size = sizeof(struct gfs2_meta_header);
+        if (!height) {
+                error = gfs2_meta_inode_buffer(ip, &bh);
+                if (error)
+                        return error;
+                dibh = bh;
+                top = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+                bottom = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+        } else {
+                error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+                if (error)
+                        return error;
+                top = (u64 *)(bh->b_data + mh_size) +
+                                  (first ? mp->mp_list[height] : 0);
+                bottom = (u64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+        }
+        error = bc(ip, dibh, bh, top, bottom, height, data);
+        if (error)
+                goto out;
+        if (height < ip->i_di.di_height - 1)
+                for (; top < bottom; top++, first = 0) {
+                        if (!*top)
+                                continue;
+                        bn = be64_to_cpu(*top);
+                        error = recursive_scan(ip, dibh, mp, height + 1, bn,
+                                               first, bc, data);
+                        if (error)
+                                break;
+                }
+out:
+        brelse(bh);
+        return error;
+}
+/**
+ * do_strip - Look for a layer a particular layer of the file and strip it off
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @bh: A buffer of pointers
+ * @top: The first pointer in the buffer
+ * @bottom: One more than the last pointer
+ * @height: the height this buffer is at
+ * @data: a pointer to a struct strip_mine
+ *
+ * Returns: errno
+ */
+static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
+                    struct buffer_head *bh, u64 *top, u64 *bottom,
+                    unsigned int height, void *data)
+{
+        struct strip_mine *sm = data;
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrp_list rlist;
+        u64 bn, bstart;
+        u32 blen;
+        u64 *p;
+        unsigned int rg_blocks = 0;
+        int metadata;
+        unsigned int revokes = 0;
+        int x;
+        int error;
+        if (!*top)
+                sm->sm_first = 0;
+        if (height != sm->sm_height)
+                return 0;
+        if (sm->sm_first) {
+                top++;
+                sm->sm_first = 0;
+        }
+        metadata = (height != ip->i_di.di_height - 1);
+        if (metadata)
+                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+        error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
+        if (error)
+                return error;
+        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+        bstart = 0;
+        blen = 0;
+        for (p = top; p < bottom; p++) {
+                if (!*p)
+                        continue;
+                bn = be64_to_cpu(*p);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart)
+                                gfs2_rlist_add(sdp, &rlist, bstart);
+                        bstart = bn;
+                        blen = 1;
+                }
+        }
+        if (bstart)
+                gfs2_rlist_add(sdp, &rlist, bstart);
+        else
+                goto out; /* Nothing to do */
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        for (x = 0; x < rlist.rl_rgrps; x++) {
+                struct gfs2_rgrpd *rgd;
+                rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+                rg_blocks += rgd->rd_ri.ri_length;
+        }
+        error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+        if (error)
+                goto out_rlist;
+        error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
+                                 RES_INDIRECT + RES_STATFS + RES_QUOTA,
+                                 revokes);
+        if (error)
+                goto out_rg_gunlock;
+        down_write(&ip->i_rw_mutex);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        bstart = 0;
+        blen = 0;
+        for (p = top; p < bottom; p++) {
+                if (!*p)
+                        continue;
+                bn = be64_to_cpu(*p);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart) {
+                                if (metadata)
+                                        gfs2_free_meta(ip, bstart, blen);
+                                else
+                                        gfs2_free_data(ip, bstart, blen);
+                        }
+                        bstart = bn;
+                        blen = 1;
+                }
+                *p = 0;
+                if (!ip->i_di.di_blocks)
+                        gfs2_consist_inode(ip);
+                ip->i_di.di_blocks--;
+        }
+        if (bstart) {
+                if (metadata)
+                        gfs2_free_meta(ip, bstart, blen);
+                else
+                        gfs2_free_data(ip, bstart, blen);
+        }
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        up_write(&ip->i_rw_mutex);
+        gfs2_trans_end(sdp);
+out_rg_gunlock:
+        gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+        gfs2_rlist_free(&rlist);
+out:
+        gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
+        return error;
+}
+/**
+ * do_grow - Make a file look bigger than it is
+ * @ip: the inode
+ * @size: the size to set the file to
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+static int do_grow(struct gfs2_inode *ip, u64 size)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al;
+        struct buffer_head *dibh;
+        unsigned int h;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+        if (error)
+                goto out_gunlock_q;
+        al->al_requested = sdp->sd_max_height + RES_DATA;
+        error = gfs2_inplace_reserve(ip);
+        if (error)
+                goto out_gunlock_q;
+        error = gfs2_trans_begin(sdp,
+                        sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
+                        RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+        if (error)
+                goto out_ipres;
+        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+                if (gfs2_is_stuffed(ip)) {
+                        error = gfs2_unstuff_dinode(ip, NULL);
+                        if (error)
+                                goto out_end_trans;
+                }
+                h = calc_tree_height(ip, size);
+                if (ip->i_di.di_height < h) {
+                        down_write(&ip->i_rw_mutex);
+                        error = build_height(&ip->i_inode, h);
+                        up_write(&ip->i_rw_mutex);
+                        if (error)
+                                goto out_end_trans;
+                }
+        }
+        ip->i_di.di_size = size;
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out_end_trans;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipres:
+        gfs2_inplace_release(ip);
+out_gunlock_q:
+        gfs2_quota_unlock(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+/**
+ * gfs2_block_truncate_page - Deal with zeroing out data for truncate
+ *
+ * This is partly borrowed from ext3.
+ */
+static int gfs2_block_truncate_page(struct address_space *mapping)
+{
+        struct inode *inode = mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        loff_t from = inode->i_size;
+        unsigned long index = from >> PAGE_CACHE_SHIFT;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned blocksize, iblock, length, pos;
+        struct buffer_head *bh;
+        struct page *page;
+        void *kaddr;
+        int err;
+        page = grab_cache_page(mapping, index);
+        if (!page)
+                return 0;
+        blocksize = inode->i_sb->s_blocksize;
+        length = blocksize - (offset & (blocksize - 1));
+        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, blocksize, 0);
+        /* Find the buffer that contains "offset" */
+        bh = page_buffers(page);
+        pos = blocksize;
+        while (offset >= pos) {
+                bh = bh->b_this_page;
+                iblock++;
+                pos += blocksize;
+        }
+        err = 0;
+        if (!buffer_mapped(bh)) {
+                gfs2_get_block(inode, iblock, bh, 0);
+                /* unmapped? It's a hole - nothing to do */
+                if (!buffer_mapped(bh))
+                        goto unlock;
+        }
+        /* Ok, it's mapped. Make sure it's up-to-date */
+        if (PageUptodate(page))
+                set_buffer_uptodate(bh);
+        if (!buffer_uptodate(bh)) {
+                err = -EIO;
+                ll_rw_block(READ, 1, &bh);
+                wait_on_buffer(bh);
+                /* Uhhuh. Read error. Complain and punt. */
+                if (!buffer_uptodate(bh))
+                        goto unlock;
+        }
+        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+                gfs2_trans_add_bh(ip->i_gl, bh, 0);
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr + offset, 0, length);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+unlock:
+        unlock_page(page);
+        page_cache_release(page);
+        return err;
+}
+static int trunc_start(struct gfs2_inode *ip, u64 size)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
+        int journaled = gfs2_is_jdata(ip);
+        int error;
+        error = gfs2_trans_begin(sdp,
+                                 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        if (gfs2_is_stuffed(ip)) {
+                ip->i_di.di_size = size;
+                ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+                error = 1;
+        } else {
+                if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
+                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
+                if (!error) {
+                        ip->i_di.di_size = size;
+                        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+                        ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                }
+        }
+        brelse(dibh);
+out:
+        gfs2_trans_end(sdp);
+        return error;
+}
+static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+{
+        unsigned int height = ip->i_di.di_height;
+        u64 lblock;
+        struct metapath mp;
+        int error;
+        if (!size)
+                lblock = 0;
+        else
+                lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
+        find_metapath(ip, lblock, &mp);
+        gfs2_alloc_get(ip);
+        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        while (height--) {
+                struct strip_mine sm;
+                sm.sm_first = !!size;
+                sm.sm_height = height;
+                error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+                if (error)
+                        break;
+        }
+        gfs2_quota_unhold(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+static int trunc_end(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                return error;
+        down_write(&ip->i_rw_mutex);
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        if (!ip->i_di.di_size) {
+                ip->i_di.di_height = 0;
+                ip->i_di.di_goal_meta =
+                        ip->i_di.di_goal_data =
+                        ip->i_num.no_addr;
+                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        }
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+out:
+        up_write(&ip->i_rw_mutex);
+        gfs2_trans_end(sdp);
+        return error;
+}
+/**
+ * do_shrink - make a file smaller
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+static int do_shrink(struct gfs2_inode *ip, u64 size)
+{
+        int error;
+        error = trunc_start(ip, size);
+        if (error < 0)
+                return error;
+        if (error > 0)
+                return 0;
+        error = trunc_dealloc(ip, size);
+        if (!error)
+                error = trunc_end(ip);
+        return error;
+}
+/**
+ * gfs2_truncatei - make a file a given size
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * The file size can grow, shrink, or stay the same size.
+ *
+ * Returns: errno
+ */
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+{
+        int error;
+        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
+                return -EINVAL;
+        if (size > ip->i_di.di_size)
+                error = do_grow(ip, size);
+        else
+                error = do_shrink(ip, size);
+        return error;
+}
+int gfs2_truncatei_resume(struct gfs2_inode *ip)
+{
+        int error;
+        error = trunc_dealloc(ip, ip->i_di.di_size);
+        if (!error)
+                error = trunc_end(ip);
+        return error;
+}
+int gfs2_file_dealloc(struct gfs2_inode *ip)
+{
+        return trunc_dealloc(ip, 0);
+}
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+                            unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int tmp;
+        if (gfs2_is_dir(ip)) {
+                *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
+                *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
+        } else {
+                *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+                *ind_blocks = 3 * (sdp->sd_max_height - 1);
+        }
+        for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+                *ind_blocks += tmp;
+        }
+}
+/**
+ * gfs2_write_alloc_required - figure out if a write will require an allocation
+ * @ip: the file being written to
+ * @offset: the offset to write to
+ * @len: the number of bytes being written
+ * @alloc_required: set to 1 if an alloc is required, 0 otherwise
+ *
+ * Returns: errno
+ */
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+                              unsigned int len, int *alloc_required)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u64 lblock, lblock_stop, dblock;
+        u32 extlen;
+        int new = 0;
+        int error = 0;
+        *alloc_required = 0;
+        if (!len)
+                return 0;
+        if (gfs2_is_stuffed(ip)) {
+                if (offset + len >
+                    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+                        *alloc_required = 1;
+                return 0;
+        }
+        if (gfs2_is_dir(ip)) {
+                unsigned int bsize = sdp->sd_jbsize;
+                lblock = offset;
+                do_div(lblock, bsize);
+                lblock_stop = offset + len + bsize - 1;
+                do_div(lblock_stop, bsize);
+        } else {
+                unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+                lblock = offset >> shift;
+                lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+        }
+        for (; lblock < lblock_stop; lblock += extlen) {
+                error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+                if (error)
+                        return error;
+                if (!dblock) {
+                        *alloc_required = 1;
+                        return 0;
+                }
+        }
+        return 0;
+}
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..0fd379b4cd9e
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __BMAP_DOT_H__
+#define __BMAP_DOT_H__
+struct inode;
+struct gfs2_inode;
+struct page;
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh, unsigned int maxlen);
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
+int gfs2_truncatei_resume(struct gfs2_inode *ip);
+int gfs2_file_dealloc(struct gfs2_inode *ip);
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+                            unsigned int *data_blocks,
+                            unsigned int *ind_blocks);
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+                              unsigned int len, int *alloc_required);
+#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..cab1f68d4685
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "daemon.h"
+#include "glock.h"
+#include "log.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+/* This uses schedule_timeout() instead of msleep() because it's good for
+   the daemons to wake up more often than the timeout when unmounting so
+   the user's unmount doesn't sit there forever.
+   The kthread functions used to start these daemons block and flush signals. */
+/**
+ * gfs2_scand - Look for cached glocks and inodes to toss from memory
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One of these daemons runs, finding candidates to add to sd_reclaim_list.
+ * See gfs2_glockd()
+ */
+int gfs2_scand(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        while (!kthread_should_stop()) {
+                gfs2_scand_internal(sdp);
+                t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
+/**
+ * gfs2_glockd - Reclaim unused glock structures
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
+ * Number of daemons can be set by user, with num_glockd mount option.
+ */
+int gfs2_glockd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        while (!kthread_should_stop()) {
+                while (atomic_read(&sdp->sd_reclaim_count))
+                        gfs2_reclaim_glock(sdp);
+                wait_event_interruptible(sdp->sd_reclaim_wq,
+                                         (atomic_read(&sdp->sd_reclaim_count) ||
+                                         kthread_should_stop()));
+        }
+        return 0;
+}
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_recoverd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        while (!kthread_should_stop()) {
+                gfs2_check_journals(sdp);
+                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+int gfs2_logd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        struct gfs2_holder ji_gh;
+        unsigned long t;
+        while (!kthread_should_stop()) {
+                /* Advance the log tail */
+                t = sdp->sd_log_flush_time +
+                    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+                gfs2_ail1_empty(sdp, DIO_ALL);
+                if (time_after_eq(jiffies, t)) {
+                        gfs2_log_flush(sdp, NULL);
+                        sdp->sd_log_flush_time = jiffies;
+                }
+                /* Check for latest journal index */
+                t = sdp->sd_jindex_refresh_time +
+                    gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
+                if (time_after_eq(jiffies, t)) {
+                        if (!gfs2_jindex_hold(sdp, &ji_gh))
+                                gfs2_glock_dq_uninit(&ji_gh);
+                        sdp->sd_jindex_refresh_time = jiffies;
+                }
+                t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_quotad(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        int error;
+        while (!kthread_should_stop()) {
+                /* Update the master statfs file */
+                t = sdp->sd_statfs_sync_time +
+                    gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+                if (time_after_eq(jiffies, t)) {
+                        error = gfs2_statfs_sync(sdp);
+                        if (error &&
+                            error != -EROFS &&
+                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                                fs_err(sdp, "quotad: (1) error=%d\n", error);
+                        sdp->sd_statfs_sync_time = jiffies;
+                }
+                /* Update quota file */
+                t = sdp->sd_quota_sync_time +
+                    gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
+                if (time_after_eq(jiffies, t)) {
+                        error = gfs2_quota_sync(sdp);
+                        if (error &&
+                            error != -EROFS &&
+                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                                fs_err(sdp, "quotad: (2) error=%d\n", error);
+                        sdp->sd_quota_sync_time = jiffies;
+                }
+                gfs2_quota_scan(sdp);
+                t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..801007120fb2
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __DAEMON_DOT_H__
+#define __DAEMON_DOT_H__
+int gfs2_scand(void *data);
+int gfs2_glockd(void *data);
+int gfs2_recoverd(void *data);
+int gfs2_logd(void *data);
+int gfs2_quotad(void *data);
+#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..459498cac93b
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1961 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+/*
+ * Implements Extendible Hashing as described in:
+ *   "Extendible Hashing" by Fagin, et al in
+ *     __ACM Trans. on Database Systems__, Sept 1979.
+ *
+ *
+ * Here's the layout of dirents which is essentially the same as that of ext2
+ * within a single block. The field de_name_len is the number of bytes
+ * actually required for the name (no null terminator). The field de_rec_len
+ * is the number of bytes allocated to the dirent. The offset of the next
+ * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
+ * deleted, the preceding dirent inherits its allocated space, ie
+ * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
+ * by adding de_rec_len to the current dirent, this essentially causes the
+ * deleted dirent to get jumped over when iterating through all the dirents.
+ *
+ * When deleting the first dirent in a block, there is no previous dirent so
+ * the field de_ino is set to zero to designate it as deleted. When allocating
+ * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
+ * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
+ * dirent is allocated. Otherwise it must go through all the 'used' dirents
+ * searching for one in which the amount of total space minus the amount of
+ * used space will provide enough space for the new dirent.
+ *
+ * There are two types of blocks in which dirents reside. In a stuffed dinode,
+ * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
+ * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
+ * beginning of the leaf block. The dirents reside in leaves when
+ *
+ * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ *
+ * Otherwise, the dirents are "linear", within a single stuffed dinode block.
+ *
+ * When the dirents are in leaves, the actual contents of the directory file are
+ * used as an array of 64-bit block pointers pointing to the leaf blocks. The
+ * dirents are NOT in the directory file itself. There can be more than one
+ * block pointer in the array that points to the same leaf. In fact, when a
+ * directory is first converted from linear to exhash, all of the pointers
+ * point to the same leaf.
+ *
+ * When a leaf is completely full, the size of the hash table can be
+ * doubled unless it is already at the maximum size which is hard coded into
+ * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
+ * but never before the maximum hash table size has been reached.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/vmalloc.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "bmap.h"
+#include "util.h"
+#define IS_LEAF     1 /* Hashed (leaf) directory */
+#define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
+#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
+#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
+                            u64 leaf_no, void *data);
+typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
+                            const struct qstr *name, void *opaque);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+                            struct buffer_head **bhp)
+{
+        struct buffer_head *bh;
+        bh = gfs2_meta_new(ip->i_gl, block);
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
+        gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+        *bhp = bh;
+        return 0;
+}
+static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
+                                        struct buffer_head **bhp)
+{
+        struct buffer_head *bh;
+        int error;
+        error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+        if (error)
+                return error;
+        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
+                brelse(bh);
+                return -EIO;
+        }
+        *bhp = bh;
+        return 0;
+}
+static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
+                                  unsigned int offset, unsigned int size)
+{
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
+        if (ip->i_di.di_size < offset + size)
+                ip->i_di.di_size = offset + size;
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        return size;
+}
+/**
+ * gfs2_dir_write_data - Write directory information to the inode
+ * @ip: The GFS2 inode
+ * @buf: The buffer containing information to be written
+ * @offset: The file offset to start writing at
+ * @size: The amount of data to write
+ *
+ * Returns: The number of bytes correctly written or error code
+ */
+static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
+                               u64 offset, unsigned int size)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
+        u64 lblock, dblock;
+        u32 extlen = 0;
+        unsigned int o;
+        int copied = 0;
+        int error = 0;
+        if (!size)
+                return 0;
+        if (gfs2_is_stuffed(ip) &&
+            offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+                return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
+                                              size);
+        if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+                return -EINVAL;
+        if (gfs2_is_stuffed(ip)) {
+                error = gfs2_unstuff_dinode(ip, NULL);
+                if (error)
+                        return error;
+        }
+        lblock = offset;
+        o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+        while (copied < size) {
+                unsigned int amount;
+                struct buffer_head *bh;
+                int new;
+                amount = size - copied;
+                if (amount > sdp->sd_sb.sb_bsize - o)
+                        amount = sdp->sd_sb.sb_bsize - o;
+                if (!extlen) {
+                        new = 1;
+                        error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+                                                &dblock, &extlen);
+                        if (error)
+                                goto fail;
+                        error = -EIO;
+                        if (gfs2_assert_withdraw(sdp, dblock))
+                                goto fail;
+                }
+                if (amount == sdp->sd_jbsize || new)
+                        error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
+                else
+                        error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
+                if (error)
+                        goto fail;
+                gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                memcpy(bh->b_data + o, buf, amount);
+                brelse(bh);
+                if (error)
+                        goto fail;
+                buf += amount;
+                copied += amount;
+                lblock++;
+                dblock++;
+                extlen--;
+                o = sizeof(struct gfs2_meta_header);
+        }
+out:
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        if (ip->i_di.di_size < offset + copied)
+                ip->i_di.di_size = offset + copied;
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        return copied;
+fail:
+        if (copied)
+                goto out;
+        return error;
+}
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
+                                 u64 offset, unsigned int size)
+{
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                offset += sizeof(struct gfs2_dinode);
+                memcpy(buf, dibh->b_data + offset, size);
+                brelse(dibh);
+        }
+        return (error) ? error : size;
+}
+/**
+ * gfs2_dir_read_data - Read a data from a directory inode
+ * @ip: The GFS2 Inode
+ * @buf: The buffer to place result into
+ * @offset: File offset to begin jdata_readng from
+ * @size: Amount of data to transfer
+ *
+ * Returns: The amount of data actually copied or the error
+ */
+static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
+                              unsigned int size, unsigned ra)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u64 lblock, dblock;
+        u32 extlen = 0;
+        unsigned int o;
+        int copied = 0;
+        int error = 0;
+        if (offset >= ip->i_di.di_size)
+                return 0;
+        if (offset + size > ip->i_di.di_size)
+                size = ip->i_di.di_size - offset;
+        if (!size)
+                return 0;
+        if (gfs2_is_stuffed(ip))
+                return gfs2_dir_read_stuffed(ip, buf, offset, size);
+        if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+                return -EINVAL;
+        lblock = offset;
+        o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+        while (copied < size) {
+                unsigned int amount;
+                struct buffer_head *bh;
+                int new;
+                amount = size - copied;
+                if (amount > sdp->sd_sb.sb_bsize - o)
+                        amount = sdp->sd_sb.sb_bsize - o;
+                if (!extlen) {
+                        new = 0;
+                        error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+                                                &dblock, &extlen);
+                        if (error || !dblock)
+                                goto fail;
+                        BUG_ON(extlen < 1);
+                        if (!ra)
+                                extlen = 1;
+                        bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+                }
+                if (!bh) {
+                        error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+                        if (error)
+                                goto fail;
+                }
+                error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
+                if (error) {
+                        brelse(bh);
+                        goto fail;
+                }
+                dblock++;
+                extlen--;
+                memcpy(buf, bh->b_data + o, amount);
+                brelse(bh);
+                bh = NULL;
+                buf += amount;
+                copied += amount;
+                lblock++;
+                o = sizeof(struct gfs2_meta_header);
+        }
+        return copied;
+fail:
+        return (copied) ? copied : error;
+}
+static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
+                                     const struct qstr *name, int ret)
+{
+        if (dent->de_inum.no_addr != 0 &&
+            be32_to_cpu(dent->de_hash) == name->hash &&
+            be16_to_cpu(dent->de_name_len) == name->len &&
+            memcmp(dent+1, name->name, name->len) == 0)
+                return ret;
+        return 0;
+}
+static int gfs2_dirent_find(const struct gfs2_dirent *dent,
+                            const struct qstr *name,
+                            void *opaque)
+{
+        return __gfs2_dirent_find(dent, name, 1);
+}
+static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
+                            const struct qstr *name,
+                            void *opaque)
+{
+        return __gfs2_dirent_find(dent, name, 2);
+}
+/*
+ * name->name holds ptr to start of block.
+ * name->len holds size of block.
+ */
+static int gfs2_dirent_last(const struct gfs2_dirent *dent,
+                            const struct qstr *name,
+                            void *opaque)
+{
+        const char *start = name->name;
+        const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
+        if (name->len == (end - start))
+                return 1;
+        return 0;
+}
+static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
+                                  const struct qstr *name,
+                                  void *opaque)
+{
+        unsigned required = GFS2_DIRENT_SIZE(name->len);
+        unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+        unsigned totlen = be16_to_cpu(dent->de_rec_len);
+        if (!dent->de_inum.no_addr)
+                actual = GFS2_DIRENT_SIZE(0);
+        if (totlen - actual >= required)
+                return 1;
+        return 0;
+}
+struct dirent_gather {
+        const struct gfs2_dirent **pdent;
+        unsigned offset;
+};
+static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
+                              const struct qstr *name,
+                              void *opaque)
+{
+        struct dirent_gather *g = opaque;
+        if (dent->de_inum.no_addr) {
+                g->pdent[g->offset++] = dent;
+        }
+        return 0;
+}
+/*
+ * Other possible things to check:
+ * - Inode located within filesystem size (and on valid block)
+ * - Valid directory entry type
+ * Not sure how heavy-weight we want to make this... could also check
+ * hash is correct for example, but that would take a lot of extra time.
+ * For now the most important thing is to check that the various sizes
+ * are correct.
+ */
+static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
+                             unsigned int size, unsigned int len, int first)
+{
+        const char *msg = "gfs2_dirent too small";
+        if (unlikely(size < sizeof(struct gfs2_dirent)))
+                goto error;
+        msg = "gfs2_dirent misaligned";
+        if (unlikely(offset & 0x7))
+                goto error;
+        msg = "gfs2_dirent points beyond end of block";
+        if (unlikely(offset + size > len))
+                goto error;
+        msg = "zero inode number";
+        if (unlikely(!first && !dent->de_inum.no_addr))
+                goto error;
+        msg = "name length is greater than space in dirent";
+        if (dent->de_inum.no_addr &&
+            unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
+                     size))
+                goto error;
+        return 0;
+error:
+        printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
+               first ? "first in block" : "not first in block");
+        return -EIO;
+}
+static int gfs2_dirent_offset(const void *buf)
+{
+        const struct gfs2_meta_header *h = buf;
+        int offset;
+        BUG_ON(buf == NULL);
+        switch(be32_to_cpu(h->mh_type)) {
+        case GFS2_METATYPE_LF:
+                offset = sizeof(struct gfs2_leaf);
+                break;
+        case GFS2_METATYPE_DI:
+                offset = sizeof(struct gfs2_dinode);
+                break;
+        default:
+                goto wrong_type;
+        }
+        return offset;
+wrong_type:
+        printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
+               be32_to_cpu(h->mh_type));
+        return -1;
+}
+static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
+                                            unsigned int len, gfs2_dscan_t scan,
+                                            const struct qstr *name,
+                                            void *opaque)
+{
+        struct gfs2_dirent *dent, *prev;
+        unsigned offset;
+        unsigned size;
+        int ret = 0;
+        ret = gfs2_dirent_offset(buf);
+        if (ret < 0)
+                goto consist_inode;
+        offset = ret;
+        prev = NULL;
+        dent = buf + offset;
+        size = be16_to_cpu(dent->de_rec_len);
+        if (gfs2_check_dirent(dent, offset, size, len, 1))
+                goto consist_inode;
+        do {
+                ret = scan(dent, name, opaque);
+                if (ret)
+                        break;
+                offset += size;
+                if (offset == len)
+                        break;
+                prev = dent;
+                dent = buf + offset;
+                size = be16_to_cpu(dent->de_rec_len);
+                if (gfs2_check_dirent(dent, offset, size, len, 0))
+                        goto consist_inode;
+        } while(1);
+        switch(ret) {
+        case 0:
+                return NULL;
+        case 1:
+                return dent;
+        case 2:
+                return prev ? prev : dent;
+        default:
+                BUG_ON(ret > 0);
+                return ERR_PTR(ret);
+        }
+consist_inode:
+        gfs2_consist_inode(GFS2_I(inode));
+        return ERR_PTR(-EIO);
+}
+/**
+ * dirent_first - Return the first dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * return first dirent whether bh points to leaf or stuffed dinode
+ *
+ * Returns: IS_LEAF, IS_DINODE, or -errno
+ */
+static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
+                        struct gfs2_dirent **dent)
+{
+        struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
+        if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
+                if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
+                        return -EIO;
+                *dent = (struct gfs2_dirent *)(bh->b_data +
+                                               sizeof(struct gfs2_leaf));
+                return IS_LEAF;
+        } else {
+                if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
+                        return -EIO;
+                *dent = (struct gfs2_dirent *)(bh->b_data +
+                                               sizeof(struct gfs2_dinode));
+                return IS_DINODE;
+        }
+}
+static int dirent_check_reclen(struct gfs2_inode *dip,
+                               const struct gfs2_dirent *d, const void *end_p)
+{
+        const void *ptr = d;
+        u16 rec_len = be16_to_cpu(d->de_rec_len);
+        if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
+                goto broken;
+        ptr += rec_len;
+        if (ptr < end_p)
+                return rec_len;
+        if (ptr == end_p)
+                return -ENOENT;
+broken:
+        gfs2_consist_inode(dip);
+        return -EIO;
+}
+/**
+ * dirent_next - Next dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
+                       struct gfs2_dirent **dent)
+{
+        struct gfs2_dirent *cur = *dent, *tmp;
+        char *bh_end = bh->b_data + bh->b_size;
+        int ret;
+        ret = dirent_check_reclen(dip, cur, bh_end);
+        if (ret < 0)
+                return ret;
+        tmp = (void *)cur + ret;
+        ret = dirent_check_reclen(dip, tmp, bh_end);
+        if (ret == -EIO)
+                return ret;
+        /* Only the first dent could ever have de_inum.no_addr == 0 */
+        if (!tmp->de_inum.no_addr) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        *dent = tmp;
+        return 0;
+}
+/**
+ * dirent_del - Delete a dirent
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @prev: The previous dirent
+ * @cur: The current dirent
+ *
+ */
+static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
+                       struct gfs2_dirent *prev, struct gfs2_dirent *cur)
+{
+        u16 cur_rec_len, prev_rec_len;
+        if (!cur->de_inum.no_addr) {
+                gfs2_consist_inode(dip);
+                return;
+        }
+        gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        /* If there is no prev entry, this is the first entry in the block.
+           The de_rec_len is already as big as it needs to be.  Just zero
+           out the inode number and return.  */
+        if (!prev) {
+                cur->de_inum.no_addr = 0;       /* No endianess worries */
+                return;
+        }
+        /*  Combine this dentry with the previous one.  */
+        prev_rec_len = be16_to_cpu(prev->de_rec_len);
+        cur_rec_len = be16_to_cpu(cur->de_rec_len);
+        if ((char *)prev + prev_rec_len != (char *)cur)
+                gfs2_consist_inode(dip);
+        if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
+                gfs2_consist_inode(dip);
+        prev_rec_len += cur_rec_len;
+        prev->de_rec_len = cpu_to_be16(prev_rec_len);
+}
+/*
+ * Takes a dent from which to grab space as an argument. Returns the
+ * newly created dent.
+ */
+static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
+                                            struct gfs2_dirent *dent,
+                                            const struct qstr *name,
+                                            struct buffer_head *bh)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_dirent *ndent;
+        unsigned offset = 0, totlen;
+        if (dent->de_inum.no_addr)
+                offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+        totlen = be16_to_cpu(dent->de_rec_len);
+        BUG_ON(offset + name->len > totlen);
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        ndent = (struct gfs2_dirent *)((char *)dent + offset);
+        dent->de_rec_len = cpu_to_be16(offset);
+        gfs2_qstr2dirent(name, totlen - offset, ndent);
+        return ndent;
+}
+static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
+                                             struct buffer_head *bh,
+                                             const struct qstr *name)
+{
+        struct gfs2_dirent *dent;
+        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+                                gfs2_dirent_find_space, name, NULL);
+        if (!dent || IS_ERR(dent))
+                return dent;
+        return gfs2_init_dirent(inode, dent, name, bh);
+}
+static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
+                    struct buffer_head **bhp)
+{
+        int error;
+        error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+        if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
+                /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
+                error = -EIO;
+        }
+        return error;
+}
+/**
+ * get_leaf_nr - Get a leaf number associated with the index
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_out:
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
+                       u64 *leaf_out)
+{
+        u64 leaf_no;
+        int error;
+        error = gfs2_dir_read_data(dip, (char *)&leaf_no,
+                                    index * sizeof(u64),
+                                    sizeof(u64), 0);
+        if (error != sizeof(u64))
+                return (error < 0) ? error : -EIO;
+        *leaf_out = be64_to_cpu(leaf_no);
+        return 0;
+}
+static int get_first_leaf(struct gfs2_inode *dip, u32 index,
+                          struct buffer_head **bh_out)
+{
+        u64 leaf_no;
+        int error;
+        error = get_leaf_nr(dip, index, &leaf_no);
+        if (!error)
+                error = get_leaf(dip, leaf_no, bh_out);
+        return error;
+}
+static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
+                                              const struct qstr *name,
+                                              gfs2_dscan_t scan,
+                                              struct buffer_head **pbh)
+{
+        struct buffer_head *bh;
+        struct gfs2_dirent *dent;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        int error;
+        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                struct gfs2_leaf *leaf;
+                unsigned hsize = 1 << ip->i_di.di_depth;
+                unsigned index;
+                u64 ln;
+                if (hsize * sizeof(u64) != ip->i_di.di_size) {
+                        gfs2_consist_inode(ip);
+                        return ERR_PTR(-EIO);
+                }
+                index = name->hash >> (32 - ip->i_di.di_depth);
+                error = get_first_leaf(ip, index, &bh);
+                if (error)
+                        return ERR_PTR(error);
+                do {
+                        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+                                                scan, name, NULL);
+                        if (dent)
+                                goto got_dent;
+                        leaf = (struct gfs2_leaf *)bh->b_data;
+                        ln = be64_to_cpu(leaf->lf_next);
+                        brelse(bh);
+                        if (!ln)
+                                break;
+                        error = get_leaf(ip, ln, &bh);
+                } while(!error);
+                return error ? ERR_PTR(error) : NULL;
+        }
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                return ERR_PTR(error);
+        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
+got_dent:
+        if (unlikely(dent == NULL || IS_ERR(dent))) {
+                brelse(bh);
+                bh = NULL;
+        }
+        *pbh = bh;
+        return dent;
+}
+static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        u64 bn = gfs2_alloc_meta(ip);
+        struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+        struct gfs2_leaf *leaf;
+        struct gfs2_dirent *dent;
+        struct qstr name = { .name = "", .len = 0, .hash = 0 };
+        if (!bh)
+                return NULL;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
+        leaf = (struct gfs2_leaf *)bh->b_data;
+        leaf->lf_depth = cpu_to_be16(depth);
+        leaf->lf_entries = 0;
+        leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE);
+        leaf->lf_next = 0;
+        memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+        dent = (struct gfs2_dirent *)(leaf+1);
+        gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
+        *pbh = bh;
+        return leaf;
+}
+/**
+ * dir_make_exhash - Convert a stuffed directory into an ExHash directory
+ * @dip: The GFS2 inode
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+static int dir_make_exhash(struct inode *inode)
+{
+        struct gfs2_inode *dip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_dirent *dent;
+        struct qstr args;
+        struct buffer_head *bh, *dibh;
+        struct gfs2_leaf *leaf;
+        int y;
+        u32 x;
+        u64 *lp, bn;
+        int error;
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (error)
+                return error;
+        /*  Turn over a new leaf  */
+        leaf = new_leaf(inode, &bh, 0);
+        if (!leaf)
+                return -ENOSPC;
+        bn = bh->b_blocknr;
+        gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
+        leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+        /*  Copy dirents  */
+        gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
+                             sizeof(struct gfs2_dinode));
+        /*  Find last entry  */
+        x = 0;
+        args.len = bh->b_size - sizeof(struct gfs2_dinode) +
+                   sizeof(struct gfs2_leaf);
+        args.name = bh->b_data;
+        dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
+                                gfs2_dirent_last, &args, NULL);
+        if (!dent) {
+                brelse(bh);
+                brelse(dibh);
+                return -EIO;
+        }
+        if (IS_ERR(dent)) {
+                brelse(bh);
+                brelse(dibh);
+                return PTR_ERR(dent);
+        }
+        /*  Adjust the last dirent's record length
+           (Remember that dent still points to the last entry.)  */
+        dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
+                sizeof(struct gfs2_dinode) -
+                sizeof(struct gfs2_leaf));
+        brelse(bh);
+        /*  We're done with the new leaf block, now setup the new
+            hash table.  */
+        gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        lp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
+        for (x = sdp->sd_hash_ptrs; x--; lp++)
+                *lp = cpu_to_be64(bn);
+        dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+        dip->i_di.di_blocks++;
+        dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+        dip->i_di.di_payload_format = 0;
+        for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
+        dip->i_di.di_depth = y;
+        gfs2_dinode_out(&dip->i_di, dibh->b_data);
+        brelse(dibh);
+        return 0;
+}
+/**
+ * dir_split_leaf - Split a leaf block into two
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_no:
+ *
+ * Returns: 0 on success, error code on failure
+ */
+static int dir_split_leaf(struct inode *inode, const struct qstr *name)
+{
+        struct gfs2_inode *dip = GFS2_I(inode);
+        struct buffer_head *nbh, *obh, *dibh;
+        struct gfs2_leaf *nleaf, *oleaf;
+        struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
+        u32 start, len, half_len, divider;
+        u64 bn, *lp, leaf_no;
+        u32 index;
+        int x, moved = 0;
+        int error;
+        index = name->hash >> (32 - dip->i_di.di_depth);
+        error = get_leaf_nr(dip, index, &leaf_no);
+        if (error)
+                return error;
+        /*  Get the old leaf block  */
+        error = get_leaf(dip, leaf_no, &obh);
+        if (error)
+                return error;
+        oleaf = (struct gfs2_leaf *)obh->b_data;
+        if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
+                brelse(obh);
+                return 1; /* can't split */
+        }
+        gfs2_trans_add_bh(dip->i_gl, obh, 1);
+        nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
+        if (!nleaf) {
+                brelse(obh);
+                return -ENOSPC;
+        }
+        bn = nbh->b_blocknr;
+        /*  Compute the start and len of leaf pointers in the hash table.  */
+        len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+        half_len = len >> 1;
+        if (!half_len) {
+                printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
+                gfs2_consist_inode(dip);
+                error = -EIO;
+                goto fail_brelse;
+        }
+        start = (index & ~(len - 1));
+        /* Change the pointers.
+           Don't bother distinguishing stuffed from non-stuffed.
+           This code is complicated enough already. */
+        lp = kmalloc(half_len * sizeof(u64), GFP_NOFS | __GFP_NOFAIL);
+        /*  Change the pointers  */
+        for (x = 0; x < half_len; x++)
+                lp[x] = cpu_to_be64(bn);
+        error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
+                                    half_len * sizeof(u64));
+        if (error != half_len * sizeof(u64)) {
+                if (error >= 0)
+                        error = -EIO;
+                goto fail_lpfree;
+        }
+        kfree(lp);
+        /*  Compute the divider  */
+        divider = (start + half_len) << (32 - dip->i_di.di_depth);
+        /*  Copy the entries  */
+        dirent_first(dip, obh, &dent);
+        do {
+                next = dent;
+                if (dirent_next(dip, obh, &next))
+                        next = NULL;
+                if (dent->de_inum.no_addr &&
+                    be32_to_cpu(dent->de_hash) < divider) {
+                        struct qstr str;
+                        str.name = (char*)(dent+1);
+                        str.len = be16_to_cpu(dent->de_name_len);
+                        str.hash = be32_to_cpu(dent->de_hash);
+                        new = gfs2_dirent_alloc(inode, nbh, &str);
+                        if (IS_ERR(new)) {
+                                error = PTR_ERR(new);
+                                break;
+                        }
+                        new->de_inum = dent->de_inum; /* No endian worries */
+                        new->de_type = dent->de_type; /* No endian worries */
+                        nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
+                        dirent_del(dip, obh, prev, dent);
+                        if (!oleaf->lf_entries)
+                                gfs2_consist_inode(dip);
+                        oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
+                        if (!prev)
+                                prev = dent;
+                        moved = 1;
+                } else {
+                        prev = dent;
+                }
+                dent = next;
+        } while (dent);
+        oleaf->lf_depth = nleaf->lf_depth;
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
+                dip->i_di.di_blocks++;
+                gfs2_dinode_out(&dip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        brelse(obh);
+        brelse(nbh);
+        return error;
+fail_lpfree:
+        kfree(lp);
+fail_brelse:
+        brelse(obh);
+        brelse(nbh);
+        return error;
+}
+/**
+ * dir_double_exhash - Double size of ExHash table
+ * @dip: The GFS2 dinode
+ *
+ * Returns: 0 on success, error code on failure
+ */
+static int dir_double_exhash(struct gfs2_inode *dip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct buffer_head *dibh;
+        u32 hsize;
+        u64 *buf;
+        u64 *from, *to;
+        u64 block;
+        int x;
+        int error = 0;
+        hsize = 1 << dip->i_di.di_depth;
+        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        /*  Allocate both the "from" and "to" buffers in one big chunk  */
+        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+        for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+                error = gfs2_dir_read_data(dip, (char *)buf,
+                                            block * sdp->sd_hash_bsize,
+                                            sdp->sd_hash_bsize, 1);
+                if (error != sdp->sd_hash_bsize) {
+                        if (error >= 0)
+                                error = -EIO;
+                        goto fail;
+                }
+                from = buf;
+                to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
+                for (x = sdp->sd_hash_ptrs; x--; from++) {
+                        *to++ = *from;  /*  No endianess worries  */
+                        *to++ = *from;
+                }
+                error = gfs2_dir_write_data(dip,
+                                             (char *)buf + sdp->sd_hash_bsize,
+                                             block * sdp->sd_sb.sb_bsize,
+                                             sdp->sd_sb.sb_bsize);
+                if (error != sdp->sd_sb.sb_bsize) {
+                        if (error >= 0)
+                                error = -EIO;
+                        goto fail;
+                }
+        }
+        kfree(buf);
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (!gfs2_assert_withdraw(sdp, !error)) {
+                dip->i_di.di_depth++;
+                gfs2_dinode_out(&dip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        return error;
+fail:
+        kfree(buf);
+        return error;
+}
+/**
+ * compare_dents - compare directory entries by hash value
+ * @a: first dent
+ * @b: second dent
+ *
+ * When comparing the hash entries of @a to @b:
+ *   gt: returns 1
+ *   lt: returns -1
+ *   eq: returns 0
+ */
+static int compare_dents(const void *a, const void *b)
+{
+        const struct gfs2_dirent *dent_a, *dent_b;
+        u32 hash_a, hash_b;
+        int ret = 0;
+        dent_a = *(const struct gfs2_dirent **)a;
+        hash_a = be32_to_cpu(dent_a->de_hash);
+        dent_b = *(const struct gfs2_dirent **)b;
+        hash_b = be32_to_cpu(dent_b->de_hash);
+        if (hash_a > hash_b)
+                ret = 1;
+        else if (hash_a < hash_b)
+                ret = -1;
+        else {
+                unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
+                unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
+                if (len_a > len_b)
+                        ret = 1;
+                else if (len_a < len_b)
+                        ret = -1;
+                else
+                        ret = memcmp(dent_a + 1, dent_b + 1, len_a);
+        }
+        return ret;
+}
+/**
+ * do_filldir_main - read out directory entries
+ * @dip: The GFS2 inode
+ * @offset: The offset in the file to read from
+ * @opaque: opaque data to pass to filldir
+ * @filldir: The function to pass entries to
+ * @darr: an array of struct gfs2_dirent pointers to read
+ * @entries: the number of entries in darr
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Jump through some hoops to make sure that if there are hash collsions,
+ * they are read out at the beginning of a buffer.  We want to minimize
+ * the possibility that they will fall into different readdir buffers or
+ * that someone will want to seek to that location.
+ *
+ * Returns: errno, >0 on exception from filldir
+ */
+static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
+                           void *opaque, gfs2_filldir_t filldir,
+                           const struct gfs2_dirent **darr, u32 entries,
+                           int *copied)
+{
+        const struct gfs2_dirent *dent, *dent_next;
+        struct gfs2_inum inum;
+        u64 off, off_next;
+        unsigned int x, y;
+        int run = 0;
+        int error = 0;
+        sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+        dent_next = darr[0];
+        off_next = be32_to_cpu(dent_next->de_hash);
+        off_next = gfs2_disk_hash2offset(off_next);
+        for (x = 0, y = 1; x < entries; x++, y++) {
+                dent = dent_next;
+                off = off_next;
+                if (y < entries) {
+                        dent_next = darr[y];
+                        off_next = be32_to_cpu(dent_next->de_hash);
+                        off_next = gfs2_disk_hash2offset(off_next);
+                        if (off < *offset)
+                                continue;
+                        *offset = off;
+                        if (off_next == off) {
+                                if (*copied && !run)
+                                        return 1;
+                                run = 1;
+                        } else
+                                run = 0;
+                } else {
+                        if (off < *offset)
+                                continue;
+                        *offset = off;
+                }
+                gfs2_inum_in(&inum, (char *)&dent->de_inum);
+                error = filldir(opaque, (const char *)(dent + 1),
+                                be16_to_cpu(dent->de_name_len),
+                                off, &inum,
+                                be16_to_cpu(dent->de_type));
+                if (error)
+                        return 1;
+                *copied = 1;
+        }
+        /* Increment the *offset by one, so the next time we come into the
+           do_filldir fxn, we get the next entry instead of the last one in the
+           current leaf */
+        (*offset)++;
+        return 0;
+}
+static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
+                              gfs2_filldir_t filldir, int *copied,
+                              unsigned *depth, u64 leaf_no)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *bh;
+        struct gfs2_leaf *lf;
+        unsigned entries = 0;
+        unsigned leaves = 0;
+        const struct gfs2_dirent **darr, *dent;
+        struct dirent_gather g;
+        struct buffer_head **larr;
+        int leaf = 0;
+        int error, i;
+        u64 lfn = leaf_no;
+        do {
+                error = get_leaf(ip, lfn, &bh);
+                if (error)
+                        goto out;
+                lf = (struct gfs2_leaf *)bh->b_data;
+                if (leaves == 0)
+                        *depth = be16_to_cpu(lf->lf_depth);
+                entries += be16_to_cpu(lf->lf_entries);
+                leaves++;
+                lfn = be64_to_cpu(lf->lf_next);
+                brelse(bh);
+        } while(lfn);
+        if (!entries)
+                return 0;
+        error = -ENOMEM;
+        larr = vmalloc((leaves + entries) * sizeof(void *));
+        if (!larr)
+                goto out;
+        darr = (const struct gfs2_dirent **)(larr + leaves);
+        g.pdent = darr;
+        g.offset = 0;
+        lfn = leaf_no;
+        do {
+                error = get_leaf(ip, lfn, &bh);
+                if (error)
+                        goto out_kfree;
+                lf = (struct gfs2_leaf *)bh->b_data;
+                lfn = be64_to_cpu(lf->lf_next);
+                if (lf->lf_entries) {
+                        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+                                                gfs2_dirent_gather, NULL, &g);
+                        error = PTR_ERR(dent);
+                        if (IS_ERR(dent)) {
+                                goto out_kfree;
+                        }
+                        error = 0;
+                        larr[leaf++] = bh;
+                } else {
+                        brelse(bh);
+                }
+        } while(lfn);
+        error = do_filldir_main(ip, offset, opaque, filldir, darr,
+                                entries, copied);
+out_kfree:
+        for(i = 0; i < leaf; i++)
+                brelse(larr[i]);
+        vfree(larr);
+out:
+        return error;
+}
+/**
+ * dir_e_read - Reads the entries from a directory into a filldir buffer
+ * @dip: dinode pointer
+ * @offset: the hash of the last entry read shifted to the right once
+ * @opaque: buffer for the filldir function to fill
+ * @filldir: points to the filldir function to use
+ *
+ * Returns: errno
+ */
+static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
+                      gfs2_filldir_t filldir)
+{
+        struct gfs2_inode *dip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        u32 hsize, len = 0;
+        u32 ht_offset, lp_offset, ht_offset_cur = -1;
+        u32 hash, index;
+        u64 *lp;
+        int copied = 0;
+        int error = 0;
+        unsigned depth = 0;
+        hsize = 1 << dip->i_di.di_depth;
+        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        hash = gfs2_dir_offset2hash(*offset);
+        index = hash >> (32 - dip->i_di.di_depth);
+        lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+        if (!lp)
+                return -ENOMEM;
+        while (index < hsize) {
+                lp_offset = index & (sdp->sd_hash_ptrs - 1);
+                ht_offset = index - lp_offset;
+                if (ht_offset_cur != ht_offset) {
+                        error = gfs2_dir_read_data(dip, (char *)lp,
+                                                ht_offset * sizeof(u64),
+                                                sdp->sd_hash_bsize, 1);
+                        if (error != sdp->sd_hash_bsize) {
+                                if (error >= 0)
+                                        error = -EIO;
+                                goto out;
+                        }
+                        ht_offset_cur = ht_offset;
+                }
+                error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+                                           &copied, &depth,
+                                           be64_to_cpu(lp[lp_offset]));
+                if (error)
+                        break;
+                len = 1 << (dip->i_di.di_depth - depth);
+                index = (index & ~(len - 1)) + len;
+        }
+out:
+        kfree(lp);
+        if (error > 0)
+                error = 0;
+        return error;
+}
+int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+                  gfs2_filldir_t filldir)
+{
+        struct gfs2_inode *dip = GFS2_I(inode);
+        struct dirent_gather g;
+        const struct gfs2_dirent **darr, *dent;
+        struct buffer_head *dibh;
+        int copied = 0;
+        int error;
+        if (!dip->i_di.di_entries)
+                return 0;
+        if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+                return dir_e_read(inode, offset, opaque, filldir);
+        if (!gfs2_is_stuffed(dip)) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (error)
+                return error;
+        error = -ENOMEM;
+        darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
+                       GFP_KERNEL);
+        if (darr) {
+                g.pdent = darr;
+                g.offset = 0;
+                dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
+                                        gfs2_dirent_gather, NULL, &g);
+                if (IS_ERR(dent)) {
+                        error = PTR_ERR(dent);
+                        goto out;
+                }
+                error = do_filldir_main(dip, offset, opaque, filldir, darr,
+                                        dip->i_di.di_entries, &copied);
+out:
+                kfree(darr);
+        }
+        if (error > 0)
+                error = 0;
+        brelse(dibh);
+        return error;
+}
+/**
+ * gfs2_dir_search - Search a directory
+ * @dip: The GFS2 inode
+ * @filename:
+ * @inode:
+ *
+ * This routine searches a directory for a file or another directory.
+ * Assumes a glock is held on dip.
+ *
+ * Returns: errno
+ */
+int gfs2_dir_search(struct inode *dir, const struct qstr *name,
+                    struct gfs2_inum *inum, unsigned int *type)
+{
+        struct buffer_head *bh;
+        struct gfs2_dirent *dent;
+        dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+        if (dent) {
+                if (IS_ERR(dent))
+                        return PTR_ERR(dent);
+                if (inum)
+                        gfs2_inum_in(inum, (char *)&dent->de_inum);
+                if (type)
+                        *type = be16_to_cpu(dent->de_type);
+                brelse(bh);
+                return 0;
+        }
+        return -ENOENT;
+}
+static int dir_new_leaf(struct inode *inode, const struct qstr *name)
+{
+        struct buffer_head *bh, *obh;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_leaf *leaf, *oleaf;
+        int error;
+        u32 index;
+        u64 bn;
+        index = name->hash >> (32 - ip->i_di.di_depth);
+        error = get_first_leaf(ip, index, &obh);
+        if (error)
+                return error;
+        do {
+                oleaf = (struct gfs2_leaf *)obh->b_data;
+                bn = be64_to_cpu(oleaf->lf_next);
+                if (!bn)
+                        break;
+                brelse(obh);
+                error = get_leaf(ip, bn, &obh);
+                if (error)
+                        return error;
+        } while(1);
+        gfs2_trans_add_bh(ip->i_gl, obh, 1);
+        leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
+        if (!leaf) {
+                brelse(obh);
+                return -ENOSPC;
+        }
+        oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
+        brelse(bh);
+        brelse(obh);
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                return error;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        ip->i_di.di_blocks++;
+        gfs2_dinode_out(&ip->i_di, bh->b_data);
+        brelse(bh);
+        return 0;
+}
+/**
+ * gfs2_dir_add - Add new filename into directory
+ * @dip: The GFS2 inode
+ * @filename: The new name
+ * @inode: The inode number of the entry
+ * @type: The type of the entry
+ *
+ * Returns: 0 on success, error code on failure
+ */
+int gfs2_dir_add(struct inode *inode, const struct qstr *name,
+                 const struct gfs2_inum *inum, unsigned type)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *bh;
+        struct gfs2_dirent *dent;
+        struct gfs2_leaf *leaf;
+        int error;
+        while(1) {
+                dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
+                                          &bh);
+                if (dent) {
+                        if (IS_ERR(dent))
+                                return PTR_ERR(dent);
+                        dent = gfs2_init_dirent(inode, dent, name, bh);
+                        gfs2_inum_out(inum, (char *)&dent->de_inum);
+                        dent->de_type = cpu_to_be16(type);
+                        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                                leaf = (struct gfs2_leaf *)bh->b_data;
+                                leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
+                        }
+                        brelse(bh);
+                        error = gfs2_meta_inode_buffer(ip, &bh);
+                        if (error)
+                                break;
+                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        ip->i_di.di_entries++;
+                        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+                        gfs2_dinode_out(&ip->i_di, bh->b_data);
+                        brelse(bh);
+                        error = 0;
+                        break;
+                }
+                if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+                        error = dir_make_exhash(inode);
+                        if (error)
+                                break;
+                        continue;
+                }
+                error = dir_split_leaf(inode, name);
+                if (error == 0)
+                        continue;
+                if (error < 0)
+                        break;
+                if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+                        error = dir_double_exhash(ip);
+                        if (error)
+                                break;
+                        error = dir_split_leaf(inode, name);
+                        if (error < 0)
+                                break;
+                        if (error == 0)
+                                continue;
+                }
+                error = dir_new_leaf(inode, name);
+                if (!error)
+                        continue;
+                error = -ENOSPC;
+                break;
+        }
+        return error;
+}
+/**
+ * gfs2_dir_del - Delete a directory entry
+ * @dip: The GFS2 inode
+ * @filename: The filename
+ *
+ * Returns: 0 on success, error code on failure
+ */
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
+{
+        struct gfs2_dirent *dent, *prev = NULL;
+        struct buffer_head *bh;
+        int error;
+        /* Returns _either_ the entry (if its first in block) or the
+           previous entry otherwise */
+        dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
+        if (!dent) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        if (IS_ERR(dent)) {
+                gfs2_consist_inode(dip);
+                return PTR_ERR(dent);
+        }
+        /* If not first in block, adjust pointers accordingly */
+        if (gfs2_dirent_find(dent, name, NULL) == 0) {
+                prev = dent;
+                dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
+        }
+        dirent_del(dip, bh, prev, dent);
+        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+                u16 entries = be16_to_cpu(leaf->lf_entries);
+                if (!entries)
+                        gfs2_consist_inode(dip);
+                leaf->lf_entries = cpu_to_be16(--entries);
+        }
+        brelse(bh);
+        error = gfs2_meta_inode_buffer(dip, &bh);
+        if (error)
+                return error;
+        if (!dip->i_di.di_entries)
+                gfs2_consist_inode(dip);
+        gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        dip->i_di.di_entries--;
+        dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+        gfs2_dinode_out(&dip->i_di, bh->b_data);
+        brelse(bh);
+        mark_inode_dirty(&dip->i_inode);
+        return error;
+}
+/**
+ * gfs2_dir_mvino - Change inode number of directory entry
+ * @dip: The GFS2 inode
+ * @filename:
+ * @new_inode:
+ *
+ * This routine changes the inode number of a directory entry.  It's used
+ * by rename to change ".." when a directory is moved.
+ * Assumes a glock is held on dvp.
+ *
+ * Returns: errno
+ */
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+                   struct gfs2_inum *inum, unsigned int new_type)
+{
+        struct buffer_head *bh;
+        struct gfs2_dirent *dent;
+        int error;
+        dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
+        if (!dent) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        if (IS_ERR(dent))
+                return PTR_ERR(dent);
+        gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        gfs2_inum_out(inum, (char *)&dent->de_inum);
+        dent->de_type = cpu_to_be16(new_type);
+        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                brelse(bh);
+                error = gfs2_meta_inode_buffer(dip, &bh);
+                if (error)
+                        return error;
+                gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        }
+        dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+        gfs2_dinode_out(&dip->i_di, bh->b_data);
+        brelse(bh);
+        return 0;
+}
+/**
+ * foreach_leaf - call a function for each leaf in a directory
+ * @dip: the directory
+ * @lc: the function to call for each each
+ * @data: private data to pass to it
+ *
+ * Returns: errno
+ */
+static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct buffer_head *bh;
+        struct gfs2_leaf *leaf;
+        u32 hsize, len;
+        u32 ht_offset, lp_offset, ht_offset_cur = -1;
+        u32 index = 0;
+        u64 *lp;
+        u64 leaf_no;
+        int error = 0;
+        hsize = 1 << dip->i_di.di_depth;
+        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+        if (!lp)
+                return -ENOMEM;
+        while (index < hsize) {
+                lp_offset = index & (sdp->sd_hash_ptrs - 1);
+                ht_offset = index - lp_offset;
+                if (ht_offset_cur != ht_offset) {
+                        error = gfs2_dir_read_data(dip, (char *)lp,
+                                                ht_offset * sizeof(u64),
+                                                sdp->sd_hash_bsize, 1);
+                        if (error != sdp->sd_hash_bsize) {
+                                if (error >= 0)
+                                        error = -EIO;
+                                goto out;
+                        }
+                        ht_offset_cur = ht_offset;
+                }
+                leaf_no = be64_to_cpu(lp[lp_offset]);
+                if (leaf_no) {
+                        error = get_leaf(dip, leaf_no, &bh);
+                        if (error)
+                                goto out;
+                        leaf = (struct gfs2_leaf *)bh->b_data;
+                        len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
+                        brelse(bh);
+                        error = lc(dip, index, len, leaf_no, data);
+                        if (error)
+                                goto out;
+                        index = (index & ~(len - 1)) + len;
+                } else
+                        index++;
+        }
+        if (index != hsize) {
+                gfs2_consist_inode(dip);
+                error = -EIO;
+        }
+out:
+        kfree(lp);
+        return error;
+}
+/**
+ * leaf_dealloc - Deallocate a directory leaf
+ * @dip: the directory
+ * @index: the hash table offset in the directory
+ * @len: the number of pointers to this leaf
+ * @leaf_no: the leaf number
+ * @data: not used
+ *
+ * Returns: errno
+ */
+static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
+                        u64 leaf_no, void *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_leaf *tmp_leaf;
+        struct gfs2_rgrp_list rlist;
+        struct buffer_head *bh, *dibh;
+        u64 blk, nblk;
+        unsigned int rg_blocks = 0, l_blocks = 0;
+        char *ht;
+        unsigned int x, size = len * sizeof(u64);
+        int error;
+        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+        ht = kzalloc(size, GFP_KERNEL);
+        if (!ht)
+                return -ENOMEM;
+        gfs2_alloc_get(dip);
+        error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
+        if (error)
+                goto out_qs;
+        /*  Count the number of leaves  */
+        for (blk = leaf_no; blk; blk = nblk) {
+                error = get_leaf(dip, blk, &bh);
+                if (error)
+                        goto out_rlist;
+                tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+                nblk = be64_to_cpu(tmp_leaf->lf_next);
+                brelse(bh);
+                gfs2_rlist_add(sdp, &rlist, blk);
+                l_blocks++;
+        }
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        for (x = 0; x < rlist.rl_rgrps; x++) {
+                struct gfs2_rgrpd *rgd;
+                rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+                rg_blocks += rgd->rd_ri.ri_length;
+        }
+        error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+        if (error)
+                goto out_rlist;
+        error = gfs2_trans_begin(sdp,
+                        rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
+                        RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+        if (error)
+                goto out_rg_gunlock;
+        for (blk = leaf_no; blk; blk = nblk) {
+                error = get_leaf(dip, blk, &bh);
+                if (error)
+                        goto out_end_trans;
+                tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+                nblk = be64_to_cpu(tmp_leaf->lf_next);
+                brelse(bh);
+                gfs2_free_meta(dip, blk, 1);
+                if (!dip->i_di.di_blocks)
+                        gfs2_consist_inode(dip);
+                dip->i_di.di_blocks--;
+        }
+        error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
+        if (error != size) {
+                if (error >= 0)
+                        error = -EIO;
+                goto out_end_trans;
+        }
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (error)
+                goto out_end_trans;
+        gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+        gfs2_dinode_out(&dip->i_di, dibh->b_data);
+        brelse(dibh);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_rg_gunlock:
+        gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+        gfs2_rlist_free(&rlist);
+        gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
+out_qs:
+        gfs2_quota_unhold(dip);
+out:
+        gfs2_alloc_put(dip);
+        kfree(ht);
+        return error;
+}
+/**
+ * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
+ * @dip: the directory
+ *
+ * Dealloc all on-disk directory leaves to FREEMETA state
+ * Change on-disk inode type to "regular file"
+ *
+ * Returns: errno
+ */
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct buffer_head *bh;
+        int error;
+        /* Dealloc on-disk leaves to FREEMETA state */
+        error = foreach_leaf(dip, leaf_dealloc, NULL);
+        if (error)
+                return error;
+        /* Make this a regular file in case we crash.
+           (We don't want to free these blocks a second time.)  */
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(dip, &bh);
+        if (!error) {
+                gfs2_trans_add_bh(dip->i_gl, bh, 1);
+                ((struct gfs2_dinode *)bh->b_data)->di_mode =
+                                                cpu_to_be32(S_IFREG);
+                brelse(bh);
+        }
+        gfs2_trans_end(sdp);
+        return error;
+}
+/**
+ * gfs2_diradd_alloc_required - find if adding entry will require an allocation
+ * @ip: the file being written to
+ * @filname: the filename that's going to be added
+ *
+ * Returns: 1 if alloc required, 0 if not, -ve on error
+ */
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+{
+        struct gfs2_dirent *dent;
+        struct buffer_head *bh;
+        dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
+        if (!dent) {
+                return 1;
+        }
+        if (IS_ERR(dent))
+                return PTR_ERR(dent);
+        brelse(bh);
+        return 0;
+}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..371233419b07
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+#include <linux/dcache.h>
+struct inode;
+struct gfs2_inode;
+struct gfs2_inum;
+/**
+ * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+typedef int (*gfs2_filldir_t) (void *opaque,
+                              const char *name, unsigned int length,
+                              u64 offset,
+                              struct gfs2_inum *inum, unsigned int type);
+int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
+                    struct gfs2_inum *inum, unsigned int *type);
+int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+                 const struct gfs2_inum *inum, unsigned int type);
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque,
+                  gfs2_filldir_t filldir);
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+                   struct gfs2_inum *new_inum, unsigned int new_type);
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+int gfs2_diradd_alloc_required(struct inode *dir,
+                               const struct qstr *filename);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+                            struct buffer_head **bhp);
+static inline u32 gfs2_disk_hash(const char *data, int len)
+{
+        return crc32_le((u32)~0, data, len) ^ (u32)~0;
+}
+static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
+{
+        name->name = fname;
+        name->len = strlen(fname);
+        name->hash = gfs2_disk_hash(name->name, name->len);
+}
+/* N.B. This probably ought to take inum & type as args as well */
+static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
+{
+        dent->de_inum.no_addr = cpu_to_be64(0);
+        dent->de_inum.no_formal_ino = cpu_to_be64(0);
+        dent->de_hash = cpu_to_be32(name->hash);
+        dent->de_rec_len = cpu_to_be16(reclen);
+        dent->de_name_len = cpu_to_be16(name->len);
+        dent->de_type = cpu_to_be16(0);
+        memset(dent->__pad, 0, sizeof(dent->__pad));
+        memcpy(dent + 1, name->name, name->len);
+}
+#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..92c54e9b0dc3
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "util.h"
+/**
+ * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
+ * @namep: ea name, possibly with type appended
+ *
+ * Returns: GFS2_EATYPE_XXX
+ */
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
+{
+        unsigned int type;
+        if (strncmp(name, "system.", 7) == 0) {
+                type = GFS2_EATYPE_SYS;
+                if (truncated_name)
+                        *truncated_name = name + sizeof("system.") - 1;
+        } else if (strncmp(name, "user.", 5) == 0) {
+                type = GFS2_EATYPE_USR;
+                if (truncated_name)
+                        *truncated_name = name + sizeof("user.") - 1;
+        } else if (strncmp(name, "security.", 9) == 0) {
+                type = GFS2_EATYPE_SECURITY;
+                if (truncated_name)
+                        *truncated_name = name + sizeof("security.") - 1;
+        } else {
+                type = GFS2_EATYPE_UNUSED;
+                if (truncated_name)
+                        *truncated_name = NULL;
+        }
+        return type;
+}
+static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        int error = permission(inode, MAY_READ, NULL);
+        if (error)
+                return error;
+        return gfs2_ea_get_i(ip, er);
+}
+static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        if (S_ISREG(inode->i_mode) ||
+            (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+                int error = permission(inode, MAY_WRITE, NULL);
+                if (error)
+                        return error;
+        } else
+                return -EPERM;
+        return gfs2_ea_set_i(ip, er);
+}
+static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        if (S_ISREG(inode->i_mode) ||
+            (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+                int error = permission(inode, MAY_WRITE, NULL);
+                if (error)
+                        return error;
+        } else
+                return -EPERM;
+        return gfs2_ea_remove_i(ip, er);
+}
+static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
+            !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
+            !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
+            (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
+             GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
+                return -EOPNOTSUPP;
+        return gfs2_ea_get_i(ip, er);
+}
+static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        int remove = 0;
+        int error;
+        if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+                if (!(er->er_flags & GFS2_ERF_MODE)) {
+                        er->er_mode = ip->i_di.di_mode;
+                        er->er_flags |= GFS2_ERF_MODE;
+                }
+                error = gfs2_acl_validate_set(ip, 1, er,
+                                              &remove, &er->er_mode);
+                if (error)
+                        return error;
+                error = gfs2_ea_set_i(ip, er);
+                if (error)
+                        return error;
+                if (remove)
+                        gfs2_ea_remove_i(ip, er);
+                return 0;
+        } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+                error = gfs2_acl_validate_set(ip, 0, er,
+                                              &remove, NULL);
+                if (error)
+                        return error;
+                if (!remove)
+                        error = gfs2_ea_set_i(ip, er);
+                else {
+                        error = gfs2_ea_remove_i(ip, er);
+                        if (error == -ENODATA)
+                                error = 0;
+                }
+                return error;
+        }
+        return -EPERM;
+}
+static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+                int error = gfs2_acl_validate_remove(ip, 1);
+                if (error)
+                        return error;
+        } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+                int error = gfs2_acl_validate_remove(ip, 0);
+                if (error)
+                        return error;
+        } else
+                return -EPERM;
+        return gfs2_ea_remove_i(ip, er);
+}
+static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        int error = permission(inode, MAY_READ, NULL);
+        if (error)
+                return error;
+        return gfs2_ea_get_i(ip, er);
+}
+static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        int error = permission(inode, MAY_WRITE, NULL);
+        if (error)
+                return error;
+        return gfs2_ea_set_i(ip, er);
+}
+static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        int error = permission(inode, MAY_WRITE, NULL);
+        if (error)
+                return error;
+        return gfs2_ea_remove_i(ip, er);
+}
+static struct gfs2_eattr_operations gfs2_user_eaops = {
+        .eo_get = user_eo_get,
+        .eo_set = user_eo_set,
+        .eo_remove = user_eo_remove,
+        .eo_name = "user",
+};
+struct gfs2_eattr_operations gfs2_system_eaops = {
+        .eo_get = system_eo_get,
+        .eo_set = system_eo_set,
+        .eo_remove = system_eo_remove,
+        .eo_name = "system",
+};
+static struct gfs2_eattr_operations gfs2_security_eaops = {
+        .eo_get = security_eo_get,
+        .eo_set = security_eo_set,
+        .eo_remove = security_eo_remove,
+        .eo_name = "security",
+};
+struct gfs2_eattr_operations *gfs2_ea_ops[] = {
+        NULL,
+        &gfs2_user_eaops,
+        &gfs2_system_eaops,
+        &gfs2_security_eaops,
+};
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..508b4f7a2449
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __EAOPS_DOT_H__
+#define __EAOPS_DOT_H__
+struct gfs2_ea_request;
+struct gfs2_inode;
+struct gfs2_eattr_operations {
+        int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+        int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+        int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+        char *eo_name;
+};
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
+extern struct gfs2_eattr_operations gfs2_system_eaops;
+extern struct gfs2_eattr_operations *gfs2_ea_ops[];
+#endif /* __EAOPS_DOT_H__ */
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..a65a4ccfd4dd
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1501 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ *                (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+                        unsigned int *size)
+{
+        *size = GFS2_EAREQ_SIZE_STUFFED(er);
+        if (*size <= sdp->sd_jbsize)
+                return 1;
+        *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+        return 0;
+}
+static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+{
+        unsigned int size;
+        if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+                return -ERANGE;
+        ea_calc_size(sdp, er, &size);
+        /* This can only happen with 512 byte blocks */
+        if (size > sdp->sd_jbsize)
+                return -ERANGE;
+        return 0;
+}
+typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
+                          struct gfs2_ea_header *ea,
+                          struct gfs2_ea_header *prev, void *private);
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+                        ea_call_t ea_call, void *data)
+{
+        struct gfs2_ea_header *ea, *prev = NULL;
+        int error = 0;
+        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
+                return -EIO;
+        for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+                if (!GFS2_EA_REC_LEN(ea))
+                        goto fail;
+                if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
+                                                  bh->b_data + bh->b_size))
+                        goto fail;
+                if (!GFS2_EATYPE_VALID(ea->ea_type))
+                        goto fail;
+                error = ea_call(ip, bh, ea, prev, data);
+                if (error)
+                        return error;
+                if (GFS2_EA_IS_LAST(ea)) {
+                        if ((char *)GFS2_EA2NEXT(ea) !=
+                            bh->b_data + bh->b_size)
+                                goto fail;
+                        break;
+                }
+        }
+        return error;
+fail:
+        gfs2_consist_inode(ip);
+        return -EIO;
+}
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+        struct buffer_head *bh, *eabh;
+        u64 *eablk, *end;
+        int error;
+        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+        if (error)
+                return error;
+        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+                error = ea_foreach_i(ip, bh, ea_call, data);
+                goto out;
+        }
+        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
+                error = -EIO;
+                goto out;
+        }
+        eablk = (u64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+        end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
+        for (; eablk < end; eablk++) {
+                u64 bn;
+                if (!*eablk)
+                        break;
+                bn = be64_to_cpu(*eablk);
+                error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+                if (error)
+                        break;
+                error = ea_foreach_i(ip, eabh, ea_call, data);
+                brelse(eabh);
+                if (error)
+                        break;
+        }
+out:
+        brelse(bh);
+        return error;
+}
+struct ea_find {
+        struct gfs2_ea_request *ef_er;
+        struct gfs2_ea_location *ef_el;
+};
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+                     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+                     void *private)
+{
+        struct ea_find *ef = private;
+        struct gfs2_ea_request *er = ef->ef_er;
+        if (ea->ea_type == GFS2_EATYPE_UNUSED)
+                return 0;
+        if (ea->ea_type == er->er_type) {
+                if (ea->ea_name_len == er->er_name_len &&
+                    !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+                        struct gfs2_ea_location *el = ef->ef_el;
+                        get_bh(bh);
+                        el->el_bh = bh;
+                        el->el_ea = ea;
+                        el->el_prev = prev;
+                        return 1;
+                }
+        }
+        return 0;
+}
+int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                 struct gfs2_ea_location *el)
+{
+        struct ea_find ef;
+        int error;
+        ef.ef_er = er;
+        ef.ef_el = el;
+        memset(el, 0, sizeof(struct gfs2_ea_location));
+        error = ea_foreach(ip, ea_find_i, &ef);
+        if (error > 0)
+                return 0;
+        return error;
+}
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG.  But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+                                struct gfs2_ea_header *ea,
+                                struct gfs2_ea_header *prev, void *private)
+{
+        int *leave = private;
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_holder rg_gh;
+        struct buffer_head *dibh;
+        u64 *dataptrs, bn = 0;
+        u64 bstart = 0;
+        unsigned int blen = 0;
+        unsigned int blks = 0;
+        unsigned int x;
+        int error;
+        if (GFS2_EA_IS_STUFFED(ea))
+                return 0;
+        dataptrs = GFS2_EA2DATAPTRS(ea);
+        for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+                if (*dataptrs) {
+                        blks++;
+                        bn = be64_to_cpu(*dataptrs);
+                }
+        }
+        if (!blks)
+                return 0;
+        rgd = gfs2_blk2rgrpd(sdp, bn);
+        if (!rgd) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+        if (error)
+                return error;
+        error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE +
+                                 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
+        if (error)
+                goto out_gunlock;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        dataptrs = GFS2_EA2DATAPTRS(ea);
+        for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+                if (!*dataptrs)
+                        break;
+                bn = be64_to_cpu(*dataptrs);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart)
+                                gfs2_free_meta(ip, bstart, blen);
+                        bstart = bn;
+                        blen = 1;
+                }
+                *dataptrs = 0;
+                if (!ip->i_di.di_blocks)
+                        gfs2_consist_inode(ip);
+                ip->i_di.di_blocks--;
+        }
+        if (bstart)
+                gfs2_free_meta(ip, bstart, blen);
+        if (prev && !leave) {
+                u32 len;
+                len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+                prev->ea_rec_len = cpu_to_be32(len);
+                if (GFS2_EA_IS_LAST(ea))
+                        prev->ea_flags |= GFS2_EAFLAG_LAST;
+        } else {
+                ea->ea_type = GFS2_EATYPE_UNUSED;
+                ea->ea_num_ptrs = 0;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_uninit(&rg_gh);
+        return error;
+}
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+                               struct gfs2_ea_header *ea,
+                               struct gfs2_ea_header *prev, int leave)
+{
+        struct gfs2_alloc *al;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out_alloc;
+        error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+        if (error)
+                goto out_quota;
+        error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
+        gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+        gfs2_quota_unhold(ip);
+out_alloc:
+        gfs2_alloc_put(ip);
+        return error;
+}
+struct ea_list {
+        struct gfs2_ea_request *ei_er;
+        unsigned int ei_size;
+};
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+                     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+                     void *private)
+{
+        struct ea_list *ei = private;
+        struct gfs2_ea_request *er = ei->ei_er;
+        unsigned int ea_size = gfs2_ea_strlen(ea);
+        if (ea->ea_type == GFS2_EATYPE_UNUSED)
+                return 0;
+        if (er->er_data_len) {
+                char *prefix = NULL;
+                unsigned int l = 0;
+                char c = 0;
+                if (ei->ei_size + ea_size > er->er_data_len)
+                        return -ERANGE;
+                switch (ea->ea_type) {
+                case GFS2_EATYPE_USR:
+                        prefix = "user.";
+                        l = 5;
+                        break;
+                case GFS2_EATYPE_SYS:
+                        prefix = "system.";
+                        l = 7;
+                        break;
+                case GFS2_EATYPE_SECURITY:
+                        prefix = "security.";
+                        l = 9;
+                        break;
+                }
+                BUG_ON(l == 0);
+                memcpy(er->er_data + ei->ei_size, prefix, l);
+                memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
+                       ea->ea_name_len);
+                memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+        }
+        ei->ei_size += ea_size;
+        return 0;
+}
+/**
+ * gfs2_ea_list -
+ * @ip:
+ * @er:
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_holder i_gh;
+        int error;
+        if (!er->er_data || !er->er_data_len) {
+                er->er_data = NULL;
+                er->er_data_len = 0;
+        }
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (error)
+                return error;
+        if (ip->i_di.di_eattr) {
+                struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+                error = ea_foreach(ip, ea_list_i, &ei);
+                if (!error)
+                        error = ei.ei_size;
+        }
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
+ * ea_get_unstuffed - actually copies the unstuffed data into the
+ *                    request buffer
+ * @ip: The GFS2 inode
+ * @ea: The extended attribute header structure
+ * @data: The data to be copied
+ *
+ * Returns: errno
+ */
+static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+                            char *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head **bh;
+        unsigned int amount = GFS2_EA_DATA_LEN(ea);
+        unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+        u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+        unsigned int x;
+        int error = 0;
+        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+        if (!bh)
+                return -ENOMEM;
+        for (x = 0; x < nptrs; x++) {
+                error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+                                       bh + x);
+                if (error) {
+                        while (x--)
+                                brelse(bh[x]);
+                        goto out;
+                }
+                dataptrs++;
+        }
+        for (x = 0; x < nptrs; x++) {
+                error = gfs2_meta_wait(sdp, bh[x]);
+                if (error) {
+                        for (; x < nptrs; x++)
+                                brelse(bh[x]);
+                        goto out;
+                }
+                if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+                        for (; x < nptrs; x++)
+                                brelse(bh[x]);
+                        error = -EIO;
+                        goto out;
+                }
+                memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
+                       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+                amount -= sdp->sd_jbsize;
+                data += sdp->sd_jbsize;
+                brelse(bh[x]);
+        }
+out:
+        kfree(bh);
+        return error;
+}
+int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+                     char *data)
+{
+        if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+                memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
+                return 0;
+        } else
+                return ea_get_unstuffed(ip, el->el_ea, data);
+}
+/**
+ * gfs2_ea_get_i -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_ea_location el;
+        int error;
+        if (!ip->i_di.di_eattr)
+                return -ENODATA;
+        error = gfs2_ea_find(ip, er, &el);
+        if (error)
+                return error;
+        if (!el.el_ea)
+                return -ENODATA;
+        if (er->er_data_len) {
+                if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
+                        error =  -ERANGE;
+                else
+                        error = gfs2_ea_get_copy(ip, &el, er->er_data);
+        }
+        if (!error)
+                error = GFS2_EA_DATA_LEN(el.el_ea);
+        brelse(el.el_bh);
+        return error;
+}
+/**
+ * gfs2_ea_get -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_holder i_gh;
+        int error;
+        if (!er->er_name_len ||
+            er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+                return -EINVAL;
+        if (!er->er_data || !er->er_data_len) {
+                er->er_data = NULL;
+                er->er_data_len = 0;
+        }
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (error)
+                return error;
+        error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp: Pointer to pointer to a struct buffer_head
+ *
+ * Returns: errno
+ */
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_ea_header *ea;
+        u64 block;
+        block = gfs2_alloc_meta(ip);
+        *bhp = gfs2_meta_new(ip->i_gl, block);
+        gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+        gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+        gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+        ea = GFS2_EA_BH2FIRST(*bhp);
+        ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+        ea->ea_type = GFS2_EATYPE_UNUSED;
+        ea->ea_flags = GFS2_EAFLAG_LAST;
+        ea->ea_num_ptrs = 0;
+        ip->i_di.di_blocks++;
+        return 0;
+}
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ *            necessary
+ * @ip: inode that is being modified
+ * @ea: the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+                    struct gfs2_ea_request *er)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        ea->ea_data_len = cpu_to_be32(er->er_data_len);
+        ea->ea_name_len = er->er_name_len;
+        ea->ea_type = er->er_type;
+        ea->__pad = 0;
+        memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+        if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+                ea->ea_num_ptrs = 0;
+                memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+        } else {
+                u64 *dataptr = GFS2_EA2DATAPTRS(ea);
+                const char *data = er->er_data;
+                unsigned int data_len = er->er_data_len;
+                unsigned int copy;
+                unsigned int x;
+                ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
+                for (x = 0; x < ea->ea_num_ptrs; x++) {
+                        struct buffer_head *bh;
+                        u64 block;
+                        int mh_size = sizeof(struct gfs2_meta_header);
+                        block = gfs2_alloc_meta(ip);
+                        bh = gfs2_meta_new(ip->i_gl, block);
+                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+                        ip->i_di.di_blocks++;
+                        copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
+                                                           data_len;
+                        memcpy(bh->b_data + mh_size, data, copy);
+                        if (copy < sdp->sd_jbsize)
+                                memset(bh->b_data + mh_size + copy, 0,
+                                       sdp->sd_jbsize - copy);
+                        *dataptr++ = cpu_to_be64(bh->b_blocknr);
+                        data += copy;
+                        data_len -= copy;
+                        brelse(bh);
+                }
+                gfs2_assert_withdraw(sdp, !data_len);
+        }
+        return 0;
+}
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+                                   struct gfs2_ea_request *er, void *private);
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                             unsigned int blks,
+                             ea_skeleton_call_t skeleton_call, void *private)
+{
+        struct gfs2_alloc *al;
+        struct buffer_head *dibh;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+        if (error)
+                goto out_gunlock_q;
+        al->al_requested = blks;
+        error = gfs2_inplace_reserve(ip);
+        if (error)
+                goto out_gunlock_q;
+        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
+                                 blks + al->al_rgd->rd_ri.ri_length +
+                                 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+        if (error)
+                goto out_ipres;
+        error = skeleton_call(ip, er, private);
+        if (error)
+                goto out_end_trans;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                if (er->er_flags & GFS2_ERF_MODE) {
+                        gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+                                            (ip->i_di.di_mode & S_IFMT) ==
+                                            (er->er_mode & S_IFMT));
+                        ip->i_di.di_mode = er->er_mode;
+                }
+                ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+out_end_trans:
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+out_ipres:
+        gfs2_inplace_release(ip);
+out_gunlock_q:
+        gfs2_quota_unlock(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                     void *private)
+{
+        struct buffer_head *bh;
+        int error;
+        error = ea_alloc_blk(ip, &bh);
+        if (error)
+                return error;
+        ip->i_di.di_eattr = bh->b_blocknr;
+        error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+        brelse(bh);
+        return error;
+}
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
+        unsigned int blks = 1;
+        if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
+                blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+        return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+}
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+        u32 ea_size = GFS2_EA_SIZE(ea);
+        struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
+                                     ea_size);
+        u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+        int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+        ea->ea_rec_len = cpu_to_be32(ea_size);
+        ea->ea_flags ^= last;
+        new->ea_rec_len = cpu_to_be32(new_size);
+        new->ea_flags = last;
+        return new;
+}
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+                                  struct gfs2_ea_location *el)
+{
+        struct gfs2_ea_header *ea = el->el_ea;
+        struct gfs2_ea_header *prev = el->el_prev;
+        u32 len;
+        gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+        if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+                ea->ea_type = GFS2_EATYPE_UNUSED;
+                return;
+        } else if (GFS2_EA2NEXT(prev) != ea) {
+                prev = GFS2_EA2NEXT(prev);
+                gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
+        }
+        len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+        prev->ea_rec_len = cpu_to_be32(len);
+        if (GFS2_EA_IS_LAST(ea))
+                prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+struct ea_set {
+        int ea_split;
+        struct gfs2_ea_request *es_er;
+        struct gfs2_ea_location *es_el;
+        struct buffer_head *es_bh;
+        struct gfs2_ea_header *es_ea;
+};
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+                                 struct gfs2_ea_header *ea, struct ea_set *es)
+{
+        struct gfs2_ea_request *er = es->es_er;
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
+        if (error)
+                return error;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        if (es->ea_split)
+                ea = ea_split_ea(ea);
+        ea_write(ip, ea, er);
+        if (es->es_el)
+                ea_set_remove_stuffed(ip, es->es_el);
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        if (er->er_flags & GFS2_ERF_MODE) {
+                gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+                        (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
+                ip->i_di.di_mode = er->er_mode;
+        }
+        ip->i_di.di_ctime = get_seconds();
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+out:
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        return error;
+}
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+                               struct gfs2_ea_request *er, void *private)
+{
+        struct ea_set *es = private;
+        struct gfs2_ea_header *ea = es->es_ea;
+        int error;
+        gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+        if (es->ea_split)
+                ea = ea_split_ea(ea);
+        error = ea_write(ip, ea, er);
+        if (error)
+                return error;
+        if (es->es_el)
+                ea_set_remove_stuffed(ip, es->es_el);
+        return 0;
+}
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+                         struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+                         void *private)
+{
+        struct ea_set *es = private;
+        unsigned int size;
+        int stuffed;
+        int error;
+        stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+        if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+                if (GFS2_EA_REC_LEN(ea) < size)
+                        return 0;
+                if (!GFS2_EA_IS_STUFFED(ea)) {
+                        error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+                        if (error)
+                                return error;
+                }
+                es->ea_split = 0;
+        } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+                es->ea_split = 1;
+        else
+                return 0;
+        if (stuffed) {
+                error = ea_set_simple_noalloc(ip, bh, ea, es);
+                if (error)
+                        return error;
+        } else {
+                unsigned int blks;
+                es->es_bh = bh;
+                es->es_ea = ea;
+                blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
+                                        GFS2_SB(&ip->i_inode)->sd_jbsize);
+                error = ea_alloc_skeleton(ip, es->es_er, blks,
+                                          ea_set_simple_alloc, es);
+                if (error)
+                        return error;
+        }
+        return 1;
+}
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                        void *private)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *indbh, *newbh;
+        u64 *eablk;
+        int error;
+        int mh_size = sizeof(struct gfs2_meta_header);
+        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+                u64 *end;
+                error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+                                       &indbh);
+                if (error)
+                        return error;
+                if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+                        error = -EIO;
+                        goto out;
+                }
+                eablk = (u64 *)(indbh->b_data + mh_size);
+                end = eablk + sdp->sd_inptrs;
+                for (; eablk < end; eablk++)
+                        if (!*eablk)
+                                break;
+                if (eablk == end) {
+                        error = -ENOSPC;
+                        goto out;
+                }
+                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+        } else {
+                u64 blk;
+                blk = gfs2_alloc_meta(ip);
+                indbh = gfs2_meta_new(ip->i_gl, blk);
+                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+                gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+                gfs2_buffer_clear_tail(indbh, mh_size);
+                eablk = (u64 *)(indbh->b_data + mh_size);
+                *eablk = cpu_to_be64(ip->i_di.di_eattr);
+                ip->i_di.di_eattr = blk;
+                ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+                ip->i_di.di_blocks++;
+                eablk++;
+        }
+        error = ea_alloc_blk(ip, &newbh);
+        if (error)
+                goto out;
+        *eablk = cpu_to_be64((u64)newbh->b_blocknr);
+        error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+        brelse(newbh);
+        if (error)
+                goto out;
+        if (private)
+                ea_set_remove_stuffed(ip, private);
+out:
+        brelse(indbh);
+        return error;
+}
+static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                    struct gfs2_ea_location *el)
+{
+        struct ea_set es;
+        unsigned int blks = 2;
+        int error;
+        memset(&es, 0, sizeof(struct ea_set));
+        es.es_er = er;
+        es.es_el = el;
+        error = ea_foreach(ip, ea_set_simple, &es);
+        if (error > 0)
+                return 0;
+        if (error)
+                return error;
+        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+                blks++;
+        if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+                blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+        return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+}
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+                                   struct gfs2_ea_location *el)
+{
+        if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+                el->el_prev = GFS2_EA2NEXT(el->el_prev);
+                gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+                                     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+        }
+        return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+}
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_ea_location el;
+        int error;
+        if (!ip->i_di.di_eattr) {
+                if (er->er_flags & XATTR_REPLACE)
+                        return -ENODATA;
+                return ea_init(ip, er);
+        }
+        error = gfs2_ea_find(ip, er, &el);
+        if (error)
+                return error;
+        if (el.el_ea) {
+                if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+                        brelse(el.el_bh);
+                        return -EPERM;
+                }
+                error = -EEXIST;
+                if (!(er->er_flags & XATTR_CREATE)) {
+                        int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+                        error = ea_set_i(ip, er, &el);
+                        if (!error && unstuffed)
+                                ea_set_remove_unstuffed(ip, &el);
+                }
+                brelse(el.el_bh);
+        } else {
+                error = -ENODATA;
+                if (!(er->er_flags & XATTR_REPLACE))
+                        error = ea_set_i(ip, er, NULL);
+        }
+        return error;
+}
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_holder i_gh;
+        int error;
+        if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+                return -EINVAL;
+        if (!er->er_data || !er->er_data_len) {
+                er->er_data = NULL;
+                er->er_data_len = 0;
+        }
+        error = ea_check_size(GFS2_SB(&ip->i_inode), er);
+        if (error)
+                return error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                return error;
+        if (IS_IMMUTABLE(&ip->i_inode))
+                error = -EPERM;
+        else
+                error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+        struct gfs2_ea_header *ea = el->el_ea;
+        struct gfs2_ea_header *prev = el->el_prev;
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+        if (error)
+                return error;
+        gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+        if (prev) {
+                u32 len;
+                len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+                prev->ea_rec_len = cpu_to_be32(len);
+                if (GFS2_EA_IS_LAST(ea))
+                        prev->ea_flags |= GFS2_EAFLAG_LAST;
+        } else
+                ea->ea_type = GFS2_EATYPE_UNUSED;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        return error;
+}
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_ea_location el;
+        int error;
+        if (!ip->i_di.di_eattr)
+                return -ENODATA;
+        error = gfs2_ea_find(ip, er, &el);
+        if (error)
+                return error;
+        if (!el.el_ea)
+                return -ENODATA;
+        if (GFS2_EA_IS_STUFFED(el.el_ea))
+                error = ea_remove_stuffed(ip, &el);
+        else
+                error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
+                                            0);
+        brelse(el.el_bh);
+        return error;
+}
+/**
+ * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
+ * @ip: pointer to the inode of the target file
+ * @er: request information
+ *
+ * Returns: errno
+ */
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_holder i_gh;
+        int error;
+        if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+                return -EINVAL;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                return error;
+        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+                error = -EPERM;
+        else
+                error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+                                  struct gfs2_ea_header *ea, char *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head **bh;
+        unsigned int amount = GFS2_EA_DATA_LEN(ea);
+        unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+        u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+        unsigned int x;
+        int error;
+        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+        if (!bh)
+                return -ENOMEM;
+        error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+        if (error)
+                goto out;
+        for (x = 0; x < nptrs; x++) {
+                error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+                                       bh + x);
+                if (error) {
+                        while (x--)
+                                brelse(bh[x]);
+                        goto fail;
+                }
+                dataptrs++;
+        }
+        for (x = 0; x < nptrs; x++) {
+                error = gfs2_meta_wait(sdp, bh[x]);
+                if (error) {
+                        for (; x < nptrs; x++)
+                                brelse(bh[x]);
+                        goto fail;
+                }
+                if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+                        for (; x < nptrs; x++)
+                                brelse(bh[x]);
+                        error = -EIO;
+                        goto fail;
+                }
+                gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+                memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
+                       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+                amount -= sdp->sd_jbsize;
+                data += sdp->sd_jbsize;
+                brelse(bh[x]);
+        }
+out:
+        kfree(bh);
+        return error;
+fail:
+        gfs2_trans_end(sdp);
+        kfree(bh);
+        return error;
+}
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+                      struct iattr *attr, char *data)
+{
+        struct buffer_head *dibh;
+        int error;
+        if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+                error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+                if (error)
+                        return error;
+                gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+                memcpy(GFS2_EA2DATA(el->el_ea), data,
+                       GFS2_EA_DATA_LEN(el->el_ea));
+        } else
+                error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                error = inode_setattr(&ip->i_inode, attr);
+                gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+                gfs2_inode_attr_out(ip);
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        return error;
+}
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrp_list rlist;
+        struct buffer_head *indbh, *dibh;
+        u64 *eablk, *end;
+        unsigned int rg_blocks = 0;
+        u64 bstart = 0;
+        unsigned int blen = 0;
+        unsigned int blks = 0;
+        unsigned int x;
+        int error;
+        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+        if (error)
+                return error;
+        if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+                error = -EIO;
+                goto out;
+        }
+        eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+        end = eablk + sdp->sd_inptrs;
+        for (; eablk < end; eablk++) {
+                u64 bn;
+                if (!*eablk)
+                        break;
+                bn = be64_to_cpu(*eablk);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart)
+                                gfs2_rlist_add(sdp, &rlist, bstart);
+                        bstart = bn;
+                        blen = 1;
+                }
+                blks++;
+        }
+        if (bstart)
+                gfs2_rlist_add(sdp, &rlist, bstart);
+        else
+                goto out;
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        for (x = 0; x < rlist.rl_rgrps; x++) {
+                struct gfs2_rgrpd *rgd;
+                rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+                rg_blocks += rgd->rd_ri.ri_length;
+        }
+        error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+        if (error)
+                goto out_rlist_free;
+        error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
+                                 RES_STATFS + RES_QUOTA, blks);
+        if (error)
+                goto out_gunlock;
+        gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+        eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+        bstart = 0;
+        blen = 0;
+        for (; eablk < end; eablk++) {
+                u64 bn;
+                if (!*eablk)
+                        break;
+                bn = be64_to_cpu(*eablk);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart)
+                                gfs2_free_meta(ip, bstart, blen);
+                        bstart = bn;
+                        blen = 1;
+                }
+                *eablk = 0;
+                if (!ip->i_di.di_blocks)
+                        gfs2_consist_inode(ip);
+                ip->i_di.di_blocks--;
+        }
+        if (bstart)
+                gfs2_free_meta(ip, bstart, blen);
+        ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist_free:
+        gfs2_rlist_free(&rlist);
+out:
+        brelse(indbh);
+        return error;
+}
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_rgrpd *rgd;
+        struct buffer_head *dibh;
+        int error;
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+        if (!rgd) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+                                   &al->al_rgd_gh);
+        if (error)
+                return error;
+        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
+                                 RES_QUOTA, 1);
+        if (error)
+                goto out_gunlock;
+        gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+        ip->i_di.di_eattr = 0;
+        if (!ip->i_di.di_blocks)
+                gfs2_consist_inode(ip);
+        ip->i_di.di_blocks--;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+        return error;
+}
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+        struct gfs2_alloc *al;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out_alloc;
+        error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+        if (error)
+                goto out_quota;
+        error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+        if (error)
+                goto out_rindex;
+        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+                error = ea_dealloc_indirect(ip);
+                if (error)
+                        goto out_rindex;
+        }
+        error = ea_dealloc_block(ip);
+out_rindex:
+        gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+        gfs2_quota_unhold(ip);
+out_alloc:
+        gfs2_alloc_put(ip);
+        return error;
+}
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ffa65947d686
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+struct gfs2_inode;
+struct iattr;
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+                                  (sizeof(u64) * (ea)->ea_num_ptrs)), 8)
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
+      sizeof(u64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+#define GFS2_EA2DATAPTRS(ea) \
+((u64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+#define GFS2_ERF_MODE 0x80000000
+struct gfs2_ea_request {
+        const char *er_name;
+        char *er_data;
+        unsigned int er_name_len;
+        unsigned int er_data_len;
+        unsigned int er_type; /* GFS2_EATYPE_... */
+        int er_flags;
+        mode_t er_mode;
+};
+struct gfs2_ea_location {
+        struct buffer_head *el_bh;
+        struct gfs2_ea_header *el_ea;
+        struct gfs2_ea_header *el_prev;
+};
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_dealloc(struct gfs2_inode *ip);
+/* Exported to acl.c */
+int gfs2_ea_find(struct gfs2_inode *ip,
+                 struct gfs2_ea_request *er,
+                 struct gfs2_ea_location *el);
+int gfs2_ea_get_copy(struct gfs2_inode *ip,
+                     struct gfs2_ea_location *el,
+                     char *data);
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+                      struct iattr *attr, char *data);
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+        switch (ea->ea_type) {
+        case GFS2_EATYPE_USR:
+                return 5 + ea->ea_name_len + 1;
+        case GFS2_EATYPE_SYS:
+                return 7 + ea->ea_name_len + 1;
+        case GFS2_EATYPE_SECURITY:
+                return 9 + ea->ea_name_len + 1;
+        default:
+                return 0;
+        }
+}
+#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..3bb11c0f8b56
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __GFS2_DOT_H__
+#define __GFS2_DOT_H__
+enum {
+        NO_CREATE = 0,
+        CREATE = 1,
+};
+enum {
+        NO_WAIT = 0,
+        WAIT = 1,
+};
+enum {
+        NO_FORCE = 0,
+        FORCE = 1,
+};
+#define GFS2_FAST_NAME_SIZE 8
+#endif /* __GFS2_DOT_H__ */
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..78fe0fae23ff
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2231 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/sort.h>
+#include <linux/jhash.h>
+#include <linux/kallsyms.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/list.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "super.h"
+#include "util.h"
+struct greedy {
+        struct gfs2_holder gr_gh;
+        struct work_struct gr_work;
+};
+struct gfs2_gl_hash_bucket {
+        struct hlist_head hb_list;
+};
+typedef void (*glock_examiner) (struct gfs2_glock * gl);
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
+static int dump_glock(struct gfs2_glock *gl);
+static int dump_inode(struct gfs2_inode *ip);
+#define GFS2_GL_HASH_SHIFT      15
+#define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
+#define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
+static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+/*
+ * Despite what you might think, the numbers below are not arbitrary :-)
+ * They are taken from the ipv4 routing hash code, which is well tested
+ * and thus should be nearly optimal. Later on we might tweek the numbers
+ * but for now this should be fine.
+ *
+ * The reason for putting the locks in a separate array from the list heads
+ * is that we can have fewer locks than list heads and save memory. We use
+ * the same hash function for both, but with a different hash mask.
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+        defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_LOCKDEP
+# define GL_HASH_LOCK_SZ        256
+#else
+# if NR_CPUS >= 32
+#  define GL_HASH_LOCK_SZ       4096
+# elif NR_CPUS >= 16
+#  define GL_HASH_LOCK_SZ       2048
+# elif NR_CPUS >= 8
+#  define GL_HASH_LOCK_SZ       1024
+# elif NR_CPUS >= 4
+#  define GL_HASH_LOCK_SZ       512
+# else
+#  define GL_HASH_LOCK_SZ       256
+# endif
+#endif
+/* We never want more locks than chains */
+#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
+# undef GL_HASH_LOCK_SZ
+# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
+#endif
+static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
+static inline rwlock_t *gl_lock_addr(unsigned int x)
+{
+        return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
+}
+#else /* not SMP, so no spinlocks required */
+static inline rwlock_t *gl_lock_addr(x)
+{
+        return NULL;
+}
+#endif
+/**
+ * relaxed_state_ok - is a requested lock compatible with the current lock mode?
+ * @actual: the current state of the lock
+ * @requested: the lock state that was requested by the caller
+ * @flags: the modifier flags passed in by the caller
+ *
+ * Returns: 1 if the locks are compatible, 0 otherwise
+ */
+static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
+                                   int flags)
+{
+        if (actual == requested)
+                return 1;
+        if (flags & GL_EXACT)
+                return 0;
+        if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
+                return 1;
+        if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
+                return 1;
+        return 0;
+}
+/**
+ * gl_hash() - Turn glock number into hash bucket number
+ * @lock: The glock number
+ *
+ * Returns: The number of the corresponding hash bucket
+ */
+static unsigned int gl_hash(const struct gfs2_sbd *sdp,
+                            const struct lm_lockname *name)
+{
+        unsigned int h;
+        h = jhash(&name->ln_number, sizeof(u64), 0);
+        h = jhash(&name->ln_type, sizeof(unsigned int), h);
+        h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
+        h &= GFS2_GL_HASH_MASK;
+        return h;
+}
+/**
+ * glock_free() - Perform a few checks and then release struct gfs2_glock
+ * @gl: The glock to release
+ *
+ * Also calls lock module to release its internal structure for this glock.
+ *
+ */
+static void glock_free(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct inode *aspace = gl->gl_aspace;
+        gfs2_lm_put_lock(sdp, gl->gl_lock);
+        if (aspace)
+                gfs2_aspace_put(aspace);
+        kmem_cache_free(gfs2_glock_cachep, gl);
+}
+/**
+ * gfs2_glock_hold() - increment reference count on glock
+ * @gl: The glock to hold
+ *
+ */
+void gfs2_glock_hold(struct gfs2_glock *gl)
+{
+        atomic_inc(&gl->gl_ref);
+}
+/**
+ * gfs2_glock_put() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ */
+int gfs2_glock_put(struct gfs2_glock *gl)
+{
+        int rv = 0;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        write_lock(gl_lock_addr(gl->gl_hash));
+        if (atomic_dec_and_test(&gl->gl_ref)) {
+                hlist_del(&gl->gl_list);
+                write_unlock(gl_lock_addr(gl->gl_hash));
+                BUG_ON(spin_is_locked(&gl->gl_spin));
+                gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
+                gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
+                gfs2_assert(sdp, list_empty(&gl->gl_holders));
+                gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
+                gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
+                gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+                glock_free(gl);
+                rv = 1;
+                goto out;
+        }
+        write_unlock(gl_lock_addr(gl->gl_hash));
+out:
+        return rv;
+}
+/**
+ * queue_empty - check to see if a glock's queue is empty
+ * @gl: the glock
+ * @head: the head of the queue to check
+ *
+ * This function protects the list in the event that a process already
+ * has a holder on the list and is adding a second holder for itself.
+ * The glmutex lock is what generally prevents processes from working
+ * on the same glock at once, but the special case of adding a second
+ * holder for yourself ("recursive" locking) doesn't involve locking
+ * glmutex, making the spin lock necessary.
+ *
+ * Returns: 1 if the queue is empty
+ */
+static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
+{
+        int empty;
+        spin_lock(&gl->gl_spin);
+        empty = list_empty(head);
+        spin_unlock(&gl->gl_spin);
+        return empty;
+}
+/**
+ * search_bucket() - Find struct gfs2_glock by lock number
+ * @bucket: the bucket to search
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+static struct gfs2_glock *search_bucket(unsigned int hash,
+                                        const struct gfs2_sbd *sdp,
+                                        const struct lm_lockname *name)
+{
+        struct gfs2_glock *gl;
+        struct hlist_node *h;
+        hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+                if (!lm_name_equal(&gl->gl_name, name))
+                        continue;
+                if (gl->gl_sbd != sdp)
+                        continue;
+                atomic_inc(&gl->gl_ref);
+                return gl;
+        }
+        return NULL;
+}
+/**
+ * gfs2_glock_find() - Find glock by lock number
+ * @sdp: The GFS2 superblock
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
+                                          const struct lm_lockname *name)
+{
+        unsigned int hash = gl_hash(sdp, name);
+        struct gfs2_glock *gl;
+        read_lock(gl_lock_addr(hash));
+        gl = search_bucket(hash, sdp, name);
+        read_unlock(gl_lock_addr(hash));
+        return gl;
+}
+/**
+ * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
+ * @sdp: The GFS2 superblock
+ * @number: the lock number
+ * @glops: The glock_operations to use
+ * @create: If 0, don't create the glock if it doesn't exist
+ * @glp: the glock is returned here
+ *
+ * This does not lock a glock, just finds/creates structures for one.
+ *
+ * Returns: errno
+ */
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+                   const struct gfs2_glock_operations *glops, int create,
+                   struct gfs2_glock **glp)
+{
+        struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
+        struct gfs2_glock *gl, *tmp;
+        unsigned int hash = gl_hash(sdp, &name);
+        int error;
+        read_lock(gl_lock_addr(hash));
+        gl = search_bucket(hash, sdp, &name);
+        read_unlock(gl_lock_addr(hash));
+        if (gl || !create) {
+                *glp = gl;
+                return 0;
+        }
+        gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+        if (!gl)
+                return -ENOMEM;
+        gl->gl_flags = 0;
+        gl->gl_name = name;
+        atomic_set(&gl->gl_ref, 1);
+        gl->gl_state = LM_ST_UNLOCKED;
+        gl->gl_hash = hash;
+        gl->gl_owner = NULL;
+        gl->gl_ip = 0;
+        gl->gl_ops = glops;
+        gl->gl_req_gh = NULL;
+        gl->gl_req_bh = NULL;
+        gl->gl_vn = 0;
+        gl->gl_stamp = jiffies;
+        gl->gl_object = NULL;
+        gl->gl_sbd = sdp;
+        gl->gl_aspace = NULL;
+        lops_init_le(&gl->gl_le, &gfs2_glock_lops);
+        /* If this glock protects actual on-disk data or metadata blocks,
+           create a VFS inode to manage the pages/buffers holding them. */
+        if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
+                gl->gl_aspace = gfs2_aspace_get(sdp);
+                if (!gl->gl_aspace) {
+                        error = -ENOMEM;
+                        goto fail;
+                }
+        }
+        error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
+        if (error)
+                goto fail_aspace;
+        write_lock(gl_lock_addr(hash));
+        tmp = search_bucket(hash, sdp, &name);
+        if (tmp) {
+                write_unlock(gl_lock_addr(hash));
+                glock_free(gl);
+                gl = tmp;
+        } else {
+                hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
+                write_unlock(gl_lock_addr(hash));
+        }
+        *glp = gl;
+        return 0;
+fail_aspace:
+        if (gl->gl_aspace)
+                gfs2_aspace_put(gl->gl_aspace);
+fail:
+        kmem_cache_free(gfs2_glock_cachep, gl);
+        return error;
+}
+/**
+ * gfs2_holder_init - initialize a struct gfs2_holder in the default way
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ */
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+                      struct gfs2_holder *gh)
+{
+        INIT_LIST_HEAD(&gh->gh_list);
+        gh->gh_gl = gl;
+        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        gh->gh_owner = current;
+        gh->gh_state = state;
+        gh->gh_flags = flags;
+        gh->gh_error = 0;
+        gh->gh_iflags = 0;
+        init_completion(&gh->gh_wait);
+        if (gh->gh_state == LM_ST_EXCLUSIVE)
+                gh->gh_flags |= GL_LOCAL_EXCL;
+        gfs2_glock_hold(gl);
+}
+/**
+ * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Don't mess with the glock.
+ *
+ */
+void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+{
+        gh->gh_state = state;
+        gh->gh_flags = flags;
+        if (gh->gh_state == LM_ST_EXCLUSIVE)
+                gh->gh_flags |= GL_LOCAL_EXCL;
+        gh->gh_iflags &= 1 << HIF_ALLOCED;
+        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+}
+/**
+ * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
+ * @gh: the holder structure
+ *
+ */
+void gfs2_holder_uninit(struct gfs2_holder *gh)
+{
+        gfs2_glock_put(gh->gh_gl);
+        gh->gh_gl = NULL;
+        gh->gh_ip = 0;
+}
+/**
+ * gfs2_holder_get - get a struct gfs2_holder structure
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gfp_flags:
+ *
+ * Figure out how big an impact this function has.  Either:
+ * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
+ * 2) Leave it like it is
+ *
+ * Returns: the holder structure, NULL on ENOMEM
+ */
+static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
+                                           unsigned int state,
+                                           int flags, gfp_t gfp_flags)
+{
+        struct gfs2_holder *gh;
+        gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
+        if (!gh)
+                return NULL;
+        gfs2_holder_init(gl, state, flags, gh);
+        set_bit(HIF_ALLOCED, &gh->gh_iflags);
+        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        return gh;
+}
+/**
+ * gfs2_holder_put - get rid of a struct gfs2_holder structure
+ * @gh: the holder structure
+ *
+ */
+static void gfs2_holder_put(struct gfs2_holder *gh)
+{
+        gfs2_holder_uninit(gh);
+        kfree(gh);
+}
+/**
+ * rq_mutex - process a mutex request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+static int rq_mutex(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        list_del_init(&gh->gh_list);
+        /*  gh->gh_error never examined.  */
+        set_bit(GLF_LOCK, &gl->gl_flags);
+        complete(&gh->gh_wait);
+        return 1;
+}
+/**
+ * rq_promote - process a promote request in the queue
+ * @gh: the glock holder
+ *
+ * Acquire a new inter-node lock, or change a lock state to more restrictive.
+ *
+ * Returns: 1 if the queue is blocked
+ */
+static int rq_promote(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+                if (list_empty(&gl->gl_holders)) {
+                        gl->gl_req_gh = gh;
+                        set_bit(GLF_LOCK, &gl->gl_flags);
+                        spin_unlock(&gl->gl_spin);
+                        if (atomic_read(&sdp->sd_reclaim_count) >
+                            gfs2_tune_get(sdp, gt_reclaim_limit) &&
+                            !(gh->gh_flags & LM_FLAG_PRIORITY)) {
+                                gfs2_reclaim_glock(sdp);
+                                gfs2_reclaim_glock(sdp);
+                        }
+                        glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+                        spin_lock(&gl->gl_spin);
+                }
+                return 1;
+        }
+        if (list_empty(&gl->gl_holders)) {
+                set_bit(HIF_FIRST, &gh->gh_iflags);
+                set_bit(GLF_LOCK, &gl->gl_flags);
+        } else {
+                struct gfs2_holder *next_gh;
+                if (gh->gh_flags & GL_LOCAL_EXCL)
+                        return 1;
+                next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
+                                     gh_list);
+                if (next_gh->gh_flags & GL_LOCAL_EXCL)
+                         return 1;
+        }
+        list_move_tail(&gh->gh_list, &gl->gl_holders);
+        gh->gh_error = 0;
+        set_bit(HIF_HOLDER, &gh->gh_iflags);
+        complete(&gh->gh_wait);
+        return 0;
+}
+/**
+ * rq_demote - process a demote request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+static int rq_demote(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (!list_empty(&gl->gl_holders))
+                return 1;
+        if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
+                list_del_init(&gh->gh_list);
+                gh->gh_error = 0;
+                spin_unlock(&gl->gl_spin);
+                if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                        gfs2_holder_put(gh);
+                else
+                        complete(&gh->gh_wait);
+                spin_lock(&gl->gl_spin);
+        } else {
+                gl->gl_req_gh = gh;
+                set_bit(GLF_LOCK, &gl->gl_flags);
+                spin_unlock(&gl->gl_spin);
+                if (gh->gh_state == LM_ST_UNLOCKED ||
+                    gl->gl_state != LM_ST_EXCLUSIVE)
+                        glops->go_drop_th(gl);
+                else
+                        glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+                spin_lock(&gl->gl_spin);
+        }
+        return 0;
+}
+/**
+ * rq_greedy - process a queued request to drop greedy status
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+static int rq_greedy(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        list_del_init(&gh->gh_list);
+        /*  gh->gh_error never examined.  */
+        clear_bit(GLF_GREEDY, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        gfs2_holder_uninit(gh);
+        kfree(container_of(gh, struct greedy, gr_gh));
+        spin_lock(&gl->gl_spin);
+        return 0;
+}
+/**
+ * run_queue - process holder structures on a glock
+ * @gl: the glock
+ *
+ */
+static void run_queue(struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        int blocked = 1;
+        for (;;) {
+                if (test_bit(GLF_LOCK, &gl->gl_flags))
+                        break;
+                if (!list_empty(&gl->gl_waiters1)) {
+                        gh = list_entry(gl->gl_waiters1.next,
+                                        struct gfs2_holder, gh_list);
+                        if (test_bit(HIF_MUTEX, &gh->gh_iflags))
+                                blocked = rq_mutex(gh);
+                        else
+                                gfs2_assert_warn(gl->gl_sbd, 0);
+                } else if (!list_empty(&gl->gl_waiters2) &&
+                           !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
+                        gh = list_entry(gl->gl_waiters2.next,
+                                        struct gfs2_holder, gh_list);
+                        if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
+                                blocked = rq_demote(gh);
+                        else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
+                                blocked = rq_greedy(gh);
+                        else
+                                gfs2_assert_warn(gl->gl_sbd, 0);
+                } else if (!list_empty(&gl->gl_waiters3)) {
+                        gh = list_entry(gl->gl_waiters3.next,
+                                        struct gfs2_holder, gh_list);
+                        if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
+                                blocked = rq_promote(gh);
+                        else
+                                gfs2_assert_warn(gl->gl_sbd, 0);
+                } else
+                        break;
+                if (blocked)
+                        break;
+        }
+}
+/**
+ * gfs2_glmutex_lock - acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Gives caller exclusive access to manipulate a glock structure.
+ */
+static void gfs2_glmutex_lock(struct gfs2_glock *gl)
+{
+        struct gfs2_holder gh;
+        gfs2_holder_init(gl, 0, 0, &gh);
+        set_bit(HIF_MUTEX, &gh.gh_iflags);
+        spin_lock(&gl->gl_spin);
+        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+                list_add_tail(&gh.gh_list, &gl->gl_waiters1);
+        } else {
+                gl->gl_owner = current;
+                gl->gl_ip = (unsigned long)__builtin_return_address(0);
+                complete(&gh.gh_wait);
+        }
+        spin_unlock(&gl->gl_spin);
+        wait_for_completion(&gh.gh_wait);
+        gfs2_holder_uninit(&gh);
+}
+/**
+ * gfs2_glmutex_trylock - try to acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if the glock is acquired
+ */
+static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
+{
+        int acquired = 1;
+        spin_lock(&gl->gl_spin);
+        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+                acquired = 0;
+        } else {
+                gl->gl_owner = current;
+                gl->gl_ip = (unsigned long)__builtin_return_address(0);
+        }
+        spin_unlock(&gl->gl_spin);
+        return acquired;
+}
+/**
+ * gfs2_glmutex_unlock - release a local lock on a glock
+ * @gl: the glock
+ *
+ */
+static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
+{
+        spin_lock(&gl->gl_spin);
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        gl->gl_owner = NULL;
+        gl->gl_ip = 0;
+        run_queue(gl);
+        BUG_ON(!spin_is_locked(&gl->gl_spin));
+        spin_unlock(&gl->gl_spin);
+}
+/**
+ * handle_callback - add a demote request to a lock's queue
+ * @gl: the glock
+ * @state: the state the caller wants us to change to
+ *
+ * Note: This may fail sliently if we are out of memory.
+ */
+static void handle_callback(struct gfs2_glock *gl, unsigned int state)
+{
+        struct gfs2_holder *gh, *new_gh = NULL;
+restart:
+        spin_lock(&gl->gl_spin);
+        list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+                if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
+                    gl->gl_req_gh != gh) {
+                        if (gh->gh_state != state)
+                                gh->gh_state = LM_ST_UNLOCKED;
+                        goto out;
+                }
+        }
+        if (new_gh) {
+                list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
+                new_gh = NULL;
+        } else {
+                spin_unlock(&gl->gl_spin);
+                new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
+                if (!new_gh)
+                        return;
+                set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
+                set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
+                goto restart;
+        }
+out:
+        spin_unlock(&gl->gl_spin);
+        if (new_gh)
+                gfs2_holder_put(new_gh);
+}
+void gfs2_glock_inode_squish(struct inode *inode)
+{
+        struct gfs2_holder gh;
+        struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+        gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
+        set_bit(HIF_DEMOTE, &gh.gh_iflags);
+        spin_lock(&gl->gl_spin);
+        gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
+        list_add_tail(&gh.gh_list, &gl->gl_waiters2);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+        wait_for_completion(&gh.gh_wait);
+        gfs2_holder_uninit(&gh);
+}
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+        int held1, held2;
+        held1 = (gl->gl_state != LM_ST_UNLOCKED);
+        held2 = (new_state != LM_ST_UNLOCKED);
+        if (held1 != held2) {
+                if (held2)
+                        gfs2_glock_hold(gl);
+                else
+                        gfs2_glock_put(gl);
+        }
+        gl->gl_state = new_state;
+}
+/**
+ * xmote_bh - Called after the lock module is done acquiring a lock
+ * @gl: The glock in question
+ * @ret: the int returned from the lock module
+ *
+ */
+static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh = gl->gl_req_gh;
+        int prev_state = gl->gl_state;
+        int op_done = 1;
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
+        state_change(gl, ret & LM_OUT_ST_MASK);
+        if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
+                if (glops->go_inval)
+                        glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+        } else if (gl->gl_state == LM_ST_DEFERRED) {
+                /* We might not want to do this here.
+                   Look at moving to the inode glops. */
+                if (glops->go_inval)
+                        glops->go_inval(gl, DIO_DATA);
+        }
+        /*  Deal with each possible exit condition  */
+        if (!gh)
+                gl->gl_stamp = jiffies;
+        else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                gh->gh_error = -EIO;
+                spin_unlock(&gl->gl_spin);
+        } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                if (gl->gl_state == gh->gh_state ||
+                    gl->gl_state == LM_ST_UNLOCKED) {
+                        gh->gh_error = 0;
+                } else {
+                        if (gfs2_assert_warn(sdp, gh->gh_flags &
+                                        (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
+                                fs_warn(sdp, "ret = 0x%.8X\n", ret);
+                        gh->gh_error = GLR_TRYFAILED;
+                }
+                spin_unlock(&gl->gl_spin);
+                if (ret & LM_OUT_CANCELED)
+                        handle_callback(gl, LM_ST_UNLOCKED);
+        } else if (ret & LM_OUT_CANCELED) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                gh->gh_error = GLR_CANCELED;
+                spin_unlock(&gl->gl_spin);
+        } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+                spin_lock(&gl->gl_spin);
+                list_move_tail(&gh->gh_list, &gl->gl_holders);
+                gh->gh_error = 0;
+                set_bit(HIF_HOLDER, &gh->gh_iflags);
+                spin_unlock(&gl->gl_spin);
+                set_bit(HIF_FIRST, &gh->gh_iflags);
+                op_done = 0;
+        } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                gh->gh_error = GLR_TRYFAILED;
+                spin_unlock(&gl->gl_spin);
+        } else {
+                if (gfs2_assert_withdraw(sdp, 0) == -1)
+                        fs_err(sdp, "ret = 0x%.8X\n", ret);
+        }
+        if (glops->go_xmote_bh)
+                glops->go_xmote_bh(gl);
+        if (op_done) {
+                spin_lock(&gl->gl_spin);
+                gl->gl_req_gh = NULL;
+                gl->gl_req_bh = NULL;
+                clear_bit(GLF_LOCK, &gl->gl_flags);
+                run_queue(gl);
+                spin_unlock(&gl->gl_spin);
+        }
+        gfs2_glock_put(gl);
+        if (gh) {
+                if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                        gfs2_holder_put(gh);
+                else
+                        complete(&gh->gh_wait);
+        }
+}
+/**
+ * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
+ * @gl: The glock in question
+ * @state: the requested state
+ * @flags: modifier flags to the lock call
+ *
+ */
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
+                                 LM_FLAG_NOEXP | LM_FLAG_ANY |
+                                 LM_FLAG_PRIORITY);
+        unsigned int lck_ret;
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
+        gfs2_assert_warn(sdp, state != gl->gl_state);
+        if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+                glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+        gfs2_glock_hold(gl);
+        gl->gl_req_bh = xmote_bh;
+        lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
+        if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
+                return;
+        if (lck_ret & LM_OUT_ASYNC)
+                gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
+        else
+                xmote_bh(gl, lck_ret);
+}
+/**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh = gl->gl_req_gh;
+        clear_bit(GLF_PREFETCH, &gl->gl_flags);
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, !ret);
+        state_change(gl, LM_ST_UNLOCKED);
+        if (glops->go_inval)
+                glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+        if (gh) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                gh->gh_error = 0;
+                spin_unlock(&gl->gl_spin);
+        }
+        if (glops->go_drop_bh)
+                glops->go_drop_bh(gl);
+        spin_lock(&gl->gl_spin);
+        gl->gl_req_gh = NULL;
+        gl->gl_req_bh = NULL;
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+        gfs2_glock_put(gl);
+        if (gh) {
+                if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                        gfs2_holder_put(gh);
+                else
+                        complete(&gh->gh_wait);
+        }
+}
+/**
+ * gfs2_glock_drop_th - call into the lock module to unlock a lock
+ * @gl: the glock
+ *
+ */
+void gfs2_glock_drop_th(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        unsigned int ret;
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
+        if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+                glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+        gfs2_glock_hold(gl);
+        gl->gl_req_bh = drop_bh;
+        ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
+        if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
+                return;
+        if (!ret)
+                drop_bh(gl, ret);
+        else
+                gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
+}
+/**
+ * do_cancels - cancel requests for locks stuck waiting on an expire flag
+ * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
+ *
+ * Don't cancel GL_NOCANCEL requests.
+ */
+static void do_cancels(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        spin_lock(&gl->gl_spin);
+        while (gl->gl_req_gh != gh &&
+               !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+               !list_empty(&gh->gh_list)) {
+                if (gl->gl_req_bh && !(gl->gl_req_gh &&
+                                     (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
+                        spin_unlock(&gl->gl_spin);
+                        gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+                        msleep(100);
+                        spin_lock(&gl->gl_spin);
+                } else {
+                        spin_unlock(&gl->gl_spin);
+                        msleep(100);
+                        spin_lock(&gl->gl_spin);
+                }
+        }
+        spin_unlock(&gl->gl_spin);
+}
+/**
+ * glock_wait_internal - wait on a glock acquisition
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success
+ */
+static int glock_wait_internal(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (test_bit(HIF_ABORTED, &gh->gh_iflags))
+                return -EIO;
+        if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+                spin_lock(&gl->gl_spin);
+                if (gl->gl_req_gh != gh &&
+                    !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+                    !list_empty(&gh->gh_list)) {
+                        list_del_init(&gh->gh_list);
+                        gh->gh_error = GLR_TRYFAILED;
+                        run_queue(gl);
+                        spin_unlock(&gl->gl_spin);
+                        return gh->gh_error;
+                }
+                spin_unlock(&gl->gl_spin);
+        }
+        if (gh->gh_flags & LM_FLAG_PRIORITY)
+                do_cancels(gh);
+        wait_for_completion(&gh->gh_wait);
+        if (gh->gh_error)
+                return gh->gh_error;
+        gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
+        gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
+                                                   gh->gh_flags));
+        if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
+                gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+                if (glops->go_lock) {
+                        gh->gh_error = glops->go_lock(gh);
+                        if (gh->gh_error) {
+                                spin_lock(&gl->gl_spin);
+                                list_del_init(&gh->gh_list);
+                                spin_unlock(&gl->gl_spin);
+                        }
+                }
+                spin_lock(&gl->gl_spin);
+                gl->gl_req_gh = NULL;
+                gl->gl_req_bh = NULL;
+                clear_bit(GLF_LOCK, &gl->gl_flags);
+                run_queue(gl);
+                spin_unlock(&gl->gl_spin);
+        }
+        return gh->gh_error;
+}
+static inline struct gfs2_holder *
+find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+{
+        struct gfs2_holder *gh;
+        list_for_each_entry(gh, head, gh_list) {
+                if (gh->gh_owner == owner)
+                        return gh;
+        }
+        return NULL;
+}
+/**
+ * add_to_queue - Add a holder to the wait queue (but look for recursion)
+ * @gh: the holder structure to add
+ *
+ */
+static void add_to_queue(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_holder *existing;
+        BUG_ON(!gh->gh_owner);
+        existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+        if (existing) {
+                print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+                printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
+                printk(KERN_INFO "lock type : %d lock state : %d\n",
+                                existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
+                print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+                printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
+                printk(KERN_INFO "lock type : %d lock state : %d\n",
+                                gl->gl_name.ln_type, gl->gl_state);
+                BUG();
+        }
+        existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+        if (existing) {
+                print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+                print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+                BUG();
+        }
+        if (gh->gh_flags & LM_FLAG_PRIORITY)
+                list_add(&gh->gh_list, &gl->gl_waiters3);
+        else
+                list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+}
+/**
+ * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
+ * @gh: the holder structure
+ *
+ * if (gh->gh_flags & GL_ASYNC), this never returns an error
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+int gfs2_glock_nq(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        int error = 0;
+restart:
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+                set_bit(HIF_ABORTED, &gh->gh_iflags);
+                return -EIO;
+        }
+        set_bit(HIF_PROMOTE, &gh->gh_iflags);
+        spin_lock(&gl->gl_spin);
+        add_to_queue(gh);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+        if (!(gh->gh_flags & GL_ASYNC)) {
+                error = glock_wait_internal(gh);
+                if (error == GLR_CANCELED) {
+                        msleep(100);
+                        goto restart;
+                }
+        }
+        clear_bit(GLF_PREFETCH, &gl->gl_flags);
+        if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
+                dump_glock(gl);
+        return error;
+}
+/**
+ * gfs2_glock_poll - poll to see if an async request has been completed
+ * @gh: the holder
+ *
+ * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
+ */
+int gfs2_glock_poll(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        int ready = 0;
+        spin_lock(&gl->gl_spin);
+        if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                ready = 1;
+        else if (list_empty(&gh->gh_list)) {
+                if (gh->gh_error == GLR_CANCELED) {
+                        spin_unlock(&gl->gl_spin);
+                        msleep(100);
+                        if (gfs2_glock_nq(gh))
+                                return 1;
+                        return 0;
+                } else
+                        ready = 1;
+        }
+        spin_unlock(&gl->gl_spin);
+        return ready;
+}
+/**
+ * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+int gfs2_glock_wait(struct gfs2_holder *gh)
+{
+        int error;
+        error = glock_wait_internal(gh);
+        if (error == GLR_CANCELED) {
+                msleep(100);
+                gh->gh_flags &= ~GL_ASYNC;
+                error = gfs2_glock_nq(gh);
+        }
+        return error;
+}
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (gh->gh_flags & GL_NOCACHE)
+                handle_callback(gl, LM_ST_UNLOCKED);
+        gfs2_glmutex_lock(gl);
+        spin_lock(&gl->gl_spin);
+        list_del_init(&gh->gh_list);
+        if (list_empty(&gl->gl_holders)) {
+                spin_unlock(&gl->gl_spin);
+                if (glops->go_unlock)
+                        glops->go_unlock(gh);
+                gl->gl_stamp = jiffies;
+                spin_lock(&gl->gl_spin);
+        }
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+}
+/**
+ * gfs2_glock_prefetch - Try to prefetch a glock
+ * @gl: the glock
+ * @state: the state to prefetch in
+ * @flags: flags passed to go_xmote_th()
+ *
+ */
+static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
+                                int flags)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        spin_lock(&gl->gl_spin);
+        if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) ||
+            !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) ||
+            !list_empty(&gl->gl_waiters3) ||
+            relaxed_state_ok(gl->gl_state, state, flags)) {
+                spin_unlock(&gl->gl_spin);
+                return;
+        }
+        set_bit(GLF_PREFETCH, &gl->gl_flags);
+        set_bit(GLF_LOCK, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        glops->go_xmote_th(gl, state, flags);
+}
+static void greedy_work(void *data)
+{
+        struct greedy *gr = data;
+        struct gfs2_holder *gh = &gr->gr_gh;
+        struct gfs2_glock *gl = gh->gh_gl;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+        if (glops->go_greedy)
+                glops->go_greedy(gl);
+        spin_lock(&gl->gl_spin);
+        if (list_empty(&gl->gl_waiters2)) {
+                clear_bit(GLF_GREEDY, &gl->gl_flags);
+                spin_unlock(&gl->gl_spin);
+                gfs2_holder_uninit(gh);
+                kfree(gr);
+        } else {
+                gfs2_glock_hold(gl);
+                list_add_tail(&gh->gh_list, &gl->gl_waiters2);
+                run_queue(gl);
+                spin_unlock(&gl->gl_spin);
+                gfs2_glock_put(gl);
+        }
+}
+/**
+ * gfs2_glock_be_greedy -
+ * @gl:
+ * @time:
+ *
+ * Returns: 0 if go_greedy will be called, 1 otherwise
+ */
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
+{
+        struct greedy *gr;
+        struct gfs2_holder *gh;
+        if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
+            test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
+                return 1;
+        gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
+        if (!gr) {
+                clear_bit(GLF_GREEDY, &gl->gl_flags);
+                return 1;
+        }
+        gh = &gr->gr_gh;
+        gfs2_holder_init(gl, 0, 0, gh);
+        set_bit(HIF_GREEDY, &gh->gh_iflags);
+        INIT_WORK(&gr->gr_work, greedy_work, gr);
+        set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+        schedule_delayed_work(&gr->gr_work, time);
+        return 0;
+}
+/**
+ * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
+ * @gh: the holder structure
+ *
+ */
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
+{
+        gfs2_glock_dq(gh);
+        gfs2_holder_uninit(gh);
+}
+/**
+ * gfs2_glock_nq_num - acquire a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ * @gh: the struct gfs2_holder
+ *
+ * Returns: errno
+ */
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+                      const struct gfs2_glock_operations *glops,
+                      unsigned int state, int flags, struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl;
+        int error;
+        error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+        if (!error) {
+                error = gfs2_glock_nq_init(gl, state, flags, gh);
+                gfs2_glock_put(gl);
+        }
+        return error;
+}
+/**
+ * glock_compare - Compare two struct gfs2_glock structures for sorting
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ */
+static int glock_compare(const void *arg_a, const void *arg_b)
+{
+        const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+        const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+        const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+        const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+        if (a->ln_number > b->ln_number)
+                return 1;
+        if (a->ln_number < b->ln_number)
+                return -1;
+        if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+                return 1;
+        if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL))
+                return 1;
+        return 0;
+}
+/**
+ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
+                     struct gfs2_holder **p)
+{
+        unsigned int x;
+        int error = 0;
+        for (x = 0; x < num_gh; x++)
+                p[x] = &ghs[x];
+        sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
+        for (x = 0; x < num_gh; x++) {
+                p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+                error = gfs2_glock_nq(p[x]);
+                if (error) {
+                        while (x--)
+                                gfs2_glock_dq(p[x]);
+                        break;
+                }
+        }
+        return error;
+}
+/**
+ * gfs2_glock_nq_m - acquire multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Figure out how big an impact this function has.  Either:
+ * 1) Replace this code with code that calls gfs2_glock_prefetch()
+ * 2) Forget async stuff and just call nq_m_sync()
+ * 3) Leave it like it is
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+        int *e;
+        unsigned int x;
+        int borked = 0, serious = 0;
+        int error = 0;
+        if (!num_gh)
+                return 0;
+        if (num_gh == 1) {
+                ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+                return gfs2_glock_nq(ghs);
+        }
+        e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+        if (!e)
+                return -ENOMEM;
+        for (x = 0; x < num_gh; x++) {
+                ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
+                error = gfs2_glock_nq(&ghs[x]);
+                if (error) {
+                        borked = 1;
+                        serious = error;
+                        num_gh = x;
+                        break;
+                }
+        }
+        for (x = 0; x < num_gh; x++) {
+                error = e[x] = glock_wait_internal(&ghs[x]);
+                if (error) {
+                        borked = 1;
+                        if (error != GLR_TRYFAILED && error != GLR_CANCELED)
+                                serious = error;
+                }
+        }
+        if (!borked) {
+                kfree(e);
+                return 0;
+        }
+        for (x = 0; x < num_gh; x++)
+                if (!e[x])
+                        gfs2_glock_dq(&ghs[x]);
+        if (serious)
+                error = serious;
+        else {
+                for (x = 0; x < num_gh; x++)
+                        gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
+                                          &ghs[x]);
+                error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
+        }
+        kfree(e);
+        return error;
+}
+/**
+ * gfs2_glock_dq_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+        unsigned int x;
+        for (x = 0; x < num_gh; x++)
+                gfs2_glock_dq(&ghs[x]);
+}
+/**
+ * gfs2_glock_dq_uninit_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+        unsigned int x;
+        for (x = 0; x < num_gh; x++)
+                gfs2_glock_dq_uninit(&ghs[x]);
+}
+/**
+ * gfs2_glock_prefetch_num - prefetch a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ *
+ * Returns: errno
+ */
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
+                             const struct gfs2_glock_operations *glops,
+                             unsigned int state, int flags)
+{
+        struct gfs2_glock *gl;
+        int error;
+        if (atomic_read(&sdp->sd_reclaim_count) <
+            gfs2_tune_get(sdp, gt_reclaim_limit)) {
+                error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+                if (!error) {
+                        gfs2_glock_prefetch(gl, state, flags);
+                        gfs2_glock_put(gl);
+                }
+        }
+}
+/**
+ * gfs2_lvb_hold - attach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+int gfs2_lvb_hold(struct gfs2_glock *gl)
+{
+        int error;
+        gfs2_glmutex_lock(gl);
+        if (!atomic_read(&gl->gl_lvb_count)) {
+                error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
+                if (error) {
+                        gfs2_glmutex_unlock(gl);
+                        return error;
+                }
+                gfs2_glock_hold(gl);
+        }
+        atomic_inc(&gl->gl_lvb_count);
+        gfs2_glmutex_unlock(gl);
+        return 0;
+}
+/**
+ * gfs2_lvb_unhold - detach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+void gfs2_lvb_unhold(struct gfs2_glock *gl)
+{
+        gfs2_glock_hold(gl);
+        gfs2_glmutex_lock(gl);
+        gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
+        if (atomic_dec_and_test(&gl->gl_lvb_count)) {
+                gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+                gl->gl_lvb = NULL;
+                gfs2_glock_put(gl);
+        }
+        gfs2_glmutex_unlock(gl);
+        gfs2_glock_put(gl);
+}
+static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                        unsigned int state)
+{
+        struct gfs2_glock *gl;
+        gl = gfs2_glock_find(sdp, name);
+        if (!gl)
+                return;
+        if (gl->gl_ops->go_callback)
+                gl->gl_ops->go_callback(gl, state);
+        handle_callback(gl, state);
+        spin_lock(&gl->gl_spin);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+        gfs2_glock_put(gl);
+}
+/**
+ * gfs2_glock_cb - Callback used by locking module
+ * @sdp: Pointer to the superblock
+ * @type: Type of callback
+ * @data: Type dependent data pointer
+ *
+ * Called by the locking module when it wants to tell us something.
+ * Either we need to drop a lock, one of our ASYNC requests completed, or
+ * a journal from another client needs to be recovered.
+ */
+void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
+{
+        struct gfs2_sbd *sdp = cb_data;
+        switch (type) {
+        case LM_CB_NEED_E:
+                blocking_cb(sdp, data, LM_ST_UNLOCKED);
+                return;
+        case LM_CB_NEED_D:
+                blocking_cb(sdp, data, LM_ST_DEFERRED);
+                return;
+        case LM_CB_NEED_S:
+                blocking_cb(sdp, data, LM_ST_SHARED);
+                return;
+        case LM_CB_ASYNC: {
+                struct lm_async_cb *async = data;
+                struct gfs2_glock *gl;
+                gl = gfs2_glock_find(sdp, &async->lc_name);
+                if (gfs2_assert_warn(sdp, gl))
+                        return;
+                if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
+                        gl->gl_req_bh(gl, async->lc_ret);
+                gfs2_glock_put(gl);
+                return;
+        }
+        case LM_CB_NEED_RECOVERY:
+                gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
+                if (sdp->sd_recoverd_process)
+                        wake_up_process(sdp->sd_recoverd_process);
+                return;
+        case LM_CB_DROPLOCKS:
+                gfs2_gl_hash_clear(sdp, NO_WAIT);
+                gfs2_quota_scan(sdp);
+                return;
+        default:
+                gfs2_assert_warn(sdp, 0);
+                return;
+        }
+}
+/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int demote_ok(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        int demote = 1;
+        if (test_bit(GLF_STICKY, &gl->gl_flags))
+                demote = 0;
+        else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
+                demote = time_after_eq(jiffies, gl->gl_stamp +
+                                    gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
+        else if (glops->go_demote_ok)
+                demote = glops->go_demote_ok(gl);
+        return demote;
+}
+/**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        spin_lock(&sdp->sd_reclaim_lock);
+        if (list_empty(&gl->gl_reclaim)) {
+                gfs2_glock_hold(gl);
+                list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
+                atomic_inc(&sdp->sd_reclaim_count);
+        }
+        spin_unlock(&sdp->sd_reclaim_lock);
+        wake_up(&sdp->sd_reclaim_wq);
+}
+/**
+ * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
+ * @sdp: the filesystem
+ *
+ * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
+ * different glock and we notice that there are a lot of glocks in the
+ * reclaim list.
+ *
+ */
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+{
+        struct gfs2_glock *gl;
+        spin_lock(&sdp->sd_reclaim_lock);
+        if (list_empty(&sdp->sd_reclaim_list)) {
+                spin_unlock(&sdp->sd_reclaim_lock);
+                return;
+        }
+        gl = list_entry(sdp->sd_reclaim_list.next,
+                        struct gfs2_glock, gl_reclaim);
+        list_del_init(&gl->gl_reclaim);
+        spin_unlock(&sdp->sd_reclaim_lock);
+        atomic_dec(&sdp->sd_reclaim_count);
+        atomic_inc(&sdp->sd_reclaimed);
+        if (gfs2_glmutex_trylock(gl)) {
+                if (queue_empty(gl, &gl->gl_holders) &&
+                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+                        handle_callback(gl, LM_ST_UNLOCKED);
+                gfs2_glmutex_unlock(gl);
+        }
+        gfs2_glock_put(gl);
+}
+/**
+ * examine_bucket - Call a function for glock in a hash bucket
+ * @examiner: the function
+ * @sdp: the filesystem
+ * @bucket: the bucket
+ *
+ * Returns: 1 if the bucket has entries
+ */
+static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+                          unsigned int hash)
+{
+        struct gfs2_glock *gl, *prev = NULL;
+        int has_entries = 0;
+        struct hlist_head *head = &gl_hash_table[hash].hb_list;
+        read_lock(gl_lock_addr(hash));
+        /* Can't use hlist_for_each_entry - don't want prefetch here */
+        if (hlist_empty(head))
+                goto out;
+        gl = list_entry(head->first, struct gfs2_glock, gl_list);
+        while(1) {
+                if (gl->gl_sbd == sdp) {
+                        gfs2_glock_hold(gl);
+                        read_unlock(gl_lock_addr(hash));
+                        if (prev)
+                                gfs2_glock_put(prev);
+                        prev = gl;
+                        examiner(gl);
+                        has_entries = 1;
+                        read_lock(gl_lock_addr(hash));
+                }
+                if (gl->gl_list.next == NULL)
+                        break;
+                gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
+        }
+out:
+        read_unlock(gl_lock_addr(hash));
+        if (prev)
+                gfs2_glock_put(prev);
+        return has_entries;
+}
+/**
+ * scan_glock - look at a glock and see if we can reclaim it
+ * @gl: the glock to look at
+ *
+ */
+static void scan_glock(struct gfs2_glock *gl)
+{
+        if (gl->gl_ops == &gfs2_inode_glops)
+                return;
+        if (gfs2_glmutex_trylock(gl)) {
+                if (queue_empty(gl, &gl->gl_holders) &&
+                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+                        goto out_schedule;
+                gfs2_glmutex_unlock(gl);
+        }
+        return;
+out_schedule:
+        gfs2_glmutex_unlock(gl);
+        gfs2_glock_schedule_for_reclaim(gl);
+}
+/**
+ * gfs2_scand_internal - Look for glocks and inodes to toss from memory
+ * @sdp: the filesystem
+ *
+ */
+void gfs2_scand_internal(struct gfs2_sbd *sdp)
+{
+        unsigned int x;
+        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+                examine_bucket(scan_glock, sdp, x);
+}
+/**
+ * clear_glock - look at a glock and see if we can free it from glock cache
+ * @gl: the glock to look at
+ *
+ */
+static void clear_glock(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        int released;
+        spin_lock(&sdp->sd_reclaim_lock);
+        if (!list_empty(&gl->gl_reclaim)) {
+                list_del_init(&gl->gl_reclaim);
+                atomic_dec(&sdp->sd_reclaim_count);
+                spin_unlock(&sdp->sd_reclaim_lock);
+                released = gfs2_glock_put(gl);
+                gfs2_assert(sdp, !released);
+        } else {
+                spin_unlock(&sdp->sd_reclaim_lock);
+        }
+        if (gfs2_glmutex_trylock(gl)) {
+                if (queue_empty(gl, &gl->gl_holders) &&
+                    gl->gl_state != LM_ST_UNLOCKED)
+                        handle_callback(gl, LM_ST_UNLOCKED);
+                gfs2_glmutex_unlock(gl);
+        }
+}
+/**
+ * gfs2_gl_hash_clear - Empty out the glock hash table
+ * @sdp: the filesystem
+ * @wait: wait until it's all gone
+ *
+ * Called when unmounting the filesystem, or when inter-node lock manager
+ * requests DROPLOCKS because it is running out of capacity.
+ */
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+{
+        unsigned long t;
+        unsigned int x;
+        int cont;
+        t = jiffies;
+        for (;;) {
+                cont = 0;
+                for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+                        if (examine_bucket(clear_glock, sdp, x))
+                                cont = 1;
+                }
+                if (!wait || !cont)
+                        break;
+                if (time_after_eq(jiffies,
+                                  t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
+                        fs_warn(sdp, "Unmount seems to be stalled. "
+                                     "Dumping lock state...\n");
+                        gfs2_dump_lockstate(sdp);
+                        t = jiffies;
+                }
+                invalidate_inodes(sdp->sd_vfs);
+                msleep(10);
+        }
+}
+/*
+ *  Diagnostic routines to help debug distributed deadlock
+ */
+/**
+ * dump_holder - print information about a glock holder
+ * @str: a string naming the type of holder
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+static int dump_holder(char *str, struct gfs2_holder *gh)
+{
+        unsigned int x;
+        int error = -ENOBUFS;
+        printk(KERN_INFO "  %s\n", str);
+        printk(KERN_INFO "    owner = %ld\n",
+                   (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
+        printk(KERN_INFO "    gh_state = %u\n", gh->gh_state);
+        printk(KERN_INFO "    gh_flags =");
+        for (x = 0; x < 32; x++)
+                if (gh->gh_flags & (1 << x))
+                        printk(" %u", x);
+        printk(" \n");
+        printk(KERN_INFO "    error = %d\n", gh->gh_error);
+        printk(KERN_INFO "    gh_iflags =");
+        for (x = 0; x < 32; x++)
+                if (test_bit(x, &gh->gh_iflags))
+                        printk(" %u", x);
+        printk(" \n");
+        print_symbol(KERN_INFO "    initialized at: %s\n", gh->gh_ip);
+        error = 0;
+        return error;
+}
+/**
+ * dump_inode - print information about an inode
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+static int dump_inode(struct gfs2_inode *ip)
+{
+        unsigned int x;
+        int error = -ENOBUFS;
+        printk(KERN_INFO "  Inode:\n");
+        printk(KERN_INFO "    num = %llu %llu\n",
+                    (unsigned long long)ip->i_num.no_formal_ino,
+                    (unsigned long long)ip->i_num.no_addr);
+        printk(KERN_INFO "    type = %u\n", IF2DT(ip->i_di.di_mode));
+        printk(KERN_INFO "    i_flags =");
+        for (x = 0; x < 32; x++)
+                if (test_bit(x, &ip->i_flags))
+                        printk(" %u", x);
+        printk(" \n");
+        error = 0;
+        return error;
+}
+/**
+ * dump_glock - print information about a glock
+ * @gl: the glock
+ * @count: where we are in the buffer
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+static int dump_glock(struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        unsigned int x;
+        int error = -ENOBUFS;
+        spin_lock(&gl->gl_spin);
+        printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+               (unsigned long long)gl->gl_name.ln_number);
+        printk(KERN_INFO "  gl_flags =");
+        for (x = 0; x < 32; x++) {
+                if (test_bit(x, &gl->gl_flags))
+                        printk(" %u", x);
+        }
+        printk(" \n");
+        printk(KERN_INFO "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
+        printk(KERN_INFO "  gl_state = %u\n", gl->gl_state);
+        printk(KERN_INFO "  gl_owner = %s\n", gl->gl_owner->comm);
+        print_symbol(KERN_INFO "  gl_ip = %s\n", gl->gl_ip);
+        printk(KERN_INFO "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+        printk(KERN_INFO "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+        printk(KERN_INFO "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+        printk(KERN_INFO "  object = %s\n", (gl->gl_object) ? "yes" : "no");
+        printk(KERN_INFO "  le = %s\n",
+                   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
+        printk(KERN_INFO "  reclaim = %s\n",
+                    (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+        if (gl->gl_aspace)
+                printk(KERN_INFO "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
+                       gl->gl_aspace->i_mapping->nrpages);
+        else
+                printk(KERN_INFO "  aspace = no\n");
+        printk(KERN_INFO "  ail = %d\n", atomic_read(&gl->gl_ail_count));
+        if (gl->gl_req_gh) {
+                error = dump_holder("Request", gl->gl_req_gh);
+                if (error)
+                        goto out;
+        }
+        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                error = dump_holder("Holder", gh);
+                if (error)
+                        goto out;
+        }
+        list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
+                error = dump_holder("Waiter1", gh);
+                if (error)
+                        goto out;
+        }
+        list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+                error = dump_holder("Waiter2", gh);
+                if (error)
+                        goto out;
+        }
+        list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
+                error = dump_holder("Waiter3", gh);
+                if (error)
+                        goto out;
+        }
+        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
+                if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
+                    list_empty(&gl->gl_holders)) {
+                        error = dump_inode(gl->gl_object);
+                        if (error)
+                                goto out;
+                } else {
+                        error = -ENOBUFS;
+                        printk(KERN_INFO "  Inode: busy\n");
+                }
+        }
+        error = 0;
+out:
+        spin_unlock(&gl->gl_spin);
+        return error;
+}
+/**
+ * gfs2_dump_lockstate - print out the current lockstate
+ * @sdp: the filesystem
+ * @ub: the buffer to copy the information into
+ *
+ * If @ub is NULL, dump the lockstate to the console.
+ *
+ */
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
+{
+        struct gfs2_glock *gl;
+        struct hlist_node *h;
+        unsigned int x;
+        int error = 0;
+        for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+                read_lock(gl_lock_addr(x));
+                hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
+                        if (gl->gl_sbd != sdp)
+                                continue;
+                        error = dump_glock(gl);
+                        if (error)
+                                break;
+                }
+                read_unlock(gl_lock_addr(x));
+                if (error)
+                        break;
+        }
+        return error;
+}
+int __init gfs2_glock_init(void)
+{
+        unsigned i;
+        for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
+                INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
+        }
+#ifdef GL_HASH_LOCK_SZ
+        for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
+                rwlock_init(&gl_hash_locks[i]);
+        }
+#endif
+        return 0;
+}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..2b2a889ee2cc
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __GLOCK_DOT_H__
+#define __GLOCK_DOT_H__
+#include "incore.h"
+/* Flags for lock requests; used in gfs2_holder gh_flag field.
+   From lm_interface.h:
+#define LM_FLAG_TRY             0x00000001
+#define LM_FLAG_TRY_1CB         0x00000002
+#define LM_FLAG_NOEXP           0x00000004
+#define LM_FLAG_ANY             0x00000008
+#define LM_FLAG_PRIORITY        0x00000010 */
+#define GL_LOCAL_EXCL           0x00000020
+#define GL_ASYNC                0x00000040
+#define GL_EXACT                0x00000080
+#define GL_SKIP                 0x00000100
+#define GL_ATIME                0x00000200
+#define GL_NOCACHE              0x00000400
+#define GL_NOCANCEL             0x00001000
+#define GL_AOP                  0x00004000
+#define GL_DUMP                 0x00008000
+#define GLR_TRYFAILED           13
+#define GLR_CANCELED            14
+static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        int locked = 0;
+        /* Look in glock's list of holders for one with current task as owner */
+        spin_lock(&gl->gl_spin);
+        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (gh->gh_owner == current) {
+                        locked = 1;
+                        break;
+                }
+        }
+        spin_unlock(&gl->gl_spin);
+        return locked;
+}
+static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
+{
+        return gl->gl_state == LM_ST_EXCLUSIVE;
+}
+static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
+{
+        return gl->gl_state == LM_ST_DEFERRED;
+}
+static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
+{
+        return gl->gl_state == LM_ST_SHARED;
+}
+static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
+{
+        int ret;
+        spin_lock(&gl->gl_spin);
+        ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
+        spin_unlock(&gl->gl_spin);
+        return ret;
+}
+int gfs2_glock_get(struct gfs2_sbd *sdp,
+                   u64 number, const struct gfs2_glock_operations *glops,
+                   int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+int gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+                      struct gfs2_holder *gh);
+void gfs2_holder_reinit(unsigned int state, unsigned flags,
+                        struct gfs2_holder *gh);
+void gfs2_holder_uninit(struct gfs2_holder *gh);
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
+void gfs2_glock_drop_th(struct gfs2_glock *gl);
+int gfs2_glock_nq(struct gfs2_holder *gh);
+int gfs2_glock_poll(struct gfs2_holder *gh);
+int gfs2_glock_wait(struct gfs2_holder *gh);
+void gfs2_glock_dq(struct gfs2_holder *gh);
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
+                      u64 number, const struct gfs2_glock_operations *glops,
+                      unsigned int state, int flags, struct gfs2_holder *gh);
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
+                             const struct gfs2_glock_operations *glops,
+                             unsigned int state, int flags);
+void gfs2_glock_inode_squish(struct inode *inode);
+/**
+ * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_*, or errno
+ */
+static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
+                                     unsigned int state, int flags,
+                                     struct gfs2_holder *gh)
+{
+        int error;
+        gfs2_holder_init(gl, state, flags, gh);
+        error = gfs2_glock_nq(gh);
+        if (error)
+                gfs2_holder_uninit(gh);
+        return error;
+}
+/*  Lock Value Block functions  */
+int gfs2_lvb_hold(struct gfs2_glock *gl);
+void gfs2_lvb_unhold(struct gfs2_glock *gl);
+void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+void gfs2_scand_internal(struct gfs2_sbd *sdp);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+int __init gfs2_glock_init(void);
+#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..41a6b6818a50
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,615 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "util.h"
+#include "trans.h"
+/**
+ * ail_empty_gl - remove all buffers for a given lock from the AIL
+ * @gl: the glock
+ *
+ * None of the buffers should be dirty, locked, or pinned.
+ */
+static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        unsigned int blocks;
+        struct list_head *head = &gl->gl_ail_list;
+        struct gfs2_bufdata *bd;
+        struct buffer_head *bh;
+        u64 blkno;
+        int error;
+        blocks = atomic_read(&gl->gl_ail_count);
+        if (!blocks)
+                return;
+        error = gfs2_trans_begin(sdp, 0, blocks);
+        if (gfs2_assert_withdraw(sdp, !error))
+                return;
+        gfs2_log_lock(sdp);
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata,
+                                bd_ail_gl_list);
+                bh = bd->bd_bh;
+                blkno = bh->b_blocknr;
+                gfs2_assert_withdraw(sdp, !buffer_busy(bh));
+                bd->bd_ail = NULL;
+                list_del(&bd->bd_ail_st_list);
+                list_del(&bd->bd_ail_gl_list);
+                atomic_dec(&gl->gl_ail_count);
+                brelse(bh);
+                gfs2_log_unlock(sdp);
+                gfs2_trans_add_revoke(sdp, blkno);
+                gfs2_log_lock(sdp);
+        }
+        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+        gfs2_log_unlock(sdp);
+        gfs2_trans_end(sdp);
+        gfs2_log_flush(sdp, NULL);
+}
+/**
+ * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
+ * @gl: the glock
+ *
+ */
+static void gfs2_pte_inval(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip;
+        struct inode *inode;
+        ip = gl->gl_object;
+        inode = &ip->i_inode;
+        if (!ip || !S_ISREG(ip->i_di.di_mode))
+                return;
+        if (!test_bit(GIF_PAGED, &ip->i_flags))
+                return;
+        unmap_shared_mapping_range(inode->i_mapping, 0, 0);
+        if (test_bit(GIF_SW_PAGED, &ip->i_flags))
+                set_bit(GLF_DIRTY, &gl->gl_flags);
+        clear_bit(GIF_SW_PAGED, &ip->i_flags);
+}
+/**
+ * gfs2_page_inval - Invalidate all pages associated with a glock
+ * @gl: the glock
+ *
+ */
+static void gfs2_page_inval(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip;
+        struct inode *inode;
+        ip = gl->gl_object;
+        inode = &ip->i_inode;
+        if (!ip || !S_ISREG(ip->i_di.di_mode))
+                return;
+        truncate_inode_pages(inode->i_mapping, 0);
+        gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
+        clear_bit(GIF_PAGED, &ip->i_flags);
+}
+/**
+ * gfs2_page_wait - Wait for writeback of data
+ * @gl: the glock
+ *
+ * Syncs data (not metadata) for a regular file.
+ * No-op for all other types.
+ */
+static void gfs2_page_wait(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip = gl->gl_object;
+        struct inode *inode = &ip->i_inode;
+        struct address_space *mapping = inode->i_mapping;
+        int error;
+        if (!S_ISREG(ip->i_di.di_mode))
+                return;
+        error = filemap_fdatawait(mapping);
+        /* Put back any errors cleared by filemap_fdatawait()
+           so they can be caught by someone who can pass them
+           up to user space. */
+        if (error == -ENOSPC)
+                set_bit(AS_ENOSPC, &mapping->flags);
+        else if (error)
+                set_bit(AS_EIO, &mapping->flags);
+}
+static void gfs2_page_writeback(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip = gl->gl_object;
+        struct inode *inode = &ip->i_inode;
+        struct address_space *mapping = inode->i_mapping;
+        if (!S_ISREG(ip->i_di.di_mode))
+                return;
+        filemap_fdatawrite(mapping);
+}
+/**
+ * meta_go_sync - sync out the metadata for this glock
+ * @gl: the glock
+ * @flags: DIO_*
+ *
+ * Called when demoting or unlocking an EX glock.  We must flush
+ * to disk all dirty buffers/pages relating to this glock, and must not
+ * not return to caller to demote/unlock the glock until I/O is complete.
+ */
+static void meta_go_sync(struct gfs2_glock *gl, int flags)
+{
+        if (!(flags & DIO_METADATA))
+                return;
+        if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
+                gfs2_log_flush(gl->gl_sbd, gl);
+                gfs2_meta_sync(gl);
+                if (flags & DIO_RELEASE)
+                        gfs2_ail_empty_gl(gl);
+        }
+}
+/**
+ * meta_go_inval - invalidate the metadata for this glock
+ * @gl: the glock
+ * @flags:
+ *
+ */
+static void meta_go_inval(struct gfs2_glock *gl, int flags)
+{
+        if (!(flags & DIO_METADATA))
+                return;
+        gfs2_meta_inval(gl);
+        gl->gl_vn++;
+}
+/**
+ * inode_go_xmote_th - promote/demote a glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+                              int flags)
+{
+        if (gl->gl_state != LM_ST_UNLOCKED)
+                gfs2_pte_inval(gl);
+        gfs2_glock_xmote_th(gl, state, flags);
+}
+/**
+ * inode_go_xmote_bh - After promoting/demoting a glock
+ * @gl: the glock
+ *
+ */
+static void inode_go_xmote_bh(struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh = gl->gl_req_gh;
+        struct buffer_head *bh;
+        int error;
+        if (gl->gl_state != LM_ST_UNLOCKED &&
+            (!gh || !(gh->gh_flags & GL_SKIP))) {
+                error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
+                if (!error)
+                        brelse(bh);
+        }
+}
+/**
+ * inode_go_drop_th - unlock a glock
+ * @gl: the glock
+ *
+ * Invoked from rq_demote().
+ * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
+ * is being purged from our node's glock cache; we're dropping lock.
+ */
+static void inode_go_drop_th(struct gfs2_glock *gl)
+{
+        gfs2_pte_inval(gl);
+        gfs2_glock_drop_th(gl);
+}
+/**
+ * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * @gl: the glock protecting the inode
+ * @flags:
+ *
+ */
+static void inode_go_sync(struct gfs2_glock *gl, int flags)
+{
+        int meta = (flags & DIO_METADATA);
+        int data = (flags & DIO_DATA);
+        if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
+                if (meta && data) {
+                        gfs2_page_writeback(gl);
+                        gfs2_log_flush(gl->gl_sbd, gl);
+                        gfs2_meta_sync(gl);
+                        gfs2_page_wait(gl);
+                        clear_bit(GLF_DIRTY, &gl->gl_flags);
+                } else if (meta) {
+                        gfs2_log_flush(gl->gl_sbd, gl);
+                        gfs2_meta_sync(gl);
+                } else if (data) {
+                        gfs2_page_writeback(gl);
+                        gfs2_page_wait(gl);
+                }
+                if (flags & DIO_RELEASE)
+                        gfs2_ail_empty_gl(gl);
+        }
+}
+/**
+ * inode_go_inval - prepare a inode glock to be released
+ * @gl: the glock
+ * @flags:
+ *
+ */
+static void inode_go_inval(struct gfs2_glock *gl, int flags)
+{
+        int meta = (flags & DIO_METADATA);
+        int data = (flags & DIO_DATA);
+        if (meta) {
+                gfs2_meta_inval(gl);
+                gl->gl_vn++;
+        }
+        if (data)
+                gfs2_page_inval(gl);
+}
+/**
+ * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int inode_go_demote_ok(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        int demote = 0;
+        if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
+                demote = 1;
+        else if (!sdp->sd_args.ar_localcaching &&
+                 time_after_eq(jiffies, gl->gl_stamp +
+                               gfs2_tune_get(sdp, gt_demote_secs) * HZ))
+                demote = 1;
+        return demote;
+}
+/**
+ * inode_go_lock - operation done after an inode lock is locked by a process
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+static int inode_go_lock(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_inode *ip = gl->gl_object;
+        int error = 0;
+        if (!ip)
+                return 0;
+        if (ip->i_vn != gl->gl_vn) {
+                error = gfs2_inode_refresh(ip);
+                if (error)
+                        return error;
+                gfs2_inode_attr_in(ip);
+        }
+        if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+            (gl->gl_state == LM_ST_EXCLUSIVE) &&
+            (gh->gh_flags & GL_LOCAL_EXCL))
+                error = gfs2_truncatei_resume(ip);
+        return error;
+}
+/**
+ * inode_go_unlock - operation done before an inode lock is unlocked by a
+ *                   process
+ * @gl: the glock
+ * @flags:
+ *
+ */
+static void inode_go_unlock(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_inode *ip = gl->gl_object;
+        if (ip == NULL)
+                return;
+        if (test_bit(GLF_DIRTY, &gl->gl_flags))
+                gfs2_inode_attr_in(ip);
+        gfs2_meta_cache_flush(ip);
+}
+/**
+ * inode_greedy -
+ * @gl: the glock
+ *
+ */
+static void inode_greedy(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = gl->gl_object;
+        unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
+        unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
+        unsigned int new_time;
+        spin_lock(&ip->i_spin);
+        if (time_after(ip->i_last_pfault + quantum, jiffies)) {
+                new_time = ip->i_greedy + quantum;
+                if (new_time > max)
+                        new_time = max;
+        } else {
+                new_time = ip->i_greedy - quantum;
+                if (!new_time || new_time > max)
+                        new_time = 1;
+        }
+        ip->i_greedy = new_time;
+        spin_unlock(&ip->i_spin);
+        iput(&ip->i_inode);
+}
+/**
+ * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+{
+        return !gl->gl_aspace->i_mapping->nrpages;
+}
+/**
+ * rgrp_go_lock - operation done after an rgrp lock is locked by
+ *    a first holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+static int rgrp_go_lock(struct gfs2_holder *gh)
+{
+        return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
+}
+/**
+ * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
+ *    a last holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ */
+static void rgrp_go_unlock(struct gfs2_holder *gh)
+{
+        gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
+}
+/**
+ * trans_go_xmote_th - promote/demote the transaction glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+                              int flags)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        if (gl->gl_state != LM_ST_UNLOCKED &&
+            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                gfs2_meta_syncfs(sdp);
+                gfs2_log_shutdown(sdp);
+        }
+        gfs2_glock_xmote_th(gl, state, flags);
+}
+/**
+ * trans_go_xmote_bh - After promoting/demoting the transaction glock
+ * @gl: the glock
+ *
+ */
+static void trans_go_xmote_bh(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+        struct gfs2_glock *j_gl = ip->i_gl;
+        struct gfs2_log_header head;
+        int error;
+        if (gl->gl_state != LM_ST_UNLOCKED &&
+            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
+                j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+                error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+                if (error)
+                        gfs2_consist(sdp);
+                if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
+                        gfs2_consist(sdp);
+                /*  Initialize some head of the log stuff  */
+                if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+                        sdp->sd_log_sequence = head.lh_sequence + 1;
+                        gfs2_log_pointers_init(sdp, head.lh_blkno);
+                }
+        }
+}
+/**
+ * trans_go_drop_th - unlock the transaction glock
+ * @gl: the glock
+ *
+ * We want to sync the device even with localcaching.  Remember
+ * that localcaching journal replay only marks buffers dirty.
+ */
+static void trans_go_drop_th(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                gfs2_meta_syncfs(sdp);
+                gfs2_log_shutdown(sdp);
+        }
+        gfs2_glock_drop_th(gl);
+}
+/**
+ * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int quota_go_demote_ok(struct gfs2_glock *gl)
+{
+        return !atomic_read(&gl->gl_lvb_count);
+}
+const struct gfs2_glock_operations gfs2_meta_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_META,
+};
+const struct gfs2_glock_operations gfs2_inode_glops = {
+        .go_xmote_th = inode_go_xmote_th,
+        .go_xmote_bh = inode_go_xmote_bh,
+        .go_drop_th = inode_go_drop_th,
+        .go_sync = inode_go_sync,
+        .go_inval = inode_go_inval,
+        .go_demote_ok = inode_go_demote_ok,
+        .go_lock = inode_go_lock,
+        .go_unlock = inode_go_unlock,
+        .go_greedy = inode_greedy,
+        .go_type = LM_TYPE_INODE,
+};
+const struct gfs2_glock_operations gfs2_rgrp_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_sync = meta_go_sync,
+        .go_inval = meta_go_inval,
+        .go_demote_ok = rgrp_go_demote_ok,
+        .go_lock = rgrp_go_lock,
+        .go_unlock = rgrp_go_unlock,
+        .go_type = LM_TYPE_RGRP,
+};
+const struct gfs2_glock_operations gfs2_trans_glops = {
+        .go_xmote_th = trans_go_xmote_th,
+        .go_xmote_bh = trans_go_xmote_bh,
+        .go_drop_th = trans_go_drop_th,
+        .go_type = LM_TYPE_NONDISK,
+};
+const struct gfs2_glock_operations gfs2_iopen_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_IOPEN,
+};
+const struct gfs2_glock_operations gfs2_flock_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_FLOCK,
+};
+const struct gfs2_glock_operations gfs2_nondisk_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_NONDISK,
+};
+const struct gfs2_glock_operations gfs2_quota_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_demote_ok = quota_go_demote_ok,
+        .go_type = LM_TYPE_QUOTA,
+};
+const struct gfs2_glock_operations gfs2_journal_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_JOURNAL,
+};
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..a1d9b5b024e6
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __GLOPS_DOT_H__
+#define __GLOPS_DOT_H__
+#include "incore.h"
+extern const struct gfs2_glock_operations gfs2_meta_glops;
+extern const struct gfs2_glock_operations gfs2_inode_glops;
+extern const struct gfs2_glock_operations gfs2_rgrp_glops;
+extern const struct gfs2_glock_operations gfs2_trans_glops;
+extern const struct gfs2_glock_operations gfs2_iopen_glops;
+extern const struct gfs2_glock_operations gfs2_flock_glops;
+extern const struct gfs2_glock_operations gfs2_nondisk_glops;
+extern const struct gfs2_glock_operations gfs2_quota_glops;
+extern const struct gfs2_glock_operations gfs2_journal_glops;
+#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..118dc693d111
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,634 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __INCORE_DOT_H__
+#define __INCORE_DOT_H__
+#include <linux/fs.h>
+#define DIO_WAIT        0x00000010
+#define DIO_METADATA    0x00000020
+#define DIO_DATA        0x00000040
+#define DIO_RELEASE     0x00000080
+#define DIO_ALL         0x00000100
+struct gfs2_log_operations;
+struct gfs2_log_element;
+struct gfs2_holder;
+struct gfs2_glock;
+struct gfs2_quota_data;
+struct gfs2_trans;
+struct gfs2_ail;
+struct gfs2_jdesc;
+struct gfs2_sbd;
+typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
+/*
+ * Structure of operations that are associated with each
+ * type of element in the log.
+ */
+struct gfs2_log_operations {
+        void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
+        void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
+        void (*lo_before_commit) (struct gfs2_sbd *sdp);
+        void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
+        void (*lo_before_scan) (struct gfs2_jdesc *jd,
+                                struct gfs2_log_header *head, int pass);
+        int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
+                                 struct gfs2_log_descriptor *ld, __be64 *ptr,
+                                 int pass);
+        void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
+        const char *lo_name;
+};
+struct gfs2_log_element {
+        struct list_head le_list;
+        const struct gfs2_log_operations *le_ops;
+};
+struct gfs2_bitmap {
+        struct buffer_head *bi_bh;
+        char *bi_clone;
+        u32 bi_offset;
+        u32 bi_start;
+        u32 bi_len;
+};
+struct gfs2_rgrpd {
+        struct list_head rd_list;       /* Link with superblock */
+        struct list_head rd_list_mru;
+        struct list_head rd_recent;     /* Recently used rgrps */
+        struct gfs2_glock *rd_gl;       /* Glock for this rgrp */
+        struct gfs2_rindex rd_ri;
+        struct gfs2_rgrp rd_rg;
+        u64 rd_rg_vn;
+        struct gfs2_bitmap *rd_bits;
+        unsigned int rd_bh_count;
+        struct mutex rd_mutex;
+        u32 rd_free_clone;
+        struct gfs2_log_element rd_le;
+        u32 rd_last_alloc_data;
+        u32 rd_last_alloc_meta;
+        struct gfs2_sbd *rd_sbd;
+};
+enum gfs2_state_bits {
+        BH_Pinned = BH_PrivateStart,
+        BH_Escaped = BH_PrivateStart + 1,
+};
+BUFFER_FNS(Pinned, pinned)
+TAS_BUFFER_FNS(Pinned, pinned)
+BUFFER_FNS(Escaped, escaped)
+TAS_BUFFER_FNS(Escaped, escaped)
+struct gfs2_bufdata {
+        struct buffer_head *bd_bh;
+        struct gfs2_glock *bd_gl;
+        struct list_head bd_list_tr;
+        struct gfs2_log_element bd_le;
+        struct gfs2_ail *bd_ail;
+        struct list_head bd_ail_st_list;
+        struct list_head bd_ail_gl_list;
+};
+struct gfs2_glock_operations {
+        void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
+                             int flags);
+        void (*go_xmote_bh) (struct gfs2_glock * gl);
+        void (*go_drop_th) (struct gfs2_glock * gl);
+        void (*go_drop_bh) (struct gfs2_glock * gl);
+        void (*go_sync) (struct gfs2_glock * gl, int flags);
+        void (*go_inval) (struct gfs2_glock * gl, int flags);
+        int (*go_demote_ok) (struct gfs2_glock * gl);
+        int (*go_lock) (struct gfs2_holder * gh);
+        void (*go_unlock) (struct gfs2_holder * gh);
+        void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
+        void (*go_greedy) (struct gfs2_glock * gl);
+        const int go_type;
+};
+enum {
+        /* Actions */
+        HIF_MUTEX               = 0,
+        HIF_PROMOTE             = 1,
+        HIF_DEMOTE              = 2,
+        HIF_GREEDY              = 3,
+        /* States */
+        HIF_ALLOCED             = 4,
+        HIF_DEALLOC             = 5,
+        HIF_HOLDER              = 6,
+        HIF_FIRST               = 7,
+        HIF_ABORTED             = 9,
+};
+struct gfs2_holder {
+        struct list_head gh_list;
+        struct gfs2_glock *gh_gl;
+        struct task_struct *gh_owner;
+        unsigned int gh_state;
+        unsigned gh_flags;
+        int gh_error;
+        unsigned long gh_iflags;
+        struct completion gh_wait;
+        unsigned long gh_ip;
+};
+enum {
+        GLF_LOCK                = 1,
+        GLF_STICKY              = 2,
+        GLF_PREFETCH            = 3,
+        GLF_DIRTY               = 5,
+        GLF_SKIP_WAITERS2       = 6,
+        GLF_GREEDY              = 7,
+};
+struct gfs2_glock {
+        struct hlist_node gl_list;
+        unsigned long gl_flags;         /* GLF_... */
+        struct lm_lockname gl_name;
+        atomic_t gl_ref;
+        spinlock_t gl_spin;
+        unsigned int gl_state;
+        unsigned int gl_hash;
+        struct task_struct *gl_owner;
+        unsigned long gl_ip;
+        struct list_head gl_holders;
+        struct list_head gl_waiters1;   /* HIF_MUTEX */
+        struct list_head gl_waiters2;   /* HIF_DEMOTE, HIF_GREEDY */
+        struct list_head gl_waiters3;   /* HIF_PROMOTE */
+        const struct gfs2_glock_operations *gl_ops;
+        struct gfs2_holder *gl_req_gh;
+        gfs2_glop_bh_t gl_req_bh;
+        void *gl_lock;
+        char *gl_lvb;
+        atomic_t gl_lvb_count;
+        u64 gl_vn;
+        unsigned long gl_stamp;
+        void *gl_object;
+        struct list_head gl_reclaim;
+        struct gfs2_sbd *gl_sbd;
+        struct inode *gl_aspace;
+        struct gfs2_log_element gl_le;
+        struct list_head gl_ail_list;
+        atomic_t gl_ail_count;
+};
+struct gfs2_alloc {
+        /* Quota stuff */
+        struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
+        struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
+        unsigned int al_qd_num;
+        u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+        u32 al_alloced; /* Filled in by gfs2_alloc_*() */
+        /* Filled in by gfs2_inplace_reserve() */
+        unsigned int al_line;
+        char *al_file;
+        struct gfs2_holder al_ri_gh;
+        struct gfs2_holder al_rgd_gh;
+        struct gfs2_rgrpd *al_rgd;
+};
+enum {
+        GIF_QD_LOCKED           = 1,
+        GIF_PAGED               = 2,
+        GIF_SW_PAGED            = 3,
+};
+struct gfs2_inode {
+        struct inode i_inode;
+        struct gfs2_inum i_num;
+        unsigned long i_flags;          /* GIF_... */
+        u64 i_vn;
+        struct gfs2_dinode i_di; /* To be replaced by ref to block */
+        struct gfs2_glock *i_gl; /* Move into i_gh? */
+        struct gfs2_holder i_iopen_gh;
+        struct gfs2_holder i_gh; /* for prepare/commit_write only */
+        struct gfs2_alloc i_alloc;
+        u64 i_last_rg_alloc;
+        spinlock_t i_spin;
+        struct rw_semaphore i_rw_mutex;
+        unsigned int i_greedy;
+        unsigned long i_last_pfault;
+        struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
+};
+/*
+ * Since i_inode is the first element of struct gfs2_inode,
+ * this is effectively a cast.
+ */
+static inline struct gfs2_inode *GFS2_I(struct inode *inode)
+{
+        return container_of(inode, struct gfs2_inode, i_inode);
+}
+/* To be removed? */
+static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
+{
+        return inode->i_sb->s_fs_info;
+}
+enum {
+        GFF_DID_DIRECT_ALLOC    = 0,
+        GFF_EXLOCK = 1,
+};
+struct gfs2_file {
+        unsigned long f_flags;          /* GFF_... */
+        struct mutex f_fl_mutex;
+        struct gfs2_holder f_fl_gh;
+};
+struct gfs2_revoke {
+        struct gfs2_log_element rv_le;
+        u64 rv_blkno;
+};
+struct gfs2_revoke_replay {
+        struct list_head rr_list;
+        u64 rr_blkno;
+        unsigned int rr_where;
+};
+enum {
+        QDF_USER                = 0,
+        QDF_CHANGE              = 1,
+        QDF_LOCKED              = 2,
+};
+struct gfs2_quota_lvb {
+        __be32 qb_magic;
+        u32 __pad;
+        __be64 qb_limit;      /* Hard limit of # blocks to alloc */
+        __be64 qb_warn;       /* Warn user when alloc is above this # */
+        __be64 qb_value;       /* Current # blocks allocated */
+};
+struct gfs2_quota_data {
+        struct list_head qd_list;
+        unsigned int qd_count;
+        u32 qd_id;
+        unsigned long qd_flags;         /* QDF_... */
+        s64 qd_change;
+        s64 qd_change_sync;
+        unsigned int qd_slot;
+        unsigned int qd_slot_count;
+        struct buffer_head *qd_bh;
+        struct gfs2_quota_change *qd_bh_qc;
+        unsigned int qd_bh_count;
+        struct gfs2_glock *qd_gl;
+        struct gfs2_quota_lvb qd_qb;
+        u64 qd_sync_gen;
+        unsigned long qd_last_warn;
+        unsigned long qd_last_touched;
+};
+struct gfs2_log_buf {
+        struct list_head lb_list;
+        struct buffer_head *lb_bh;
+        struct buffer_head *lb_real;
+};
+struct gfs2_trans {
+        unsigned long tr_ip;
+        unsigned int tr_blocks;
+        unsigned int tr_revokes;
+        unsigned int tr_reserved;
+        struct gfs2_holder tr_t_gh;
+        int tr_touched;
+        unsigned int tr_num_buf;
+        unsigned int tr_num_buf_new;
+        unsigned int tr_num_buf_rm;
+        struct list_head tr_list_buf;
+        unsigned int tr_num_revoke;
+        unsigned int tr_num_revoke_rm;
+};
+struct gfs2_ail {
+        struct list_head ai_list;
+        unsigned int ai_first;
+        struct list_head ai_ail1_list;
+        struct list_head ai_ail2_list;
+        u64 ai_sync_gen;
+};
+struct gfs2_jdesc {
+        struct list_head jd_list;
+        struct inode *jd_inode;
+        unsigned int jd_jid;
+        int jd_dirty;
+        unsigned int jd_blocks;
+};
+#define GFS2_GLOCKD_DEFAULT     1
+#define GFS2_GLOCKD_MAX         16
+#define GFS2_QUOTA_DEFAULT      GFS2_QUOTA_OFF
+#define GFS2_QUOTA_OFF          0
+#define GFS2_QUOTA_ACCOUNT      1
+#define GFS2_QUOTA_ON           2
+#define GFS2_DATA_DEFAULT       GFS2_DATA_ORDERED
+#define GFS2_DATA_WRITEBACK     1
+#define GFS2_DATA_ORDERED       2
+struct gfs2_args {
+        char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
+        char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
+        char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
+        int ar_spectator; /* Don't get a journal because we're always RO */
+        int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
+        int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
+        int ar_localcaching; /* Local-style caching (dangerous on multihost) */
+        int ar_debug; /* Oops on errors instead of trying to be graceful */
+        int ar_upgrade; /* Upgrade ondisk/multihost format */
+        unsigned int ar_num_glockd; /* Number of glockd threads */
+        int ar_posix_acl; /* Enable posix acls */
+        int ar_quota; /* off/account/on */
+        int ar_suiddir; /* suiddir support */
+        int ar_data; /* ordered/writeback */
+};
+struct gfs2_tune {
+        spinlock_t gt_spin;
+        unsigned int gt_ilimit;
+        unsigned int gt_ilimit_tries;
+        unsigned int gt_ilimit_min;
+        unsigned int gt_demote_secs; /* Cache retention for unheld glock */
+        unsigned int gt_incore_log_blocks;
+        unsigned int gt_log_flush_secs;
+        unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
+        unsigned int gt_scand_secs;
+        unsigned int gt_recoverd_secs;
+        unsigned int gt_logd_secs;
+        unsigned int gt_quotad_secs;
+        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
+        unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
+        unsigned int gt_quota_scale_num; /* Numerator */
+        unsigned int gt_quota_scale_den; /* Denominator */
+        unsigned int gt_quota_cache_secs;
+        unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
+        unsigned int gt_atime_quantum; /* Min secs between atime updates */
+        unsigned int gt_new_files_jdata;
+        unsigned int gt_new_files_directio;
+        unsigned int gt_max_atomic_write; /* Split big writes into this size */
+        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
+        unsigned int gt_lockdump_size;
+        unsigned int gt_stall_secs; /* Detects trouble! */
+        unsigned int gt_complain_secs;
+        unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
+        unsigned int gt_entries_per_readdir;
+        unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
+        unsigned int gt_greedy_default;
+        unsigned int gt_greedy_quantum;
+        unsigned int gt_greedy_max;
+        unsigned int gt_statfs_quantum;
+        unsigned int gt_statfs_slow;
+};
+enum {
+        SDF_JOURNAL_CHECKED     = 0,
+        SDF_JOURNAL_LIVE        = 1,
+        SDF_SHUTDOWN            = 2,
+        SDF_NOATIME             = 3,
+};
+#define GFS2_FSNAME_LEN         256
+struct gfs2_sbd {
+        struct super_block *sd_vfs;
+        struct super_block *sd_vfs_meta;
+        struct kobject sd_kobj;
+        unsigned long sd_flags; /* SDF_... */
+        struct gfs2_sb sd_sb;
+        /* Constants computed on mount */
+        u32 sd_fsb2bb;
+        u32 sd_fsb2bb_shift;
+        u32 sd_diptrs;  /* Number of pointers in a dinode */
+        u32 sd_inptrs;  /* Number of pointers in a indirect block */
+        u32 sd_jbsize;  /* Size of a journaled data block */
+        u32 sd_hash_bsize;      /* sizeof(exhash block) */
+        u32 sd_hash_bsize_shift;
+        u32 sd_hash_ptrs;       /* Number of pointers in a hash block */
+        u32 sd_qc_per_block;
+        u32 sd_max_dirres;      /* Max blocks needed to add a directory entry */
+        u32 sd_max_height;      /* Max height of a file's metadata tree */
+        u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
+        u32 sd_max_jheight; /* Max height of journaled file's meta tree */
+        u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
+        struct gfs2_args sd_args;       /* Mount arguments */
+        struct gfs2_tune sd_tune;       /* Filesystem tuning structure */
+        /* Lock Stuff */
+        struct lm_lockstruct sd_lockstruct;
+        struct list_head sd_reclaim_list;
+        spinlock_t sd_reclaim_lock;
+        wait_queue_head_t sd_reclaim_wq;
+        atomic_t sd_reclaim_count;
+        struct gfs2_holder sd_live_gh;
+        struct gfs2_glock *sd_rename_gl;
+        struct gfs2_glock *sd_trans_gl;
+        /* Inode Stuff */
+        struct inode *sd_master_dir;
+        struct inode *sd_jindex;
+        struct inode *sd_inum_inode;
+        struct inode *sd_statfs_inode;
+        struct inode *sd_ir_inode;
+        struct inode *sd_sc_inode;
+        struct inode *sd_qc_inode;
+        struct inode *sd_rindex;
+        struct inode *sd_quota_inode;
+        /* Inum stuff */
+        struct mutex sd_inum_mutex;
+        /* StatFS stuff */
+        spinlock_t sd_statfs_spin;
+        struct mutex sd_statfs_mutex;
+        struct gfs2_statfs_change sd_statfs_master;
+        struct gfs2_statfs_change sd_statfs_local;
+        unsigned long sd_statfs_sync_time;
+        /* Resource group stuff */
+        u64 sd_rindex_vn;
+        spinlock_t sd_rindex_spin;
+        struct mutex sd_rindex_mutex;
+        struct list_head sd_rindex_list;
+        struct list_head sd_rindex_mru_list;
+        struct list_head sd_rindex_recent_list;
+        struct gfs2_rgrpd *sd_rindex_forward;
+        unsigned int sd_rgrps;
+        /* Journal index stuff */
+        struct list_head sd_jindex_list;
+        spinlock_t sd_jindex_spin;
+        struct mutex sd_jindex_mutex;
+        unsigned int sd_journals;
+        unsigned long sd_jindex_refresh_time;
+        struct gfs2_jdesc *sd_jdesc;
+        struct gfs2_holder sd_journal_gh;
+        struct gfs2_holder sd_jinode_gh;
+        struct gfs2_holder sd_ir_gh;
+        struct gfs2_holder sd_sc_gh;
+        struct gfs2_holder sd_qc_gh;
+        /* Daemon stuff */
+        struct task_struct *sd_scand_process;
+        struct task_struct *sd_recoverd_process;
+        struct task_struct *sd_logd_process;
+        struct task_struct *sd_quotad_process;
+        struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
+        unsigned int sd_glockd_num;
+        /* Quota stuff */
+        struct list_head sd_quota_list;
+        atomic_t sd_quota_count;
+        spinlock_t sd_quota_spin;
+        struct mutex sd_quota_mutex;
+        unsigned int sd_quota_slots;
+        unsigned int sd_quota_chunks;
+        unsigned char **sd_quota_bitmap;
+        u64 sd_quota_sync_gen;
+        unsigned long sd_quota_sync_time;
+        /* Log stuff */
+        spinlock_t sd_log_lock;
+        unsigned int sd_log_blks_reserved;
+        unsigned int sd_log_commited_buf;
+        unsigned int sd_log_commited_revoke;
+        unsigned int sd_log_num_gl;
+        unsigned int sd_log_num_buf;
+        unsigned int sd_log_num_revoke;
+        unsigned int sd_log_num_rg;
+        unsigned int sd_log_num_databuf;
+        unsigned int sd_log_num_jdata;
+        unsigned int sd_log_num_hdrs;
+        struct list_head sd_log_le_gl;
+        struct list_head sd_log_le_buf;
+        struct list_head sd_log_le_revoke;
+        struct list_head sd_log_le_rg;
+        struct list_head sd_log_le_databuf;
+        unsigned int sd_log_blks_free;
+        struct mutex sd_log_reserve_mutex;
+        u64 sd_log_sequence;
+        unsigned int sd_log_head;
+        unsigned int sd_log_tail;
+        int sd_log_idle;
+        unsigned long sd_log_flush_time;
+        struct rw_semaphore sd_log_flush_lock;
+        struct list_head sd_log_flush_list;
+        unsigned int sd_log_flush_head;
+        u64 sd_log_flush_wrapped;
+        struct list_head sd_ail1_list;
+        struct list_head sd_ail2_list;
+        u64 sd_ail_sync_gen;
+        /* Replay stuff */
+        struct list_head sd_revoke_list;
+        unsigned int sd_replay_tail;
+        unsigned int sd_found_blocks;
+        unsigned int sd_found_revokes;
+        unsigned int sd_replayed_blocks;
+        /* For quiescing the filesystem */
+        struct gfs2_holder sd_freeze_gh;
+        struct mutex sd_freeze_lock;
+        unsigned int sd_freeze_count;
+        /* Counters */
+        atomic_t sd_glock_count;
+        atomic_t sd_glock_held_count;
+        atomic_t sd_inode_count;
+        atomic_t sd_reclaimed;
+        char sd_fsname[GFS2_FSNAME_LEN];
+        char sd_table_name[GFS2_FSNAME_LEN];
+        char sd_proto_name[GFS2_FSNAME_LEN];
+        /* Debugging crud */
+        unsigned long sd_last_warning;
+        struct vfsmount *sd_gfs2mnt;
+};
+#endif /* __INCORE_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..57c43ac47925
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1379 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <linux/security.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "eattr.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_address.h"
+#include "ops_file.h"
+#include "ops_inode.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+/**
+ * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
+ * @ip: The GFS2 inode (with embedded disk inode data)
+ * @inode:  The Linux VFS inode
+ *
+ */
+void gfs2_inode_attr_in(struct gfs2_inode *ip)
+{
+        struct inode *inode = &ip->i_inode;
+        struct gfs2_dinode *di = &ip->i_di;
+        inode->i_ino = ip->i_num.no_addr;
+        switch (di->di_mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                inode->i_rdev = MKDEV(di->di_major, di->di_minor);
+                break;
+        default:
+                inode->i_rdev = 0;
+                break;
+        };
+        inode->i_mode = di->di_mode;
+        inode->i_nlink = di->di_nlink;
+        inode->i_uid = di->di_uid;
+        inode->i_gid = di->di_gid;
+        i_size_write(inode, di->di_size);
+        inode->i_atime.tv_sec = di->di_atime;
+        inode->i_mtime.tv_sec = di->di_mtime;
+        inode->i_ctime.tv_sec = di->di_ctime;
+        inode->i_atime.tv_nsec = 0;
+        inode->i_mtime.tv_nsec = 0;
+        inode->i_ctime.tv_nsec = 0;
+        inode->i_blocks = di->di_blocks <<
+                (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+        if (di->di_flags & GFS2_DIF_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        else
+                inode->i_flags &= ~S_IMMUTABLE;
+        if (di->di_flags & GFS2_DIF_APPENDONLY)
+                inode->i_flags |= S_APPEND;
+        else
+                inode->i_flags &= ~S_APPEND;
+}
+/**
+ * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
+ * @ip: The GFS2 inode
+ *
+ * Only copy out the attributes that we want the VFS layer
+ * to be able to modify.
+ */
+void gfs2_inode_attr_out(struct gfs2_inode *ip)
+{
+        struct inode *inode = &ip->i_inode;
+        struct gfs2_dinode *di = &ip->i_di;
+        gfs2_assert_withdraw(GFS2_SB(inode),
+                (di->di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
+        di->di_mode = inode->i_mode;
+        di->di_uid = inode->i_uid;
+        di->di_gid = inode->i_gid;
+        di->di_atime = inode->i_atime.tv_sec;
+        di->di_mtime = inode->i_mtime.tv_sec;
+        di->di_ctime = inode->i_ctime.tv_sec;
+}
+static int iget_test(struct inode *inode, void *opaque)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inum *inum = opaque;
+        if (ip && ip->i_num.no_addr == inum->no_addr)
+                return 1;
+        return 0;
+}
+static int iget_set(struct inode *inode, void *opaque)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inum *inum = opaque;
+        ip->i_num = *inum;
+        return 0;
+}
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
+{
+        return ilookup5(sb, (unsigned long)inum->no_formal_ino,
+                        iget_test, inum);
+}
+static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
+{
+        return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
+                     iget_test, iget_set, inum);
+}
+/**
+ * gfs2_inode_lookup - Lookup an inode
+ * @sb: The super block
+ * @inum: The inode number
+ * @type: The type of the inode
+ *
+ * Returns: A VFS inode, or an error
+ */
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
+{
+        struct inode *inode = gfs2_iget(sb, inum);
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_glock *io_gl;
+        int error;
+        if (inode->i_state & I_NEW) {
+                struct gfs2_sbd *sdp = GFS2_SB(inode);
+                umode_t mode = DT2IF(type);
+                inode->i_private = ip;
+                inode->i_mode = mode;
+                if (S_ISREG(mode)) {
+                        inode->i_op = &gfs2_file_iops;
+                        inode->i_fop = &gfs2_file_fops;
+                        inode->i_mapping->a_ops = &gfs2_file_aops;
+                } else if (S_ISDIR(mode)) {
+                        inode->i_op = &gfs2_dir_iops;
+                        inode->i_fop = &gfs2_dir_fops;
+                } else if (S_ISLNK(mode)) {
+                        inode->i_op = &gfs2_symlink_iops;
+                } else {
+                        inode->i_op = &gfs2_dev_iops;
+                }
+                error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+                if (unlikely(error))
+                        goto fail;
+                ip->i_gl->gl_object = ip;
+                error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+                if (unlikely(error))
+                        goto fail_put;
+                ip->i_vn = ip->i_gl->gl_vn - 1;
+                error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
+                if (unlikely(error))
+                        goto fail_iopen;
+                gfs2_glock_put(io_gl);
+                unlock_new_inode(inode);
+        }
+        return inode;
+fail_iopen:
+        gfs2_glock_put(io_gl);
+fail_put:
+        ip->i_gl->gl_object = NULL;
+        gfs2_glock_put(ip->i_gl);
+fail:
+        iput(inode);
+        return ERR_PTR(error);
+}
+/**
+ * gfs2_inode_refresh - Refresh the incore copy of the dinode
+ * @ip: The GFS2 inode
+ *
+ * Returns: errno
+ */
+int gfs2_inode_refresh(struct gfs2_inode *ip)
+{
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
+                brelse(dibh);
+                return -EIO;
+        }
+        gfs2_dinode_in(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                return -EIO;
+        }
+        if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
+                return -ESTALE;
+        ip->i_vn = ip->i_gl->gl_vn;
+        return 0;
+}
+int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al;
+        struct gfs2_rgrpd *rgd;
+        int error;
+        if (ip->i_di.di_blocks != 1) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                return -EIO;
+        }
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+        if (error)
+                goto out_qs;
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+        if (!rgd) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+                goto out_rindex_relse;
+        }
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+                                   &al->al_rgd_gh);
+        if (error)
+                goto out_rindex_relse;
+        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
+        if (error)
+                goto out_rg_gunlock;
+        gfs2_trans_add_gl(ip->i_gl);
+        gfs2_free_di(rgd, ip);
+        gfs2_trans_end(sdp);
+        clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+out_rg_gunlock:
+        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+out_rindex_relse:
+        gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_qs:
+        gfs2_quota_unhold(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+/**
+ * gfs2_change_nlink - Change nlink count on inode
+ * @ip: The GFS2 inode
+ * @diff: The change in the nlink count required
+ *
+ * Returns: errno
+ */
+int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
+{
+        struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
+        struct buffer_head *dibh;
+        u32 nlink;
+        int error;
+        BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
+        nlink = ip->i_di.di_nlink + diff;
+        /* If we are reducing the nlink count, but the new value ends up being
+           bigger than the old one, we must have underflowed. */
+        if (diff < 0 && nlink > ip->i_di.di_nlink) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                return -EIO;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        ip->i_di.di_nlink = nlink;
+        ip->i_di.di_ctime = get_seconds();
+        ip->i_inode.i_nlink = nlink;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        mark_inode_dirty(&ip->i_inode);
+        if (ip->i_di.di_nlink == 0) {
+                struct gfs2_rgrpd *rgd;
+                struct gfs2_holder ri_gh, rg_gh;
+                error = gfs2_rindex_hold(sdp, &ri_gh);
+                if (error)
+                        goto out;
+                error = -EIO;
+                rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+                if (!rgd)
+                        goto out_norgrp;
+                error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+                if (error)
+                        goto out_norgrp;
+                clear_nlink(&ip->i_inode);
+                gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
+                gfs2_glock_dq_uninit(&rg_gh);
+out_norgrp:
+                gfs2_glock_dq_uninit(&ri_gh);
+        }
+out:
+        return error;
+}
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
+{
+        struct qstr qstr;
+        gfs2_str2qstr(&qstr, name);
+        return gfs2_lookupi(dip, &qstr, 1, NULL);
+}
+/**
+ * gfs2_lookupi - Look up a filename in a directory and return its inode
+ * @d_gh: An initialized holder for the directory glock
+ * @name: The name of the inode to look for
+ * @is_root: If 1, ignore the caller's permissions
+ * @i_gh: An uninitialized holder for the new inode glock
+ *
+ * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
+ * @is_root is true.
+ *
+ * Returns: errno
+ */
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+                           int is_root, struct nameidata *nd)
+{
+        struct super_block *sb = dir->i_sb;
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_holder d_gh;
+        struct gfs2_inum inum;
+        unsigned int type;
+        int error = 0;
+        struct inode *inode = NULL;
+        if (!name->len || name->len > GFS2_FNAMESIZE)
+                return ERR_PTR(-ENAMETOOLONG);
+        if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
+            (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
+             dir == sb->s_root->d_inode)) {
+                igrab(dir);
+                return dir;
+        }
+        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+        if (error)
+                return ERR_PTR(error);
+        if (!is_root) {
+                error = permission(dir, MAY_EXEC, NULL);
+                if (error)
+                        goto out;
+        }
+        error = gfs2_dir_search(dir, name, &inum, &type);
+        if (error)
+                goto out;
+        inode = gfs2_inode_lookup(sb, &inum, type);
+out:
+        gfs2_glock_dq_uninit(&d_gh);
+        if (error == -ENOENT)
+                return NULL;
+        return inode;
+}
+static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
+        struct buffer_head *bh;
+        struct gfs2_inum_range ir;
+        int error;
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                return error;
+        mutex_lock(&sdp->sd_inum_mutex);
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error) {
+                mutex_unlock(&sdp->sd_inum_mutex);
+                gfs2_trans_end(sdp);
+                return error;
+        }
+        gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+        if (ir.ir_length) {
+                *formal_ino = ir.ir_start++;
+                ir.ir_length--;
+                gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                gfs2_inum_range_out(&ir,
+                                    bh->b_data + sizeof(struct gfs2_dinode));
+                brelse(bh);
+                mutex_unlock(&sdp->sd_inum_mutex);
+                gfs2_trans_end(sdp);
+                return 0;
+        }
+        brelse(bh);
+        mutex_unlock(&sdp->sd_inum_mutex);
+        gfs2_trans_end(sdp);
+        return 1;
+}
+static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
+        struct gfs2_holder gh;
+        struct buffer_head *bh;
+        struct gfs2_inum_range ir;
+        int error;
+        error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (error)
+                return error;
+        error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+        if (error)
+                goto out;
+        mutex_lock(&sdp->sd_inum_mutex);
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                goto out_end_trans;
+        gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+        if (!ir.ir_length) {
+                struct buffer_head *m_bh;
+                u64 x, y;
+                error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+                if (error)
+                        goto out_brelse;
+                x = *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
+                x = y = be64_to_cpu(x);
+                ir.ir_start = x;
+                ir.ir_length = GFS2_INUM_QUANTUM;
+                x += GFS2_INUM_QUANTUM;
+                if (x < y)
+                        gfs2_consist_inode(m_ip);
+                x = cpu_to_be64(x);
+                gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+                *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
+                brelse(m_bh);
+        }
+        *formal_ino = ir.ir_start++;
+        ir.ir_length--;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+out_brelse:
+        brelse(bh);
+out_end_trans:
+        mutex_unlock(&sdp->sd_inum_mutex);
+        gfs2_trans_end(sdp);
+out:
+        gfs2_glock_dq_uninit(&gh);
+        return error;
+}
+static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
+{
+        int error;
+        error = pick_formal_ino_1(sdp, inum);
+        if (error <= 0)
+                return error;
+        error = pick_formal_ino_2(sdp, inum);
+        return error;
+}
+/**
+ * create_ok - OK to create a new on-disk inode here?
+ * @dip:  Directory in which dinode is to be created
+ * @name:  Name of new dinode
+ * @mode:
+ *
+ * Returns: errno
+ */
+static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
+                     unsigned int mode)
+{
+        int error;
+        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        if (error)
+                return error;
+        /*  Don't create entries in an unlinked directory  */
+        if (!dip->i_di.di_nlink)
+                return -EPERM;
+        error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
+        switch (error) {
+        case -ENOENT:
+                error = 0;
+                break;
+        case 0:
+                return -EEXIST;
+        default:
+                return error;
+        }
+        if (dip->i_di.di_entries == (u32)-1)
+                return -EFBIG;
+        if (S_ISDIR(mode) && dip->i_di.di_nlink == (u32)-1)
+                return -EMLINK;
+        return 0;
+}
+static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
+                               unsigned int *uid, unsigned int *gid)
+{
+        if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
+            (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) {
+                if (S_ISDIR(*mode))
+                        *mode |= S_ISUID;
+                else if (dip->i_di.di_uid != current->fsuid)
+                        *mode &= ~07111;
+                *uid = dip->i_di.di_uid;
+        } else
+                *uid = current->fsuid;
+        if (dip->i_di.di_mode & S_ISGID) {
+                if (S_ISDIR(*mode))
+                        *mode |= S_ISGID;
+                *gid = dip->i_di.di_gid;
+        } else
+                *gid = current->fsgid;
+}
+static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
+                        u64 *generation)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        int error;
+        gfs2_alloc_get(dip);
+        dip->i_alloc.al_requested = RES_DINODE;
+        error = gfs2_inplace_reserve(dip);
+        if (error)
+                goto out;
+        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
+        if (error)
+                goto out_ipreserv;
+        inum->no_addr = gfs2_alloc_di(dip, generation);
+        gfs2_trans_end(sdp);
+out_ipreserv:
+        gfs2_inplace_release(dip);
+out:
+        gfs2_alloc_put(dip);
+        return error;
+}
+/**
+ * init_dinode - Fill in a new dinode structure
+ * @dip: the directory this inode is being created in
+ * @gl: The glock covering the new inode
+ * @inum: the inode number
+ * @mode: the file permissions
+ * @uid:
+ * @gid:
+ *
+ */
+static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+                        const struct gfs2_inum *inum, unsigned int mode,
+                        unsigned int uid, unsigned int gid,
+                        const u64 *generation)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_dinode *di;
+        struct buffer_head *dibh;
+        dibh = gfs2_meta_new(gl, inum->no_addr);
+        gfs2_trans_add_bh(gl, dibh, 1);
+        gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
+        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        di = (struct gfs2_dinode *)dibh->b_data;
+        di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
+        di->di_num.no_addr = cpu_to_be64(inum->no_addr);
+        di->di_mode = cpu_to_be32(mode);
+        di->di_uid = cpu_to_be32(uid);
+        di->di_gid = cpu_to_be32(gid);
+        di->di_nlink = cpu_to_be32(0);
+        di->di_size = cpu_to_be64(0);
+        di->di_blocks = cpu_to_be64(1);
+        di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
+        di->di_major = di->di_minor = cpu_to_be32(0);
+        di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
+        di->di_generation = cpu_to_be64(*generation);
+        di->di_flags = cpu_to_be32(0);
+        if (S_ISREG(mode)) {
+                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+                    gfs2_tune_get(sdp, gt_new_files_jdata))
+                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
+                    gfs2_tune_get(sdp, gt_new_files_directio))
+                        di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
+        } else if (S_ISDIR(mode)) {
+                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+                                            GFS2_DIF_INHERIT_DIRECTIO);
+                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+                                            GFS2_DIF_INHERIT_JDATA);
+        }
+        di->__pad1 = 0;
+        di->di_payload_format = cpu_to_be32(0);
+        di->di_height = cpu_to_be32(0);
+        di->__pad2 = 0;
+        di->__pad3 = 0;
+        di->di_depth = cpu_to_be16(0);
+        di->di_entries = cpu_to_be32(0);
+        memset(&di->__pad4, 0, sizeof(di->__pad4));
+        di->di_eattr = cpu_to_be64(0);
+        memset(&di->di_reserved, 0, sizeof(di->di_reserved));
+        brelse(dibh);
+}
+static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+                       unsigned int mode, const struct gfs2_inum *inum,
+                       const u64 *generation)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        unsigned int uid, gid;
+        int error;
+        munge_mode_uid_gid(dip, &mode, &uid, &gid);
+        gfs2_alloc_get(dip);
+        error = gfs2_quota_lock(dip, uid, gid);
+        if (error)
+                goto out;
+        error = gfs2_quota_check(dip, uid, gid);
+        if (error)
+                goto out_quota;
+        error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
+        if (error)
+                goto out_quota;
+        init_dinode(dip, gl, inum, mode, uid, gid, generation);
+        gfs2_quota_change(dip, +1, uid, gid);
+        gfs2_trans_end(sdp);
+out_quota:
+        gfs2_quota_unlock(dip);
+out:
+        gfs2_alloc_put(dip);
+        return error;
+}
+static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
+                       struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_alloc *al;
+        int alloc_required;
+        struct buffer_head *dibh;
+        int error;
+        al = gfs2_alloc_get(dip);
+        error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto fail;
+        error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
+        if (alloc_required < 0)
+                goto fail;
+        if (alloc_required) {
+                error = gfs2_quota_check(dip, dip->i_di.di_uid,
+                                         dip->i_di.di_gid);
+                if (error)
+                        goto fail_quota_locks;
+                al->al_requested = sdp->sd_max_dirres;
+                error = gfs2_inplace_reserve(dip);
+                if (error)
+                        goto fail_quota_locks;
+                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                                         al->al_rgd->rd_ri.ri_length +
+                                         2 * RES_DINODE +
+                                         RES_STATFS + RES_QUOTA, 0);
+                if (error)
+                        goto fail_ipreserv;
+        } else {
+                error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
+                if (error)
+                        goto fail_quota_locks;
+        }
+        error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
+        if (error)
+                goto fail_end_trans;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto fail_end_trans;
+        ip->i_di.di_nlink = 1;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        return 0;
+fail_end_trans:
+        gfs2_trans_end(sdp);
+fail_ipreserv:
+        if (dip->i_alloc.al_rgd)
+                gfs2_inplace_release(dip);
+fail_quota_locks:
+        gfs2_quota_unlock(dip);
+fail:
+        gfs2_alloc_put(dip);
+        return error;
+}
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+        int err;
+        size_t len;
+        void *value;
+        char *name;
+        struct gfs2_ea_request er;
+        err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
+                                           &name, &value, &len);
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        return 0;
+                return err;
+        }
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = GFS2_EATYPE_SECURITY;
+        er.er_name = name;
+        er.er_data = value;
+        er.er_name_len = strlen(name);
+        er.er_data_len = len;
+        err = gfs2_ea_set_i(ip, &er);
+        kfree(value);
+        kfree(name);
+        return err;
+}
+/**
+ * gfs2_createi - Create a new inode
+ * @ghs: An array of two holders
+ * @name: The name of the new file
+ * @mode: the permissions on the new inode
+ *
+ * @ghs[0] is an initialized holder for the directory
+ * @ghs[1] is the holder for the inode lock
+ *
+ * If the return value is not NULL, the glocks on both the directory and the new
+ * file are held.  A transaction has been started and an inplace reservation
+ * is held, as well.
+ *
+ * Returns: An inode
+ */
+struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+                           unsigned int mode)
+{
+        struct inode *inode;
+        struct gfs2_inode *dip = ghs->gh_gl->gl_object;
+        struct inode *dir = &dip->i_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_inum inum;
+        int error;
+        u64 generation;
+        if (!name->len || name->len > GFS2_FNAMESIZE)
+                return ERR_PTR(-ENAMETOOLONG);
+        gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
+        error = gfs2_glock_nq(ghs);
+        if (error)
+                goto fail;
+        error = create_ok(dip, name, mode);
+        if (error)
+                goto fail_gunlock;
+        error = pick_formal_ino(sdp, &inum.no_formal_ino);
+        if (error)
+                goto fail_gunlock;
+        error = alloc_dinode(dip, &inum, &generation);
+        if (error)
+                goto fail_gunlock;
+        if (inum.no_addr < dip->i_num.no_addr) {
+                gfs2_glock_dq(ghs);
+                error = gfs2_glock_nq_num(sdp, inum.no_addr,
+                                          &gfs2_inode_glops, LM_ST_EXCLUSIVE,
+                                          GL_SKIP, ghs + 1);
+                if (error) {
+                        return ERR_PTR(error);
+                }
+                gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
+                error = gfs2_glock_nq(ghs);
+                if (error) {
+                        gfs2_glock_dq_uninit(ghs + 1);
+                        return ERR_PTR(error);
+                }
+                error = create_ok(dip, name, mode);
+                if (error)
+                        goto fail_gunlock2;
+        } else {
+                error = gfs2_glock_nq_num(sdp, inum.no_addr,
+                                          &gfs2_inode_glops, LM_ST_EXCLUSIVE,
+                                          GL_SKIP, ghs + 1);
+                if (error)
+                        goto fail_gunlock;
+        }
+        error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
+        if (error)
+                goto fail_gunlock2;
+        inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode));
+        if (IS_ERR(inode))
+                goto fail_gunlock2;
+        error = gfs2_inode_refresh(GFS2_I(inode));
+        if (error)
+                goto fail_iput;
+        error = gfs2_acl_create(dip, GFS2_I(inode));
+        if (error)
+                goto fail_iput;
+        error = gfs2_security_init(dip, GFS2_I(inode));
+        if (error)
+                goto fail_iput;
+        error = link_dinode(dip, name, GFS2_I(inode));
+        if (error)
+                goto fail_iput;
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        return inode;
+fail_iput:
+        iput(inode);
+fail_gunlock2:
+        gfs2_glock_dq_uninit(ghs + 1);
+fail_gunlock:
+        gfs2_glock_dq(ghs);
+fail:
+        return ERR_PTR(error);
+}
+/**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                struct gfs2_inode *ip)
+{
+        struct qstr dotname;
+        int error;
+        if (ip->i_di.di_entries != 2) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                return -EIO;
+        }
+        error = gfs2_dir_del(dip, name);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(dip, -1);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, ".");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, "..");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(ip, -2);
+        if (error)
+                return error;
+        return error;
+}
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+                   struct gfs2_inode *ip)
+{
+        struct gfs2_inum inum;
+        unsigned int type;
+        int error;
+        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+                return -EPERM;
+        if ((dip->i_di.di_mode & S_ISVTX) &&
+            dip->i_di.di_uid != current->fsuid &&
+            ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (IS_APPEND(&dip->i_inode))
+                return -EPERM;
+        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        if (error)
+                return error;
+        error = gfs2_dir_search(&dip->i_inode, name, &inum, &type);
+        if (error)
+                return error;
+        if (!gfs2_inum_equal(&inum, &ip->i_num))
+                return -ENOENT;
+        if (IF2DT(ip->i_di.di_mode) != type) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        return 0;
+}
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+        struct inode *dir = &to->i_inode;
+        struct super_block *sb = dir->i_sb;
+        struct inode *tmp;
+        struct qstr dotdot;
+        int error = 0;
+        gfs2_str2qstr(&dotdot, "..");
+        igrab(dir);
+        for (;;) {
+                if (dir == &this->i_inode) {
+                        error = -EINVAL;
+                        break;
+                }
+                if (dir == sb->s_root->d_inode) {
+                        error = 0;
+                        break;
+                }
+                tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
+                if (IS_ERR(tmp)) {
+                        error = PTR_ERR(tmp);
+                        break;
+                }
+                iput(dir);
+                dir = tmp;
+        }
+        iput(dir);
+        return error;
+}
+/**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+        struct gfs2_holder i_gh;
+        struct buffer_head *dibh;
+        unsigned int x;
+        int error;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+        error = gfs2_glock_nq_atime(&i_gh);
+        if (error) {
+                gfs2_holder_uninit(&i_gh);
+                return error;
+        }
+        if (!ip->i_di.di_size) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+                goto out;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        x = ip->i_di.di_size + 1;
+        if (x > *len) {
+                *buf = kmalloc(x, GFP_KERNEL);
+                if (!*buf) {
+                        error = -ENOMEM;
+                        goto out_brelse;
+                }
+        }
+        memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+        *len = x;
+out_brelse:
+        brelse(dibh);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
+ * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
+ *       conditionally update the inode's atime
+ * @gh: the holder to acquire
+ *
+ * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
+ * Update if the difference between the current time and the inode's current
+ * atime is greater than an interval specified at mount.
+ *
+ * Returns: errno
+ */
+int gfs2_glock_nq_atime(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = gl->gl_object;
+        s64 curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
+        unsigned int state;
+        int flags;
+        int error;
+        if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
+            gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
+            gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
+                return -EINVAL;
+        state = gh->gh_state;
+        flags = gh->gh_flags;
+        error = gfs2_glock_nq(gh);
+        if (error)
+                return error;
+        if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
+            (sdp->sd_vfs->s_flags & MS_RDONLY))
+                return 0;
+        curtime = get_seconds();
+        if (curtime - ip->i_di.di_atime >= quantum) {
+                gfs2_glock_dq(gh);
+                gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
+                                   gh);
+                error = gfs2_glock_nq(gh);
+                if (error)
+                        return error;
+                /* Verify that atime hasn't been updated while we were
+                   trying to get exclusive lock. */
+                curtime = get_seconds();
+                if (curtime - ip->i_di.di_atime >= quantum) {
+                        struct buffer_head *dibh;
+                        struct gfs2_dinode *di;
+                        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+                        if (error == -EROFS)
+                                return 0;
+                        if (error)
+                                goto fail;
+                        error = gfs2_meta_inode_buffer(ip, &dibh);
+                        if (error)
+                                goto fail_end_trans;
+                        ip->i_di.di_atime = curtime;
+                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                        di = (struct gfs2_dinode *)dibh->b_data;
+                        di->di_atime = cpu_to_be64(ip->i_di.di_atime);
+                        brelse(dibh);
+                        gfs2_trans_end(sdp);
+                }
+                /* If someone else has asked for the glock,
+                   unlock and let them have it. Then reacquire
+                   in the original state. */
+                if (gfs2_glock_is_blocking(gl)) {
+                        gfs2_glock_dq(gh);
+                        gfs2_holder_reinit(state, flags, gh);
+                        return gfs2_glock_nq(gh);
+                }
+        }
+        return 0;
+fail_end_trans:
+        gfs2_trans_end(sdp);
+fail:
+        gfs2_glock_dq(gh);
+        return error;
+}
+/**
+ * glock_compare_atime - Compare two struct gfs2_glock structures for sort
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ * Returns: 1 if A > B
+ *         -1 if A < B
+ *          0 if A == B
+ */
+static int glock_compare_atime(const void *arg_a, const void *arg_b)
+{
+        const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+        const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+        const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+        const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+        if (a->ln_number > b->ln_number)
+                return 1;
+        if (a->ln_number < b->ln_number)
+                return -1;
+        if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+                return 1;
+        if (gh_a->gh_state == LM_ST_SHARED && (gh_b->gh_flags & GL_ATIME))
+                return 1;
+        return 0;
+}
+/**
+ * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
+ *      atime update
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+        struct gfs2_holder **p;
+        unsigned int x;
+        int error = 0;
+        if (!num_gh)
+                return 0;
+        if (num_gh == 1) {
+                ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+                if (ghs->gh_flags & GL_ATIME)
+                        error = gfs2_glock_nq_atime(ghs);
+                else
+                        error = gfs2_glock_nq(ghs);
+                return error;
+        }
+        p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+        if (!p)
+                return -ENOMEM;
+        for (x = 0; x < num_gh; x++)
+                p[x] = &ghs[x];
+        sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
+        for (x = 0; x < num_gh; x++) {
+                p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+                if (p[x]->gh_flags & GL_ATIME)
+                        error = gfs2_glock_nq_atime(p[x]);
+                else
+                        error = gfs2_glock_nq(p[x]);
+                if (error) {
+                        while (x--)
+                                gfs2_glock_dq(p[x]);
+                        break;
+                }
+        }
+        kfree(p);
+        return error;
+}
+static int
+__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                error = inode_setattr(&ip->i_inode, attr);
+                gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+                gfs2_inode_attr_out(ip);
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        return error;
+}
+/**
+ * gfs2_setattr_simple -
+ * @ip:
+ * @attr:
+ *
+ * Called with a reference on the vnode.
+ *
+ * Returns: errno
+ */
+int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+        int error;
+        if (current->journal_info)
+                return __gfs2_setattr_simple(ip, attr);
+        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
+        if (error)
+                return error;
+        error = __gfs2_setattr_simple(ip, attr);
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        return error;
+}
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..f5d861760579
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __INODE_DOT_H__
+#define __INODE_DOT_H__
+static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
+{
+        return !ip->i_di.di_height;
+}
+static inline int gfs2_is_jdata(struct gfs2_inode *ip)
+{
+        return ip->i_di.di_flags & GFS2_DIF_JDATA;
+}
+static inline int gfs2_is_dir(struct gfs2_inode *ip)
+{
+        return S_ISDIR(ip->i_di.di_mode);
+}
+void gfs2_inode_attr_in(struct gfs2_inode *ip);
+void gfs2_inode_attr_out(struct gfs2_inode *ip);
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
+int gfs2_inode_refresh(struct gfs2_inode *ip);
+int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+                           int is_root, struct nameidata *nd);
+struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+                           unsigned int mode);
+int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                struct gfs2_inode *ip);
+int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+                   struct gfs2_inode *ip);
+int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
+int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+int gfs2_glock_nq_atime(struct gfs2_holder *gh);
+int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
+int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
+#endif /* __INODE_DOT_H__ */
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..effe4a337c1d
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "lm.h"
+#include "super.h"
+#include "util.h"
+/**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguements
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+        char *proto = sdp->sd_proto_name;
+        char *table = sdp->sd_table_name;
+        int flags = 0;
+        int error;
+        if (sdp->sd_args.ar_spectator)
+                flags |= LM_MFLAG_SPECTATOR;
+        fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+        error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
+                                     gfs2_glock_cb, sdp,
+                                     GFS2_MIN_LVB_SIZE, flags,
+                                     &sdp->sd_lockstruct, &sdp->sd_kobj);
+        if (error) {
+                fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
+                        proto, table, sdp->sd_args.ar_hostdata);
+                goto out;
+        }
+        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
+                                  GFS2_MIN_LVB_SIZE)) {
+                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+                goto out;
+        }
+        if (sdp->sd_args.ar_spectator)
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
+        else
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
+                         sdp->sd_lockstruct.ls_jid);
+        fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+        if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
+            !sdp->sd_args.ar_ignore_local_fs) {
+                sdp->sd_args.ar_localflocks = 1;
+                sdp->sd_args.ar_localcaching = 1;
+        }
+out:
+        return error;
+}
+void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
+                                        sdp->sd_lockstruct.ls_lockspace);
+}
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+}
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+{
+        va_list args;
+        if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return 0;
+        va_start(args, fmt);
+        vprintk(fmt, args);
+        va_end(args);
+        fs_err(sdp, "about to withdraw from the cluster\n");
+        BUG_ON(sdp->sd_args.ar_debug);
+        fs_err(sdp, "waiting for outstanding I/O\n");
+        /* FIXME: suspend dm device so oustanding bio's complete
+           and all further io requests fail */
+        fs_err(sdp, "telling LM to withdraw\n");
+        gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+        fs_err(sdp, "withdrawn\n");
+        dump_stack();
+        return -1;
+}
+int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                     void **lockp)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
+                                sdp->sd_lockstruct.ls_lockspace, name, lockp);
+        return error;
+}
+void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
+}
+unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+                          unsigned int cur_state, unsigned int req_state,
+                          unsigned int flags)
+{
+        int ret = 0;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+                                                         req_state, flags);
+        return ret;
+}
+unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+                            unsigned int cur_state)
+{
+        int ret = 0;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
+        return ret;
+}
+void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
+}
+int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
+        return error;
+}
+void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
+}
+int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                      struct file *file, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+        return error;
+}
+int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                  struct file *file, int cmd, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_plock(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
+        return error;
+}
+int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                    struct file *file, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_punlock(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+        return error;
+}
+void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                           unsigned int message)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_recovery_done(
+                        sdp->sd_lockstruct.ls_lockspace, jid, message);
+}
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..21cdc30ee08c
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __LM_DOT_H__
+#define __LM_DOT_H__
+struct gfs2_sbd;
+#define GFS2_MIN_LVB_SIZE 32
+int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
+void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+                                __attribute__ ((format(printf, 2, 3)));
+int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                     void **lockp);
+void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
+unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+                         unsigned int cur_state, unsigned int req_state,
+                         unsigned int flags);
+unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+                           unsigned int cur_state);
+void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
+int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
+void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
+int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                      struct file *file, struct file_lock *fl);
+int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                  struct file *file, int cmd, struct file_lock *fl);
+int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                    struct file *file, struct file_lock *fl);
+void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                           unsigned int message);
+#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..663fee728783
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/lm_interface.h>
+struct lmh_wrapper {
+        struct list_head lw_list;
+        const struct lm_lockops *lw_ops;
+};
+/* List of registered low-level locking protocols.  A file system selects one
+   of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+static LIST_HEAD(lmh_list);
+static DEFINE_MUTEX(lmh_lock);
+/**
+ * gfs2_register_lockproto - Register a low-level locking protocol
+ * @proto: the protocol definition
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+int gfs2_register_lockproto(const struct lm_lockops *proto)
+{
+        struct lmh_wrapper *lw;
+        mutex_lock(&lmh_lock);
+        list_for_each_entry(lw, &lmh_list, lw_list) {
+                if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
+                        mutex_unlock(&lmh_lock);
+                        printk(KERN_INFO "GFS2: protocol %s already exists\n",
+                               proto->lm_proto_name);
+                        return -EEXIST;
+                }
+        }
+        lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
+        if (!lw) {
+                mutex_unlock(&lmh_lock);
+                return -ENOMEM;
+        }
+        lw->lw_ops = proto;
+        list_add(&lw->lw_list, &lmh_list);
+        mutex_unlock(&lmh_lock);
+        return 0;
+}
+/**
+ * gfs2_unregister_lockproto - Unregister a low-level locking protocol
+ * @proto: the protocol definition
+ *
+ */
+void gfs2_unregister_lockproto(const struct lm_lockops *proto)
+{
+        struct lmh_wrapper *lw;
+        mutex_lock(&lmh_lock);
+        list_for_each_entry(lw, &lmh_list, lw_list) {
+                if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
+                        list_del(&lw->lw_list);
+                        mutex_unlock(&lmh_lock);
+                        kfree(lw);
+                        return;
+                }
+        }
+        mutex_unlock(&lmh_lock);
+        printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
+               proto->lm_proto_name);
+}
+/**
+ * gfs2_mount_lockproto - Mount a lock protocol
+ * @proto_name - the name of the protocol
+ * @table_name - the name of the lock space
+ * @host_data - data specific to this host
+ * @cb - the callback to the code using the lock module
+ * @sdp - The GFS2 superblock
+ * @min_lvb_size - the mininum LVB size that the caller can deal with
+ * @flags - LM_MFLAG_*
+ * @lockstruct - a structure returned describing the mount
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
+                         lm_callback_t cb, void *cb_data,
+                         unsigned int min_lvb_size, int flags,
+                         struct lm_lockstruct *lockstruct,
+                         struct kobject *fskobj)
+{
+        struct lmh_wrapper *lw = NULL;
+        int try = 0;
+        int error, found;
+retry:
+        mutex_lock(&lmh_lock);
+        found = 0;
+        list_for_each_entry(lw, &lmh_list, lw_list) {
+                if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found) {
+                if (!try && capable(CAP_SYS_MODULE)) {
+                        try = 1;
+                        mutex_unlock(&lmh_lock);
+                        request_module(proto_name);
+                        goto retry;
+                }
+                printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
+                error = -ENOENT;
+                goto out;
+        }
+        if (!try_module_get(lw->lw_ops->lm_owner)) {
+                try = 0;
+                mutex_unlock(&lmh_lock);
+                msleep(1000);
+                goto retry;
+        }
+        error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
+                                     min_lvb_size, flags, lockstruct, fskobj);
+        if (error)
+                module_put(lw->lw_ops->lm_owner);
+out:
+        mutex_unlock(&lmh_lock);
+        return error;
+}
+void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
+{
+        mutex_lock(&lmh_lock);
+        lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+        if (lockstruct->ls_ops->lm_owner)
+                module_put(lockstruct->ls_ops->lm_owner);
+        mutex_unlock(&lmh_lock);
+}
+/**
+ * gfs2_withdraw_lockproto - abnormally unmount a lock module
+ * @lockstruct: the lockstruct passed into mount
+ *
+ */
+void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
+{
+        mutex_lock(&lmh_lock);
+        lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
+        if (lockstruct->ls_ops->lm_owner)
+                module_put(lockstruct->ls_ops->lm_owner);
+        mutex_unlock(&lmh_lock);
+}
+EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
+EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..89b93b6b45cf
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
+lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..b167addf9fd1
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,524 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include "lock_dlm.h"
+static char junk_lvb[GDLM_LVB_SIZE];
+static void queue_complete(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        clear_bit(LFL_ACTIVE, &lp->flags);
+        spin_lock(&ls->async_lock);
+        list_add_tail(&lp->clist, &ls->complete);
+        spin_unlock(&ls->async_lock);
+        wake_up(&ls->thread_wait);
+}
+static inline void gdlm_ast(void *astarg)
+{
+        queue_complete(astarg);
+}
+static inline void gdlm_bast(void *astarg, int mode)
+{
+        struct gdlm_lock *lp = astarg;
+        struct gdlm_ls *ls = lp->ls;
+        if (!mode) {
+                printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+                        lp->lockname.ln_type,
+                        (unsigned long long)lp->lockname.ln_number);
+                return;
+        }
+        spin_lock(&ls->async_lock);
+        if (!lp->bast_mode) {
+                list_add_tail(&lp->blist, &ls->blocking);
+                lp->bast_mode = mode;
+        } else if (lp->bast_mode < mode)
+                lp->bast_mode = mode;
+        spin_unlock(&ls->async_lock);
+        wake_up(&ls->thread_wait);
+}
+void gdlm_queue_delayed(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        spin_lock(&ls->async_lock);
+        list_add_tail(&lp->delay_list, &ls->delayed);
+        spin_unlock(&ls->async_lock);
+}
+/* convert gfs lock-state to dlm lock-mode */
+static s16 make_mode(s16 lmstate)
+{
+        switch (lmstate) {
+        case LM_ST_UNLOCKED:
+                return DLM_LOCK_NL;
+        case LM_ST_EXCLUSIVE:
+                return DLM_LOCK_EX;
+        case LM_ST_DEFERRED:
+                return DLM_LOCK_CW;
+        case LM_ST_SHARED:
+                return DLM_LOCK_PR;
+        }
+        gdlm_assert(0, "unknown LM state %d", lmstate);
+        return -1;
+}
+/* convert dlm lock-mode to gfs lock-state */
+s16 gdlm_make_lmstate(s16 dlmmode)
+{
+        switch (dlmmode) {
+        case DLM_LOCK_IV:
+        case DLM_LOCK_NL:
+                return LM_ST_UNLOCKED;
+        case DLM_LOCK_EX:
+                return LM_ST_EXCLUSIVE;
+        case DLM_LOCK_CW:
+                return LM_ST_DEFERRED;
+        case DLM_LOCK_PR:
+                return LM_ST_SHARED;
+        }
+        gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+        return -1;
+}
+/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
+   DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
+static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
+{
+        s16 cur = make_mode(cur_state);
+        if (lp->cur != DLM_LOCK_IV)
+                gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
+}
+static inline unsigned int make_flags(struct gdlm_lock *lp,
+                                      unsigned int gfs_flags,
+                                      s16 cur, s16 req)
+{
+        unsigned int lkf = 0;
+        if (gfs_flags & LM_FLAG_TRY)
+                lkf |= DLM_LKF_NOQUEUE;
+        if (gfs_flags & LM_FLAG_TRY_1CB) {
+                lkf |= DLM_LKF_NOQUEUE;
+                lkf |= DLM_LKF_NOQUEUEBAST;
+        }
+        if (gfs_flags & LM_FLAG_PRIORITY) {
+                lkf |= DLM_LKF_NOORDER;
+                lkf |= DLM_LKF_HEADQUE;
+        }
+        if (gfs_flags & LM_FLAG_ANY) {
+                if (req == DLM_LOCK_PR)
+                        lkf |= DLM_LKF_ALTCW;
+                else if (req == DLM_LOCK_CW)
+                        lkf |= DLM_LKF_ALTPR;
+        }
+        if (lp->lksb.sb_lkid != 0) {
+                lkf |= DLM_LKF_CONVERT;
+                /* Conversion deadlock avoidance by DLM */
+                if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
+                    !(lkf & DLM_LKF_NOQUEUE) &&
+                    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
+                        lkf |= DLM_LKF_CONVDEADLK;
+        }
+        if (lp->lvb)
+                lkf |= DLM_LKF_VALBLK;
+        return lkf;
+}
+/* make_strname - convert GFS lock numbers to a string */
+static inline void make_strname(struct lm_lockname *lockname,
+                                struct gdlm_strname *str)
+{
+        sprintf(str->name, "%8x%16llx", lockname->ln_type,
+                (unsigned long long)lockname->ln_number);
+        str->namelen = GDLM_STRNAME_BYTES;
+}
+static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
+                          struct gdlm_lock **lpp)
+{
+        struct gdlm_lock *lp;
+        lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
+        if (!lp)
+                return -ENOMEM;
+        lp->lockname = *name;
+        lp->ls = ls;
+        lp->cur = DLM_LOCK_IV;
+        lp->lvb = NULL;
+        lp->hold_null = NULL;
+        init_completion(&lp->ast_wait);
+        INIT_LIST_HEAD(&lp->clist);
+        INIT_LIST_HEAD(&lp->blist);
+        INIT_LIST_HEAD(&lp->delay_list);
+        spin_lock(&ls->async_lock);
+        list_add(&lp->all_list, &ls->all_locks);
+        ls->all_locks_count++;
+        spin_unlock(&ls->async_lock);
+        *lpp = lp;
+        return 0;
+}
+void gdlm_delete_lp(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        spin_lock(&ls->async_lock);
+        if (!list_empty(&lp->clist))
+                list_del_init(&lp->clist);
+        if (!list_empty(&lp->blist))
+                list_del_init(&lp->blist);
+        if (!list_empty(&lp->delay_list))
+                list_del_init(&lp->delay_list);
+        gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
+                    (unsigned long long)lp->lockname.ln_number);
+        list_del_init(&lp->all_list);
+        ls->all_locks_count--;
+        spin_unlock(&ls->async_lock);
+        kfree(lp);
+}
+int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
+                  void **lockp)
+{
+        struct gdlm_lock *lp;
+        int error;
+        error = gdlm_create_lp(lockspace, name, &lp);
+        *lockp = lp;
+        return error;
+}
+void gdlm_put_lock(void *lock)
+{
+        gdlm_delete_lp(lock);
+}
+unsigned int gdlm_do_lock(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        struct gdlm_strname str;
+        int error, bast = 1;
+        /*
+         * When recovery is in progress, delay lock requests for submission
+         * once recovery is done.  Requests for recovery (NOEXP) and unlocks
+         * can pass.
+         */
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+            !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
+                gdlm_queue_delayed(lp);
+                return LM_OUT_ASYNC;
+        }
+        /*
+         * Submit the actual lock request.
+         */
+        if (test_bit(LFL_NOBAST, &lp->flags))
+                bast = 0;
+        make_strname(&lp->lockname, &str);
+        set_bit(LFL_ACTIVE, &lp->flags);
+        log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
+                  (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
+                  lp->cur, lp->req, lp->lkf);
+        error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
+                         str.name, str.namelen, 0, gdlm_ast, lp,
+                         bast ? gdlm_bast : NULL);
+        if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
+                lp->lksb.sb_status = -EAGAIN;
+                queue_complete(lp);
+                error = 0;
+        }
+        if (error) {
+                log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
+                          "flags=%lx", ls->fsname, lp->lockname.ln_type,
+                          (unsigned long long)lp->lockname.ln_number, error,
+                          lp->cur, lp->req, lp->lkf, lp->flags);
+                return LM_OUT_ERROR;
+        }
+        return LM_OUT_ASYNC;
+}
+static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        unsigned int lkf = 0;
+        int error;
+        set_bit(LFL_DLM_UNLOCK, &lp->flags);
+        set_bit(LFL_ACTIVE, &lp->flags);
+        if (lp->lvb)
+                lkf = DLM_LKF_VALBLK;
+        log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
+                  (unsigned long long)lp->lockname.ln_number,
+                  lp->lksb.sb_lkid, lp->cur, lkf);
+        error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
+        if (error) {
+                log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
+                          "flags=%lx", ls->fsname, lp->lockname.ln_type,
+                          (unsigned long long)lp->lockname.ln_number, error,
+                          lp->cur, lp->req, lp->lkf, lp->flags);
+                return LM_OUT_ERROR;
+        }
+        return LM_OUT_ASYNC;
+}
+unsigned int gdlm_lock(void *lock, unsigned int cur_state,
+                       unsigned int req_state, unsigned int flags)
+{
+        struct gdlm_lock *lp = lock;
+        clear_bit(LFL_DLM_CANCEL, &lp->flags);
+        if (flags & LM_FLAG_NOEXP)
+                set_bit(LFL_NOBLOCK, &lp->flags);
+        check_cur_state(lp, cur_state);
+        lp->req = make_mode(req_state);
+        lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
+        return gdlm_do_lock(lp);
+}
+unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
+{
+        struct gdlm_lock *lp = lock;
+        clear_bit(LFL_DLM_CANCEL, &lp->flags);
+        if (lp->cur == DLM_LOCK_IV)
+                return 0;
+        return gdlm_do_unlock(lp);
+}
+void gdlm_cancel(void *lock)
+{
+        struct gdlm_lock *lp = lock;
+        struct gdlm_ls *ls = lp->ls;
+        int error, delay_list = 0;
+        if (test_bit(LFL_DLM_CANCEL, &lp->flags))
+                return;
+        log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
+                 (unsigned long long)lp->lockname.ln_number, lp->flags);
+        spin_lock(&ls->async_lock);
+        if (!list_empty(&lp->delay_list)) {
+                list_del_init(&lp->delay_list);
+                delay_list = 1;
+        }
+        spin_unlock(&ls->async_lock);
+        if (delay_list) {
+                set_bit(LFL_CANCEL, &lp->flags);
+                set_bit(LFL_ACTIVE, &lp->flags);
+                queue_complete(lp);
+                return;
+        }
+        if (!test_bit(LFL_ACTIVE, &lp->flags) ||
+            test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+                log_info("gdlm_cancel skip %x,%llx flags %lx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number, lp->flags);
+                return;
+        }
+        /* the lock is blocked in the dlm */
+        set_bit(LFL_DLM_CANCEL, &lp->flags);
+        set_bit(LFL_ACTIVE, &lp->flags);
+        error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
+                           NULL, lp);
+        log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
+                 lp->lockname.ln_type,
+                 (unsigned long long)lp->lockname.ln_number, lp->flags);
+        if (error == -EBUSY)
+                clear_bit(LFL_DLM_CANCEL, &lp->flags);
+}
+static int gdlm_add_lvb(struct gdlm_lock *lp)
+{
+        char *lvb;
+        lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
+        if (!lvb)
+                return -ENOMEM;
+        lp->lksb.sb_lvbptr = lvb;
+        lp->lvb = lvb;
+        return 0;
+}
+static void gdlm_del_lvb(struct gdlm_lock *lp)
+{
+        kfree(lp->lvb);
+        lp->lvb = NULL;
+        lp->lksb.sb_lvbptr = NULL;
+}
+/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
+   the completion) because gfs won't call hold_lvb() during a callback (from
+   the context of a lock_dlm thread). */
+static int hold_null_lock(struct gdlm_lock *lp)
+{
+        struct gdlm_lock *lpn = NULL;
+        int error;
+        if (lp->hold_null) {
+                printk(KERN_INFO "lock_dlm: lvb already held\n");
+                return 0;
+        }
+        error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
+        if (error)
+                goto out;
+        lpn->lksb.sb_lvbptr = junk_lvb;
+        lpn->lvb = junk_lvb;
+        lpn->req = DLM_LOCK_NL;
+        lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
+        set_bit(LFL_NOBAST, &lpn->flags);
+        set_bit(LFL_INLOCK, &lpn->flags);
+        init_completion(&lpn->ast_wait);
+        gdlm_do_lock(lpn);
+        wait_for_completion(&lpn->ast_wait);
+        error = lpn->lksb.sb_status;
+        if (error) {
+                printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
+                       error);
+                gdlm_delete_lp(lpn);
+                lpn = NULL;
+        }
+out:
+        lp->hold_null = lpn;
+        return error;
+}
+/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
+   the completion) because gfs may call unhold_lvb() during a callback (from
+   the context of a lock_dlm thread) which could cause a deadlock since the
+   other lock_dlm thread could be engaged in recovery. */
+static void unhold_null_lock(struct gdlm_lock *lp)
+{
+        struct gdlm_lock *lpn = lp->hold_null;
+        gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
+                    (unsigned long long)lp->lockname.ln_number);
+        lpn->lksb.sb_lvbptr = NULL;
+        lpn->lvb = NULL;
+        set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
+        gdlm_do_unlock(lpn);
+        lp->hold_null = NULL;
+}
+/* Acquire a NL lock because gfs requires the value block to remain
+   intact on the resource while the lvb is "held" even if it's holding no locks
+   on the resource. */
+int gdlm_hold_lvb(void *lock, char **lvbp)
+{
+        struct gdlm_lock *lp = lock;
+        int error;
+        error = gdlm_add_lvb(lp);
+        if (error)
+                return error;
+        *lvbp = lp->lvb;
+        error = hold_null_lock(lp);
+        if (error)
+                gdlm_del_lvb(lp);
+        return error;
+}
+void gdlm_unhold_lvb(void *lock, char *lvb)
+{
+        struct gdlm_lock *lp = lock;
+        unhold_null_lock(lp);
+        gdlm_del_lvb(lp);
+}
+void gdlm_submit_delayed(struct gdlm_ls *ls)
+{
+        struct gdlm_lock *lp, *safe;
+        spin_lock(&ls->async_lock);
+        list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
+                list_del_init(&lp->delay_list);
+                list_add_tail(&lp->delay_list, &ls->submit);
+        }
+        spin_unlock(&ls->async_lock);
+        wake_up(&ls->thread_wait);
+}
+int gdlm_release_all_locks(struct gdlm_ls *ls)
+{
+        struct gdlm_lock *lp, *safe;
+        int count = 0;
+        spin_lock(&ls->async_lock);
+        list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
+                list_del_init(&lp->all_list);
+                if (lp->lvb && lp->lvb != junk_lvb)
+                        kfree(lp->lvb);
+                kfree(lp);
+                count++;
+        }
+        spin_unlock(&ls->async_lock);
+        return count;
+}
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..33af707a4d3f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef LOCK_DLM_DOT_H
+#define LOCK_DLM_DOT_H
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/socket.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/fcntl.h>
+#include <linux/wait.h>
+#include <net/sock.h>
+#include <linux/dlm.h>
+#include <linux/lm_interface.h>
+/*
+ * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
+ * prefix of lock_dlm_ gets awkward.  Externally, GFS refers to this module
+ * as "lock_dlm".
+ */
+#define GDLM_STRNAME_BYTES      24
+#define GDLM_LVB_SIZE           32
+#define GDLM_DROP_COUNT         50000
+#define GDLM_DROP_PERIOD        60
+#define GDLM_NAME_LEN           128
+/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
+   We sprintf these numbers into a 24 byte string of hex values to make them
+   human-readable (to make debugging simpler.) */
+struct gdlm_strname {
+        unsigned char           name[GDLM_STRNAME_BYTES];
+        unsigned short          namelen;
+};
+enum {
+        DFL_BLOCK_LOCKS         = 0,
+        DFL_SPECTATOR           = 1,
+        DFL_WITHDRAW            = 2,
+};
+struct gdlm_ls {
+        u32             id;
+        int                     jid;
+        int                     first;
+        int                     first_done;
+        unsigned long           flags;
+        struct kobject          kobj;
+        char                    clustername[GDLM_NAME_LEN];
+        char                    fsname[GDLM_NAME_LEN];
+        int                     fsflags;
+        dlm_lockspace_t         *dlm_lockspace;
+        lm_callback_t           fscb;
+        struct gfs2_sbd         *sdp;
+        int                     recover_jid;
+        int                     recover_jid_done;
+        int                     recover_jid_status;
+        spinlock_t              async_lock;
+        struct list_head        complete;
+        struct list_head        blocking;
+        struct list_head        delayed;
+        struct list_head        submit;
+        struct list_head        all_locks;
+        u32             all_locks_count;
+        wait_queue_head_t       wait_control;
+        struct task_struct      *thread1;
+        struct task_struct      *thread2;
+        wait_queue_head_t       thread_wait;
+        unsigned long           drop_time;
+        int                     drop_locks_count;
+        int                     drop_locks_period;
+};
+enum {
+        LFL_NOBLOCK             = 0,
+        LFL_NOCACHE             = 1,
+        LFL_DLM_UNLOCK          = 2,
+        LFL_DLM_CANCEL          = 3,
+        LFL_SYNC_LVB            = 4,
+        LFL_FORCE_PROMOTE       = 5,
+        LFL_REREQUEST           = 6,
+        LFL_ACTIVE              = 7,
+        LFL_INLOCK              = 8,
+        LFL_CANCEL              = 9,
+        LFL_NOBAST              = 10,
+        LFL_HEADQUE             = 11,
+        LFL_UNLOCK_DELETE       = 12,
+};
+struct gdlm_lock {
+        struct gdlm_ls          *ls;
+        struct lm_lockname      lockname;
+        char                    *lvb;
+        struct dlm_lksb         lksb;
+        s16                     cur;
+        s16                     req;
+        s16                     prev_req;
+        u32                     lkf;            /* dlm flags DLM_LKF_ */
+        unsigned long           flags;          /* lock_dlm flags LFL_ */
+        int                     bast_mode;      /* protected by async_lock */
+        struct completion       ast_wait;
+        struct list_head        clist;          /* complete */
+        struct list_head        blist;          /* blocking */
+        struct list_head        delay_list;     /* delayed */
+        struct list_head        all_list;       /* all locks for the fs */
+        struct gdlm_lock        *hold_null;     /* NL lock for hold_lvb */
+};
+#define gdlm_assert(assertion, fmt, args...)                                  \
+do {                                                                          \
+        if (unlikely(!(assertion))) {                                         \
+                printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
+                                  "lock_dlm:  " fmt "\n",                     \
+                                  #assertion, ##args);                        \
+                BUG();                                                        \
+        }                                                                     \
+} while (0)
+#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
+#define log_info(fmt, arg...)  log_print(KERN_INFO , fmt , ## arg)
+#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
+#ifdef LOCK_DLM_LOG_DEBUG
+#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
+#else
+#define log_debug(fmt, arg...)
+#endif
+/* sysfs.c */
+int gdlm_sysfs_init(void);
+void gdlm_sysfs_exit(void);
+int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
+void gdlm_kobject_release(struct gdlm_ls *);
+/* thread.c */
+int gdlm_init_threads(struct gdlm_ls *);
+void gdlm_release_threads(struct gdlm_ls *);
+/* lock.c */
+s16 gdlm_make_lmstate(s16);
+void gdlm_queue_delayed(struct gdlm_lock *);
+void gdlm_submit_delayed(struct gdlm_ls *);
+int gdlm_release_all_locks(struct gdlm_ls *);
+void gdlm_delete_lp(struct gdlm_lock *);
+unsigned int gdlm_do_lock(struct gdlm_lock *);
+int gdlm_get_lock(void *, struct lm_lockname *, void **);
+void gdlm_put_lock(void *);
+unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
+unsigned int gdlm_unlock(void *, unsigned int);
+void gdlm_cancel(void *);
+int gdlm_hold_lvb(void *, char **);
+void gdlm_unhold_lvb(void *, char *);
+/* plock.c */
+int gdlm_plock_init(void);
+void gdlm_plock_exit(void);
+int gdlm_plock(void *, struct lm_lockname *, struct file *, int,
+                struct file_lock *);
+int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
+                struct file_lock *);
+int gdlm_punlock(void *, struct lm_lockname *, struct file *,
+                struct file_lock *);
+#endif
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..2194b1d5b5ec
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/init.h>
+#include "lock_dlm.h"
+extern int gdlm_drop_count;
+extern int gdlm_drop_period;
+extern struct lm_lockops gdlm_ops;
+static int __init init_lock_dlm(void)
+{
+        int error;
+        error = gfs2_register_lockproto(&gdlm_ops);
+        if (error) {
+                printk(KERN_WARNING "lock_dlm:  can't register protocol: %d\n",
+                       error);
+                return error;
+        }
+        error = gdlm_sysfs_init();
+        if (error) {
+                gfs2_unregister_lockproto(&gdlm_ops);
+                return error;
+        }
+        error = gdlm_plock_init();
+        if (error) {
+                gdlm_sysfs_exit();
+                gfs2_unregister_lockproto(&gdlm_ops);
+                return error;
+        }
+        gdlm_drop_count = GDLM_DROP_COUNT;
+        gdlm_drop_period = GDLM_DROP_PERIOD;
+        printk(KERN_INFO
+               "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
+        return 0;
+}
+static void __exit exit_lock_dlm(void)
+{
+        gdlm_plock_exit();
+        gdlm_sysfs_exit();
+        gfs2_unregister_lockproto(&gdlm_ops);
+}
+module_init(init_lock_dlm);
+module_exit(exit_lock_dlm);
+MODULE_DESCRIPTION("GFS DLM Locking Module");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..1f94dd35a943
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include "lock_dlm.h"
+int gdlm_drop_count;
+int gdlm_drop_period;
+const struct lm_lockops gdlm_ops;
+static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
+                                 int flags, char *table_name)
+{
+        struct gdlm_ls *ls;
+        char buf[256], *p;
+        ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
+        if (!ls)
+                return NULL;
+        ls->drop_locks_count = gdlm_drop_count;
+        ls->drop_locks_period = gdlm_drop_period;
+        ls->fscb = cb;
+        ls->sdp = sdp;
+        ls->fsflags = flags;
+        spin_lock_init(&ls->async_lock);
+        INIT_LIST_HEAD(&ls->complete);
+        INIT_LIST_HEAD(&ls->blocking);
+        INIT_LIST_HEAD(&ls->delayed);
+        INIT_LIST_HEAD(&ls->submit);
+        INIT_LIST_HEAD(&ls->all_locks);
+        init_waitqueue_head(&ls->thread_wait);
+        init_waitqueue_head(&ls->wait_control);
+        ls->thread1 = NULL;
+        ls->thread2 = NULL;
+        ls->drop_time = jiffies;
+        ls->jid = -1;
+        strncpy(buf, table_name, 256);
+        buf[255] = '\0';
+        p = strstr(buf, ":");
+        if (!p) {
+                log_info("invalid table_name \"%s\"", table_name);
+                kfree(ls);
+                return NULL;
+        }
+        *p = '\0';
+        p++;
+        strncpy(ls->clustername, buf, GDLM_NAME_LEN);
+        strncpy(ls->fsname, p, GDLM_NAME_LEN);
+        return ls;
+}
+static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
+{
+        char data[256];
+        char *options, *x, *y;
+        int error = 0;
+        memset(data, 0, 256);
+        strncpy(data, data_arg, 255);
+        for (options = data; (x = strsep(&options, ":")); ) {
+                if (!*x)
+                        continue;
+                y = strchr(x, '=');
+                if (y)
+                        *y++ = 0;
+                if (!strcmp(x, "jid")) {
+                        if (!y) {
+                                log_error("need argument to jid");
+                                error = -EINVAL;
+                                break;
+                        }
+                        sscanf(y, "%u", &ls->jid);
+                } else if (!strcmp(x, "first")) {
+                        if (!y) {
+                                log_error("need argument to first");
+                                error = -EINVAL;
+                                break;
+                        }
+                        sscanf(y, "%u", &ls->first);
+                } else if (!strcmp(x, "id")) {
+                        if (!y) {
+                                log_error("need argument to id");
+                                error = -EINVAL;
+                                break;
+                        }
+                        sscanf(y, "%u", &ls->id);
+                } else if (!strcmp(x, "nodir")) {
+                        if (!y) {
+                                log_error("need argument to nodir");
+                                error = -EINVAL;
+                                break;
+                        }
+                        sscanf(y, "%u", nodir);
+                } else {
+                        log_error("unkonwn option: %s", x);
+                        error = -EINVAL;
+                        break;
+                }
+        }
+        return error;
+}
+static int gdlm_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj)
+{
+        struct gdlm_ls *ls;
+        int error = -ENOMEM, nodir = 0;
+        if (min_lvb_size > GDLM_LVB_SIZE)
+                goto out;
+        ls = init_gdlm(cb, cb_data, flags, table_name);
+        if (!ls)
+                goto out;
+        error = make_args(ls, host_data, &nodir);
+        if (error)
+                goto out;
+        error = gdlm_init_threads(ls);
+        if (error)
+                goto out_free;
+        error = gdlm_kobject_setup(ls, fskobj);
+        if (error)
+                goto out_thread;
+        error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
+                                  &ls->dlm_lockspace,
+                                  nodir ? DLM_LSFL_NODIR : 0,
+                                  GDLM_LVB_SIZE);
+        if (error) {
+                log_error("dlm_new_lockspace error %d", error);
+                goto out_kobj;
+        }
+        lockstruct->ls_jid = ls->jid;
+        lockstruct->ls_first = ls->first;
+        lockstruct->ls_lockspace = ls;
+        lockstruct->ls_ops = &gdlm_ops;
+        lockstruct->ls_flags = 0;
+        lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
+        return 0;
+out_kobj:
+        gdlm_kobject_release(ls);
+out_thread:
+        gdlm_release_threads(ls);
+out_free:
+        kfree(ls);
+out:
+        return error;
+}
+static void gdlm_unmount(void *lockspace)
+{
+        struct gdlm_ls *ls = lockspace;
+        int rv;
+        log_debug("unmount flags %lx", ls->flags);
+        /* FIXME: serialize unmount and withdraw in case they
+           happen at once.  Also, if unmount follows withdraw,
+           wait for withdraw to finish. */
+        if (test_bit(DFL_WITHDRAW, &ls->flags))
+                goto out;
+        gdlm_kobject_release(ls);
+        dlm_release_lockspace(ls->dlm_lockspace, 2);
+        gdlm_release_threads(ls);
+        rv = gdlm_release_all_locks(ls);
+        if (rv)
+                log_info("gdlm_unmount: %d stray locks freed", rv);
+out:
+        kfree(ls);
+}
+static void gdlm_recovery_done(void *lockspace, unsigned int jid,
+                               unsigned int message)
+{
+        struct gdlm_ls *ls = lockspace;
+        ls->recover_jid_done = jid;
+        ls->recover_jid_status = message;
+        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+}
+static void gdlm_others_may_mount(void *lockspace)
+{
+        struct gdlm_ls *ls = lockspace;
+        ls->first_done = 1;
+        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+}
+/* Userspace gets the offline uevent, blocks new gfs locks on
+   other mounters, and lets us know (sets WITHDRAW flag).  Then,
+   userspace leaves the mount group while we leave the lockspace. */
+static void gdlm_withdraw(void *lockspace)
+{
+        struct gdlm_ls *ls = lockspace;
+        kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
+        wait_event_interruptible(ls->wait_control,
+                                 test_bit(DFL_WITHDRAW, &ls->flags));
+        dlm_release_lockspace(ls->dlm_lockspace, 2);
+        gdlm_release_threads(ls);
+        gdlm_release_all_locks(ls);
+        gdlm_kobject_release(ls);
+}
+const struct lm_lockops gdlm_ops = {
+        .lm_proto_name = "lock_dlm",
+        .lm_mount = gdlm_mount,
+        .lm_others_may_mount = gdlm_others_may_mount,
+        .lm_unmount = gdlm_unmount,
+        .lm_withdraw = gdlm_withdraw,
+        .lm_get_lock = gdlm_get_lock,
+        .lm_put_lock = gdlm_put_lock,
+        .lm_lock = gdlm_lock,
+        .lm_unlock = gdlm_unlock,
+        .lm_plock = gdlm_plock,
+        .lm_punlock = gdlm_punlock,
+        .lm_plock_get = gdlm_plock_get,
+        .lm_cancel = gdlm_cancel,
+        .lm_hold_lvb = gdlm_hold_lvb,
+        .lm_unhold_lvb = gdlm_unhold_lvb,
+        .lm_recovery_done = gdlm_recovery_done,
+        .lm_owner = THIS_MODULE,
+};
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..7365aec9511b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/miscdevice.h>
+#include <linux/lock_dlm_plock.h>
+#include "lock_dlm.h"
+static spinlock_t ops_lock;
+static struct list_head send_list;
+static struct list_head recv_list;
+static wait_queue_head_t send_wq;
+static wait_queue_head_t recv_wq;
+struct plock_op {
+        struct list_head list;
+        int done;
+        struct gdlm_plock_info info;
+};
+static inline void set_version(struct gdlm_plock_info *info)
+{
+        info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
+        info->version[1] = GDLM_PLOCK_VERSION_MINOR;
+        info->version[2] = GDLM_PLOCK_VERSION_PATCH;
+}
+static int check_version(struct gdlm_plock_info *info)
+{
+        if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
+            (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
+                log_error("plock device version mismatch: "
+                          "kernel (%u.%u.%u), user (%u.%u.%u)",
+                          GDLM_PLOCK_VERSION_MAJOR,
+                          GDLM_PLOCK_VERSION_MINOR,
+                          GDLM_PLOCK_VERSION_PATCH,
+                          info->version[0],
+                          info->version[1],
+                          info->version[2]);
+                return -EINVAL;
+        }
+        return 0;
+}
+static void send_op(struct plock_op *op)
+{
+        set_version(&op->info);
+        INIT_LIST_HEAD(&op->list);
+        spin_lock(&ops_lock);
+        list_add_tail(&op->list, &send_list);
+        spin_unlock(&ops_lock);
+        wake_up(&send_wq);
+}
+int gdlm_plock(void *lockspace, struct lm_lockname *name,
+               struct file *file, int cmd, struct file_lock *fl)
+{
+        struct gdlm_ls *ls = lockspace;
+        struct plock_op *op;
+        int rv;
+        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        if (!op)
+                return -ENOMEM;
+        op->info.optype         = GDLM_PLOCK_OP_LOCK;
+        op->info.pid            = fl->fl_pid;
+        op->info.ex             = (fl->fl_type == F_WRLCK);
+        op->info.wait           = IS_SETLKW(cmd);
+        op->info.fsid           = ls->id;
+        op->info.number         = name->ln_number;
+        op->info.start          = fl->fl_start;
+        op->info.end            = fl->fl_end;
+        op->info.owner          = (__u64)(long) fl->fl_owner;
+        send_op(op);
+        wait_event(recv_wq, (op->done != 0));
+        spin_lock(&ops_lock);
+        if (!list_empty(&op->list)) {
+                printk(KERN_INFO "plock op on list\n");
+                list_del(&op->list);
+        }
+        spin_unlock(&ops_lock);
+        rv = op->info.rv;
+        if (!rv) {
+                if (posix_lock_file_wait(file, fl) < 0)
+                        log_error("gdlm_plock: vfs lock error %x,%llx",
+                                  name->ln_type,
+                                  (unsigned long long)name->ln_number);
+        }
+        kfree(op);
+        return rv;
+}
+int gdlm_punlock(void *lockspace, struct lm_lockname *name,
+                 struct file *file, struct file_lock *fl)
+{
+        struct gdlm_ls *ls = lockspace;
+        struct plock_op *op;
+        int rv;
+        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        if (!op)
+                return -ENOMEM;
+        if (posix_lock_file_wait(file, fl) < 0)
+                log_error("gdlm_punlock: vfs unlock error %x,%llx",
+                          name->ln_type, (unsigned long long)name->ln_number);
+        op->info.optype         = GDLM_PLOCK_OP_UNLOCK;
+        op->info.pid            = fl->fl_pid;
+        op->info.fsid           = ls->id;
+        op->info.number         = name->ln_number;
+        op->info.start          = fl->fl_start;
+        op->info.end            = fl->fl_end;
+        op->info.owner          = (__u64)(long) fl->fl_owner;
+        send_op(op);
+        wait_event(recv_wq, (op->done != 0));
+        spin_lock(&ops_lock);
+        if (!list_empty(&op->list)) {
+                printk(KERN_INFO "punlock op on list\n");
+                list_del(&op->list);
+        }
+        spin_unlock(&ops_lock);
+        rv = op->info.rv;
+        kfree(op);
+        return rv;
+}
+int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
+                   struct file *file, struct file_lock *fl)
+{
+        struct gdlm_ls *ls = lockspace;
+        struct plock_op *op;
+        int rv;
+        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        if (!op)
+                return -ENOMEM;
+        op->info.optype         = GDLM_PLOCK_OP_GET;
+        op->info.pid            = fl->fl_pid;
+        op->info.ex             = (fl->fl_type == F_WRLCK);
+        op->info.fsid           = ls->id;
+        op->info.number         = name->ln_number;
+        op->info.start          = fl->fl_start;
+        op->info.end            = fl->fl_end;
+        send_op(op);
+        wait_event(recv_wq, (op->done != 0));
+        spin_lock(&ops_lock);
+        if (!list_empty(&op->list)) {
+                printk(KERN_INFO "plock_get op on list\n");
+                list_del(&op->list);
+        }
+        spin_unlock(&ops_lock);
+        rv = op->info.rv;
+        if (rv == 0)
+                fl->fl_type = F_UNLCK;
+        else if (rv > 0) {
+                fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+                fl->fl_pid = op->info.pid;
+                fl->fl_start = op->info.start;
+                fl->fl_end = op->info.end;
+        }
+        kfree(op);
+        return rv;
+}
+/* a read copies out one plock request from the send list */
+static ssize_t dev_read(struct file *file, char __user *u, size_t count,
+                        loff_t *ppos)
+{
+        struct gdlm_plock_info info;
+        struct plock_op *op = NULL;
+        if (count < sizeof(info))
+                return -EINVAL;
+        spin_lock(&ops_lock);
+        if (!list_empty(&send_list)) {
+                op = list_entry(send_list.next, struct plock_op, list);
+                list_move(&op->list, &recv_list);
+                memcpy(&info, &op->info, sizeof(info));
+        }
+        spin_unlock(&ops_lock);
+        if (!op)
+                return -EAGAIN;
+        if (copy_to_user(u, &info, sizeof(info)))
+                return -EFAULT;
+        return sizeof(info);
+}
+/* a write copies in one plock result that should match a plock_op
+   on the recv list */
+static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
+                         loff_t *ppos)
+{
+        struct gdlm_plock_info info;
+        struct plock_op *op;
+        int found = 0;
+        if (count != sizeof(info))
+                return -EINVAL;
+        if (copy_from_user(&info, u, sizeof(info)))
+                return -EFAULT;
+        if (check_version(&info))
+                return -EINVAL;
+        spin_lock(&ops_lock);
+        list_for_each_entry(op, &recv_list, list) {
+                if (op->info.fsid == info.fsid && op->info.number == info.number &&
+                    op->info.owner == info.owner) {
+                        list_del_init(&op->list);
+                        found = 1;
+                        op->done = 1;
+                        memcpy(&op->info, &info, sizeof(info));
+                        break;
+                }
+        }
+        spin_unlock(&ops_lock);
+        if (found)
+                wake_up(&recv_wq);
+        else
+                printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
+                        (unsigned long long)info.number);
+        return count;
+}
+static unsigned int dev_poll(struct file *file, poll_table *wait)
+{
+        poll_wait(file, &send_wq, wait);
+        spin_lock(&ops_lock);
+        if (!list_empty(&send_list)) {
+                spin_unlock(&ops_lock);
+                return POLLIN | POLLRDNORM;
+        }
+        spin_unlock(&ops_lock);
+        return 0;
+}
+static struct file_operations dev_fops = {
+        .read    = dev_read,
+        .write   = dev_write,
+        .poll    = dev_poll,
+        .owner   = THIS_MODULE
+};
+static struct miscdevice plock_dev_misc = {
+        .minor = MISC_DYNAMIC_MINOR,
+        .name = GDLM_PLOCK_MISC_NAME,
+        .fops = &dev_fops
+};
+int gdlm_plock_init(void)
+{
+        int rv;
+        spin_lock_init(&ops_lock);
+        INIT_LIST_HEAD(&send_list);
+        INIT_LIST_HEAD(&recv_list);
+        init_waitqueue_head(&send_wq);
+        init_waitqueue_head(&recv_wq);
+        rv = misc_register(&plock_dev_misc);
+        if (rv)
+                printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
+                       rv);
+        return rv;
+}
+void gdlm_plock_exit(void)
+{
+        if (misc_deregister(&plock_dev_misc) < 0)
+                printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
+}
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..29ae06f94944
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/ctype.h>
+#include <linux/stat.h>
+#include "lock_dlm.h"
+extern struct lm_lockops gdlm_ops;
+static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
+}
+static ssize_t block_show(struct gdlm_ls *ls, char *buf)
+{
+        ssize_t ret;
+        int val = 0;
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
+                val = 1;
+        ret = sprintf(buf, "%d\n", val);
+        return ret;
+}
+static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+        ssize_t ret = len;
+        int val;
+        val = simple_strtol(buf, NULL, 0);
+        if (val == 1)
+                set_bit(DFL_BLOCK_LOCKS, &ls->flags);
+        else if (val == 0) {
+                clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
+                gdlm_submit_delayed(ls);
+        } else {
+                ret = -EINVAL;
+        }
+        return ret;
+}
+static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
+{
+        ssize_t ret;
+        int val = 0;
+        if (test_bit(DFL_WITHDRAW, &ls->flags))
+                val = 1;
+        ret = sprintf(buf, "%d\n", val);
+        return ret;
+}
+static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+        ssize_t ret = len;
+        int val;
+        val = simple_strtol(buf, NULL, 0);
+        if (val == 1)
+                set_bit(DFL_WITHDRAW, &ls->flags);
+        else
+                ret = -EINVAL;
+        wake_up(&ls->wait_control);
+        return ret;
+}
+static ssize_t id_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%u\n", ls->id);
+}
+static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->jid);
+}
+static ssize_t first_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->first);
+}
+static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->first_done);
+}
+static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->recover_jid);
+}
+static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+        ls->recover_jid = simple_strtol(buf, NULL, 0);
+        ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
+        return len;
+}
+static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->recover_jid_done);
+}
+static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->recover_jid_status);
+}
+struct gdlm_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gdlm_ls *, char *);
+        ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
+};
+#define GDLM_ATTR(_name,_mode,_show,_store) \
+static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
+GDLM_ATTR(block,          0644, block_show,          block_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
+GDLM_ATTR(id,             0444, id_show,             NULL);
+GDLM_ATTR(jid,            0444, jid_show,            NULL);
+GDLM_ATTR(first,          0444, first_show,          NULL);
+GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
+GDLM_ATTR(recover,        0644, recover_show,        recover_store);
+GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+static struct attribute *gdlm_attrs[] = {
+        &gdlm_attr_proto_name.attr,
+        &gdlm_attr_block.attr,
+        &gdlm_attr_withdraw.attr,
+        &gdlm_attr_id.attr,
+        &gdlm_attr_jid.attr,
+        &gdlm_attr_first.attr,
+        &gdlm_attr_first_done.attr,
+        &gdlm_attr_recover.attr,
+        &gdlm_attr_recover_done.attr,
+        &gdlm_attr_recover_status.attr,
+        NULL,
+};
+static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+        struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
+        return a->show ? a->show(ls, buf) : 0;
+}
+static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+        struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
+        return a->store ? a->store(ls, buf, len) : len;
+}
+static struct sysfs_ops gdlm_attr_ops = {
+        .show  = gdlm_attr_show,
+        .store = gdlm_attr_store,
+};
+static struct kobj_type gdlm_ktype = {
+        .default_attrs = gdlm_attrs,
+        .sysfs_ops     = &gdlm_attr_ops,
+};
+static struct kset gdlm_kset = {
+        .subsys = &kernel_subsys,
+        .kobj   = {.name = "lock_dlm",},
+        .ktype  = &gdlm_ktype,
+};
+int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
+{
+        int error;
+        error = kobject_set_name(&ls->kobj, "%s", "lock_module");
+        if (error) {
+                log_error("can't set kobj name %d", error);
+                return error;
+        }
+        ls->kobj.kset = &gdlm_kset;
+        ls->kobj.ktype = &gdlm_ktype;
+        ls->kobj.parent = fskobj;
+        error = kobject_register(&ls->kobj);
+        if (error)
+                log_error("can't register kobj %d", error);
+        return error;
+}
+void gdlm_kobject_release(struct gdlm_ls *ls)
+{
+        kobject_unregister(&ls->kobj);
+}
+int gdlm_sysfs_init(void)
+{
+        int error;
+        error = kset_register(&gdlm_kset);
+        if (error)
+                printk("lock_dlm: cannot register kset %d\n", error);
+        return error;
+}
+void gdlm_sysfs_exit(void)
+{
+        kset_unregister(&gdlm_kset);
+}
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..9cf1f168eaf8
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include "lock_dlm.h"
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+   thread gets to it. */
+static void queue_submit(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        spin_lock(&ls->async_lock);
+        list_add_tail(&lp->delay_list, &ls->submit);
+        spin_unlock(&ls->async_lock);
+        wake_up(&ls->thread_wait);
+}
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+        struct gdlm_ls *ls = lp->ls;
+        unsigned int cb = 0;
+        switch (gdlm_make_lmstate(bast_mode)) {
+        case LM_ST_EXCLUSIVE:
+                cb = LM_CB_NEED_E;
+                break;
+        case LM_ST_DEFERRED:
+                cb = LM_CB_NEED_D;
+                break;
+        case LM_ST_SHARED:
+                cb = LM_CB_NEED_S;
+                break;
+        default:
+                gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
+        }
+        ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+static void process_complete(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        struct lm_async_cb acb;
+        s16 prev_mode = lp->cur;
+        memset(&acb, 0, sizeof(acb));
+        if (lp->lksb.sb_status == -DLM_ECANCEL) {
+                log_info("complete dlm cancel %x,%llx flags %lx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                if (lp->cur == DLM_LOCK_IV)
+                        lp->lksb.sb_lkid = 0;
+                goto out;
+        }
+        if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+                if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+                        log_info("unlock sb_status %d %x,%llx flags %lx",
+                                 lp->lksb.sb_status, lp->lockname.ln_type,
+                                 (unsigned long long)lp->lockname.ln_number,
+                                 lp->flags);
+                        return;
+                }
+                lp->cur = DLM_LOCK_IV;
+                lp->req = DLM_LOCK_IV;
+                lp->lksb.sb_lkid = 0;
+                if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+                        gdlm_delete_lp(lp);
+                        return;
+                }
+                goto out;
+        }
+        if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+                memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+        if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+                if (lp->req == DLM_LOCK_PR)
+                        lp->req = DLM_LOCK_CW;
+                else if (lp->req == DLM_LOCK_CW)
+                        lp->req = DLM_LOCK_PR;
+        }
+        /*
+         * A canceled lock request.  The lock was just taken off the delayed
+         * list and was never even submitted to dlm.
+         */
+        if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+                log_info("complete internal cancel %x,%llx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                goto out;
+        }
+        /*
+         * An error occured.
+         */
+        if (lp->lksb.sb_status) {
+                /* a "normal" error */
+                if ((lp->lksb.sb_status == -EAGAIN) &&
+                    (lp->lkf & DLM_LKF_NOQUEUE)) {
+                        lp->req = lp->cur;
+                        if (lp->cur == DLM_LOCK_IV)
+                                lp->lksb.sb_lkid = 0;
+                        goto out;
+                }
+                /* this could only happen with cancels I think */
+                log_info("ast sb_status %d %x,%llx flags %lx",
+                         lp->lksb.sb_status, lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                return;
+        }
+        /*
+         * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+         */
+        if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+                complete(&lp->ast_wait);
+                return;
+        }
+        /*
+         * A lock has been demoted to NL because it initially completed during
+         * BLOCK_LOCKS.  Now it must be requested in the originally requested
+         * mode.
+         */
+        if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+                gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                lp->cur = DLM_LOCK_NL;
+                lp->req = lp->prev_req;
+                lp->prev_req = DLM_LOCK_IV;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                set_bit(LFL_NOCACHE, &lp->flags);
+                if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+                    !test_bit(LFL_NOBLOCK, &lp->flags))
+                        gdlm_queue_delayed(lp);
+                else
+                        queue_submit(lp);
+                return;
+        }
+        /*
+         * A request is granted during dlm recovery.  It may be granted
+         * because the locks of a failed node were cleared.  In that case,
+         * there may be inconsistent data beneath this lock and we must wait
+         * for recovery to complete to use it.  When gfs recovery is done this
+         * granted lock will be converted to NL and then reacquired in this
+         * granted state.
+         */
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+            !test_bit(LFL_NOBLOCK, &lp->flags) &&
+            lp->req != DLM_LOCK_NL) {
+                lp->cur = lp->req;
+                lp->prev_req = lp->req;
+                lp->req = DLM_LOCK_NL;
+                lp->lkf |= DLM_LKF_CONVERT;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                log_debug("rereq %x,%llx id %x %d,%d",
+                          lp->lockname.ln_type,
+                          (unsigned long long)lp->lockname.ln_number,
+                          lp->lksb.sb_lkid, lp->cur, lp->req);
+                set_bit(LFL_REREQUEST, &lp->flags);
+                queue_submit(lp);
+                return;
+        }
+        /*
+         * DLM demoted the lock to NL before it was granted so GFS must be
+         * told it cannot cache data for this lock.
+         */
+        if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+                set_bit(LFL_NOCACHE, &lp->flags);
+out:
+        /*
+         * This is an internal lock_dlm lock
+         */
+        if (test_bit(LFL_INLOCK, &lp->flags)) {
+                clear_bit(LFL_NOBLOCK, &lp->flags);
+                lp->cur = lp->req;
+                complete(&lp->ast_wait);
+                return;
+        }
+        /*
+         * Normal completion of a lock request.  Tell GFS it now has the lock.
+         */
+        clear_bit(LFL_NOBLOCK, &lp->flags);
+        lp->cur = lp->req;
+        acb.lc_name = lp->lockname;
+        acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+        if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
+            (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
+                acb.lc_ret |= LM_OUT_CACHEABLE;
+        ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+static inline int no_work(struct gdlm_ls *ls, int blocking)
+{
+        int ret;
+        spin_lock(&ls->async_lock);
+        ret = list_empty(&ls->complete) && list_empty(&ls->submit);
+        if (ret && blocking)
+                ret = list_empty(&ls->blocking);
+        spin_unlock(&ls->async_lock);
+        return ret;
+}
+static inline int check_drop(struct gdlm_ls *ls)
+{
+        if (!ls->drop_locks_count)
+                return 0;
+        if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
+                ls->drop_time = jiffies;
+                if (ls->all_locks_count >= ls->drop_locks_count)
+                        return 1;
+        }
+        return 0;
+}
+static int gdlm_thread(void *data)
+{
+        struct gdlm_ls *ls = (struct gdlm_ls *) data;
+        struct gdlm_lock *lp = NULL;
+        int blist = 0;
+        uint8_t complete, blocking, submit, drop;
+        DECLARE_WAITQUEUE(wait, current);
+        /* Only thread1 is allowed to do blocking callbacks since gfs
+           may wait for a completion callback within a blocking cb. */
+        if (current == ls->thread1)
+                blist = 1;
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue(&ls->thread_wait, &wait);
+                if (no_work(ls, blist))
+                        schedule();
+                remove_wait_queue(&ls->thread_wait, &wait);
+                set_current_state(TASK_RUNNING);
+                complete = blocking = submit = drop = 0;
+                spin_lock(&ls->async_lock);
+                if (blist && !list_empty(&ls->blocking)) {
+                        lp = list_entry(ls->blocking.next, struct gdlm_lock,
+                                        blist);
+                        list_del_init(&lp->blist);
+                        blocking = lp->bast_mode;
+                        lp->bast_mode = 0;
+                } else if (!list_empty(&ls->complete)) {
+                        lp = list_entry(ls->complete.next, struct gdlm_lock,
+                                        clist);
+                        list_del_init(&lp->clist);
+                        complete = 1;
+                } else if (!list_empty(&ls->submit)) {
+                        lp = list_entry(ls->submit.next, struct gdlm_lock,
+                                        delay_list);
+                        list_del_init(&lp->delay_list);
+                        submit = 1;
+                }
+                drop = check_drop(ls);
+                spin_unlock(&ls->async_lock);
+                if (complete)
+                        process_complete(lp);
+                else if (blocking)
+                        process_blocking(lp, blocking);
+                else if (submit)
+                        gdlm_do_lock(lp);
+                if (drop)
+                        ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
+                schedule();
+        }
+        return 0;
+}
+int gdlm_init_threads(struct gdlm_ls *ls)
+{
+        struct task_struct *p;
+        int error;
+        p = kthread_run(gdlm_thread, ls, "lock_dlm1");
+        error = IS_ERR(p);
+        if (error) {
+                log_error("can't start lock_dlm1 thread %d", error);
+                return error;
+        }
+        ls->thread1 = p;
+        p = kthread_run(gdlm_thread, ls, "lock_dlm2");
+        error = IS_ERR(p);
+        if (error) {
+                log_error("can't start lock_dlm2 thread %d", error);
+                kthread_stop(ls->thread1);
+                return error;
+        }
+        ls->thread2 = p;
+        return 0;
+}
+void gdlm_release_threads(struct gdlm_ls *ls)
+{
+        kthread_stop(ls->thread1);
+        kthread_stop(ls->thread2);
+}
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..35e9730bc3a8
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
+lock_nolock-y := main.o
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..acfbc941f319
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/lm_interface.h>
+struct nolock_lockspace {
+        unsigned int nl_lvb_size;
+};
+static const struct lm_lockops nolock_ops;
+static int nolock_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj)
+{
+        char *c;
+        unsigned int jid;
+        struct nolock_lockspace *nl;
+        c = strstr(host_data, "jid=");
+        if (!c)
+                jid = 0;
+        else {
+                c += 4;
+                sscanf(c, "%u", &jid);
+        }
+        nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
+        if (!nl)
+                return -ENOMEM;
+        nl->nl_lvb_size = min_lvb_size;
+        lockstruct->ls_jid = jid;
+        lockstruct->ls_first = 1;
+        lockstruct->ls_lvb_size = min_lvb_size;
+        lockstruct->ls_lockspace = nl;
+        lockstruct->ls_ops = &nolock_ops;
+        lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+        return 0;
+}
+static void nolock_others_may_mount(void *lockspace)
+{
+}
+static void nolock_unmount(void *lockspace)
+{
+        struct nolock_lockspace *nl = lockspace;
+        kfree(nl);
+}
+static void nolock_withdraw(void *lockspace)
+{
+}
+/**
+ * nolock_get_lock - get a lm_lock_t given a descripton of the lock
+ * @lockspace: the lockspace the lock lives in
+ * @name: the name of the lock
+ * @lockp: return the lm_lock_t here
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
+                           void **lockp)
+{
+        *lockp = lockspace;
+        return 0;
+}
+/**
+ * nolock_put_lock - get rid of a lock structure
+ * @lock: the lock to throw away
+ *
+ */
+static void nolock_put_lock(void *lock)
+{
+}
+/**
+ * nolock_lock - acquire a lock
+ * @lock: the lock to manipulate
+ * @cur_state: the current state
+ * @req_state: the requested state
+ * @flags: modifier flags
+ *
+ * Returns: A bitmap of LM_OUT_*
+ */
+static unsigned int nolock_lock(void *lock, unsigned int cur_state,
+                                unsigned int req_state, unsigned int flags)
+{
+        return req_state | LM_OUT_CACHEABLE;
+}
+/**
+ * nolock_unlock - unlock a lock
+ * @lock: the lock to manipulate
+ * @cur_state: the current state
+ *
+ * Returns: 0
+ */
+static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
+{
+        return 0;
+}
+static void nolock_cancel(void *lock)
+{
+}
+/**
+ * nolock_hold_lvb - hold on to a lock value block
+ * @lock: the lock the LVB is associated with
+ * @lvbp: return the lm_lvb_t here
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+static int nolock_hold_lvb(void *lock, char **lvbp)
+{
+        struct nolock_lockspace *nl = lock;
+        int error = 0;
+        *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
+        if (!*lvbp)
+                error = -ENOMEM;
+        return error;
+}
+/**
+ * nolock_unhold_lvb - release a LVB
+ * @lock: the lock the LVB is associated with
+ * @lvb: the lock value block
+ *
+ */
+static void nolock_unhold_lvb(void *lock, char *lvb)
+{
+        kfree(lvb);
+}
+static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
+                            struct file *file, struct file_lock *fl)
+{
+        struct file_lock tmp;
+        int ret;
+        ret = posix_test_lock(file, fl, &tmp);
+        fl->fl_type = F_UNLCK;
+        if (ret)
+                memcpy(fl, &tmp, sizeof(struct file_lock));
+        return 0;
+}
+static int nolock_plock(void *lockspace, struct lm_lockname *name,
+                        struct file *file, int cmd, struct file_lock *fl)
+{
+        int error;
+        error = posix_lock_file_wait(file, fl);
+        return error;
+}
+static int nolock_punlock(void *lockspace, struct lm_lockname *name,
+                          struct file *file, struct file_lock *fl)
+{
+        int error;
+        error = posix_lock_file_wait(file, fl);
+        return error;
+}
+static void nolock_recovery_done(void *lockspace, unsigned int jid,
+                                 unsigned int message)
+{
+}
+static const struct lm_lockops nolock_ops = {
+        .lm_proto_name = "lock_nolock",
+        .lm_mount = nolock_mount,
+        .lm_others_may_mount = nolock_others_may_mount,
+        .lm_unmount = nolock_unmount,
+        .lm_withdraw = nolock_withdraw,
+        .lm_get_lock = nolock_get_lock,
+        .lm_put_lock = nolock_put_lock,
+        .lm_lock = nolock_lock,
+        .lm_unlock = nolock_unlock,
+        .lm_cancel = nolock_cancel,
+        .lm_hold_lvb = nolock_hold_lvb,
+        .lm_unhold_lvb = nolock_unhold_lvb,
+        .lm_plock_get = nolock_plock_get,
+        .lm_plock = nolock_plock,
+        .lm_punlock = nolock_punlock,
+        .lm_recovery_done = nolock_recovery_done,
+        .lm_owner = THIS_MODULE,
+};
+static int __init init_nolock(void)
+{
+        int error;
+        error = gfs2_register_lockproto(&nolock_ops);
+        if (error) {
+                printk(KERN_WARNING
+                       "lock_nolock: can't register protocol: %d\n", error);
+                return error;
+        }
+        printk(KERN_INFO
+               "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
+        return 0;
+}
+static void __exit exit_nolock(void)
+{
+        gfs2_unregister_lockproto(&nolock_ops);
+}
+module_init(init_nolock);
+module_exit(exit_nolock);
+MODULE_DESCRIPTION("GFS Nolock Locking Module");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..554fe5bd1b72
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,687 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "util.h"
+#include "dir.h"
+#define PULL 1
+/**
+ * gfs2_struct2blk - compute stuff
+ * @sdp: the filesystem
+ * @nstruct: the number of structures
+ * @ssize: the size of the structures
+ *
+ * Compute the number of log descriptor blocks needed to hold a certain number
+ * of structures of a certain size.
+ *
+ * Returns: the number of blocks needed (minimum is always 1)
+ */
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+                             unsigned int ssize)
+{
+        unsigned int blks;
+        unsigned int first, second;
+        blks = 1;
+        first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
+        if (nstruct > first) {
+                second = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_meta_header)) / ssize;
+                blks += DIV_ROUND_UP(nstruct - first, second);
+        }
+        return blks;
+}
+/**
+ * gfs2_ail1_start_one - Start I/O on a part of the AIL
+ * @sdp: the filesystem
+ * @tr: the part of the AIL
+ *
+ */
+static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct gfs2_bufdata *bd, *s;
+        struct buffer_head *bh;
+        int retry;
+        BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
+        do {
+                retry = 0;
+                list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+                                                 bd_ail_st_list) {
+                        bh = bd->bd_bh;
+                        gfs2_assert(sdp, bd->bd_ail == ai);
+                        if (!buffer_busy(bh)) {
+                                if (!buffer_uptodate(bh)) {
+                                        gfs2_log_unlock(sdp);
+                                        gfs2_io_error_bh(sdp, bh);
+                                        gfs2_log_lock(sdp);
+                                }
+                                list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+                                continue;
+                        }
+                        if (!buffer_dirty(bh))
+                                continue;
+                        list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+                        gfs2_log_unlock(sdp);
+                        wait_on_buffer(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        gfs2_log_lock(sdp);
+                        retry = 1;
+                        break;
+                }
+        } while (retry);
+}
+/**
+ * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
+{
+        struct gfs2_bufdata *bd, *s;
+        struct buffer_head *bh;
+        list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+                                         bd_ail_st_list) {
+                bh = bd->bd_bh;
+                gfs2_assert(sdp, bd->bd_ail == ai);
+                if (buffer_busy(bh)) {
+                        if (flags & DIO_ALL)
+                                continue;
+                        else
+                                break;
+                }
+                if (!buffer_uptodate(bh))
+                        gfs2_io_error_bh(sdp, bh);
+                list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+        }
+        return list_empty(&ai->ai_ail1_list);
+}
+void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+{
+        struct list_head *head = &sdp->sd_ail1_list;
+        u64 sync_gen;
+        struct list_head *first;
+        struct gfs2_ail *first_ai, *ai, *tmp;
+        int done = 0;
+        gfs2_log_lock(sdp);
+        if (list_empty(head)) {
+                gfs2_log_unlock(sdp);
+                return;
+        }
+        sync_gen = sdp->sd_ail_sync_gen++;
+        first = head->prev;
+        first_ai = list_entry(first, struct gfs2_ail, ai_list);
+        first_ai->ai_sync_gen = sync_gen;
+        gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
+        if (flags & DIO_ALL)
+                first = NULL;
+        while(!done) {
+                if (first && (head->prev != first ||
+                              gfs2_ail1_empty_one(sdp, first_ai, 0)))
+                        break;
+                done = 1;
+                list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+                        if (ai->ai_sync_gen >= sync_gen)
+                                continue;
+                        ai->ai_sync_gen = sync_gen;
+                        gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+                        done = 0;
+                        break;
+                }
+        }
+        gfs2_log_unlock(sdp);
+}
+int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+{
+        struct gfs2_ail *ai, *s;
+        int ret;
+        gfs2_log_lock(sdp);
+        list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
+                if (gfs2_ail1_empty_one(sdp, ai, flags))
+                        list_move(&ai->ai_list, &sdp->sd_ail2_list);
+                else if (!(flags & DIO_ALL))
+                        break;
+        }
+        ret = list_empty(&sdp->sd_ail1_list);
+        gfs2_log_unlock(sdp);
+        return ret;
+}
+/**
+ * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &ai->ai_ail2_list;
+        struct gfs2_bufdata *bd;
+        while (!list_empty(head)) {
+                bd = list_entry(head->prev, struct gfs2_bufdata,
+                                bd_ail_st_list);
+                gfs2_assert(sdp, bd->bd_ail == ai);
+                bd->bd_ail = NULL;
+                list_del(&bd->bd_ail_st_list);
+                list_del(&bd->bd_ail_gl_list);
+                atomic_dec(&bd->bd_gl->gl_ail_count);
+                brelse(bd->bd_bh);
+        }
+}
+static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
+{
+        struct gfs2_ail *ai, *safe;
+        unsigned int old_tail = sdp->sd_log_tail;
+        int wrap = (new_tail < old_tail);
+        int a, b, rm;
+        gfs2_log_lock(sdp);
+        list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
+                a = (old_tail <= ai->ai_first);
+                b = (ai->ai_first < new_tail);
+                rm = (wrap) ? (a || b) : (a && b);
+                if (!rm)
+                        continue;
+                gfs2_ail2_empty_one(sdp, ai);
+                list_del(&ai->ai_list);
+                gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
+                gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
+                kfree(ai);
+        }
+        gfs2_log_unlock(sdp);
+}
+/**
+ * gfs2_log_reserve - Make a log reservation
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks to reserve
+ *
+ * Returns: errno
+ */
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
+{
+        unsigned int try = 0;
+        if (gfs2_assert_warn(sdp, blks) ||
+            gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
+                return -EINVAL;
+        mutex_lock(&sdp->sd_log_reserve_mutex);
+        gfs2_log_lock(sdp);
+        while(sdp->sd_log_blks_free <= blks) {
+                gfs2_log_unlock(sdp);
+                gfs2_ail1_empty(sdp, 0);
+                gfs2_log_flush(sdp, NULL);
+                if (try++)
+                        gfs2_ail1_start(sdp, 0);
+                gfs2_log_lock(sdp);
+        }
+        sdp->sd_log_blks_free -= blks;
+        gfs2_log_unlock(sdp);
+        mutex_unlock(&sdp->sd_log_reserve_mutex);
+        down_read(&sdp->sd_log_flush_lock);
+        return 0;
+}
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+        gfs2_log_lock(sdp);
+        sdp->sd_log_blks_free += blks;
+        gfs2_assert_withdraw(sdp,
+                             sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+        gfs2_log_unlock(sdp);
+        up_read(&sdp->sd_log_flush_lock);
+}
+static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
+{
+        int error;
+        struct buffer_head bh_map;
+        error = gfs2_block_map(sdp->sd_jdesc->jd_inode, lbn, 0, &bh_map, 1);
+        if (error || !bh_map.b_blocknr)
+                printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, bh_map.b_blocknr, lbn);
+        gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
+        return bh_map.b_blocknr;
+}
+/**
+ * log_distance - Compute distance between two journal blocks
+ * @sdp: The GFS2 superblock
+ * @newer: The most recent journal block of the pair
+ * @older: The older journal block of the pair
+ *
+ *   Compute the distance (in the journal direction) between two
+ *   blocks in the journal
+ *
+ * Returns: the distance in blocks
+ */
+static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
+                                        unsigned int older)
+{
+        int dist;
+        dist = newer - older;
+        if (dist < 0)
+                dist += sdp->sd_jdesc->jd_blocks;
+        return dist;
+}
+static unsigned int current_tail(struct gfs2_sbd *sdp)
+{
+        struct gfs2_ail *ai;
+        unsigned int tail;
+        gfs2_log_lock(sdp);
+        if (list_empty(&sdp->sd_ail1_list)) {
+                tail = sdp->sd_log_head;
+        } else {
+                ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
+                tail = ai->ai_first;
+        }
+        gfs2_log_unlock(sdp);
+        return tail;
+}
+static inline void log_incr_head(struct gfs2_sbd *sdp)
+{
+        if (sdp->sd_log_flush_head == sdp->sd_log_tail)
+                gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head);
+        if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
+                sdp->sd_log_flush_head = 0;
+                sdp->sd_log_flush_wrapped = 1;
+        }
+}
+/**
+ * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the buffer_head
+ */
+struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
+{
+        u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+        struct gfs2_log_buf *lb;
+        struct buffer_head *bh;
+        lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
+        list_add(&lb->lb_list, &sdp->sd_log_flush_list);
+        bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
+        lock_buffer(bh);
+        memset(bh->b_data, 0, bh->b_size);
+        set_buffer_uptodate(bh);
+        clear_buffer_dirty(bh);
+        unlock_buffer(bh);
+        log_incr_head(sdp);
+        return bh;
+}
+/**
+ * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
+ * @sdp: the filesystem
+ * @data: the data the buffer_head should point to
+ *
+ * Returns: the log buffer descriptor
+ */
+struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+                                      struct buffer_head *real)
+{
+        u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+        struct gfs2_log_buf *lb;
+        struct buffer_head *bh;
+        lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
+        list_add(&lb->lb_list, &sdp->sd_log_flush_list);
+        lb->lb_real = real;
+        bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
+        atomic_set(&bh->b_count, 1);
+        bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
+        set_bh_page(bh, real->b_page, bh_offset(real));
+        bh->b_blocknr = blkno;
+        bh->b_size = sdp->sd_sb.sb_bsize;
+        bh->b_bdev = sdp->sd_vfs->s_bdev;
+        log_incr_head(sdp);
+        return bh;
+}
+static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
+{
+        unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
+        ail2_empty(sdp, new_tail);
+        gfs2_log_lock(sdp);
+        sdp->sd_log_blks_free += dist - (pull ? 1 : 0);
+        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+        gfs2_log_unlock(sdp);
+        sdp->sd_log_tail = new_tail;
+}
+/**
+ * log_write_header - Get and initialize a journal header buffer
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the initialized log buffer descriptor
+ */
+static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
+{
+        u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+        struct buffer_head *bh;
+        struct gfs2_log_header *lh;
+        unsigned int tail;
+        u32 hash;
+        bh = sb_getblk(sdp->sd_vfs, blkno);
+        lock_buffer(bh);
+        memset(bh->b_data, 0, bh->b_size);
+        set_buffer_uptodate(bh);
+        clear_buffer_dirty(bh);
+        unlock_buffer(bh);
+        gfs2_ail1_empty(sdp, 0);
+        tail = current_tail(sdp);
+        lh = (struct gfs2_log_header *)bh->b_data;
+        memset(lh, 0, sizeof(struct gfs2_log_header));
+        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
+        lh->lh_flags = cpu_to_be32(flags);
+        lh->lh_tail = cpu_to_be32(tail);
+        lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
+        hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
+        lh->lh_hash = cpu_to_be32(hash);
+        set_buffer_dirty(bh);
+        if (sync_dirty_buffer(bh))
+                gfs2_io_error_bh(sdp, bh);
+        brelse(bh);
+        if (sdp->sd_log_tail != tail)
+                log_pull_tail(sdp, tail, pull);
+        else
+                gfs2_assert_withdraw(sdp, !pull);
+        sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
+        log_incr_head(sdp);
+}
+static void log_flush_commit(struct gfs2_sbd *sdp)
+{
+        struct list_head *head = &sdp->sd_log_flush_list;
+        struct gfs2_log_buf *lb;
+        struct buffer_head *bh;
+        while (!list_empty(head)) {
+                lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
+                list_del(&lb->lb_list);
+                bh = lb->lb_bh;
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh))
+                        gfs2_io_error_bh(sdp, bh);
+                if (lb->lb_real) {
+                        while (atomic_read(&bh->b_count) != 1)  /* Grrrr... */
+                                schedule();
+                        free_buffer_head(bh);
+                } else
+                        brelse(bh);
+                kfree(lb);
+        }
+        log_write_header(sdp, 0, 0);
+}
+/**
+ * gfs2_log_flush - flush incore transaction(s)
+ * @sdp: the filesystem
+ * @gl: The glock structure to flush.  If NULL, flush the whole incore log
+ *
+ */
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+{
+        struct gfs2_ail *ai;
+        down_write(&sdp->sd_log_flush_lock);
+        if (gl) {
+                gfs2_log_lock(sdp);
+                if (list_empty(&gl->gl_le.le_list)) {
+                        gfs2_log_unlock(sdp);
+                        up_write(&sdp->sd_log_flush_lock);
+                        return;
+                }
+                gfs2_log_unlock(sdp);
+        }
+        ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
+        INIT_LIST_HEAD(&ai->ai_ail1_list);
+        INIT_LIST_HEAD(&ai->ai_ail2_list);
+        gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
+        gfs2_assert_withdraw(sdp,
+                        sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
+        sdp->sd_log_flush_head = sdp->sd_log_head;
+        sdp->sd_log_flush_wrapped = 0;
+        ai->ai_first = sdp->sd_log_flush_head;
+        lops_before_commit(sdp);
+        if (!list_empty(&sdp->sd_log_flush_list))
+                log_flush_commit(sdp);
+        else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
+                log_write_header(sdp, 0, PULL);
+        lops_after_commit(sdp, ai);
+        sdp->sd_log_head = sdp->sd_log_flush_head;
+        sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
+        sdp->sd_log_blks_reserved = 0;
+        sdp->sd_log_commited_buf = 0;
+        sdp->sd_log_num_hdrs = 0;
+        sdp->sd_log_commited_revoke = 0;
+        gfs2_log_lock(sdp);
+        if (!list_empty(&ai->ai_ail1_list)) {
+                list_add(&ai->ai_list, &sdp->sd_ail1_list);
+                ai = NULL;
+        }
+        gfs2_log_unlock(sdp);
+        sdp->sd_vfs->s_dirt = 0;
+        up_write(&sdp->sd_log_flush_lock);
+        kfree(ai);
+}
+static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+        unsigned int reserved = 0;
+        unsigned int old;
+        gfs2_log_lock(sdp);
+        sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
+        gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
+        sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
+        gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
+        if (sdp->sd_log_commited_buf)
+                reserved += sdp->sd_log_commited_buf;
+        if (sdp->sd_log_commited_revoke)
+                reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
+                                            sizeof(u64));
+        if (reserved)
+                reserved++;
+        old = sdp->sd_log_blks_free;
+        sdp->sd_log_blks_free += tr->tr_reserved -
+                                 (reserved - sdp->sd_log_blks_reserved);
+        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
+        gfs2_assert_withdraw(sdp,
+                             sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
+                             sdp->sd_log_num_hdrs);
+        sdp->sd_log_blks_reserved = reserved;
+        gfs2_log_unlock(sdp);
+}
+/**
+ * gfs2_log_commit - Commit a transaction to the log
+ * @sdp: the filesystem
+ * @tr: the transaction
+ *
+ * Returns: errno
+ */
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+        log_refund(sdp, tr);
+        lops_incore_commit(sdp, tr);
+        sdp->sd_vfs->s_dirt = 1;
+        up_read(&sdp->sd_log_flush_lock);
+        gfs2_log_lock(sdp);
+        if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
+                gfs2_log_unlock(sdp);
+                gfs2_log_flush(sdp, NULL);
+        } else {
+                gfs2_log_unlock(sdp);
+        }
+}
+/**
+ * gfs2_log_shutdown - write a shutdown header into a journal
+ * @sdp: the filesystem
+ *
+ */
+void gfs2_log_shutdown(struct gfs2_sbd *sdp)
+{
+        down_write(&sdp->sd_log_flush_lock);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
+        gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
+        sdp->sd_log_flush_head = sdp->sd_log_head;
+        sdp->sd_log_flush_wrapped = 0;
+        log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
+        gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
+        gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
+        gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
+        sdp->sd_log_head = sdp->sd_log_flush_head;
+        sdp->sd_log_tail = sdp->sd_log_head;
+        up_write(&sdp->sd_log_flush_lock);
+}
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..7f5737d55612
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __LOG_DOT_H__
+#define __LOG_DOT_H__
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include "incore.h"
+/**
+ * gfs2_log_lock - acquire the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+{
+        spin_lock(&sdp->sd_log_lock);
+}
+/**
+ * gfs2_log_unlock - release the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+{
+        spin_unlock(&sdp->sd_log_lock);
+}
+static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
+                                          unsigned int value)
+{
+        if (++value == sdp->sd_jdesc->jd_blocks) {
+                value = 0;
+        }
+        sdp->sd_log_head = sdp->sd_log_tail = value;
+}
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+                            unsigned int ssize);
+void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
+int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+                                      struct buffer_head *real);
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+void gfs2_log_shutdown(struct gfs2_sbd *sdp);
+#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..881e337b6a70
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,809 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_glock *gl;
+        struct gfs2_trans *tr = current->journal_info;
+        tr->tr_touched = 1;
+        if (!list_empty(&le->le_list))
+                return;
+        gl = container_of(le, struct gfs2_glock, gl_le);
+        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
+                return;
+        gfs2_glock_hold(gl);
+        set_bit(GLF_DIRTY, &gl->gl_flags);
+        gfs2_log_lock(sdp);
+        sdp->sd_log_num_gl++;
+        list_add(&le->le_list, &sdp->sd_log_le_gl);
+        gfs2_log_unlock(sdp);
+}
+static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_gl;
+        struct gfs2_glock *gl;
+        while (!list_empty(head)) {
+                gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
+                list_del_init(&gl->gl_le.le_list);
+                sdp->sd_log_num_gl--;
+                gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
+                gfs2_glock_put(gl);
+        }
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
+}
+static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+        struct gfs2_trans *tr;
+        if (!list_empty(&bd->bd_list_tr))
+                return;
+        tr = current->journal_info;
+        tr->tr_touched = 1;
+        tr->tr_num_buf++;
+        list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+        if (!list_empty(&le->le_list))
+                return;
+        gfs2_trans_add_gl(bd->bd_gl);
+        gfs2_meta_check(sdp, bd->bd_bh);
+        gfs2_pin(sdp, bd->bd_bh);
+        gfs2_log_lock(sdp);
+        sdp->sd_log_num_buf++;
+        list_add(&le->le_list, &sdp->sd_log_le_buf);
+        gfs2_log_unlock(sdp);
+        tr->tr_num_buf_new++;
+}
+static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+        struct list_head *head = &tr->tr_list_buf;
+        struct gfs2_bufdata *bd;
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
+                list_del_init(&bd->bd_list_tr);
+                tr->tr_num_buf--;
+        }
+        gfs2_assert_warn(sdp, !tr->tr_num_buf);
+}
+static void buf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+        struct buffer_head *bh;
+        struct gfs2_log_descriptor *ld;
+        struct gfs2_bufdata *bd1 = NULL, *bd2;
+        unsigned int total = sdp->sd_log_num_buf;
+        unsigned int offset = sizeof(struct gfs2_log_descriptor);
+        unsigned int limit;
+        unsigned int num;
+        unsigned n;
+        __be64 *ptr;
+        offset += sizeof(__be64) - 1;
+        offset &= ~(sizeof(__be64) - 1);
+        limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
+        /* for 4k blocks, limit = 503 */
+        bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
+        while(total) {
+                num = total;
+                if (total > limit)
+                        num = limit;
+                bh = gfs2_log_get_buf(sdp);
+                sdp->sd_log_num_hdrs++;
+                ld = (struct gfs2_log_descriptor *)bh->b_data;
+                ptr = (__be64 *)(bh->b_data + offset);
+                ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+                ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+                ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+                ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
+                ld->ld_length = cpu_to_be32(num + 1);
+                ld->ld_data1 = cpu_to_be32(num);
+                ld->ld_data2 = cpu_to_be32(0);
+                memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+                n = 0;
+                list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
+                                             bd_le.le_list) {
+                        *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+                        if (++n >= num)
+                                break;
+                }
+                set_buffer_dirty(bh);
+                ll_rw_block(WRITE, 1, &bh);
+                n = 0;
+                list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
+                                             bd_le.le_list) {
+                        bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+                        set_buffer_dirty(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        if (++n >= num)
+                                break;
+                }
+                total -= num;
+        }
+}
+static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_buf;
+        struct gfs2_bufdata *bd;
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+                list_del_init(&bd->bd_le.le_list);
+                sdp->sd_log_num_buf--;
+                gfs2_unpin(sdp, bd->bd_bh, ai);
+        }
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
+}
+static void buf_lo_before_scan(struct gfs2_jdesc *jd,
+                               struct gfs2_log_header *head, int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (pass != 0)
+                return;
+        sdp->sd_found_blocks = 0;
+        sdp->sd_replayed_blocks = 0;
+}
+static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+                                struct gfs2_log_descriptor *ld, __be64 *ptr,
+                                int pass)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        struct gfs2_glock *gl = ip->i_gl;
+        unsigned int blks = be32_to_cpu(ld->ld_data1);
+        struct buffer_head *bh_log, *bh_ip;
+        u64 blkno;
+        int error = 0;
+        if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
+                return 0;
+        gfs2_replay_incr_blk(sdp, &start);
+        for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+                blkno = be64_to_cpu(*ptr++);
+                sdp->sd_found_blocks++;
+                if (gfs2_revoke_check(sdp, blkno, start))
+                        continue;
+                error = gfs2_replay_read_block(jd, start, &bh_log);
+                if (error)
+                        return error;
+                bh_ip = gfs2_meta_new(gl, blkno);
+                memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+                if (gfs2_meta_check(sdp, bh_ip))
+                        error = -EIO;
+                else
+                        mark_buffer_dirty(bh_ip);
+                brelse(bh_log);
+                brelse(bh_ip);
+                if (error)
+                        break;
+                sdp->sd_replayed_blocks++;
+        }
+        return error;
+}
+static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (error) {
+                gfs2_meta_sync(ip->i_gl);
+                return;
+        }
+        if (pass != 1)
+                return;
+        gfs2_meta_sync(ip->i_gl);
+        fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
+                jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_trans *tr;
+        tr = current->journal_info;
+        tr->tr_touched = 1;
+        tr->tr_num_revoke++;
+        gfs2_log_lock(sdp);
+        sdp->sd_log_num_revoke++;
+        list_add(&le->le_list, &sdp->sd_log_le_revoke);
+        gfs2_log_unlock(sdp);
+}
+static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
+{
+        struct gfs2_log_descriptor *ld;
+        struct gfs2_meta_header *mh;
+        struct buffer_head *bh;
+        unsigned int offset;
+        struct list_head *head = &sdp->sd_log_le_revoke;
+        struct gfs2_revoke *rv;
+        if (!sdp->sd_log_num_revoke)
+                return;
+        bh = gfs2_log_get_buf(sdp);
+        ld = (struct gfs2_log_descriptor *)bh->b_data;
+        ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+        ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+        ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+        ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
+        ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
+                                                    sizeof(u64)));
+        ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
+        ld->ld_data2 = cpu_to_be32(0);
+        memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+        offset = sizeof(struct gfs2_log_descriptor);
+        while (!list_empty(head)) {
+                rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
+                list_del_init(&rv->rv_le.le_list);
+                sdp->sd_log_num_revoke--;
+                if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
+                        set_buffer_dirty(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        bh = gfs2_log_get_buf(sdp);
+                        mh = (struct gfs2_meta_header *)bh->b_data;
+                        mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+                        mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
+                        mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
+                        offset = sizeof(struct gfs2_meta_header);
+                }
+                *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
+                kfree(rv);
+                offset += sizeof(u64);
+        }
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+        set_buffer_dirty(bh);
+        ll_rw_block(WRITE, 1, &bh);
+}
+static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
+                                  struct gfs2_log_header *head, int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (pass != 0)
+                return;
+        sdp->sd_found_revokes = 0;
+        sdp->sd_replay_tail = head->lh_tail;
+}
+static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+                                   struct gfs2_log_descriptor *ld, __be64 *ptr,
+                                   int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        unsigned int blks = be32_to_cpu(ld->ld_length);
+        unsigned int revokes = be32_to_cpu(ld->ld_data1);
+        struct buffer_head *bh;
+        unsigned int offset;
+        u64 blkno;
+        int first = 1;
+        int error;
+        if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
+                return 0;
+        offset = sizeof(struct gfs2_log_descriptor);
+        for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+                error = gfs2_replay_read_block(jd, start, &bh);
+                if (error)
+                        return error;
+                if (!first)
+                        gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
+                while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
+                        blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
+                        error = gfs2_revoke_add(sdp, blkno, start);
+                        if (error < 0)
+                                return error;
+                        else if (error)
+                                sdp->sd_found_revokes++;
+                        if (!--revokes)
+                                break;
+                        offset += sizeof(u64);
+                }
+                brelse(bh);
+                offset = sizeof(struct gfs2_meta_header);
+                first = 0;
+        }
+        return 0;
+}
+static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (error) {
+                gfs2_revoke_clean(sdp);
+                return;
+        }
+        if (pass != 1)
+                return;
+        fs_info(sdp, "jid=%u: Found %u revoke tags\n",
+                jd->jd_jid, sdp->sd_found_revokes);
+        gfs2_revoke_clean(sdp);
+}
+static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_trans *tr = current->journal_info;
+        tr->tr_touched = 1;
+        if (!list_empty(&le->le_list))
+                return;
+        rgd = container_of(le, struct gfs2_rgrpd, rd_le);
+        gfs2_rgrp_bh_hold(rgd);
+        gfs2_log_lock(sdp);
+        sdp->sd_log_num_rg++;
+        list_add(&le->le_list, &sdp->sd_log_le_rg);
+        gfs2_log_unlock(sdp);
+}
+static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_rg;
+        struct gfs2_rgrpd *rgd;
+        while (!list_empty(head)) {
+                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
+                list_del_init(&rgd->rd_le.le_list);
+                sdp->sd_log_num_rg--;
+                gfs2_rgrp_repolish_clones(rgd);
+                gfs2_rgrp_bh_put(rgd);
+        }
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
+}
+/**
+ * databuf_lo_add - Add a databuf to the transaction.
+ *
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ *    We put the data buffer on a list so that we can ensure that its
+ *    synced to disk at the right time
+ * ii) In journaled data mode
+ *    We need to journal the data block in the same way as metadata in
+ *    the functions above. The difference is that here we have a tag
+ *    which is two __be64's being the block number (as per meta data)
+ *    and a flag which says whether the data block needs escaping or
+ *    not. This means we need a new log entry for each 251 or so data
+ *    blocks, which isn't an enormous overhead but twice as much as
+ *    for normal metadata blocks.
+ */
+static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+        struct gfs2_trans *tr = current->journal_info;
+        struct address_space *mapping = bd->bd_bh->b_page->mapping;
+        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        tr->tr_touched = 1;
+        if (list_empty(&bd->bd_list_tr) &&
+            (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
+                tr->tr_num_buf++;
+                list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+                gfs2_pin(sdp, bd->bd_bh);
+                tr->tr_num_buf_new++;
+        }
+        gfs2_trans_add_gl(bd->bd_gl);
+        gfs2_log_lock(sdp);
+        if (list_empty(&le->le_list)) {
+                if (ip->i_di.di_flags & GFS2_DIF_JDATA)
+                        sdp->sd_log_num_jdata++;
+                sdp->sd_log_num_databuf++;
+                list_add(&le->le_list, &sdp->sd_log_le_databuf);
+        }
+        gfs2_log_unlock(sdp);
+}
+static int gfs2_check_magic(struct buffer_head *bh)
+{
+        struct page *page = bh->b_page;
+        void *kaddr;
+        __be32 *ptr;
+        int rv = 0;
+        kaddr = kmap_atomic(page, KM_USER0);
+        ptr = kaddr + bh_offset(bh);
+        if (*ptr == cpu_to_be32(GFS2_MAGIC))
+                rv = 1;
+        kunmap_atomic(page, KM_USER0);
+        return rv;
+}
+/**
+ * databuf_lo_before_commit - Scan the data buffers, writing as we go
+ *
+ * Here we scan through the lists of buffers and make the assumption
+ * that any buffer thats been pinned is being journaled, and that
+ * any unpinned buffer is an ordered write data buffer and therefore
+ * will be written back rather than journaled.
+ */
+static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+        LIST_HEAD(started);
+        struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
+        struct buffer_head *bh = NULL;
+        unsigned int offset = sizeof(struct gfs2_log_descriptor);
+        struct gfs2_log_descriptor *ld;
+        unsigned int limit;
+        unsigned int total_dbuf = sdp->sd_log_num_databuf;
+        unsigned int total_jdata = sdp->sd_log_num_jdata;
+        unsigned int num, n;
+        __be64 *ptr = NULL;
+        offset += 2*sizeof(__be64) - 1;
+        offset &= ~(2*sizeof(__be64) - 1);
+        limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
+        /*
+         * Start writing ordered buffers, write journaled buffers
+         * into the log along with a header
+         */
+        gfs2_log_lock(sdp);
+        bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
+                                       bd_le.le_list);
+        while(total_dbuf) {
+                num = total_jdata;
+                if (num > limit)
+                        num = limit;
+                n = 0;
+                list_for_each_entry_safe_continue(bd1, bdt,
+                                                  &sdp->sd_log_le_databuf,
+                                                  bd_le.le_list) {
+                        /* An ordered write buffer */
+                        if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
+                                list_move(&bd1->bd_le.le_list, &started);
+                                if (bd1 == bd2) {
+                                        bd2 = NULL;
+                                        bd2 = list_prepare_entry(bd2,
+                                                        &sdp->sd_log_le_databuf,
+                                                        bd_le.le_list);
+                                }
+                                total_dbuf--;
+                                if (bd1->bd_bh) {
+                                        get_bh(bd1->bd_bh);
+                                        if (buffer_dirty(bd1->bd_bh)) {
+                                                gfs2_log_unlock(sdp);
+                                                wait_on_buffer(bd1->bd_bh);
+                                                ll_rw_block(WRITE, 1,
+                                                            &bd1->bd_bh);
+                                                gfs2_log_lock(sdp);
+                                        }
+                                        brelse(bd1->bd_bh);
+                                        continue;
+                                }
+                                continue;
+                        } else if (bd1->bd_bh) { /* A journaled buffer */
+                                int magic;
+                                gfs2_log_unlock(sdp);
+                                if (!bh) {
+                                        bh = gfs2_log_get_buf(sdp);
+                                        sdp->sd_log_num_hdrs++;
+                                        ld = (struct gfs2_log_descriptor *)
+                                             bh->b_data;
+                                        ptr = (__be64 *)(bh->b_data + offset);
+                                        ld->ld_header.mh_magic =
+                                                cpu_to_be32(GFS2_MAGIC);
+                                        ld->ld_header.mh_type =
+                                                cpu_to_be32(GFS2_METATYPE_LD);
+                                        ld->ld_header.mh_format =
+                                                cpu_to_be32(GFS2_FORMAT_LD);
+                                        ld->ld_type =
+                                                cpu_to_be32(GFS2_LOG_DESC_JDATA);
+                                        ld->ld_length = cpu_to_be32(num + 1);
+                                        ld->ld_data1 = cpu_to_be32(num);
+                                        ld->ld_data2 = cpu_to_be32(0);
+                                        memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+                                }
+                                magic = gfs2_check_magic(bd1->bd_bh);
+                                *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+                                *ptr++ = cpu_to_be64((__u64)magic);
+                                clear_buffer_escaped(bd1->bd_bh);
+                                if (unlikely(magic != 0))
+                                        set_buffer_escaped(bd1->bd_bh);
+                                gfs2_log_lock(sdp);
+                                if (n++ > num)
+                                        break;
+                        } else if (!bd1->bd_bh) {
+                                total_dbuf--;
+                                sdp->sd_log_num_databuf--;
+                                list_del_init(&bd1->bd_le.le_list);
+                                if (bd1 == bd2) {
+                                        bd2 = NULL;
+                                        bd2 = list_prepare_entry(bd2,
+                                                &sdp->sd_log_le_databuf,
+                                                bd_le.le_list);
+                                }
+                                kmem_cache_free(gfs2_bufdata_cachep, bd1);
+                        }
+                }
+                gfs2_log_unlock(sdp);
+                if (bh) {
+                        set_buffer_dirty(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        bh = NULL;
+                }
+                n = 0;
+                gfs2_log_lock(sdp);
+                list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
+                                             bd_le.le_list) {
+                        if (!bd2->bd_bh)
+                                continue;
+                        /* copy buffer if it needs escaping */
+                        gfs2_log_unlock(sdp);
+                        if (unlikely(buffer_escaped(bd2->bd_bh))) {
+                                void *kaddr;
+                                struct page *page = bd2->bd_bh->b_page;
+                                bh = gfs2_log_get_buf(sdp);
+                                kaddr = kmap_atomic(page, KM_USER0);
+                                memcpy(bh->b_data,
+                                       kaddr + bh_offset(bd2->bd_bh),
+                                       sdp->sd_sb.sb_bsize);
+                                kunmap_atomic(page, KM_USER0);
+                                *(__be32 *)bh->b_data = 0;
+                        } else {
+                                bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+                        }
+                        set_buffer_dirty(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        gfs2_log_lock(sdp);
+                        if (++n >= num)
+                                break;
+                }
+                bh = NULL;
+                total_dbuf -= num;
+                total_jdata -= num;
+        }
+        gfs2_log_unlock(sdp);
+        /* Wait on all ordered buffers */
+        while (!list_empty(&started)) {
+                gfs2_log_lock(sdp);
+                bd1 = list_entry(started.next, struct gfs2_bufdata,
+                                 bd_le.le_list);
+                list_del_init(&bd1->bd_le.le_list);
+                sdp->sd_log_num_databuf--;
+                bh = bd1->bd_bh;
+                if (bh) {
+                        bh->b_private = NULL;
+                        get_bh(bh);
+                        gfs2_log_unlock(sdp);
+                        wait_on_buffer(bh);
+                        brelse(bh);
+                } else
+                        gfs2_log_unlock(sdp);
+                kmem_cache_free(gfs2_bufdata_cachep, bd1);
+        }
+        /* We've removed all the ordered write bufs here, so only jdata left */
+        gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
+}
+static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+                                    struct gfs2_log_descriptor *ld,
+                                    __be64 *ptr, int pass)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        struct gfs2_glock *gl = ip->i_gl;
+        unsigned int blks = be32_to_cpu(ld->ld_data1);
+        struct buffer_head *bh_log, *bh_ip;
+        u64 blkno;
+        u64 esc;
+        int error = 0;
+        if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
+                return 0;
+        gfs2_replay_incr_blk(sdp, &start);
+        for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+                blkno = be64_to_cpu(*ptr++);
+                esc = be64_to_cpu(*ptr++);
+                sdp->sd_found_blocks++;
+                if (gfs2_revoke_check(sdp, blkno, start))
+                        continue;
+                error = gfs2_replay_read_block(jd, start, &bh_log);
+                if (error)
+                        return error;
+                bh_ip = gfs2_meta_new(gl, blkno);
+                memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+                /* Unescape */
+                if (esc) {
+                        __be32 *eptr = (__be32 *)bh_ip->b_data;
+                        *eptr = cpu_to_be32(GFS2_MAGIC);
+                }
+                mark_buffer_dirty(bh_ip);
+                brelse(bh_log);
+                brelse(bh_ip);
+                if (error)
+                        break;
+                sdp->sd_replayed_blocks++;
+        }
+        return error;
+}
+/* FIXME: sort out accounting for log blocks etc. */
+static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (error) {
+                gfs2_meta_sync(ip->i_gl);
+                return;
+        }
+        if (pass != 1)
+                return;
+        /* data sync? */
+        gfs2_meta_sync(ip->i_gl);
+        fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
+                jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_databuf;
+        struct gfs2_bufdata *bd;
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+                list_del_init(&bd->bd_le.le_list);
+                sdp->sd_log_num_databuf--;
+                sdp->sd_log_num_jdata--;
+                gfs2_unpin(sdp, bd->bd_bh, ai);
+        }
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
+}
+const struct gfs2_log_operations gfs2_glock_lops = {
+        .lo_add = glock_lo_add,
+        .lo_after_commit = glock_lo_after_commit,
+        .lo_name = "glock",
+};
+const struct gfs2_log_operations gfs2_buf_lops = {
+        .lo_add = buf_lo_add,
+        .lo_incore_commit = buf_lo_incore_commit,
+        .lo_before_commit = buf_lo_before_commit,
+        .lo_after_commit = buf_lo_after_commit,
+        .lo_before_scan = buf_lo_before_scan,
+        .lo_scan_elements = buf_lo_scan_elements,
+        .lo_after_scan = buf_lo_after_scan,
+        .lo_name = "buf",
+};
+const struct gfs2_log_operations gfs2_revoke_lops = {
+        .lo_add = revoke_lo_add,
+        .lo_before_commit = revoke_lo_before_commit,
+        .lo_before_scan = revoke_lo_before_scan,
+        .lo_scan_elements = revoke_lo_scan_elements,
+        .lo_after_scan = revoke_lo_after_scan,
+        .lo_name = "revoke",
+};
+const struct gfs2_log_operations gfs2_rg_lops = {
+        .lo_add = rg_lo_add,
+        .lo_after_commit = rg_lo_after_commit,
+        .lo_name = "rg",
+};
+const struct gfs2_log_operations gfs2_databuf_lops = {
+        .lo_add = databuf_lo_add,
+        .lo_incore_commit = buf_lo_incore_commit,
+        .lo_before_commit = databuf_lo_before_commit,
+        .lo_after_commit = databuf_lo_after_commit,
+        .lo_scan_elements = databuf_lo_scan_elements,
+        .lo_after_scan = databuf_lo_after_scan,
+        .lo_name = "databuf",
+};
+const struct gfs2_log_operations *gfs2_log_ops[] = {
+        &gfs2_glock_lops,
+        &gfs2_buf_lops,
+        &gfs2_revoke_lops,
+        &gfs2_rg_lops,
+        &gfs2_databuf_lops,
+        NULL,
+};
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..5839c05ae6be
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __LOPS_DOT_H__
+#define __LOPS_DOT_H__
+#include <linux/list.h>
+#include "incore.h"
+extern const struct gfs2_log_operations gfs2_glock_lops;
+extern const struct gfs2_log_operations gfs2_buf_lops;
+extern const struct gfs2_log_operations gfs2_revoke_lops;
+extern const struct gfs2_log_operations gfs2_rg_lops;
+extern const struct gfs2_log_operations gfs2_databuf_lops;
+extern const struct gfs2_log_operations *gfs2_log_ops[];
+static inline void lops_init_le(struct gfs2_log_element *le,
+                                const struct gfs2_log_operations *lops)
+{
+        INIT_LIST_HEAD(&le->le_list);
+        le->le_ops = lops;
+}
+static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        if (le->le_ops->lo_add)
+                le->le_ops->lo_add(sdp, le);
+}
+static inline void lops_incore_commit(struct gfs2_sbd *sdp,
+                                      struct gfs2_trans *tr)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_incore_commit)
+                        gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
+}
+static inline void lops_before_commit(struct gfs2_sbd *sdp)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_before_commit)
+                        gfs2_log_ops[x]->lo_before_commit(sdp);
+}
+static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_after_commit)
+                        gfs2_log_ops[x]->lo_after_commit(sdp, ai);
+}
+static inline void lops_before_scan(struct gfs2_jdesc *jd,
+                                    struct gfs2_log_header *head,
+                                    unsigned int pass)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_before_scan)
+                        gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
+}
+static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+                                     struct gfs2_log_descriptor *ld,
+                                     __be64 *ptr,
+                                     unsigned int pass)
+{
+        int x, error;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_scan_elements) {
+                        error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
+                                                                  ld, ptr, pass);
+                        if (error)
+                                return error;
+                }
+        return 0;
+}
+static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
+                                   unsigned int pass)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_before_scan)
+                        gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
+}
+#endif /* __LOPS_DOT_H__ */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..21508a13bb78
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/atomic.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "ops_fstype.h"
+#include "sys.h"
+#include "util.h"
+#include "glock.h"
+static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+        struct gfs2_inode *ip = foo;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR) {
+                inode_init_once(&ip->i_inode);
+                spin_lock_init(&ip->i_spin);
+                init_rwsem(&ip->i_rw_mutex);
+                memset(ip->i_cache, 0, sizeof(ip->i_cache));
+        }
+}
+static void gfs2_init_glock_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+        struct gfs2_glock *gl = foo;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR) {
+                INIT_HLIST_NODE(&gl->gl_list);
+                spin_lock_init(&gl->gl_spin);
+                INIT_LIST_HEAD(&gl->gl_holders);
+                INIT_LIST_HEAD(&gl->gl_waiters1);
+                INIT_LIST_HEAD(&gl->gl_waiters2);
+                INIT_LIST_HEAD(&gl->gl_waiters3);
+                gl->gl_lvb = NULL;
+                atomic_set(&gl->gl_lvb_count, 0);
+                INIT_LIST_HEAD(&gl->gl_reclaim);
+                INIT_LIST_HEAD(&gl->gl_ail_list);
+                atomic_set(&gl->gl_ail_count, 0);
+        }
+}
+/**
+ * init_gfs2_fs - Register GFS2 as a filesystem
+ *
+ * Returns: 0 on success, error code on failure
+ */
+static int __init init_gfs2_fs(void)
+{
+        int error;
+        error = gfs2_sys_init();
+        if (error)
+                return error;
+        error = gfs2_glock_init();
+        if (error)
+                goto fail;
+        error = -ENOMEM;
+        gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
+                                              sizeof(struct gfs2_glock),
+                                              0, 0,
+                                              gfs2_init_glock_once, NULL);
+        if (!gfs2_glock_cachep)
+                goto fail;
+        gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
+                                              sizeof(struct gfs2_inode),
+                                              0, (SLAB_RECLAIM_ACCOUNT|
+                                              SLAB_PANIC|SLAB_MEM_SPREAD),
+                                              gfs2_init_inode_once, NULL);
+        if (!gfs2_inode_cachep)
+                goto fail;
+        gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
+                                                sizeof(struct gfs2_bufdata),
+                                                0, 0, NULL, NULL);
+        if (!gfs2_bufdata_cachep)
+                goto fail;
+        error = register_filesystem(&gfs2_fs_type);
+        if (error)
+                goto fail;
+        error = register_filesystem(&gfs2meta_fs_type);
+        if (error)
+                goto fail_unregister;
+        printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
+        return 0;
+fail_unregister:
+        unregister_filesystem(&gfs2_fs_type);
+fail:
+        if (gfs2_bufdata_cachep)
+                kmem_cache_destroy(gfs2_bufdata_cachep);
+        if (gfs2_inode_cachep)
+                kmem_cache_destroy(gfs2_inode_cachep);
+        if (gfs2_glock_cachep)
+                kmem_cache_destroy(gfs2_glock_cachep);
+        gfs2_sys_uninit();
+        return error;
+}
+/**
+ * exit_gfs2_fs - Unregister the file system
+ *
+ */
+static void __exit exit_gfs2_fs(void)
+{
+        unregister_filesystem(&gfs2_fs_type);
+        unregister_filesystem(&gfs2meta_fs_type);
+        kmem_cache_destroy(gfs2_bufdata_cachep);
+        kmem_cache_destroy(gfs2_inode_cachep);
+        kmem_cache_destroy(gfs2_glock_cachep);
+        gfs2_sys_uninit();
+}
+MODULE_DESCRIPTION("Global File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+module_init(init_gfs2_fs);
+module_exit(exit_gfs2_fs);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..3912d6a4b1e6
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/delay.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "ops_address.h"
+static int aspace_get_block(struct inode *inode, sector_t lblock,
+                            struct buffer_head *bh_result, int create)
+{
+        gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
+        return -EOPNOTSUPP;
+}
+static int gfs2_aspace_writepage(struct page *page,
+                                 struct writeback_control *wbc)
+{
+        return block_write_full_page(page, aspace_get_block, wbc);
+}
+static const struct address_space_operations aspace_aops = {
+        .writepage = gfs2_aspace_writepage,
+        .releasepage = gfs2_releasepage,
+};
+/**
+ * gfs2_aspace_get - Create and initialize a struct inode structure
+ * @sdp: the filesystem the aspace is in
+ *
+ * Right now a struct inode is just a struct inode.  Maybe Linux
+ * will supply a more lightweight address space construct (that works)
+ * in the future.
+ *
+ * Make sure pages/buffers in this aspace aren't in high memory.
+ *
+ * Returns: the aspace
+ */
+struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
+{
+        struct inode *aspace;
+        aspace = new_inode(sdp->sd_vfs);
+        if (aspace) {
+                mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
+                aspace->i_mapping->a_ops = &aspace_aops;
+                aspace->i_size = ~0ULL;
+                aspace->i_private = NULL;
+                insert_inode_hash(aspace);
+        }
+        return aspace;
+}
+void gfs2_aspace_put(struct inode *aspace)
+{
+        remove_inode_hash(aspace);
+        iput(aspace);
+}
+/**
+ * gfs2_meta_inval - Invalidate all buffers associated with a glock
+ * @gl: the glock
+ *
+ */
+void gfs2_meta_inval(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct inode *aspace = gl->gl_aspace;
+        struct address_space *mapping = gl->gl_aspace->i_mapping;
+        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+        atomic_inc(&aspace->i_writecount);
+        truncate_inode_pages(mapping, 0);
+        atomic_dec(&aspace->i_writecount);
+        gfs2_assert_withdraw(sdp, !mapping->nrpages);
+}
+/**
+ * gfs2_meta_sync - Sync all buffers associated with a glock
+ * @gl: The glock
+ *
+ */
+void gfs2_meta_sync(struct gfs2_glock *gl)
+{
+        struct address_space *mapping = gl->gl_aspace->i_mapping;
+        int error;
+        filemap_fdatawrite(mapping);
+        error = filemap_fdatawait(mapping);
+        if (error)
+                gfs2_io_error(gl->gl_sbd);
+}
+/**
+ * getbuf - Get a buffer with a given address space
+ * @sdp: the filesystem
+ * @aspace: the address space
+ * @blkno: the block number (filesystem scope)
+ * @create: 1 if the buffer should be created
+ *
+ * Returns: the buffer
+ */
+static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
+                                  u64 blkno, int create)
+{
+        struct page *page;
+        struct buffer_head *bh;
+        unsigned int shift;
+        unsigned long index;
+        unsigned int bufnum;
+        shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+        index = blkno >> shift;             /* convert block to page */
+        bufnum = blkno - (index << shift);  /* block buf index within page */
+        if (create) {
+                for (;;) {
+                        page = grab_cache_page(aspace->i_mapping, index);
+                        if (page)
+                                break;
+                        yield();
+                }
+        } else {
+                page = find_lock_page(aspace->i_mapping, index);
+                if (!page)
+                        return NULL;
+        }
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
+        /* Locate header for our buffer within our page */
+        for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
+                /* Do nothing */;
+        get_bh(bh);
+        if (!buffer_mapped(bh))
+                map_bh(bh, sdp->sd_vfs, blkno);
+        unlock_page(page);
+        mark_page_accessed(page);
+        page_cache_release(page);
+        return bh;
+}
+static void meta_prep_new(struct buffer_head *bh)
+{
+        struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+        lock_buffer(bh);
+        clear_buffer_dirty(bh);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+}
+/**
+ * gfs2_meta_new - Get a block
+ * @gl: The glock associated with this block
+ * @blkno: The block number
+ *
+ * Returns: The buffer
+ */
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
+{
+        struct buffer_head *bh;
+        bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+        meta_prep_new(bh);
+        return bh;
+}
+/**
+ * gfs2_meta_read - Read a block from disk
+ * @gl: The glock covering the block
+ * @blkno: The block number
+ * @flags: flags
+ * @bhp: the place where the buffer is returned (NULL on failure)
+ *
+ * Returns: errno
+ */
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+                   struct buffer_head **bhp)
+{
+        *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+        if (!buffer_uptodate(*bhp))
+                ll_rw_block(READ_META, 1, bhp);
+        if (flags & DIO_WAIT) {
+                int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+                if (error) {
+                        brelse(*bhp);
+                        return error;
+                }
+        }
+        return 0;
+}
+/**
+ * gfs2_meta_wait - Reread a block from disk
+ * @sdp: the filesystem
+ * @bh: The block to wait for
+ *
+ * Returns: errno
+ */
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                return -EIO;
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
+                struct gfs2_trans *tr = current->journal_info;
+                if (tr && tr->tr_touched)
+                        gfs2_io_error_bh(sdp, bh);
+                return -EIO;
+        }
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                return -EIO;
+        return 0;
+}
+/**
+ * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to be attached to
+ * @meta: Flag to indicate whether its metadata or not
+ */
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+                         int meta)
+{
+        struct gfs2_bufdata *bd;
+        if (meta)
+                lock_page(bh->b_page);
+        if (bh->b_private) {
+                if (meta)
+                        unlock_page(bh->b_page);
+                return;
+        }
+        bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
+        memset(bd, 0, sizeof(struct gfs2_bufdata));
+        bd->bd_bh = bh;
+        bd->bd_gl = gl;
+        INIT_LIST_HEAD(&bd->bd_list_tr);
+        if (meta)
+                lops_init_le(&bd->bd_le, &gfs2_buf_lops);
+        else
+                lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
+        bh->b_private = bd;
+        if (meta)
+                unlock_page(bh->b_page);
+}
+/**
+ * gfs2_pin - Pin a buffer in memory
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to be pinned
+ *
+ */
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+        struct gfs2_bufdata *bd = bh->b_private;
+        gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+        if (test_set_buffer_pinned(bh))
+                gfs2_assert_withdraw(sdp, 0);
+        wait_on_buffer(bh);
+        /* If this buffer is in the AIL and it has already been written
+           to in-place disk block, remove it from the AIL. */
+        gfs2_log_lock(sdp);
+        if (bd->bd_ail && !buffer_in_io(bh))
+                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+        gfs2_log_unlock(sdp);
+        clear_buffer_dirty(bh);
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh))
+                gfs2_io_error_bh(sdp, bh);
+        get_bh(bh);
+}
+/**
+ * gfs2_unpin - Unpin a buffer
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to unpin
+ * @ai:
+ *
+ */
+void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                struct gfs2_ail *ai)
+{
+        struct gfs2_bufdata *bd = bh->b_private;
+        gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
+        if (!buffer_pinned(bh))
+                gfs2_assert_withdraw(sdp, 0);
+        mark_buffer_dirty(bh);
+        clear_buffer_pinned(bh);
+        gfs2_log_lock(sdp);
+        if (bd->bd_ail) {
+                list_del(&bd->bd_ail_st_list);
+                brelse(bh);
+        } else {
+                struct gfs2_glock *gl = bd->bd_gl;
+                list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
+                atomic_inc(&gl->gl_ail_count);
+        }
+        bd->bd_ail = ai;
+        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+        gfs2_log_unlock(sdp);
+}
+/**
+ * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
+ * @ip: the inode who owns the buffers
+ * @bstart: the first buffer in the run
+ * @blen: the number of buffers in the run
+ *
+ */
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct inode *aspace = ip->i_gl->gl_aspace;
+        struct buffer_head *bh;
+        while (blen) {
+                bh = getbuf(sdp, aspace, bstart, NO_CREATE);
+                if (bh) {
+                        struct gfs2_bufdata *bd = bh->b_private;
+                        if (test_clear_buffer_pinned(bh)) {
+                                struct gfs2_trans *tr = current->journal_info;
+                                gfs2_log_lock(sdp);
+                                list_del_init(&bd->bd_le.le_list);
+                                gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
+                                sdp->sd_log_num_buf--;
+                                gfs2_log_unlock(sdp);
+                                tr->tr_num_buf_rm++;
+                                brelse(bh);
+                        }
+                        if (bd) {
+                                gfs2_log_lock(sdp);
+                                if (bd->bd_ail) {
+                                        u64 blkno = bh->b_blocknr;
+                                        bd->bd_ail = NULL;
+                                        list_del(&bd->bd_ail_st_list);
+                                        list_del(&bd->bd_ail_gl_list);
+                                        atomic_dec(&bd->bd_gl->gl_ail_count);
+                                        brelse(bh);
+                                        gfs2_log_unlock(sdp);
+                                        gfs2_trans_add_revoke(sdp, blkno);
+                                } else
+                                        gfs2_log_unlock(sdp);
+                        }
+                        lock_buffer(bh);
+                        clear_buffer_dirty(bh);
+                        clear_buffer_uptodate(bh);
+                        unlock_buffer(bh);
+                        brelse(bh);
+                }
+                bstart++;
+                blen--;
+        }
+}
+/**
+ * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
+ * @ip: The GFS2 inode
+ *
+ * This releases buffers that are in the most-recently-used array of
+ * blocks used for indirect block addressing for this inode.
+ */
+void gfs2_meta_cache_flush(struct gfs2_inode *ip)
+{
+        struct buffer_head **bh_slot;
+        unsigned int x;
+        spin_lock(&ip->i_spin);
+        for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
+                bh_slot = &ip->i_cache[x];
+                if (!*bh_slot)
+                        break;
+                brelse(*bh_slot);
+                *bh_slot = NULL;
+        }
+        spin_unlock(&ip->i_spin);
+}
+/**
+ * gfs2_meta_indirect_buffer - Get a metadata buffer
+ * @ip: The GFS2 inode
+ * @height: The level of this buf in the metadata (indir addr) tree (if any)
+ * @num: The block number (device relative) of the buffer
+ * @new: Non-zero if we may create a new buffer
+ * @bhp: the buffer is returned here
+ *
+ * Try to use the gfs2_inode's MRU metadata tree cache.
+ *
+ * Returns: errno
+ */
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+                              int new, struct buffer_head **bhp)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_glock *gl = ip->i_gl;
+        struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
+        int in_cache = 0;
+        spin_lock(&ip->i_spin);
+        if (*bh_slot && (*bh_slot)->b_blocknr == num) {
+                bh = *bh_slot;
+                get_bh(bh);
+                in_cache = 1;
+        }
+        spin_unlock(&ip->i_spin);
+        if (!bh)
+                bh = getbuf(gl->gl_sbd, gl->gl_aspace, num, CREATE);
+        if (!bh)
+                return -ENOBUFS;
+        if (new) {
+                if (gfs2_assert_warn(sdp, height))
+                        goto err;
+                meta_prep_new(bh);
+                gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+                gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+        } else {
+                u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+                if (!buffer_uptodate(bh)) {
+                        ll_rw_block(READ_META, 1, &bh);
+                        if (gfs2_meta_wait(sdp, bh))
+                                goto err;
+                }
+                if (gfs2_metatype_check(sdp, bh, mtype))
+                        goto err;
+        }
+        if (!in_cache) {
+                spin_lock(&ip->i_spin);
+                if (*bh_slot)
+                        brelse(*bh_slot);
+                *bh_slot = bh;
+                get_bh(bh);
+                spin_unlock(&ip->i_spin);
+        }
+        *bhp = bh;
+        return 0;
+err:
+        brelse(bh);
+        return -EIO;
+}
+/**
+ * gfs2_meta_ra - start readahead on an extent of a file
+ * @gl: the glock the blocks belong to
+ * @dblock: the starting disk block
+ * @extlen: the number of blocks in the extent
+ *
+ * returns: the first buffer in the extent
+ */
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct inode *aspace = gl->gl_aspace;
+        struct buffer_head *first_bh, *bh;
+        u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
+                          sdp->sd_sb.sb_bsize_shift;
+        BUG_ON(!extlen);
+        if (max_ra < 1)
+                max_ra = 1;
+        if (extlen > max_ra)
+                extlen = max_ra;
+        first_bh = getbuf(sdp, aspace, dblock, CREATE);
+        if (buffer_uptodate(first_bh))
+                goto out;
+        if (!buffer_locked(first_bh))
+                ll_rw_block(READ_META, 1, &first_bh);
+        dblock++;
+        extlen--;
+        while (extlen) {
+                bh = getbuf(sdp, aspace, dblock, CREATE);
+                if (!buffer_uptodate(bh) && !buffer_locked(bh))
+                        ll_rw_block(READA, 1, &bh);
+                brelse(bh);
+                dblock++;
+                extlen--;
+                if (!buffer_locked(first_bh) && buffer_uptodate(first_bh))
+                        goto out;
+        }
+        wait_on_buffer(first_bh);
+out:
+        return first_bh;
+}
+/**
+ * gfs2_meta_syncfs - sync all the buffers in a filesystem
+ * @sdp: the filesystem
+ *
+ */
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
+{
+        gfs2_log_flush(sdp, NULL);
+        for (;;) {
+                gfs2_ail1_start(sdp, DIO_ALL);
+                if (gfs2_ail1_empty(sdp, DIO_ALL))
+                        break;
+                msleep(10);
+        }
+}
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..3ec939e20dff
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __DIO_DOT_H__
+#define __DIO_DOT_H__
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include "incore.h"
+static inline void gfs2_buffer_clear(struct buffer_head *bh)
+{
+        memset(bh->b_data, 0, bh->b_size);
+}
+static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
+{
+        BUG_ON(head > bh->b_size);
+        memset(bh->b_data + head, 0, bh->b_size - head);
+}
+static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
+                                         int to_head,
+                                         struct buffer_head *from_bh,
+                                         int from_head)
+{
+        BUG_ON(from_head < to_head);
+        memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head,
+               from_bh->b_size - from_head);
+        memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
+               0, from_head - to_head);
+}
+struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
+void gfs2_aspace_put(struct inode *aspace);
+void gfs2_meta_inval(struct gfs2_glock *gl);
+void gfs2_meta_sync(struct gfs2_glock *gl);
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
+                   int flags, struct buffer_head **bhp);
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+                         int meta);
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
+void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                struct gfs2_ail *ai);
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_meta_cache_flush(struct gfs2_inode *ip);
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+                              int new, struct buffer_head **bhp);
+static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
+                                         struct buffer_head **bhp)
+{
+        return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
+}
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+#define buffer_busy(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
+#define buffer_in_io(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
+#endif /* __DIO_DOT_H__ */
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..ef3092e29607
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "mount.h"
+#include "sys.h"
+#include "util.h"
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
+{
+        struct gfs2_args *args = &sdp->sd_args;
+        char *data = data_arg;
+        char *options, *o, *v;
+        int error = 0;
+        if (!remount) {
+                /*  If someone preloaded options, use those instead  */
+                spin_lock(&gfs2_sys_margs_lock);
+                if (gfs2_sys_margs) {
+                        data = gfs2_sys_margs;
+                        gfs2_sys_margs = NULL;
+                }
+                spin_unlock(&gfs2_sys_margs_lock);
+                /*  Set some defaults  */
+                args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
+                args->ar_quota = GFS2_QUOTA_DEFAULT;
+                args->ar_data = GFS2_DATA_DEFAULT;
+        }
+        /* Split the options into tokens with the "," character and
+           process them */
+        for (options = data; (o = strsep(&options, ",")); ) {
+                if (!*o)
+                        continue;
+                v = strchr(o, '=');
+                if (v)
+                        *v++ = 0;
+                if (!strcmp(o, "lockproto")) {
+                        if (!v)
+                                goto need_value;
+                        if (remount && strcmp(v, args->ar_lockproto))
+                                goto cant_remount;
+                        strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
+                        args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
+                }
+                else if (!strcmp(o, "locktable")) {
+                        if (!v)
+                                goto need_value;
+                        if (remount && strcmp(v, args->ar_locktable))
+                                goto cant_remount;
+                        strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
+                        args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
+                }
+                else if (!strcmp(o, "hostdata")) {
+                        if (!v)
+                                goto need_value;
+                        if (remount && strcmp(v, args->ar_hostdata))
+                                goto cant_remount;
+                        strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
+                        args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
+                }
+                else if (!strcmp(o, "spectator")) {
+                        if (remount && !args->ar_spectator)
+                                goto cant_remount;
+                        args->ar_spectator = 1;
+                        sdp->sd_vfs->s_flags |= MS_RDONLY;
+                }
+                else if (!strcmp(o, "ignore_local_fs")) {
+                        if (remount && !args->ar_ignore_local_fs)
+                                goto cant_remount;
+                        args->ar_ignore_local_fs = 1;
+                }
+                else if (!strcmp(o, "localflocks")) {
+                        if (remount && !args->ar_localflocks)
+                                goto cant_remount;
+                        args->ar_localflocks = 1;
+                }
+                else if (!strcmp(o, "localcaching")) {
+                        if (remount && !args->ar_localcaching)
+                                goto cant_remount;
+                        args->ar_localcaching = 1;
+                }
+                else if (!strcmp(o, "debug"))
+                        args->ar_debug = 1;
+                else if (!strcmp(o, "nodebug"))
+                        args->ar_debug = 0;
+                else if (!strcmp(o, "upgrade")) {
+                        if (remount && !args->ar_upgrade)
+                                goto cant_remount;
+                        args->ar_upgrade = 1;
+                }
+                else if (!strcmp(o, "num_glockd")) {
+                        unsigned int x;
+                        if (!v)
+                                goto need_value;
+                        sscanf(v, "%u", &x);
+                        if (remount && x != args->ar_num_glockd)
+                                goto cant_remount;
+                        if (!x || x > GFS2_GLOCKD_MAX) {
+                                fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
+                                        GFS2_GLOCKD_MAX, x);
+                                error = -EINVAL;
+                                break;
+                        }
+                        args->ar_num_glockd = x;
+                }
+                else if (!strcmp(o, "acl")) {
+                        args->ar_posix_acl = 1;
+                        sdp->sd_vfs->s_flags |= MS_POSIXACL;
+                }
+                else if (!strcmp(o, "noacl")) {
+                        args->ar_posix_acl = 0;
+                        sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
+                }
+                else if (!strcmp(o, "quota")) {
+                        if (!v)
+                                goto need_value;
+                        if (!strcmp(v, "off"))
+                                args->ar_quota = GFS2_QUOTA_OFF;
+                        else if (!strcmp(v, "account"))
+                                args->ar_quota = GFS2_QUOTA_ACCOUNT;
+                        else if (!strcmp(v, "on"))
+                                args->ar_quota = GFS2_QUOTA_ON;
+                        else {
+                                fs_info(sdp, "invalid value for quota\n");
+                                error = -EINVAL;
+                                break;
+                        }
+                }
+                else if (!strcmp(o, "suiddir"))
+                        args->ar_suiddir = 1;
+                else if (!strcmp(o, "nosuiddir"))
+                        args->ar_suiddir = 0;
+                else if (!strcmp(o, "data")) {
+                        if (!v)
+                                goto need_value;
+                        if (!strcmp(v, "writeback"))
+                                args->ar_data = GFS2_DATA_WRITEBACK;
+                        else if (!strcmp(v, "ordered"))
+                                args->ar_data = GFS2_DATA_ORDERED;
+                        else {
+                                fs_info(sdp, "invalid value for data\n");
+                                error = -EINVAL;
+                                break;
+                        }
+                }
+                else {
+                        fs_info(sdp, "unknown option: %s\n", o);
+                        error = -EINVAL;
+                        break;
+                }
+        }
+        if (error)
+                fs_info(sdp, "invalid mount option(s)\n");
+        if (data != data_arg)
+                kfree(data);
+        return error;
+need_value:
+        fs_info(sdp, "need value for option %s\n", o);
+        return -EINVAL;
+cant_remount:
+        fs_info(sdp, "can't remount with option %s\n", o);
+        return -EINVAL;
+}
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..401288acfdf3
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __MOUNT_DOT_H__
+#define __MOUNT_DOT_H__
+struct gfs2_sbd;
+int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
+#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..1025960b0e6e
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include "gfs2.h"
+#include <linux/gfs2_ondisk.h>
+#define pv(struct, member, fmt) printk(KERN_INFO "  "#member" = "fmt"\n", \
+                                       struct->member);
+/*
+ * gfs2_xxx_in - read in an xxx struct
+ * first arg: the cpu-order structure
+ * buf: the disk-order buffer
+ *
+ * gfs2_xxx_out - write out an xxx struct
+ * first arg: the cpu-order structure
+ * buf: the disk-order buffer
+ *
+ * gfs2_xxx_print - print out an xxx struct
+ * first arg: the cpu-order structure
+ */
+void gfs2_inum_in(struct gfs2_inum *no, const void *buf)
+{
+        const struct gfs2_inum *str = buf;
+        no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
+        no->no_addr = be64_to_cpu(str->no_addr);
+}
+void gfs2_inum_out(const struct gfs2_inum *no, void *buf)
+{
+        struct gfs2_inum *str = buf;
+        str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
+        str->no_addr = cpu_to_be64(no->no_addr);
+}
+static void gfs2_inum_print(const struct gfs2_inum *no)
+{
+        printk(KERN_INFO "  no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
+        printk(KERN_INFO "  no_addr = %llu\n", (unsigned long long)no->no_addr);
+}
+static void gfs2_meta_header_in(struct gfs2_meta_header *mh, const void *buf)
+{
+        const struct gfs2_meta_header *str = buf;
+        mh->mh_magic = be32_to_cpu(str->mh_magic);
+        mh->mh_type = be32_to_cpu(str->mh_type);
+        mh->mh_format = be32_to_cpu(str->mh_format);
+}
+static void gfs2_meta_header_out(const struct gfs2_meta_header *mh, void *buf)
+{
+        struct gfs2_meta_header *str = buf;
+        str->mh_magic = cpu_to_be32(mh->mh_magic);
+        str->mh_type = cpu_to_be32(mh->mh_type);
+        str->mh_format = cpu_to_be32(mh->mh_format);
+}
+static void gfs2_meta_header_print(const struct gfs2_meta_header *mh)
+{
+        pv(mh, mh_magic, "0x%.8X");
+        pv(mh, mh_type, "%u");
+        pv(mh, mh_format, "%u");
+}
+void gfs2_sb_in(struct gfs2_sb *sb, const void *buf)
+{
+        const struct gfs2_sb *str = buf;
+        gfs2_meta_header_in(&sb->sb_header, buf);
+        sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+        sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+        sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+        sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+        gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
+        gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
+        memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+        memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+}
+void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf)
+{
+        const struct gfs2_rindex *str = buf;
+        ri->ri_addr = be64_to_cpu(str->ri_addr);
+        ri->ri_length = be32_to_cpu(str->ri_length);
+        ri->ri_data0 = be64_to_cpu(str->ri_data0);
+        ri->ri_data = be32_to_cpu(str->ri_data);
+        ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
+}
+void gfs2_rindex_print(const struct gfs2_rindex *ri)
+{
+        printk(KERN_INFO "  ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
+        pv(ri, ri_length, "%u");
+        printk(KERN_INFO "  ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
+        pv(ri, ri_data, "%u");
+        pv(ri, ri_bitbytes, "%u");
+}
+void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf)
+{
+        const struct gfs2_rgrp *str = buf;
+        gfs2_meta_header_in(&rg->rg_header, buf);
+        rg->rg_flags = be32_to_cpu(str->rg_flags);
+        rg->rg_free = be32_to_cpu(str->rg_free);
+        rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
+        rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+}
+void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf)
+{
+        struct gfs2_rgrp *str = buf;
+        gfs2_meta_header_out(&rg->rg_header, buf);
+        str->rg_flags = cpu_to_be32(rg->rg_flags);
+        str->rg_free = cpu_to_be32(rg->rg_free);
+        str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+        str->__pad = cpu_to_be32(0);
+        str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+        memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
+}
+void gfs2_quota_in(struct gfs2_quota *qu, const void *buf)
+{
+        const struct gfs2_quota *str = buf;
+        qu->qu_limit = be64_to_cpu(str->qu_limit);
+        qu->qu_warn = be64_to_cpu(str->qu_warn);
+        qu->qu_value = be64_to_cpu(str->qu_value);
+}
+void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf)
+{
+        const struct gfs2_dinode *str = buf;
+        gfs2_meta_header_in(&di->di_header, buf);
+        gfs2_inum_in(&di->di_num, &str->di_num);
+        di->di_mode = be32_to_cpu(str->di_mode);
+        di->di_uid = be32_to_cpu(str->di_uid);
+        di->di_gid = be32_to_cpu(str->di_gid);
+        di->di_nlink = be32_to_cpu(str->di_nlink);
+        di->di_size = be64_to_cpu(str->di_size);
+        di->di_blocks = be64_to_cpu(str->di_blocks);
+        di->di_atime = be64_to_cpu(str->di_atime);
+        di->di_mtime = be64_to_cpu(str->di_mtime);
+        di->di_ctime = be64_to_cpu(str->di_ctime);
+        di->di_major = be32_to_cpu(str->di_major);
+        di->di_minor = be32_to_cpu(str->di_minor);
+        di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
+        di->di_goal_data = be64_to_cpu(str->di_goal_data);
+        di->di_generation = be64_to_cpu(str->di_generation);
+        di->di_flags = be32_to_cpu(str->di_flags);
+        di->di_payload_format = be32_to_cpu(str->di_payload_format);
+        di->di_height = be16_to_cpu(str->di_height);
+        di->di_depth = be16_to_cpu(str->di_depth);
+        di->di_entries = be32_to_cpu(str->di_entries);
+        di->di_eattr = be64_to_cpu(str->di_eattr);
+}
+void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf)
+{
+        struct gfs2_dinode *str = buf;
+        gfs2_meta_header_out(&di->di_header, buf);
+        gfs2_inum_out(&di->di_num, (char *)&str->di_num);
+        str->di_mode = cpu_to_be32(di->di_mode);
+        str->di_uid = cpu_to_be32(di->di_uid);
+        str->di_gid = cpu_to_be32(di->di_gid);
+        str->di_nlink = cpu_to_be32(di->di_nlink);
+        str->di_size = cpu_to_be64(di->di_size);
+        str->di_blocks = cpu_to_be64(di->di_blocks);
+        str->di_atime = cpu_to_be64(di->di_atime);
+        str->di_mtime = cpu_to_be64(di->di_mtime);
+        str->di_ctime = cpu_to_be64(di->di_ctime);
+        str->di_major = cpu_to_be32(di->di_major);
+        str->di_minor = cpu_to_be32(di->di_minor);
+        str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
+        str->di_goal_data = cpu_to_be64(di->di_goal_data);
+        str->di_generation = cpu_to_be64(di->di_generation);
+        str->di_flags = cpu_to_be32(di->di_flags);
+        str->di_payload_format = cpu_to_be32(di->di_payload_format);
+        str->di_height = cpu_to_be16(di->di_height);
+        str->di_depth = cpu_to_be16(di->di_depth);
+        str->di_entries = cpu_to_be32(di->di_entries);
+        str->di_eattr = cpu_to_be64(di->di_eattr);
+}
+void gfs2_dinode_print(const struct gfs2_dinode *di)
+{
+        gfs2_meta_header_print(&di->di_header);
+        gfs2_inum_print(&di->di_num);
+        pv(di, di_mode, "0%o");
+        pv(di, di_uid, "%u");
+        pv(di, di_gid, "%u");
+        pv(di, di_nlink, "%u");
+        printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
+        printk(KERN_INFO "  di_blocks = %llu\n", (unsigned long long)di->di_blocks);
+        printk(KERN_INFO "  di_atime = %lld\n", (long long)di->di_atime);
+        printk(KERN_INFO "  di_mtime = %lld\n", (long long)di->di_mtime);
+        printk(KERN_INFO "  di_ctime = %lld\n", (long long)di->di_ctime);
+        pv(di, di_major, "%u");
+        pv(di, di_minor, "%u");
+        printk(KERN_INFO "  di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
+        printk(KERN_INFO "  di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
+        pv(di, di_flags, "0x%.8X");
+        pv(di, di_payload_format, "%u");
+        pv(di, di_height, "%u");
+        pv(di, di_depth, "%u");
+        pv(di, di_entries, "%u");
+        printk(KERN_INFO "  di_eattr = %llu\n", (unsigned long long)di->di_eattr);
+}
+void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf)
+{
+        const struct gfs2_log_header *str = buf;
+        gfs2_meta_header_in(&lh->lh_header, buf);
+        lh->lh_sequence = be64_to_cpu(str->lh_sequence);
+        lh->lh_flags = be32_to_cpu(str->lh_flags);
+        lh->lh_tail = be32_to_cpu(str->lh_tail);
+        lh->lh_blkno = be32_to_cpu(str->lh_blkno);
+        lh->lh_hash = be32_to_cpu(str->lh_hash);
+}
+void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf)
+{
+        const struct gfs2_inum_range *str = buf;
+        ir->ir_start = be64_to_cpu(str->ir_start);
+        ir->ir_length = be64_to_cpu(str->ir_length);
+}
+void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf)
+{
+        struct gfs2_inum_range *str = buf;
+        str->ir_start = cpu_to_be64(ir->ir_start);
+        str->ir_length = cpu_to_be64(ir->ir_length);
+}
+void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf)
+{
+        const struct gfs2_statfs_change *str = buf;
+        sc->sc_total = be64_to_cpu(str->sc_total);
+        sc->sc_free = be64_to_cpu(str->sc_free);
+        sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
+}
+void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf)
+{
+        struct gfs2_statfs_change *str = buf;
+        str->sc_total = cpu_to_be64(sc->sc_total);
+        str->sc_free = cpu_to_be64(sc->sc_free);
+        str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
+}
+void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf)
+{
+        const struct gfs2_quota_change *str = buf;
+        qc->qc_change = be64_to_cpu(str->qc_change);
+        qc->qc_flags = be32_to_cpu(str->qc_flags);
+        qc->qc_id = be32_to_cpu(str->qc_id);
+}
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..4fb743f4e4a4
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,790 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_address.h"
+#include "quota.h"
+#include "trans.h"
+#include "rgrp.h"
+#include "ops_file.h"
+#include "util.h"
+#include "glops.h"
+static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+                                   unsigned int from, unsigned int to)
+{
+        struct buffer_head *head = page_buffers(page);
+        unsigned int bsize = head->b_size;
+        struct buffer_head *bh;
+        unsigned int start, end;
+        for (bh = head, start = 0; bh != head || !start;
+             bh = bh->b_this_page, start = end) {
+                end = start + bsize;
+                if (end <= from || start >= to)
+                        continue;
+                gfs2_trans_add_bh(ip->i_gl, bh, 0);
+        }
+}
+/**
+ * gfs2_get_block - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+int gfs2_get_block(struct inode *inode, sector_t lblock,
+                   struct buffer_head *bh_result, int create)
+{
+        return gfs2_block_map(inode, lblock, create, bh_result, 32);
+}
+/**
+ * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
+                                  struct buffer_head *bh_result, int create)
+{
+        int error;
+        error = gfs2_block_map(inode, lblock, 0, bh_result, 1);
+        if (error)
+                return error;
+        if (bh_result->b_blocknr == 0)
+                return -EIO;
+        return 0;
+}
+static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
+                                 struct buffer_head *bh_result, int create)
+{
+        return gfs2_block_map(inode, lblock, 0, bh_result, 32);
+}
+/**
+ * gfs2_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ * Some of this is copied from block_write_full_page() although we still
+ * call it to do most of the work.
+ */
+static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        unsigned offset;
+        int error;
+        int done_trans = 0;
+        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
+                unlock_page(page);
+                return -EIO;
+        }
+        if (current->journal_info)
+                goto out_ignore;
+        /* Is the page fully outside i_size? (truncate in progress) */
+        offset = i_size & (PAGE_CACHE_SIZE-1);
+        if (page->index > end_index || (page->index == end_index && !offset)) {
+                page->mapping->a_ops->invalidatepage(page, 0);
+                unlock_page(page);
+                return 0; /* don't care */
+        }
+        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
+                error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+                if (error)
+                        goto out_ignore;
+                if (!page_has_buffers(page)) {
+                        create_empty_buffers(page, inode->i_sb->s_blocksize,
+                                             (1 << BH_Dirty)|(1 << BH_Uptodate));
+                }
+                gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+                done_trans = 1;
+        }
+        error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+        if (done_trans)
+                gfs2_trans_end(sdp);
+        gfs2_meta_cache_flush(ip);
+        return error;
+out_ignore:
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        return 0;
+}
+static int zero_readpage(struct page *page)
+{
+        void *kaddr;
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr, 0, PAGE_CACHE_SIZE);
+        kunmap_atomic(page, KM_USER0);
+        SetPageUptodate(page);
+        return 0;
+}
+/**
+ * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * @ip: the inode
+ * @page: the page
+ *
+ * Returns: errno
+ */
+static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+{
+        struct buffer_head *dibh;
+        void *kaddr;
+        int error;
+        /* Only the first page of a stuffed file might contain data */
+        if (unlikely(page->index))
+                return zero_readpage(page);
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        kaddr = kmap_atomic(page, KM_USER0);
+        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+               ip->i_di.di_size);
+        memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+        kunmap_atomic(page, KM_USER0);
+        brelse(dibh);
+        SetPageUptodate(page);
+        return 0;
+}
+/**
+ * gfs2_readpage - readpage with locking
+ * @file: The file to read a page for. N.B. This may be NULL if we are
+ * reading an internal file.
+ * @page: The page to read
+ *
+ * Returns: errno
+ */
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+        struct gfs2_file *gf = NULL;
+        struct gfs2_holder gh;
+        int error;
+        int do_unlock = 0;
+        if (likely(file != &gfs2_internal_file_sentinel)) {
+                if (file) {
+                        gf = file->private_data;
+                        if (test_bit(GFF_EXLOCK, &gf->f_flags))
+                                /* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
+                                goto skip_lock;
+                }
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
+                do_unlock = 1;
+                error = gfs2_glock_nq_m_atime(1, &gh);
+                if (unlikely(error))
+                        goto out_unlock;
+        }
+skip_lock:
+        if (gfs2_is_stuffed(ip)) {
+                error = stuffed_readpage(ip, page);
+                unlock_page(page);
+        } else
+                error = mpage_readpage(page, gfs2_get_block);
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = -EIO;
+        if (do_unlock) {
+                gfs2_glock_dq_m(1, &gh);
+                gfs2_holder_uninit(&gh);
+        }
+out:
+        return error;
+out_unlock:
+        unlock_page(page);
+        if (do_unlock)
+                gfs2_holder_uninit(&gh);
+        goto out;
+}
+/**
+ * gfs2_readpages - Read a bunch of pages at once
+ *
+ * Some notes:
+ * 1. This is only for readahead, so we can simply ignore any things
+ *    which are slightly inconvenient (such as locking conflicts between
+ *    the page lock and the glock) and return having done no I/O. Its
+ *    obviously not something we'd want to do on too regular a basis.
+ *    Any I/O we ignore at this time will be done via readpage later.
+ * 2. We have to handle stuffed files here too.
+ * 3. mpage_readpages() does most of the heavy lifting in the common case.
+ * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
+ * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
+ *    well as read-ahead.
+ */
+static int gfs2_readpages(struct file *file, struct address_space *mapping,
+                          struct list_head *pages, unsigned nr_pages)
+{
+        struct inode *inode = mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_holder gh;
+        unsigned page_idx;
+        int ret;
+        int do_unlock = 0;
+        if (likely(file != &gfs2_internal_file_sentinel)) {
+                if (file) {
+                        struct gfs2_file *gf = file->private_data;
+                        if (test_bit(GFF_EXLOCK, &gf->f_flags))
+                                goto skip_lock;
+                }
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
+                                 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
+                do_unlock = 1;
+                ret = gfs2_glock_nq_m_atime(1, &gh);
+                if (ret == GLR_TRYFAILED)
+                        goto out_noerror;
+                if (unlikely(ret))
+                        goto out_unlock;
+        }
+skip_lock:
+        if (gfs2_is_stuffed(ip)) {
+                struct pagevec lru_pvec;
+                pagevec_init(&lru_pvec, 0);
+                for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+                        struct page *page = list_entry(pages->prev, struct page, lru);
+                        prefetchw(&page->flags);
+                        list_del(&page->lru);
+                        if (!add_to_page_cache(page, mapping,
+                                               page->index, GFP_KERNEL)) {
+                                ret = stuffed_readpage(ip, page);
+                                unlock_page(page);
+                                if (!pagevec_add(&lru_pvec, page))
+                                         __pagevec_lru_add(&lru_pvec);
+                        } else {
+                                page_cache_release(page);
+                        }
+                }
+                pagevec_lru_add(&lru_pvec);
+                ret = 0;
+        } else {
+                /* What we really want to do .... */
+                ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
+        }
+        if (do_unlock) {
+                gfs2_glock_dq_m(1, &gh);
+                gfs2_holder_uninit(&gh);
+        }
+out:
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret = -EIO;
+        return ret;
+out_noerror:
+        ret = 0;
+out_unlock:
+        /* unlock all pages, we can't do any I/O right now */
+        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+                struct page *page = list_entry(pages->prev, struct page, lru);
+                list_del(&page->lru);
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        if (do_unlock)
+                gfs2_holder_uninit(&gh);
+        goto out;
+}
+/**
+ * gfs2_prepare_write - Prepare to write a page to a file
+ * @file: The file to write to
+ * @page: The page which is to be prepared for writing
+ * @from: From (byte range within page)
+ * @to: To (byte range within page)
+ *
+ * Returns: errno
+ */
+static int gfs2_prepare_write(struct file *file, struct page *page,
+                              unsigned from, unsigned to)
+{
+        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+        unsigned int data_blocks, ind_blocks, rblocks;
+        int alloc_required;
+        int error = 0;
+        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
+        loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+        struct gfs2_alloc *al;
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
+        error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
+        if (error)
+                goto out_uninit;
+        gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks);
+        error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required);
+        if (error)
+                goto out_unlock;
+        if (alloc_required) {
+                al = gfs2_alloc_get(ip);
+                error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                if (error)
+                        goto out_alloc_put;
+                error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+                if (error)
+                        goto out_qunlock;
+                al->al_requested = data_blocks + ind_blocks;
+                error = gfs2_inplace_reserve(ip);
+                if (error)
+                        goto out_qunlock;
+        }
+        rblocks = RES_DINODE + ind_blocks;
+        if (gfs2_is_jdata(ip))
+                rblocks += data_blocks ? data_blocks : 1;
+        if (ind_blocks || data_blocks)
+                rblocks += RES_STATFS + RES_QUOTA;
+        error = gfs2_trans_begin(sdp, rblocks, 0);
+        if (error)
+                goto out;
+        if (gfs2_is_stuffed(ip)) {
+                if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+                        error = gfs2_unstuff_dinode(ip, page);
+                        if (error == 0)
+                                goto prepare_write;
+                } else if (!PageUptodate(page))
+                        error = stuffed_readpage(ip, page);
+                goto out;
+        }
+prepare_write:
+        error = block_prepare_write(page, from, to, gfs2_get_block);
+out:
+        if (error) {
+                gfs2_trans_end(sdp);
+                if (alloc_required) {
+                        gfs2_inplace_release(ip);
+out_qunlock:
+                        gfs2_quota_unlock(ip);
+out_alloc_put:
+                        gfs2_alloc_put(ip);
+                }
+out_unlock:
+                gfs2_glock_dq_m(1, &ip->i_gh);
+out_uninit:
+                gfs2_holder_uninit(&ip->i_gh);
+        }
+        return error;
+}
+/**
+ * gfs2_commit_write - Commit write to a file
+ * @file: The file to write to
+ * @page: The page containing the data
+ * @from: From (byte range within page)
+ * @to: To (byte range within page)
+ *
+ * Returns: errno
+ */
+static int gfs2_commit_write(struct file *file, struct page *page,
+                             unsigned from, unsigned to)
+{
+        struct inode *inode = page->mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        int error = -EOPNOTSUPP;
+        struct buffer_head *dibh;
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_dinode *di;
+        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
+                goto fail_nounlock;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto fail_endtrans;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        di = (struct gfs2_dinode *)dibh->b_data;
+        if (gfs2_is_stuffed(ip)) {
+                u64 file_size;
+                void *kaddr;
+                file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to;
+                kaddr = kmap_atomic(page, KM_USER0);
+                memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
+                       kaddr + from, to - from);
+                kunmap_atomic(page, KM_USER0);
+                SetPageUptodate(page);
+                if (inode->i_size < file_size)
+                        i_size_write(inode, file_size);
+        } else {
+                if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
+                    gfs2_is_jdata(ip))
+                        gfs2_page_add_databufs(ip, page, from, to);
+                error = generic_commit_write(file, page, from, to);
+                if (error)
+                        goto fail;
+        }
+        if (ip->i_di.di_size < inode->i_size) {
+                ip->i_di.di_size = inode->i_size;
+                di->di_size = cpu_to_be64(inode->i_size);
+        }
+        di->di_mode = cpu_to_be32(inode->i_mode);
+        di->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
+        di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
+        di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
+        brelse(dibh);
+        gfs2_trans_end(sdp);
+        if (al->al_requested) {
+                gfs2_inplace_release(ip);
+                gfs2_quota_unlock(ip);
+                gfs2_alloc_put(ip);
+        }
+        gfs2_glock_dq_m(1, &ip->i_gh);
+        gfs2_holder_uninit(&ip->i_gh);
+        return 0;
+fail:
+        brelse(dibh);
+fail_endtrans:
+        gfs2_trans_end(sdp);
+        if (al->al_requested) {
+                gfs2_inplace_release(ip);
+                gfs2_quota_unlock(ip);
+                gfs2_alloc_put(ip);
+        }
+        gfs2_glock_dq_m(1, &ip->i_gh);
+        gfs2_holder_uninit(&ip->i_gh);
+fail_nounlock:
+        ClearPageUptodate(page);
+        return error;
+}
+/**
+ * gfs2_bmap - Block map function
+ * @mapping: Address space info
+ * @lblock: The block to map
+ *
+ * Returns: The disk address for the block or 0 on hole or error
+ */
+static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
+{
+        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        struct gfs2_holder i_gh;
+        sector_t dblock = 0;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (error)
+                return 0;
+        if (!gfs2_is_stuffed(ip))
+                dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
+        gfs2_glock_dq_uninit(&i_gh);
+        return dblock;
+}
+static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+        struct gfs2_bufdata *bd;
+        gfs2_log_lock(sdp);
+        bd = bh->b_private;
+        if (bd) {
+                bd->bd_bh = NULL;
+                bh->b_private = NULL;
+        }
+        gfs2_log_unlock(sdp);
+        lock_buffer(bh);
+        clear_buffer_dirty(bh);
+        bh->b_bdev = NULL;
+        clear_buffer_mapped(bh);
+        clear_buffer_req(bh);
+        clear_buffer_new(bh);
+        clear_buffer_delay(bh);
+        unlock_buffer(bh);
+}
+static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+        struct buffer_head *head, *bh, *next;
+        unsigned int curr_off = 0;
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                return;
+        bh = head = page_buffers(page);
+        do {
+                unsigned int next_off = curr_off + bh->b_size;
+                next = bh->b_this_page;
+                if (offset <= curr_off)
+                        discard_buffer(sdp, bh);
+                curr_off = next_off;
+                bh = next;
+        } while (bh != head);
+        if (!offset)
+                try_to_release_page(page, 0);
+        return;
+}
+static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
+                              const struct iovec *iov, loff_t offset,
+                              unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int rv;
+        if (rw == READ)
+                mutex_lock(&inode->i_mutex);
+        /*
+         * Shared lock, even if its a write, since we do no allocation
+         * on this path. All we need change is atime.
+         */
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        rv = gfs2_glock_nq_m_atime(1, &gh);
+        if (rv)
+                goto out;
+        if (offset > i_size_read(inode))
+                goto out;
+        /*
+         * Should we return an error here? I can't see that O_DIRECT for
+         * a journaled file makes any sense. For now we'll silently fall
+         * back to buffered I/O, likewise we do the same for stuffed
+         * files since they are (a) small and (b) unaligned.
+         */
+        if (gfs2_is_jdata(ip))
+                goto out;
+        if (gfs2_is_stuffed(ip))
+                goto out;
+        rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
+                                            inode->i_sb->s_bdev,
+                                            iov, offset, nr_segs,
+                                            gfs2_get_block_direct, NULL);
+out:
+        gfs2_glock_dq_m(1, &gh);
+        gfs2_holder_uninit(&gh);
+        if (rw == READ)
+                mutex_unlock(&inode->i_mutex);
+        return rv;
+}
+/**
+ * stuck_releasepage - We're stuck in gfs2_releasepage().  Print stuff out.
+ * @bh: the buffer we're stuck on
+ *
+ */
+static void stuck_releasepage(struct buffer_head *bh)
+{
+        struct inode *inode = bh->b_page->mapping->host;
+        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_bufdata *bd = bh->b_private;
+        struct gfs2_glock *gl;
+static unsigned limit = 0;
+        if (limit > 3)
+                return;
+        limit++;
+        fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
+        fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
+                (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
+        fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
+        fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
+        if (!bd)
+                return;
+        gl = bd->bd_gl;
+        fs_warn(sdp, "gl = (%u, %llu)\n",
+                gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
+        fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
+                (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
+                (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
+        if (gl->gl_ops == &gfs2_inode_glops) {
+                struct gfs2_inode *ip = gl->gl_object;
+                unsigned int x;
+                if (!ip)
+                        return;
+                fs_warn(sdp, "ip = %llu %llu\n",
+                        (unsigned long long)ip->i_num.no_formal_ino,
+                        (unsigned long long)ip->i_num.no_addr);
+                for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
+                        fs_warn(sdp, "ip->i_cache[%u] = %s\n",
+                                x, (ip->i_cache[x]) ? "!NULL" : "NULL");
+        }
+}
+/**
+ * gfs2_releasepage - free the metadata associated with a page
+ * @page: the page that's being released
+ * @gfp_mask: passed from Linux VFS, ignored by us
+ *
+ * Call try_to_free_buffers() if the buffers in this page can be
+ * released.
+ *
+ * Returns: 0
+ */
+int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+{
+        struct inode *aspace = page->mapping->host;
+        struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
+        struct buffer_head *bh, *head;
+        struct gfs2_bufdata *bd;
+        unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
+        if (!page_has_buffers(page))
+                goto out;
+        head = bh = page_buffers(page);
+        do {
+                while (atomic_read(&bh->b_count)) {
+                        if (!atomic_read(&aspace->i_writecount))
+                                return 0;
+                        if (time_after_eq(jiffies, t)) {
+                                stuck_releasepage(bh);
+                                /* should we withdraw here? */
+                                return 0;
+                        }
+                        yield();
+                }
+                gfs2_assert_warn(sdp, !buffer_pinned(bh));
+                gfs2_assert_warn(sdp, !buffer_dirty(bh));
+                gfs2_log_lock(sdp);
+                bd = bh->b_private;
+                if (bd) {
+                        gfs2_assert_warn(sdp, bd->bd_bh == bh);
+                        gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
+                        gfs2_assert_warn(sdp, !bd->bd_ail);
+                        bd->bd_bh = NULL;
+                        if (!list_empty(&bd->bd_le.le_list))
+                                bd = NULL;
+                        bh->b_private = NULL;
+                }
+                gfs2_log_unlock(sdp);
+                if (bd)
+                        kmem_cache_free(gfs2_bufdata_cachep, bd);
+                bh = bh->b_this_page;
+        } while (bh != head);
+out:
+        return try_to_free_buffers(page);
+}
+const struct address_space_operations gfs2_file_aops = {
+        .writepage = gfs2_writepage,
+        .readpage = gfs2_readpage,
+        .readpages = gfs2_readpages,
+        .sync_page = block_sync_page,
+        .prepare_write = gfs2_prepare_write,
+        .commit_write = gfs2_commit_write,
+        .bmap = gfs2_bmap,
+        .invalidatepage = gfs2_invalidatepage,
+        .releasepage = gfs2_releasepage,
+        .direct_IO = gfs2_direct_IO,
+};
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..35aaee4aa7e1
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_ADDRESS_DOT_H__
+#define __OPS_ADDRESS_DOT_H__
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+extern const struct address_space_operations gfs2_file_aops;
+extern int gfs2_get_block(struct inode *inode, sector_t lblock,
+                          struct buffer_head *bh_result, int create);
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..00041b1b8025
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "ops_dentry.h"
+#include "util.h"
+/**
+ * gfs2_drevalidate - Check directory lookup consistency
+ * @dentry: the mapping to check
+ * @nd:
+ *
+ * Check to make sure the lookup necessary to arrive at this inode from its
+ * parent is still good.
+ *
+ * Returns: 1 if the dentry is ok, 0 if it isn't
+ */
+static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct dentry *parent = dget_parent(dentry);
+        struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+        struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_holder d_gh;
+        struct gfs2_inode *ip;
+        struct gfs2_inum inum;
+        unsigned int type;
+        int error;
+        if (inode && is_bad_inode(inode))
+                goto invalid;
+        if (sdp->sd_args.ar_localcaching)
+                goto valid;
+        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+        if (error)
+                goto fail;
+        error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
+        switch (error) {
+        case 0:
+                if (!inode)
+                        goto invalid_gunlock;
+                break;
+        case -ENOENT:
+                if (!inode)
+                        goto valid_gunlock;
+                goto invalid_gunlock;
+        default:
+                goto fail_gunlock;
+        }
+        ip = GFS2_I(inode);
+        if (!gfs2_inum_equal(&ip->i_num, &inum))
+                goto invalid_gunlock;
+        if (IF2DT(ip->i_di.di_mode) != type) {
+                gfs2_consist_inode(dip);
+                goto fail_gunlock;
+        }
+valid_gunlock:
+        gfs2_glock_dq_uninit(&d_gh);
+valid:
+        dput(parent);
+        return 1;
+invalid_gunlock:
+        gfs2_glock_dq_uninit(&d_gh);
+invalid:
+        if (inode && S_ISDIR(inode->i_mode)) {
+                if (have_submounts(dentry))
+                        goto valid;
+                shrink_dcache_parent(dentry);
+        }
+        d_drop(dentry);
+        dput(parent);
+        return 0;
+fail_gunlock:
+        gfs2_glock_dq_uninit(&d_gh);
+fail:
+        dput(parent);
+        return 0;
+}
+static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+{
+        str->hash = gfs2_disk_hash(str->name, str->len);
+        return 0;
+}
+struct dentry_operations gfs2_dops = {
+        .d_revalidate = gfs2_drevalidate,
+        .d_hash = gfs2_dhash,
+};
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..5caa3db4d3f5
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_DENTRY_DOT_H__
+#define __OPS_DENTRY_DOT_H__
+#include <linux/dcache.h>
+extern struct dentry_operations gfs2_dops;
+#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..86127d93bd35
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "ops_export.h"
+#include "rgrp.h"
+#include "util.h"
+static struct dentry *gfs2_decode_fh(struct super_block *sb,
+                                     __u32 *fh,
+                                     int fh_len,
+                                     int fh_type,
+                                     int (*acceptable)(void *context,
+                                                       struct dentry *dentry),
+                                     void *context)
+{
+        struct gfs2_fh_obj fh_obj;
+        struct gfs2_inum *this, parent;
+        if (fh_type != fh_len)
+                return NULL;
+        this            = &fh_obj.this;
+        fh_obj.imode    = DT_UNKNOWN;
+        memset(&parent, 0, sizeof(struct gfs2_inum));
+        switch (fh_type) {
+        case GFS2_LARGE_FH_SIZE:
+                parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
+                parent.no_formal_ino |= be32_to_cpu(fh[5]);
+                parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
+                parent.no_addr |= be32_to_cpu(fh[7]);
+                fh_obj.imode = be32_to_cpu(fh[8]);
+        case GFS2_SMALL_FH_SIZE:
+                this->no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
+                this->no_formal_ino |= be32_to_cpu(fh[1]);
+                this->no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
+                this->no_addr |= be32_to_cpu(fh[3]);
+                break;
+        default:
+                return NULL;
+        }
+        return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent,
+                                                    acceptable, context);
+}
+static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+                          int connectable)
+{
+        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (*len < GFS2_SMALL_FH_SIZE ||
+            (connectable && *len < GFS2_LARGE_FH_SIZE))
+                return 255;
+        fh[0] = ip->i_num.no_formal_ino >> 32;
+        fh[0] = cpu_to_be32(fh[0]);
+        fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
+        fh[1] = cpu_to_be32(fh[1]);
+        fh[2] = ip->i_num.no_addr >> 32;
+        fh[2] = cpu_to_be32(fh[2]);
+        fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
+        fh[3] = cpu_to_be32(fh[3]);
+        *len = GFS2_SMALL_FH_SIZE;
+        if (!connectable || inode == sb->s_root->d_inode)
+                return *len;
+        spin_lock(&dentry->d_lock);
+        inode = dentry->d_parent->d_inode;
+        ip = GFS2_I(inode);
+        igrab(inode);
+        spin_unlock(&dentry->d_lock);
+        fh[4] = ip->i_num.no_formal_ino >> 32;
+        fh[4] = cpu_to_be32(fh[4]);
+        fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
+        fh[5] = cpu_to_be32(fh[5]);
+        fh[6] = ip->i_num.no_addr >> 32;
+        fh[6] = cpu_to_be32(fh[6]);
+        fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
+        fh[7] = cpu_to_be32(fh[7]);
+        fh[8]  = cpu_to_be32(inode->i_mode);
+        fh[9]  = 0;     /* pad to double word */
+        *len = GFS2_LARGE_FH_SIZE;
+        iput(inode);
+        return *len;
+}
+struct get_name_filldir {
+        struct gfs2_inum inum;
+        char *name;
+};
+static int get_name_filldir(void *opaque, const char *name, unsigned int length,
+                            u64 offset, struct gfs2_inum *inum,
+                            unsigned int type)
+{
+        struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
+        if (!gfs2_inum_equal(inum, &gnfd->inum))
+                return 0;
+        memcpy(gnfd->name, name, length);
+        gnfd->name[length] = 0;
+        return 1;
+}
+static int gfs2_get_name(struct dentry *parent, char *name,
+                         struct dentry *child)
+{
+        struct inode *dir = parent->d_inode;
+        struct inode *inode = child->d_inode;
+        struct gfs2_inode *dip, *ip;
+        struct get_name_filldir gnfd;
+        struct gfs2_holder gh;
+        u64 offset = 0;
+        int error;
+        if (!dir)
+                return -EINVAL;
+        if (!S_ISDIR(dir->i_mode) || !inode)
+                return -EINVAL;
+        dip = GFS2_I(dir);
+        ip = GFS2_I(inode);
+        *name = 0;
+        gnfd.inum = ip->i_num;
+        gnfd.name = name;
+        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
+        if (error)
+                return error;
+        error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+        gfs2_glock_dq_uninit(&gh);
+        if (!error && !*name)
+                error = -ENOENT;
+        return error;
+}
+static struct dentry *gfs2_get_parent(struct dentry *child)
+{
+        struct qstr dotdot;
+        struct inode *inode;
+        struct dentry *dentry;
+        gfs2_str2qstr(&dotdot, "..");
+        inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
+        if (!inode)
+                return ERR_PTR(-ENOENT);
+        /*
+         * In case of an error, @inode carries the error value, and we
+         * have to return that as a(n invalid) pointer to dentry.
+         */
+        if (IS_ERR(inode))
+                return ERR_PTR(PTR_ERR(inode));
+        dentry = d_alloc_anon(inode);
+        if (!dentry) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        return dentry;
+}
+static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
+        struct gfs2_inum *inum = &fh_obj->this;
+        struct gfs2_holder i_gh, ri_gh, rgd_gh;
+        struct gfs2_rgrpd *rgd;
+        struct inode *inode;
+        struct dentry *dentry;
+        int error;
+        /* System files? */
+        inode = gfs2_ilookup(sb, inum);
+        if (inode) {
+                if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) {
+                        iput(inode);
+                        return ERR_PTR(-ESTALE);
+                }
+                goto out_inode;
+        }
+        error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+                                  LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
+                                  &i_gh);
+        if (error)
+                return ERR_PTR(error);
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto fail;
+        error = -EINVAL;
+        rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
+        if (!rgd)
+                goto fail_rindex;
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+        if (error)
+                goto fail_rindex;
+        error = -ESTALE;
+        if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
+                goto fail_rgd;
+        gfs2_glock_dq_uninit(&rgd_gh);
+        gfs2_glock_dq_uninit(&ri_gh);
+        inode = gfs2_inode_lookup(sb, inum, fh_obj->imode);
+        if (!inode)
+                goto fail;
+        if (IS_ERR(inode)) {
+                error = PTR_ERR(inode);
+                goto fail;
+        }
+        error = gfs2_inode_refresh(GFS2_I(inode));
+        if (error) {
+                iput(inode);
+                goto fail;
+        }
+        error = -EIO;
+        if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+                iput(inode);
+                goto fail;
+        }
+        gfs2_glock_dq_uninit(&i_gh);
+out_inode:
+        dentry = d_alloc_anon(inode);
+        if (!dentry) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        return dentry;
+fail_rgd:
+        gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+        gfs2_glock_dq_uninit(&ri_gh);
+fail:
+        gfs2_glock_dq_uninit(&i_gh);
+        return ERR_PTR(error);
+}
+struct export_operations gfs2_export_ops = {
+        .decode_fh = gfs2_decode_fh,
+        .encode_fh = gfs2_encode_fh,
+        .get_name = gfs2_get_name,
+        .get_parent = gfs2_get_parent,
+        .get_dentry = gfs2_get_dentry,
+};
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..09aca5046fb1
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_EXPORT_DOT_H__
+#define __OPS_EXPORT_DOT_H__
+#define GFS2_SMALL_FH_SIZE 4
+#define GFS2_LARGE_FH_SIZE 10
+extern struct export_operations gfs2_export_ops;
+struct gfs2_fh_obj {
+        struct gfs2_inum this;
+        __u32            imode;
+};
+#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..3064f133bf3c
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,661 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/ext2_fs.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_file.h"
+#include "ops_vm.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "eaops.h"
+/* For regular, non-NFS */
+struct filldir_reg {
+        struct gfs2_sbd *fdr_sbd;
+        int fdr_prefetch;
+        filldir_t fdr_filldir;
+        void *fdr_opaque;
+};
+/*
+ * Most fields left uninitialised to catch anybody who tries to
+ * use them. f_flags set to prevent file_accessed() from touching
+ * any other part of this. Its use is purely as a flag so that we
+ * know (in readpage()) whether or not do to locking.
+ */
+struct file gfs2_internal_file_sentinel = {
+        .f_flags = O_NOATIME|O_RDONLY,
+};
+static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
+                           unsigned long offset, unsigned long size)
+{
+        char *kaddr;
+        unsigned long count = desc->count;
+        if (size > count)
+                size = count;
+        kaddr = kmap(page);
+        memcpy(desc->arg.buf, kaddr + offset, size);
+        kunmap(page);
+        desc->count = count - size;
+        desc->written += size;
+        desc->arg.buf += size;
+        return size;
+}
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+                       char *buf, loff_t *pos, unsigned size)
+{
+        struct inode *inode = &ip->i_inode;
+        read_descriptor_t desc;
+        desc.written = 0;
+        desc.arg.buf = buf;
+        desc.count = size;
+        desc.error = 0;
+        do_generic_mapping_read(inode->i_mapping, ra_state,
+                                &gfs2_internal_file_sentinel, pos, &desc,
+                                gfs2_read_actor);
+        return desc.written ? desc.written : desc.error;
+}
+/**
+ * gfs2_llseek - seek to a location in a file
+ * @file: the file
+ * @offset: the offset
+ * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ *
+ * SEEK_END requires the glock for the file because it references the
+ * file's size.
+ *
+ * Returns: The new offset, or errno
+ */
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_holder i_gh;
+        loff_t error;
+        if (origin == 2) {
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+                                           &i_gh);
+                if (!error) {
+                        error = remote_llseek(file, offset, origin);
+                        gfs2_glock_dq_uninit(&i_gh);
+                }
+        } else
+                error = remote_llseek(file, offset, origin);
+        return error;
+}
+/**
+ * filldir_func - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+static int filldir_func(void *opaque, const char *name, unsigned int length,
+                        u64 offset, struct gfs2_inum *inum,
+                        unsigned int type)
+{
+        struct filldir_reg *fdr = (struct filldir_reg *)opaque;
+        struct gfs2_sbd *sdp = fdr->fdr_sbd;
+        int error;
+        error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
+                                 inum->no_addr, type);
+        if (error)
+                return 1;
+        if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
+                gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops,
+                                       LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
+                gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops,
+                                       LM_ST_SHARED, LM_FLAG_TRY);
+        }
+        return 0;
+}
+/**
+ * gfs2_readdir - Read directory entries from a directory
+ * @file: The directory to read from
+ * @dirent: Buffer for dirents
+ * @filldir: Function used to do the copying
+ *
+ * Returns: errno
+ */
+static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+        struct inode *dir = file->f_mapping->host;
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct filldir_reg fdr;
+        struct gfs2_holder d_gh;
+        u64 offset = file->f_pos;
+        int error;
+        fdr.fdr_sbd = GFS2_SB(dir);
+        fdr.fdr_prefetch = 1;
+        fdr.fdr_filldir = filldir;
+        fdr.fdr_opaque = dirent;
+        gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
+        error = gfs2_glock_nq_atime(&d_gh);
+        if (error) {
+                gfs2_holder_uninit(&d_gh);
+                return error;
+        }
+        error = gfs2_dir_read(dir, &offset, &fdr, filldir_func);
+        gfs2_glock_dq_uninit(&d_gh);
+        file->f_pos = offset;
+        return error;
+}
+/**
+ * fsflags_cvt
+ * @table: A table of 32 u32 flags
+ * @val: a 32 bit value to convert
+ *
+ * This function can be used to convert between fsflags values and
+ * GFS2's own flags values.
+ *
+ * Returns: the converted flags
+ */
+static u32 fsflags_cvt(const u32 *table, u32 val)
+{
+        u32 res = 0;
+        while(val) {
+                if (val & 1)
+                        res |= *table;
+                table++;
+                val >>= 1;
+        }
+        return res;
+}
+static const u32 fsflags_to_gfs2[32] = {
+        [3] = GFS2_DIF_SYNC,
+        [4] = GFS2_DIF_IMMUTABLE,
+        [5] = GFS2_DIF_APPENDONLY,
+        [7] = GFS2_DIF_NOATIME,
+        [12] = GFS2_DIF_EXHASH,
+        [14] = GFS2_DIF_JDATA,
+        [20] = GFS2_DIF_DIRECTIO,
+};
+static const u32 gfs2_to_fsflags[32] = {
+        [gfs2fl_Sync] = FS_SYNC_FL,
+        [gfs2fl_Immutable] = FS_IMMUTABLE_FL,
+        [gfs2fl_AppendOnly] = FS_APPEND_FL,
+        [gfs2fl_NoAtime] = FS_NOATIME_FL,
+        [gfs2fl_ExHash] = FS_INDEX_FL,
+        [gfs2fl_Jdata] = FS_JOURNAL_DATA_FL,
+        [gfs2fl_Directio] = FS_DIRECTIO_FL,
+        [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
+        [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
+};
+static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        u32 fsflags;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        error = gfs2_glock_nq_m_atime(1, &gh);
+        if (error)
+                return error;
+        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
+        if (put_user(fsflags, ptr))
+                error = -EFAULT;
+        gfs2_glock_dq_m(1, &gh);
+        gfs2_holder_uninit(&gh);
+        return error;
+}
+/* Flags that can be set by user space */
+#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|                    \
+                             GFS2_DIF_DIRECTIO|                 \
+                             GFS2_DIF_IMMUTABLE|                \
+                             GFS2_DIF_APPENDONLY|               \
+                             GFS2_DIF_NOATIME|                  \
+                             GFS2_DIF_SYNC|                     \
+                             GFS2_DIF_SYSTEM|                   \
+                             GFS2_DIF_INHERIT_DIRECTIO|         \
+                             GFS2_DIF_INHERIT_JDATA)
+/**
+ * gfs2_set_flags - set flags on an inode
+ * @inode: The inode
+ * @flags: The flags to set
+ * @mask: Indicates which flags are valid
+ *
+ */
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct buffer_head *bh;
+        struct gfs2_holder gh;
+        int error;
+        u32 new_flags, flags;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (error)
+                return error;
+        flags = ip->i_di.di_flags;
+        new_flags = (flags & ~mask) | (reqflags & mask);
+        if ((new_flags ^ flags) == 0)
+                goto out;
+        if (S_ISDIR(inode->i_mode)) {
+                if ((new_flags ^ flags) & GFS2_DIF_JDATA)
+                        new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
+                if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
+                        new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
+        }
+        error = -EINVAL;
+        if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
+                goto out;
+        error = -EPERM;
+        if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
+                goto out;
+        if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
+                goto out;
+        if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
+            !capable(CAP_LINUX_IMMUTABLE))
+                goto out;
+        if (!IS_IMMUTABLE(inode)) {
+                error = permission(inode, MAY_WRITE, NULL);
+                if (error)
+                        goto out;
+        }
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                goto out;
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                goto out_trans_end;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        ip->i_di.di_flags = new_flags;
+        gfs2_dinode_out(&ip->i_di, bh->b_data);
+        brelse(bh);
+out_trans_end:
+        gfs2_trans_end(sdp);
+out:
+        gfs2_glock_dq_uninit(&gh);
+        return error;
+}
+static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
+{
+        u32 fsflags, gfsflags;
+        if (get_user(fsflags, ptr))
+                return -EFAULT;
+        gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
+        return do_gfs2_set_flags(filp, gfsflags, ~0);
+}
+static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        switch(cmd) {
+        case FS_IOC_GETFLAGS:
+                return gfs2_get_flags(filp, (u32 __user *)arg);
+        case FS_IOC_SETFLAGS:
+                return gfs2_set_flags(filp, (u32 __user *)arg);
+        }
+        return -ENOTTY;
+}
+/**
+ * gfs2_mmap -
+ * @file: The file to map
+ * @vma: The VMA which described the mapping
+ *
+ * Returns: 0 or error code
+ */
+static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_holder i_gh;
+        int error;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+        error = gfs2_glock_nq_atime(&i_gh);
+        if (error) {
+                gfs2_holder_uninit(&i_gh);
+                return error;
+        }
+        /* This is VM_MAYWRITE instead of VM_WRITE because a call
+           to mprotect() can turn on VM_WRITE later. */
+        if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
+            (VM_MAYSHARE | VM_MAYWRITE))
+                vma->vm_ops = &gfs2_vm_ops_sharewrite;
+        else
+                vma->vm_ops = &gfs2_vm_ops_private;
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * Returns: errno
+ */
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        struct gfs2_file *fp;
+        int error;
+        fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+        if (!fp)
+                return -ENOMEM;
+        mutex_init(&fp->f_fl_mutex);
+        gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
+        file->private_data = fp;
+        if (S_ISREG(ip->i_di.di_mode)) {
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+                                           &i_gh);
+                if (error)
+                        goto fail;
+                if (!(file->f_flags & O_LARGEFILE) &&
+                    ip->i_di.di_size > MAX_NON_LFS) {
+                        error = -EFBIG;
+                        goto fail_gunlock;
+                }
+                /* Listen to the Direct I/O flag */
+                if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
+                        file->f_flags |= O_DIRECT;
+                gfs2_glock_dq_uninit(&i_gh);
+        }
+        return 0;
+fail_gunlock:
+        gfs2_glock_dq_uninit(&i_gh);
+fail:
+        file->private_data = NULL;
+        kfree(fp);
+        return error;
+}
+/**
+ * gfs2_close - called to close a struct file
+ * @inode: the inode the struct file belongs to
+ * @file: the struct file being closed
+ *
+ * Returns: errno
+ */
+static int gfs2_close(struct inode *inode, struct file *file)
+{
+        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_file *fp;
+        fp = file->private_data;
+        file->private_data = NULL;
+        if (gfs2_assert_warn(sdp, fp))
+                return -EIO;
+        kfree(fp);
+        return 0;
+}
+/**
+ * gfs2_fsync - sync the dirty data for a file (across the cluster)
+ * @file: the file that points to the dentry (we ignore this)
+ * @dentry: the dentry that points to the inode to sync
+ *
+ * Returns: errno
+ */
+static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+        return 0;
+}
+/**
+ * gfs2_lock - acquire/release a posix lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+        struct lm_lockname name =
+                { .ln_number = ip->i_num.no_addr,
+                  .ln_type = LM_TYPE_PLOCK };
+        if (!(fl->fl_flags & FL_POSIX))
+                return -ENOLCK;
+        if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+                return -ENOLCK;
+        if (sdp->sd_args.ar_localflocks) {
+                if (IS_GETLK(cmd)) {
+                        struct file_lock tmp;
+                        int ret;
+                        ret = posix_test_lock(file, fl, &tmp);
+                        fl->fl_type = F_UNLCK;
+                        if (ret)
+                                memcpy(fl, &tmp, sizeof(struct file_lock));
+                        return 0;
+                } else {
+                        return posix_lock_file_wait(file, fl);
+                }
+        }
+        if (IS_GETLK(cmd))
+                return gfs2_lm_plock_get(sdp, &name, file, fl);
+        else if (fl->fl_type == F_UNLCK)
+                return gfs2_lm_punlock(sdp, &name, file, fl);
+        else
+                return gfs2_lm_plock(sdp, &name, file, cmd, fl);
+}
+static int do_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct gfs2_file *fp = file->private_data;
+        struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+        struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode);
+        struct gfs2_glock *gl;
+        unsigned int state;
+        int flags;
+        int error = 0;
+        state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
+        mutex_lock(&fp->f_fl_mutex);
+        gl = fl_gh->gh_gl;
+        if (gl) {
+                if (fl_gh->gh_state == state)
+                        goto out;
+                gfs2_glock_hold(gl);
+                flock_lock_file_wait(file,
+                                     &(struct file_lock){.fl_type = F_UNLCK});
+                gfs2_glock_dq_uninit(fl_gh);
+        } else {
+                error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
+                                      ip->i_num.no_addr, &gfs2_flock_glops,
+                                      CREATE, &gl);
+                if (error)
+                        goto out;
+        }
+        gfs2_holder_init(gl, state, flags, fl_gh);
+        gfs2_glock_put(gl);
+        error = gfs2_glock_nq(fl_gh);
+        if (error) {
+                gfs2_holder_uninit(fl_gh);
+                if (error == GLR_TRYFAILED)
+                        error = -EAGAIN;
+        } else {
+                error = flock_lock_file_wait(file, fl);
+                gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+        }
+out:
+        mutex_unlock(&fp->f_fl_mutex);
+        return error;
+}
+static void do_unflock(struct file *file, struct file_lock *fl)
+{
+        struct gfs2_file *fp = file->private_data;
+        struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+        mutex_lock(&fp->f_fl_mutex);
+        flock_lock_file_wait(file, fl);
+        if (fl_gh->gh_gl)
+                gfs2_glock_dq_uninit(fl_gh);
+        mutex_unlock(&fp->f_fl_mutex);
+}
+/**
+ * gfs2_flock - acquire/release a flock lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+        if (!(fl->fl_flags & FL_FLOCK))
+                return -ENOLCK;
+        if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+                return -ENOLCK;
+        if (sdp->sd_args.ar_localflocks)
+                return flock_lock_file_wait(file, fl);
+        if (fl->fl_type == F_UNLCK) {
+                do_unflock(file, fl);
+                return 0;
+        } else {
+                return do_flock(file, cmd, fl);
+        }
+}
+const struct file_operations gfs2_file_fops = {
+        .llseek         = gfs2_llseek,
+        .read           = do_sync_read,
+        .aio_read       = generic_file_aio_read,
+        .write          = do_sync_write,
+        .aio_write      = generic_file_aio_write,
+        .unlocked_ioctl = gfs2_ioctl,
+        .mmap           = gfs2_mmap,
+        .open           = gfs2_open,
+        .release        = gfs2_close,
+        .fsync          = gfs2_fsync,
+        .lock           = gfs2_lock,
+        .sendfile       = generic_file_sendfile,
+        .flock          = gfs2_flock,
+        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
+};
+const struct file_operations gfs2_dir_fops = {
+        .readdir        = gfs2_readdir,
+        .unlocked_ioctl = gfs2_ioctl,
+        .open           = gfs2_open,
+        .release        = gfs2_close,
+        .fsync          = gfs2_fsync,
+        .lock           = gfs2_lock,
+        .flock          = gfs2_flock,
+};
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..ce319f89ec8e
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_FILE_DOT_H__
+#define __OPS_FILE_DOT_H__
+#include <linux/fs.h>
+struct gfs2_inode;
+extern struct file gfs2_internal_file_sentinel;
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+                              struct file_ra_state *ra_state,
+                              char *buf, loff_t *pos, unsigned size);
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..178b33911843
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,928 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "daemon.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "mount.h"
+#include "ops_export.h"
+#include "ops_fstype.h"
+#include "ops_super.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#define DO 0
+#define UNDO 1
+extern struct dentry_operations gfs2_dops;
+static struct gfs2_sbd *init_sbd(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp;
+        sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
+        if (!sdp)
+                return NULL;
+        sb->s_fs_info = sdp;
+        sdp->sd_vfs = sb;
+        gfs2_tune_init(&sdp->sd_tune);
+        INIT_LIST_HEAD(&sdp->sd_reclaim_list);
+        spin_lock_init(&sdp->sd_reclaim_lock);
+        init_waitqueue_head(&sdp->sd_reclaim_wq);
+        mutex_init(&sdp->sd_inum_mutex);
+        spin_lock_init(&sdp->sd_statfs_spin);
+        mutex_init(&sdp->sd_statfs_mutex);
+        spin_lock_init(&sdp->sd_rindex_spin);
+        mutex_init(&sdp->sd_rindex_mutex);
+        INIT_LIST_HEAD(&sdp->sd_rindex_list);
+        INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
+        INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
+        INIT_LIST_HEAD(&sdp->sd_jindex_list);
+        spin_lock_init(&sdp->sd_jindex_spin);
+        mutex_init(&sdp->sd_jindex_mutex);
+        INIT_LIST_HEAD(&sdp->sd_quota_list);
+        spin_lock_init(&sdp->sd_quota_spin);
+        mutex_init(&sdp->sd_quota_mutex);
+        spin_lock_init(&sdp->sd_log_lock);
+        INIT_LIST_HEAD(&sdp->sd_log_le_gl);
+        INIT_LIST_HEAD(&sdp->sd_log_le_buf);
+        INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
+        INIT_LIST_HEAD(&sdp->sd_log_le_rg);
+        INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
+        mutex_init(&sdp->sd_log_reserve_mutex);
+        INIT_LIST_HEAD(&sdp->sd_ail1_list);
+        INIT_LIST_HEAD(&sdp->sd_ail2_list);
+        init_rwsem(&sdp->sd_log_flush_lock);
+        INIT_LIST_HEAD(&sdp->sd_log_flush_list);
+        INIT_LIST_HEAD(&sdp->sd_revoke_list);
+        mutex_init(&sdp->sd_freeze_lock);
+        return sdp;
+}
+static void init_vfs(struct super_block *sb, unsigned noatime)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        sb->s_magic = GFS2_MAGIC;
+        sb->s_op = &gfs2_super_ops;
+        sb->s_export_op = &gfs2_export_ops;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
+                set_bit(noatime, &sdp->sd_flags);
+        /* Don't let the VFS update atimes.  GFS2 handles this itself. */
+        sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+}
+static int init_names(struct gfs2_sbd *sdp, int silent)
+{
+        struct page *page;
+        char *proto, *table;
+        int error = 0;
+        proto = sdp->sd_args.ar_lockproto;
+        table = sdp->sd_args.ar_locktable;
+        /*  Try to autodetect  */
+        if (!proto[0] || !table[0]) {
+                struct gfs2_sb *sb;
+                page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+                if (!page)
+                        return -ENOBUFS;
+                sb = kmap(page);
+                gfs2_sb_in(&sdp->sd_sb, sb);
+                kunmap(page);
+                __free_page(page);
+                error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+                if (error)
+                        goto out;
+                if (!proto[0])
+                        proto = sdp->sd_sb.sb_lockproto;
+                if (!table[0])
+                        table = sdp->sd_sb.sb_locktable;
+        }
+        if (!table[0])
+                table = sdp->sd_vfs->s_id;
+        snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
+        snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
+out:
+        return error;
+}
+static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
+                        int undo)
+{
+        struct task_struct *p;
+        int error = 0;
+        if (undo)
+                goto fail_trans;
+        p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
+        error = IS_ERR(p);
+        if (error) {
+                fs_err(sdp, "can't start scand thread: %d\n", error);
+                return error;
+        }
+        sdp->sd_scand_process = p;
+        for (sdp->sd_glockd_num = 0;
+             sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
+             sdp->sd_glockd_num++) {
+                p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
+                error = IS_ERR(p);
+                if (error) {
+                        fs_err(sdp, "can't start glockd thread: %d\n", error);
+                        goto fail;
+                }
+                sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
+        }
+        error = gfs2_glock_nq_num(sdp,
+                                  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
+                                  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
+                                  mount_gh);
+        if (error) {
+                fs_err(sdp, "can't acquire mount glock: %d\n", error);
+                goto fail;
+        }
+        error = gfs2_glock_nq_num(sdp,
+                                  GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
+                                  LM_ST_SHARED,
+                                  LM_FLAG_NOEXP | GL_EXACT,
+                                  &sdp->sd_live_gh);
+        if (error) {
+                fs_err(sdp, "can't acquire live glock: %d\n", error);
+                goto fail_mount;
+        }
+        error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
+                               CREATE, &sdp->sd_rename_gl);
+        if (error) {
+                fs_err(sdp, "can't create rename glock: %d\n", error);
+                goto fail_live;
+        }
+        error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
+                               CREATE, &sdp->sd_trans_gl);
+        if (error) {
+                fs_err(sdp, "can't create transaction glock: %d\n", error);
+                goto fail_rename;
+        }
+        set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
+        return 0;
+fail_trans:
+        gfs2_glock_put(sdp->sd_trans_gl);
+fail_rename:
+        gfs2_glock_put(sdp->sd_rename_gl);
+fail_live:
+        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+fail_mount:
+        gfs2_glock_dq_uninit(mount_gh);
+fail:
+        while (sdp->sd_glockd_num--)
+                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
+        kthread_stop(sdp->sd_scand_process);
+        return error;
+}
+static struct inode *gfs2_lookup_root(struct super_block *sb,
+                                      struct gfs2_inum *inum)
+{
+        return gfs2_inode_lookup(sb, inum, DT_DIR);
+}
+static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
+{
+        struct super_block *sb = sdp->sd_vfs;
+        struct gfs2_holder sb_gh;
+        struct gfs2_inum *inum;
+        struct inode *inode;
+        int error = 0;
+        if (undo) {
+                if (sb->s_root) {
+                        dput(sb->s_root);
+                        sb->s_root = NULL;
+                }
+                return 0;
+        }
+        error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+                                 LM_ST_SHARED, 0, &sb_gh);
+        if (error) {
+                fs_err(sdp, "can't acquire superblock glock: %d\n", error);
+                return error;
+        }
+        error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
+        if (error) {
+                fs_err(sdp, "can't read superblock: %d\n", error);
+                goto out;
+        }
+        /* Set up the buffer cache and SB for real */
+        if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+                error = -EINVAL;
+                fs_err(sdp, "FS block size (%u) is too small for device "
+                       "block size (%u)\n",
+                       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+                goto out;
+        }
+        if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
+                error = -EINVAL;
+                fs_err(sdp, "FS block size (%u) is too big for machine "
+                       "page size (%u)\n",
+                       sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
+                goto out;
+        }
+        sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
+        /* Get the root inode */
+        inum = &sdp->sd_sb.sb_root_dir;
+        if (sb->s_type == &gfs2meta_fs_type)
+                inum = &sdp->sd_sb.sb_master_dir;
+        inode = gfs2_lookup_root(sb, inum);
+        if (IS_ERR(inode)) {
+                error = PTR_ERR(inode);
+                fs_err(sdp, "can't read in root inode: %d\n", error);
+                goto out;
+        }
+        sb->s_root = d_alloc_root(inode);
+        if (!sb->s_root) {
+                fs_err(sdp, "can't get root dentry\n");
+                error = -ENOMEM;
+                iput(inode);
+        }
+        sb->s_root->d_op = &gfs2_dops;
+out:
+        gfs2_glock_dq_uninit(&sb_gh);
+        return error;
+}
+static int init_journal(struct gfs2_sbd *sdp, int undo)
+{
+        struct gfs2_holder ji_gh;
+        struct task_struct *p;
+        struct gfs2_inode *ip;
+        int jindex = 1;
+        int error = 0;
+        if (undo) {
+                jindex = 0;
+                goto fail_recoverd;
+        }
+        sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
+        if (IS_ERR(sdp->sd_jindex)) {
+                fs_err(sdp, "can't lookup journal index: %d\n", error);
+                return PTR_ERR(sdp->sd_jindex);
+        }
+        ip = GFS2_I(sdp->sd_jindex);
+        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+        /* Load in the journal index special file */
+        error = gfs2_jindex_hold(sdp, &ji_gh);
+        if (error) {
+                fs_err(sdp, "can't read journal index: %d\n", error);
+                goto fail;
+        }
+        error = -EINVAL;
+        if (!gfs2_jindex_size(sdp)) {
+                fs_err(sdp, "no journals!\n");
+                goto fail_jindex;
+        }
+        if (sdp->sd_args.ar_spectator) {
+                sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
+                sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+        } else {
+                if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
+                        fs_err(sdp, "can't mount journal #%u\n",
+                               sdp->sd_lockstruct.ls_jid);
+                        fs_err(sdp, "there are only %u journals (0 - %u)\n",
+                               gfs2_jindex_size(sdp),
+                               gfs2_jindex_size(sdp) - 1);
+                        goto fail_jindex;
+                }
+                sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
+                error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
+                                          &gfs2_journal_glops,
+                                          LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
+                                          &sdp->sd_journal_gh);
+                if (error) {
+                        fs_err(sdp, "can't acquire journal glock: %d\n", error);
+                        goto fail_jindex;
+                }
+                ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+                                           LM_FLAG_NOEXP | GL_EXACT,
+                                           &sdp->sd_jinode_gh);
+                if (error) {
+                        fs_err(sdp, "can't acquire journal inode glock: %d\n",
+                               error);
+                        goto fail_journal_gh;
+                }
+                error = gfs2_jdesc_check(sdp->sd_jdesc);
+                if (error) {
+                        fs_err(sdp, "my journal (%u) is bad: %d\n",
+                               sdp->sd_jdesc->jd_jid, error);
+                        goto fail_jinode_gh;
+                }
+                sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+        }
+        if (sdp->sd_lockstruct.ls_first) {
+                unsigned int x;
+                for (x = 0; x < sdp->sd_journals; x++) {
+                        error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
+                        if (error) {
+                                fs_err(sdp, "error recovering journal %u: %d\n",
+                                       x, error);
+                                goto fail_jinode_gh;
+                        }
+                }
+                gfs2_lm_others_may_mount(sdp);
+        } else if (!sdp->sd_args.ar_spectator) {
+                error = gfs2_recover_journal(sdp->sd_jdesc);
+                if (error) {
+                        fs_err(sdp, "error recovering my journal: %d\n", error);
+                        goto fail_jinode_gh;
+                }
+        }
+        set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
+        gfs2_glock_dq_uninit(&ji_gh);
+        jindex = 0;
+        p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
+        error = IS_ERR(p);
+        if (error) {
+                fs_err(sdp, "can't start recoverd thread: %d\n", error);
+                goto fail_jinode_gh;
+        }
+        sdp->sd_recoverd_process = p;
+        return 0;
+fail_recoverd:
+        kthread_stop(sdp->sd_recoverd_process);
+fail_jinode_gh:
+        if (!sdp->sd_args.ar_spectator)
+                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+fail_journal_gh:
+        if (!sdp->sd_args.ar_spectator)
+                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+fail_jindex:
+        gfs2_jindex_free(sdp);
+        if (jindex)
+                gfs2_glock_dq_uninit(&ji_gh);
+fail:
+        iput(sdp->sd_jindex);
+        return error;
+}
+static int init_inodes(struct gfs2_sbd *sdp, int undo)
+{
+        int error = 0;
+        struct gfs2_inode *ip;
+        struct inode *inode;
+        if (undo)
+                goto fail_qinode;
+        inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir);
+        if (IS_ERR(inode)) {
+                error = PTR_ERR(inode);
+                fs_err(sdp, "can't read in master directory: %d\n", error);
+                goto fail;
+        }
+        sdp->sd_master_dir = inode;
+        error = init_journal(sdp, undo);
+        if (error)
+                goto fail_master;
+        /* Read in the master inode number inode */
+        sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
+        if (IS_ERR(sdp->sd_inum_inode)) {
+                error = PTR_ERR(sdp->sd_inum_inode);
+                fs_err(sdp, "can't read in inum inode: %d\n", error);
+                goto fail_journal;
+        }
+        /* Read in the master statfs inode */
+        sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
+        if (IS_ERR(sdp->sd_statfs_inode)) {
+                error = PTR_ERR(sdp->sd_statfs_inode);
+                fs_err(sdp, "can't read in statfs inode: %d\n", error);
+                goto fail_inum;
+        }
+        /* Read in the resource index inode */
+        sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
+        if (IS_ERR(sdp->sd_rindex)) {
+                error = PTR_ERR(sdp->sd_rindex);
+                fs_err(sdp, "can't get resource index inode: %d\n", error);
+                goto fail_statfs;
+        }
+        ip = GFS2_I(sdp->sd_rindex);
+        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+        sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
+        /* Read in the quota inode */
+        sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
+        if (IS_ERR(sdp->sd_quota_inode)) {
+                error = PTR_ERR(sdp->sd_quota_inode);
+                fs_err(sdp, "can't get quota file inode: %d\n", error);
+                goto fail_rindex;
+        }
+        return 0;
+fail_qinode:
+        iput(sdp->sd_quota_inode);
+fail_rindex:
+        gfs2_clear_rgrpd(sdp);
+        iput(sdp->sd_rindex);
+fail_statfs:
+        iput(sdp->sd_statfs_inode);
+fail_inum:
+        iput(sdp->sd_inum_inode);
+fail_journal:
+        init_journal(sdp, UNDO);
+fail_master:
+        iput(sdp->sd_master_dir);
+fail:
+        return error;
+}
+static int init_per_node(struct gfs2_sbd *sdp, int undo)
+{
+        struct inode *pn = NULL;
+        char buf[30];
+        int error = 0;
+        struct gfs2_inode *ip;
+        if (sdp->sd_args.ar_spectator)
+                return 0;
+        if (undo)
+                goto fail_qc_gh;
+        pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
+        if (IS_ERR(pn)) {
+                error = PTR_ERR(pn);
+                fs_err(sdp, "can't find per_node directory: %d\n", error);
+                return error;
+        }
+        sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
+        sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
+        if (IS_ERR(sdp->sd_ir_inode)) {
+                error = PTR_ERR(sdp->sd_ir_inode);
+                fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
+                goto fail;
+        }
+        sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
+        sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
+        if (IS_ERR(sdp->sd_sc_inode)) {
+                error = PTR_ERR(sdp->sd_sc_inode);
+                fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
+                goto fail_ir_i;
+        }
+        sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
+        sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
+        if (IS_ERR(sdp->sd_qc_inode)) {
+                error = PTR_ERR(sdp->sd_qc_inode);
+                fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
+                goto fail_ut_i;
+        }
+        iput(pn);
+        pn = NULL;
+        ip = GFS2_I(sdp->sd_ir_inode);
+        error = gfs2_glock_nq_init(ip->i_gl,
+                                   LM_ST_EXCLUSIVE, 0,
+                                   &sdp->sd_ir_gh);
+        if (error) {
+                fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
+                goto fail_qc_i;
+        }
+        ip = GFS2_I(sdp->sd_sc_inode);
+        error = gfs2_glock_nq_init(ip->i_gl,
+                                   LM_ST_EXCLUSIVE, 0,
+                                   &sdp->sd_sc_gh);
+        if (error) {
+                fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
+                goto fail_ir_gh;
+        }
+        ip = GFS2_I(sdp->sd_qc_inode);
+        error = gfs2_glock_nq_init(ip->i_gl,
+                                   LM_ST_EXCLUSIVE, 0,
+                                   &sdp->sd_qc_gh);
+        if (error) {
+                fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
+                goto fail_ut_gh;
+        }
+        return 0;
+fail_qc_gh:
+        gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+fail_ut_gh:
+        gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+fail_ir_gh:
+        gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+fail_qc_i:
+        iput(sdp->sd_qc_inode);
+fail_ut_i:
+        iput(sdp->sd_sc_inode);
+fail_ir_i:
+        iput(sdp->sd_ir_inode);
+fail:
+        if (pn)
+                iput(pn);
+        return error;
+}
+static int init_threads(struct gfs2_sbd *sdp, int undo)
+{
+        struct task_struct *p;
+        int error = 0;
+        if (undo)
+                goto fail_quotad;
+        sdp->sd_log_flush_time = jiffies;
+        sdp->sd_jindex_refresh_time = jiffies;
+        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+        error = IS_ERR(p);
+        if (error) {
+                fs_err(sdp, "can't start logd thread: %d\n", error);
+                return error;
+        }
+        sdp->sd_logd_process = p;
+        sdp->sd_statfs_sync_time = jiffies;
+        sdp->sd_quota_sync_time = jiffies;
+        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+        error = IS_ERR(p);
+        if (error) {
+                fs_err(sdp, "can't start quotad thread: %d\n", error);
+                goto fail;
+        }
+        sdp->sd_quotad_process = p;
+        return 0;
+fail_quotad:
+        kthread_stop(sdp->sd_quotad_process);
+fail:
+        kthread_stop(sdp->sd_logd_process);
+        return error;
+}
+/**
+ * fill_super - Read in superblock
+ * @sb: The VFS superblock
+ * @data: Mount options
+ * @silent: Don't complain if it's not a GFS2 filesystem
+ *
+ * Returns: errno
+ */
+static int fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct gfs2_sbd *sdp;
+        struct gfs2_holder mount_gh;
+        int error;
+        sdp = init_sbd(sb);
+        if (!sdp) {
+                printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
+                return -ENOMEM;
+        }
+        error = gfs2_mount_args(sdp, (char *)data, 0);
+        if (error) {
+                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+                goto fail;
+        }
+        init_vfs(sb, SDF_NOATIME);
+        /* Set up the buffer cache and fill in some fake block size values
+           to allow us to read-in the on-disk superblock. */
+        sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
+        sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
+        sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+                               GFS2_BASIC_BLOCK_SHIFT;
+        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+        error = init_names(sdp, silent);
+        if (error)
+                goto fail;
+        error = gfs2_sys_fs_add(sdp);
+        if (error)
+                goto fail;
+        error = gfs2_lm_mount(sdp, silent);
+        if (error)
+                goto fail_sys;
+        error = init_locking(sdp, &mount_gh, DO);
+        if (error)
+                goto fail_lm;
+        error = init_sb(sdp, silent, DO);
+        if (error)
+                goto fail_locking;
+        error = init_inodes(sdp, DO);
+        if (error)
+                goto fail_sb;
+        error = init_per_node(sdp, DO);
+        if (error)
+                goto fail_inodes;
+        error = gfs2_statfs_init(sdp);
+        if (error) {
+                fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
+                goto fail_per_node;
+        }
+        error = init_threads(sdp, DO);
+        if (error)
+                goto fail_per_node;
+        if (!(sb->s_flags & MS_RDONLY)) {
+                error = gfs2_make_fs_rw(sdp);
+                if (error) {
+                        fs_err(sdp, "can't make FS RW: %d\n", error);
+                        goto fail_threads;
+                }
+        }
+        gfs2_glock_dq_uninit(&mount_gh);
+        return 0;
+fail_threads:
+        init_threads(sdp, UNDO);
+fail_per_node:
+        init_per_node(sdp, UNDO);
+fail_inodes:
+        init_inodes(sdp, UNDO);
+fail_sb:
+        init_sb(sdp, 0, UNDO);
+fail_locking:
+        init_locking(sdp, &mount_gh, UNDO);
+fail_lm:
+        gfs2_gl_hash_clear(sdp, WAIT);
+        gfs2_lm_unmount(sdp);
+        while (invalidate_inodes(sb))
+                yield();
+fail_sys:
+        gfs2_sys_fs_del(sdp);
+fail:
+        kfree(sdp);
+        sb->s_fs_info = NULL;
+        return error;
+}
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+                const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct super_block *sb;
+        struct gfs2_sbd *sdp;
+        int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+        if (error)
+                goto out;
+        sb = mnt->mnt_sb;
+        sdp = sb->s_fs_info;
+        sdp->sd_gfs2mnt = mnt;
+out:
+        return error;
+}
+static int fill_super_meta(struct super_block *sb, struct super_block *new,
+                           void *data, int silent)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct inode *inode;
+        int error = 0;
+        new->s_fs_info = sdp;
+        sdp->sd_vfs_meta = sb;
+        init_vfs(new, SDF_NOATIME);
+        /* Get the master inode */
+        inode = igrab(sdp->sd_master_dir);
+        new->s_root = d_alloc_root(inode);
+        if (!new->s_root) {
+                fs_err(sdp, "can't get root dentry\n");
+                error = -ENOMEM;
+                iput(inode);
+        }
+        new->s_root->d_op = &gfs2_dops;
+        return error;
+}
+static int set_bdev_super(struct super_block *s, void *data)
+{
+        s->s_bdev = data;
+        s->s_dev = s->s_bdev->bd_dev;
+        return 0;
+}
+static int test_bdev_super(struct super_block *s, void *data)
+{
+        return s->s_bdev == data;
+}
+static struct super_block* get_gfs2_sb(const char *dev_name)
+{
+        struct kstat stat;
+        struct nameidata nd;
+        struct file_system_type *fstype;
+        struct super_block *sb = NULL, *s;
+        struct list_head *l;
+        int error;
+        error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+        if (error) {
+                printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
+                       dev_name);
+                goto out;
+        }
+        error = vfs_getattr(nd.mnt, nd.dentry, &stat);
+        fstype = get_fs_type("gfs2");
+        list_for_each(l, &fstype->fs_supers) {
+                s = list_entry(l, struct super_block, s_instances);
+                if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
+                    (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
+                        sb = s;
+                        goto free_nd;
+                }
+        }
+        printk(KERN_WARNING "GFS2: Unrecognized block device or "
+               "mount point %s", dev_name);
+free_nd:
+        path_release(&nd);
+out:
+        return sb;
+}
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+                            const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        int error = 0;
+        struct super_block *sb = NULL, *new;
+        struct gfs2_sbd *sdp;
+        char *gfs2mnt = NULL;
+        sb = get_gfs2_sb(dev_name);
+        if (!sb) {
+                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
+                error = -ENOENT;
+                goto error;
+        }
+        sdp = (struct gfs2_sbd*) sb->s_fs_info;
+        if (sdp->sd_vfs_meta) {
+                printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
+                error = -EBUSY;
+                goto error;
+        }
+        mutex_lock(&sb->s_bdev->bd_mount_mutex);
+        new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
+        mutex_unlock(&sb->s_bdev->bd_mount_mutex);
+        if (IS_ERR(new)) {
+                error = PTR_ERR(new);
+                goto error;
+        }
+        module_put(fs_type->owner);
+        new->s_flags = flags;
+        strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
+        sb_set_blocksize(new, sb->s_blocksize);
+        error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
+        if (error) {
+                up_write(&new->s_umount);
+                deactivate_super(new);
+                goto error;
+        }
+        new->s_flags |= MS_ACTIVE;
+        /* Grab a reference to the gfs2 mount point */
+        atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
+        return simple_set_mnt(mnt, new);
+error:
+        if (gfs2mnt)
+                kfree(gfs2mnt);
+        return error;
+}
+static void gfs2_kill_sb(struct super_block *sb)
+{
+        kill_block_super(sb);
+}
+static void gfs2_kill_sb_meta(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        generic_shutdown_super(sb);
+        sdp->sd_vfs_meta = NULL;
+        atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
+}
+struct file_system_type gfs2_fs_type = {
+        .name = "gfs2",
+        .fs_flags = FS_REQUIRES_DEV,
+        .get_sb = gfs2_get_sb,
+        .kill_sb = gfs2_kill_sb,
+        .owner = THIS_MODULE,
+};
+struct file_system_type gfs2meta_fs_type = {
+        .name = "gfs2meta",
+        .fs_flags = FS_REQUIRES_DEV,
+        .get_sb = gfs2_get_sb_meta,
+        .kill_sb = gfs2_kill_sb_meta,
+        .owner = THIS_MODULE,
+};
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..7cc2c296271b
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_FSTYPE_DOT_H__
+#define __OPS_FSTYPE_DOT_H__
+#include <linux/fs.h>
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..ef6e5ed70e94
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include <linux/utsname.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "ops_dentry.h"
+#include "ops_inode.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+/**
+ * gfs2_create - Create a file
+ * @dir: The directory in which to create the file
+ * @dentry: The dentry of the new file
+ * @mode: The mode of the new file
+ *
+ * Returns: errno
+ */
+static int gfs2_create(struct inode *dir, struct dentry *dentry,
+                       int mode, struct nameidata *nd)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_holder ghs[2];
+        struct inode *inode;
+        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+        for (;;) {
+                inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
+                if (!IS_ERR(inode)) {
+                        gfs2_trans_end(sdp);
+                        if (dip->i_alloc.al_rgd)
+                                gfs2_inplace_release(dip);
+                        gfs2_quota_unlock(dip);
+                        gfs2_alloc_put(dip);
+                        gfs2_glock_dq_uninit_m(2, ghs);
+                        mark_inode_dirty(inode);
+                        break;
+                } else if (PTR_ERR(inode) != -EEXIST ||
+                           (nd->intent.open.flags & O_EXCL)) {
+                        gfs2_holder_uninit(ghs);
+                        return PTR_ERR(inode);
+                }
+                inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+                if (inode) {
+                        if (!IS_ERR(inode)) {
+                                gfs2_holder_uninit(ghs);
+                                break;
+                        } else {
+                                gfs2_holder_uninit(ghs);
+                                return PTR_ERR(inode);
+                        }
+                }
+        }
+        d_instantiate(dentry, inode);
+        return 0;
+}
+/**
+ * gfs2_lookup - Look up a filename in a directory and return its inode
+ * @dir: The directory inode
+ * @dentry: The dentry of the new inode
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Called by the VFS layer. Lock dir and call gfs2_lookupi()
+ *
+ * Returns: errno
+ */
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+                                  struct nameidata *nd)
+{
+        struct inode *inode = NULL;
+        dentry->d_op = &gfs2_dops;
+        inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+        if (inode && IS_ERR(inode))
+                return ERR_PTR(PTR_ERR(inode));
+        if (inode)
+                return d_splice_alias(inode, dentry);
+        d_add(dentry, inode);
+        return NULL;
+}
+/**
+ * gfs2_link - Link to a file
+ * @old_dentry: The inode to link
+ * @dir: Add link to this directory
+ * @dentry: The name of the link
+ *
+ * Link the inode in "old_dentry" into the directory "dir" with the
+ * name in "dentry".
+ *
+ * Returns: errno
+ */
+static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
+                     struct dentry *dentry)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct inode *inode = old_dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder ghs[2];
+        int alloc_required;
+        int error;
+        if (S_ISDIR(ip->i_di.di_mode))
+                return -EPERM;
+        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+        error = gfs2_glock_nq_m(2, ghs);
+        if (error)
+                goto out;
+        error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
+        switch (error) {
+        case -ENOENT:
+                break;
+        case 0:
+                error = -EEXIST;
+        default:
+                goto out_gunlock;
+        }
+        error = -EINVAL;
+        if (!dip->i_di.di_nlink)
+                goto out_gunlock;
+        error = -EFBIG;
+        if (dip->i_di.di_entries == (u32)-1)
+                goto out_gunlock;
+        error = -EPERM;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                goto out_gunlock;
+        error = -EINVAL;
+        if (!ip->i_di.di_nlink)
+                goto out_gunlock;
+        error = -EMLINK;
+        if (ip->i_di.di_nlink == (u32)-1)
+                goto out_gunlock;
+        alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+        if (error < 0)
+                goto out_gunlock;
+        error = 0;
+        if (alloc_required) {
+                struct gfs2_alloc *al = gfs2_alloc_get(dip);
+                error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                if (error)
+                        goto out_alloc;
+                error = gfs2_quota_check(dip, dip->i_di.di_uid,
+                                         dip->i_di.di_gid);
+                if (error)
+                        goto out_gunlock_q;
+                al->al_requested = sdp->sd_max_dirres;
+                error = gfs2_inplace_reserve(dip);
+                if (error)
+                        goto out_gunlock_q;
+                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                                         al->al_rgd->rd_ri.ri_length +
+                                         2 * RES_DINODE + RES_STATFS +
+                                         RES_QUOTA, 0);
+                if (error)
+                        goto out_ipres;
+        } else {
+                error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
+                if (error)
+                        goto out_ipres;
+        }
+        error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
+                             IF2DT(ip->i_di.di_mode));
+        if (error)
+                goto out_end_trans;
+        error = gfs2_change_nlink(ip, +1);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipres:
+        if (alloc_required)
+                gfs2_inplace_release(dip);
+out_gunlock_q:
+        if (alloc_required)
+                gfs2_quota_unlock(dip);
+out_alloc:
+        if (alloc_required)
+                gfs2_alloc_put(dip);
+out_gunlock:
+        gfs2_glock_dq_m(2, ghs);
+out:
+        gfs2_holder_uninit(ghs);
+        gfs2_holder_uninit(ghs + 1);
+        if (!error) {
+                atomic_inc(&inode->i_count);
+                d_instantiate(dentry, inode);
+                mark_inode_dirty(inode);
+        }
+        return error;
+}
+/**
+ * gfs2_unlink - Unlink a file
+ * @dir: The inode of the directory containing the file to unlink
+ * @dentry: The file itself
+ *
+ * Unlink a file.  Call gfs2_unlinki()
+ *
+ * Returns: errno
+ */
+static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        struct gfs2_holder ghs[2];
+        int error;
+        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+        error = gfs2_glock_nq_m(2, ghs);
+        if (error)
+                goto out;
+        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_dir_del(dip, &dentry->d_name);
+        if (error)
+                goto out_end_trans;
+        error = gfs2_change_nlink(ip, -1);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_m(2, ghs);
+out:
+        gfs2_holder_uninit(ghs);
+        gfs2_holder_uninit(ghs + 1);
+        return error;
+}
+/**
+ * gfs2_symlink - Create a symlink
+ * @dir: The directory to create the symlink in
+ * @dentry: The dentry to put the symlink in
+ * @symname: The thing which the link points to
+ *
+ * Returns: errno
+ */
+static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
+                        const char *symname)
+{
+        struct gfs2_inode *dip = GFS2_I(dir), *ip;
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_holder ghs[2];
+        struct inode *inode;
+        struct buffer_head *dibh;
+        int size;
+        int error;
+        /* Must be stuffed with a null terminator for gfs2_follow_link() */
+        size = strlen(symname);
+        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
+                return -ENAMETOOLONG;
+        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+        inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
+        if (IS_ERR(inode)) {
+                gfs2_holder_uninit(ghs);
+                return PTR_ERR(inode);
+        }
+        ip = ghs[1].gh_gl->gl_object;
+        ip->i_di.di_size = size;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!gfs2_assert_withdraw(sdp, !error)) {
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
+                       size);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+        if (dip->i_alloc.al_rgd)
+                gfs2_inplace_release(dip);
+        gfs2_quota_unlock(dip);
+        gfs2_alloc_put(dip);
+        gfs2_glock_dq_uninit_m(2, ghs);
+        d_instantiate(dentry, inode);
+        mark_inode_dirty(inode);
+        return 0;
+}
+/**
+ * gfs2_mkdir - Make a directory
+ * @dir: The parent directory of the new one
+ * @dentry: The dentry of the new directory
+ * @mode: The mode of the new directory
+ *
+ * Returns: errno
+ */
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct gfs2_inode *dip = GFS2_I(dir), *ip;
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_holder ghs[2];
+        struct inode *inode;
+        struct buffer_head *dibh;
+        int error;
+        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+        inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
+        if (IS_ERR(inode)) {
+                gfs2_holder_uninit(ghs);
+                return PTR_ERR(inode);
+        }
+        ip = ghs[1].gh_gl->gl_object;
+        ip->i_di.di_nlink = 2;
+        ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+        ip->i_di.di_flags |= GFS2_DIF_JDATA;
+        ip->i_di.di_payload_format = GFS2_FORMAT_DE;
+        ip->i_di.di_entries = 2;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!gfs2_assert_withdraw(sdp, !error)) {
+                struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+                struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
+                struct qstr str;
+                gfs2_str2qstr(&str, ".");
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+                dent->de_inum = di->di_num; /* already GFS2 endian */
+                dent->de_type = cpu_to_be16(DT_DIR);
+                di->di_entries = cpu_to_be32(1);
+                gfs2_str2qstr(&str, "..");
+                dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
+                gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+                gfs2_inum_out(&dip->i_num, &dent->de_inum);
+                dent->de_type = cpu_to_be16(DT_DIR);
+                gfs2_dinode_out(&ip->i_di, di);
+                brelse(dibh);
+        }
+        error = gfs2_change_nlink(dip, +1);
+        gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
+        gfs2_trans_end(sdp);
+        if (dip->i_alloc.al_rgd)
+                gfs2_inplace_release(dip);
+        gfs2_quota_unlock(dip);
+        gfs2_alloc_put(dip);
+        gfs2_glock_dq_uninit_m(2, ghs);
+        d_instantiate(dentry, inode);
+        mark_inode_dirty(inode);
+        return 0;
+}
+/**
+ * gfs2_rmdir - Remove a directory
+ * @dir: The parent directory of the directory to be removed
+ * @dentry: The dentry of the directory to remove
+ *
+ * Remove a directory. Call gfs2_rmdiri()
+ *
+ * Returns: errno
+ */
+static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        struct gfs2_holder ghs[2];
+        int error;
+        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+        error = gfs2_glock_nq_m(2, ghs);
+        if (error)
+                goto out;
+        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+        if (error)
+                goto out_gunlock;
+        if (ip->i_di.di_entries < 2) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                error = -EIO;
+                goto out_gunlock;
+        }
+        if (ip->i_di.di_entries > 2) {
+                error = -ENOTEMPTY;
+                goto out_gunlock;
+        }
+        error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_rmdiri(dip, &dentry->d_name, ip);
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_m(2, ghs);
+out:
+        gfs2_holder_uninit(ghs);
+        gfs2_holder_uninit(ghs + 1);
+        return error;
+}
+/**
+ * gfs2_mknod - Make a special file
+ * @dir: The directory in which the special file will reside
+ * @dentry: The dentry of the special file
+ * @mode: The mode of the special file
+ * @rdev: The device specification of the special file
+ *
+ */
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
+                      dev_t dev)
+{
+        struct gfs2_inode *dip = GFS2_I(dir), *ip;
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_holder ghs[2];
+        struct inode *inode;
+        struct buffer_head *dibh;
+        u32 major = 0, minor = 0;
+        int error;
+        switch (mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                major = MAJOR(dev);
+                minor = MINOR(dev);
+                break;
+        case S_IFIFO:
+        case S_IFSOCK:
+                break;
+        default:
+                return -EOPNOTSUPP;
+        };
+        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+        inode = gfs2_createi(ghs, &dentry->d_name, mode);
+        if (IS_ERR(inode)) {
+                gfs2_holder_uninit(ghs);
+                return PTR_ERR(inode);
+        }
+        ip = ghs[1].gh_gl->gl_object;
+        ip->i_di.di_major = major;
+        ip->i_di.di_minor = minor;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!gfs2_assert_withdraw(sdp, !error)) {
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+        if (dip->i_alloc.al_rgd)
+                gfs2_inplace_release(dip);
+        gfs2_quota_unlock(dip);
+        gfs2_alloc_put(dip);
+        gfs2_glock_dq_uninit_m(2, ghs);
+        d_instantiate(dentry, inode);
+        mark_inode_dirty(inode);
+        return 0;
+}
+/**
+ * gfs2_rename - Rename a file
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ *
+ * Returns: errno
+ */
+static int gfs2_rename(struct inode *odir, struct dentry *odentry,
+                       struct inode *ndir, struct dentry *ndentry)
+{
+        struct gfs2_inode *odip = GFS2_I(odir);
+        struct gfs2_inode *ndip = GFS2_I(ndir);
+        struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
+        struct gfs2_inode *nip = NULL;
+        struct gfs2_sbd *sdp = GFS2_SB(odir);
+        struct gfs2_holder ghs[4], r_gh;
+        unsigned int num_gh;
+        int dir_rename = 0;
+        int alloc_required;
+        unsigned int x;
+        int error;
+        if (ndentry->d_inode) {
+                nip = GFS2_I(ndentry->d_inode);
+                if (ip == nip)
+                        return 0;
+        }
+        /* Make sure we aren't trying to move a dirctory into it's subdir */
+        if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
+                dir_rename = 1;
+                error = gfs2_glock_nq_init(sdp->sd_rename_gl,
+                                           LM_ST_EXCLUSIVE, 0,
+                                           &r_gh);
+                if (error)
+                        goto out;
+                error = gfs2_ok_to_move(ip, ndip);
+                if (error)
+                        goto out_gunlock_r;
+        }
+        num_gh = 1;
+        gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        if (odip != ndip) {
+                gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+                num_gh++;
+        }
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+        num_gh++;
+        if (nip) {
+                gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+                num_gh++;
+        }
+        error = gfs2_glock_nq_m(num_gh, ghs);
+        if (error)
+                goto out_uninit;
+        /* Check out the old directory */
+        error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
+        if (error)
+                goto out_gunlock;
+        /* Check out the new directory */
+        if (nip) {
+                error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+                if (error)
+                        goto out_gunlock;
+                if (S_ISDIR(nip->i_di.di_mode)) {
+                        if (nip->i_di.di_entries < 2) {
+                                if (gfs2_consist_inode(nip))
+                                        gfs2_dinode_print(&nip->i_di);
+                                error = -EIO;
+                                goto out_gunlock;
+                        }
+                        if (nip->i_di.di_entries > 2) {
+                                error = -ENOTEMPTY;
+                                goto out_gunlock;
+                        }
+                }
+        } else {
+                error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+                if (error)
+                        goto out_gunlock;
+                error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
+                switch (error) {
+                case -ENOENT:
+                        error = 0;
+                        break;
+                case 0:
+                        error = -EEXIST;
+                default:
+                        goto out_gunlock;
+                };
+                if (odip != ndip) {
+                        if (!ndip->i_di.di_nlink) {
+                                error = -EINVAL;
+                                goto out_gunlock;
+                        }
+                        if (ndip->i_di.di_entries == (u32)-1) {
+                                error = -EFBIG;
+                                goto out_gunlock;
+                        }
+                        if (S_ISDIR(ip->i_di.di_mode) &&
+                            ndip->i_di.di_nlink == (u32)-1) {
+                                error = -EMLINK;
+                                goto out_gunlock;
+                        }
+                }
+        }
+        /* Check out the dir to be renamed */
+        if (dir_rename) {
+                error = permission(odentry->d_inode, MAY_WRITE, NULL);
+                if (error)
+                        goto out_gunlock;
+        }
+        alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+        if (error < 0)
+                goto out_gunlock;
+        error = 0;
+        if (alloc_required) {
+                struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+                error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                if (error)
+                        goto out_alloc;
+                error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
+                                         ndip->i_di.di_gid);
+                if (error)
+                        goto out_gunlock_q;
+                al->al_requested = sdp->sd_max_dirres;
+                error = gfs2_inplace_reserve(ndip);
+                if (error)
+                        goto out_gunlock_q;
+                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                                         al->al_rgd->rd_ri.ri_length +
+                                         4 * RES_DINODE + 4 * RES_LEAF +
+                                         RES_STATFS + RES_QUOTA, 0);
+                if (error)
+                        goto out_ipreserv;
+        } else {
+                error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
+                                         5 * RES_LEAF, 0);
+                if (error)
+                        goto out_gunlock;
+        }
+        /* Remove the target file, if it exists */
+        if (nip) {
+                if (S_ISDIR(nip->i_di.di_mode))
+                        error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
+                else {
+                        error = gfs2_dir_del(ndip, &ndentry->d_name);
+                        if (error)
+                                goto out_end_trans;
+                        error = gfs2_change_nlink(nip, -1);
+                }
+                if (error)
+                        goto out_end_trans;
+        }
+        if (dir_rename) {
+                struct qstr name;
+                gfs2_str2qstr(&name, "..");
+                error = gfs2_change_nlink(ndip, +1);
+                if (error)
+                        goto out_end_trans;
+                error = gfs2_change_nlink(odip, -1);
+                if (error)
+                        goto out_end_trans;
+                error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
+                if (error)
+                        goto out_end_trans;
+        } else {
+                struct buffer_head *dibh;
+                error = gfs2_meta_inode_buffer(ip, &dibh);
+                if (error)
+                        goto out_end_trans;
+                ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        error = gfs2_dir_del(odip, &odentry->d_name);
+        if (error)
+                goto out_end_trans;
+        error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
+                             IF2DT(ip->i_di.di_mode));
+        if (error)
+                goto out_end_trans;
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipreserv:
+        if (alloc_required)
+                gfs2_inplace_release(ndip);
+out_gunlock_q:
+        if (alloc_required)
+                gfs2_quota_unlock(ndip);
+out_alloc:
+        if (alloc_required)
+                gfs2_alloc_put(ndip);
+out_gunlock:
+        gfs2_glock_dq_m(num_gh, ghs);
+out_uninit:
+        for (x = 0; x < num_gh; x++)
+                gfs2_holder_uninit(ghs + x);
+out_gunlock_r:
+        if (dir_rename)
+                gfs2_glock_dq_uninit(&r_gh);
+out:
+        return error;
+}
+/**
+ * gfs2_readlink - Read the value of a symlink
+ * @dentry: the symlink
+ * @buf: the buffer to read the symlink data into
+ * @size: the size of the buffer
+ *
+ * Returns: errno
+ */
+static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
+                         int user_size)
+{
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        char array[GFS2_FAST_NAME_SIZE], *buf = array;
+        unsigned int len = GFS2_FAST_NAME_SIZE;
+        int error;
+        error = gfs2_readlinki(ip, &buf, &len);
+        if (error)
+                return error;
+        if (user_size > len - 1)
+                user_size = len - 1;
+        if (copy_to_user(user_buf, buf, user_size))
+                error = -EFAULT;
+        else
+                error = user_size;
+        if (buf != array)
+                kfree(buf);
+        return error;
+}
+/**
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
+ *
+ * This can handle symlinks of any size. It is optimised for symlinks
+ * under GFS2_FAST_NAME_SIZE.
+ *
+ * Returns: 0 on success or error code
+ */
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        char array[GFS2_FAST_NAME_SIZE], *buf = array;
+        unsigned int len = GFS2_FAST_NAME_SIZE;
+        int error;
+        error = gfs2_readlinki(ip, &buf, &len);
+        if (!error) {
+                error = vfs_follow_link(nd, buf);
+                if (buf != array)
+                        kfree(buf);
+        }
+        return ERR_PTR(error);
+}
+/**
+ * gfs2_permission -
+ * @inode:
+ * @mask:
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Returns: errno
+ */
+static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        int error;
+        if (ip->i_vn == ip->i_gl->gl_vn)
+                return generic_permission(inode, mask, gfs2_check_acl);
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (!error) {
+                error = generic_permission(inode, mask, gfs2_check_acl_locked);
+                gfs2_glock_dq_uninit(&i_gh);
+        }
+        return error;
+}
+static int setattr_size(struct inode *inode, struct iattr *attr)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        int error;
+        if (attr->ia_size != ip->i_di.di_size) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        error = gfs2_truncatei(ip, attr->ia_size);
+        if (error)
+                return error;
+        return error;
+}
+static int setattr_chown(struct inode *inode, struct iattr *attr)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct buffer_head *dibh;
+        u32 ouid, ogid, nuid, ngid;
+        int error;
+        ouid = ip->i_di.di_uid;
+        ogid = ip->i_di.di_gid;
+        nuid = attr->ia_uid;
+        ngid = attr->ia_gid;
+        if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
+                ouid = nuid = NO_QUOTA_CHANGE;
+        if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
+                ogid = ngid = NO_QUOTA_CHANGE;
+        gfs2_alloc_get(ip);
+        error = gfs2_quota_lock(ip, nuid, ngid);
+        if (error)
+                goto out_alloc;
+        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+                error = gfs2_quota_check(ip, nuid, ngid);
+                if (error)
+                        goto out_gunlock_q;
+        }
+        error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
+        if (error)
+                goto out_gunlock_q;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out_end_trans;
+        error = inode_setattr(inode, attr);
+        gfs2_assert_warn(sdp, !error);
+        gfs2_inode_attr_out(ip);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+                gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
+                gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
+        }
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_gunlock_q:
+        gfs2_quota_unlock(ip);
+out_alloc:
+        gfs2_alloc_put(ip);
+        return error;
+}
+/**
+ * gfs2_setattr - Change attributes on an inode
+ * @dentry: The dentry which is changing
+ * @attr: The structure describing the change
+ *
+ * The VFS layer wants to change one or more of an inodes attributes.  Write
+ * that change out to disk.
+ *
+ * Returns: errno
+ */
+static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                return error;
+        error = -EPERM;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                goto out;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                goto out;
+        if (attr->ia_valid & ATTR_SIZE)
+                error = setattr_size(inode, attr);
+        else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
+                error = setattr_chown(inode, attr);
+        else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
+                error = gfs2_acl_chmod(ip, attr);
+        else
+                error = gfs2_setattr_simple(ip, attr);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        if (!error)
+                mark_inode_dirty(inode);
+        return error;
+}
+/**
+ * gfs2_getattr - Read out an inode's attributes
+ * @mnt: The vfsmount the inode is being accessed from
+ * @dentry: The dentry to stat
+ * @stat: The inode's stats
+ *
+ * Returns: errno
+ */
+static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                        struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+        if (!error) {
+                generic_fillattr(inode, stat);
+                gfs2_glock_dq_uninit(&gh);
+        }
+        return error;
+}
+static int gfs2_setxattr(struct dentry *dentry, const char *name,
+                         const void *data, size_t size, int flags)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_ea_request er;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        if (er.er_type == GFS2_EATYPE_UNUSED)
+                return -EOPNOTSUPP;
+        er.er_data = (char *)data;
+        er.er_name_len = strlen(er.er_name);
+        er.er_data_len = size;
+        er.er_flags = flags;
+        gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
+        return gfs2_ea_set(GFS2_I(inode), &er);
+}
+static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
+                             void *data, size_t size)
+{
+        struct gfs2_ea_request er;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        if (er.er_type == GFS2_EATYPE_UNUSED)
+                return -EOPNOTSUPP;
+        er.er_data = data;
+        er.er_name_len = strlen(er.er_name);
+        er.er_data_len = size;
+        return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
+}
+static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        struct gfs2_ea_request er;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_data = (size) ? buffer : NULL;
+        er.er_data_len = size;
+        return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+}
+static int gfs2_removexattr(struct dentry *dentry, const char *name)
+{
+        struct gfs2_ea_request er;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        if (er.er_type == GFS2_EATYPE_UNUSED)
+                return -EOPNOTSUPP;
+        er.er_name_len = strlen(er.er_name);
+        return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+}
+struct inode_operations gfs2_file_iops = {
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+};
+struct inode_operations gfs2_dev_iops = {
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+};
+struct inode_operations gfs2_dir_iops = {
+        .create = gfs2_create,
+        .lookup = gfs2_lookup,
+        .link = gfs2_link,
+        .unlink = gfs2_unlink,
+        .symlink = gfs2_symlink,
+        .mkdir = gfs2_mkdir,
+        .rmdir = gfs2_rmdir,
+        .mknod = gfs2_mknod,
+        .rename = gfs2_rename,
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+};
+struct inode_operations gfs2_symlink_iops = {
+        .readlink = gfs2_readlink,
+        .follow_link = gfs2_follow_link,
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+};
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..b15acb4fd34c
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_INODE_DOT_H__
+#define __OPS_INODE_DOT_H__
+#include <linux/fs.h>
+extern struct inode_operations gfs2_file_iops;
+extern struct inode_operations gfs2_dir_iops;
+extern struct inode_operations gfs2_symlink_iops;
+extern struct inode_operations gfs2_dev_iops;
+#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..06f06f7773d0
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "inode.h"
+#include "lm.h"
+#include "log.h"
+#include "mount.h"
+#include "ops_super.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#include "trans.h"
+#include "dir.h"
+#include "eattr.h"
+#include "bmap.h"
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        /* Check this is a "normal" inode */
+        if (inode->i_private) {
+                if (current->flags & PF_MEMALLOC)
+                        return 0;
+                if (sync)
+                        gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        }
+        return 0;
+}
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+static void gfs2_put_super(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        if (!sdp)
+                return;
+        if (!strncmp(sb->s_type->name, "gfs2meta", 8))
+                return; /* Nothing to do */
+        /*  Unfreeze the filesystem, if we need to  */
+        mutex_lock(&sdp->sd_freeze_lock);
+        if (sdp->sd_freeze_count)
+                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+        mutex_unlock(&sdp->sd_freeze_lock);
+        kthread_stop(sdp->sd_quotad_process);
+        kthread_stop(sdp->sd_logd_process);
+        kthread_stop(sdp->sd_recoverd_process);
+        while (sdp->sd_glockd_num--)
+                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
+        kthread_stop(sdp->sd_scand_process);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                error = gfs2_make_fs_ro(sdp);
+                if (error)
+                        gfs2_io_error(sdp);
+        }
+        /*  At this point, we're through modifying the disk  */
+        /*  Release stuff  */
+        iput(sdp->sd_master_dir);
+        iput(sdp->sd_jindex);
+        iput(sdp->sd_inum_inode);
+        iput(sdp->sd_statfs_inode);
+        iput(sdp->sd_rindex);
+        iput(sdp->sd_quota_inode);
+        gfs2_glock_put(sdp->sd_rename_gl);
+        gfs2_glock_put(sdp->sd_trans_gl);
+        if (!sdp->sd_args.ar_spectator) {
+                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+                iput(sdp->sd_ir_inode);
+                iput(sdp->sd_sc_inode);
+                iput(sdp->sd_qc_inode);
+        }
+        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+        gfs2_clear_rgrpd(sdp);
+        gfs2_jindex_free(sdp);
+        /*  Take apart glock structures and buffer lists  */
+        gfs2_gl_hash_clear(sdp, WAIT);
+        /*  Unmount the locking protocol  */
+        gfs2_lm_unmount(sdp);
+        /*  At this point, we're through participating in the lockspace  */
+        gfs2_sys_fs_del(sdp);
+        kfree(sdp);
+}
+/**
+ * gfs2_write_super - disk commit all incore transactions
+ * @sb: the filesystem
+ *
+ * This function is called every time sync(2) is called.
+ * After this exits, all dirty buffers are synced.
+ */
+static void gfs2_write_super(struct super_block *sb)
+{
+        gfs2_log_flush(sb->s_fs_info, NULL);
+}
+/**
+ * gfs2_write_super_lockfs - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static void gfs2_write_super_lockfs(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        for (;;) {
+                error = gfs2_freeze_fs(sdp);
+                if (!error)
+                        break;
+                switch (error) {
+                case -EBUSY:
+                        fs_err(sdp, "waiting for recovery before freeze\n");
+                        break;
+                default:
+                        fs_err(sdp, "error freezing FS: %d\n", error);
+                        break;
+                }
+                fs_err(sdp, "retrying...\n");
+                msleep(1000);
+        }
+}
+/**
+ * gfs2_unlockfs - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static void gfs2_unlockfs(struct super_block *sb)
+{
+        gfs2_unfreeze_fs(sb->s_fs_info);
+}
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_inode->i_sb;
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_statfs_change sc;
+        int error;
+        if (gfs2_tune_get(sdp, gt_statfs_slow))
+                error = gfs2_statfs_slow(sdp, &sc);
+        else
+                error = gfs2_statfs_i(sdp, &sc);
+        if (error)
+                return error;
+        buf->f_type = GFS2_MAGIC;
+        buf->f_bsize = sdp->sd_sb.sb_bsize;
+        buf->f_blocks = sc.sc_total;
+        buf->f_bfree = sc.sc_free;
+        buf->f_bavail = sc.sc_free;
+        buf->f_files = sc.sc_dinodes + sc.sc_free;
+        buf->f_ffree = sc.sc_free;
+        buf->f_namelen = GFS2_FNAMESIZE;
+        return 0;
+}
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        error = gfs2_mount_args(sdp, data, 1);
+        if (error)
+                return error;
+        if (sdp->sd_args.ar_spectator)
+                *flags |= MS_RDONLY;
+        else {
+                if (*flags & MS_RDONLY) {
+                        if (!(sb->s_flags & MS_RDONLY))
+                                error = gfs2_make_fs_ro(sdp);
+                } else if (!(*flags & MS_RDONLY) &&
+                           (sb->s_flags & MS_RDONLY)) {
+                        error = gfs2_make_fs_rw(sdp);
+                }
+        }
+        if (*flags & (MS_NOATIME | MS_NODIRATIME))
+                set_bit(SDF_NOATIME, &sdp->sd_flags);
+        else
+                clear_bit(SDF_NOATIME, &sdp->sd_flags);
+        /* Don't let the VFS update atimes.  GFS2 handles this itself. */
+        *flags |= MS_NOATIME | MS_NODIRATIME;
+        return error;
+}
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+static void gfs2_clear_inode(struct inode *inode)
+{
+        /* This tells us its a "real" inode and not one which only
+         * serves to contain an address space (see rgrp.c, meta_io.c)
+         * which therefore doesn't have its own glocks.
+         */
+        if (inode->i_private) {
+                struct gfs2_inode *ip = GFS2_I(inode);
+                gfs2_glock_inode_squish(inode);
+                gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
+                ip->i_gl->gl_object = NULL;
+                gfs2_glock_schedule_for_reclaim(ip->i_gl);
+                gfs2_glock_put(ip->i_gl);
+                ip->i_gl = NULL;
+                if (ip->i_iopen_gh.gh_gl)
+                        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+        }
+}
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+        struct gfs2_args *args = &sdp->sd_args;
+        if (args->ar_lockproto[0])
+                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+        if (args->ar_locktable[0])
+                seq_printf(s, ",locktable=%s", args->ar_locktable);
+        if (args->ar_hostdata[0])
+                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+        if (args->ar_spectator)
+                seq_printf(s, ",spectator");
+        if (args->ar_ignore_local_fs)
+                seq_printf(s, ",ignore_local_fs");
+        if (args->ar_localflocks)
+                seq_printf(s, ",localflocks");
+        if (args->ar_localcaching)
+                seq_printf(s, ",localcaching");
+        if (args->ar_debug)
+                seq_printf(s, ",debug");
+        if (args->ar_upgrade)
+                seq_printf(s, ",upgrade");
+        if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
+                seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
+        if (args->ar_posix_acl)
+                seq_printf(s, ",acl");
+        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+                char *state;
+                switch (args->ar_quota) {
+                case GFS2_QUOTA_OFF:
+                        state = "off";
+                        break;
+                case GFS2_QUOTA_ACCOUNT:
+                        state = "account";
+                        break;
+                case GFS2_QUOTA_ON:
+                        state = "on";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",quota=%s", state);
+        }
+        if (args->ar_suiddir)
+                seq_printf(s, ",suiddir");
+        if (args->ar_data != GFS2_DATA_DEFAULT) {
+                char *state;
+                switch (args->ar_data) {
+                case GFS2_DATA_WRITEBACK:
+                        state = "writeback";
+                        break;
+                case GFS2_DATA_ORDERED:
+                        state = "ordered";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",data=%s", state);
+        }
+        return 0;
+}
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+static void gfs2_delete_inode(struct inode *inode)
+{
+        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        if (!inode->i_private)
+                goto out;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
+        if (unlikely(error)) {
+                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+                goto out;
+        }
+        gfs2_glock_dq(&ip->i_iopen_gh);
+        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+        error = gfs2_glock_nq(&ip->i_iopen_gh);
+        if (error)
+                goto out_uninit;
+        if (S_ISDIR(ip->i_di.di_mode) &&
+            (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+                error = gfs2_dir_exhash_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (ip->i_di.di_eattr) {
+                error = gfs2_ea_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (!gfs2_is_stuffed(ip)) {
+                error = gfs2_file_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        error = gfs2_dinode_dealloc(ip);
+out_unlock:
+        gfs2_glock_dq(&ip->i_iopen_gh);
+out_uninit:
+        gfs2_holder_uninit(&ip->i_iopen_gh);
+        gfs2_glock_dq_uninit(&gh);
+        if (error)
+                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+        truncate_inode_pages(&inode->i_data, 0);
+        clear_inode(inode);
+}
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_inode *ip;
+        ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+        if (ip) {
+                ip->i_flags = 0;
+                ip->i_gl = NULL;
+                ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
+                ip->i_last_pfault = jiffies;
+        }
+        return &ip->i_inode;
+}
+static void gfs2_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(gfs2_inode_cachep, inode);
+}
+struct super_operations gfs2_super_ops = {
+        .alloc_inode = gfs2_alloc_inode,
+        .destroy_inode = gfs2_destroy_inode,
+        .write_inode = gfs2_write_inode,
+        .delete_inode = gfs2_delete_inode,
+        .put_super = gfs2_put_super,
+        .write_super = gfs2_write_super,
+        .write_super_lockfs = gfs2_write_super_lockfs,
+        .unlockfs = gfs2_unlockfs,
+        .statfs = gfs2_statfs,
+        .remount_fs = gfs2_remount_fs,
+        .clear_inode = gfs2_clear_inode,
+        .show_options = gfs2_show_options,
+};
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..9de73f042f78
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_SUPER_DOT_H__
+#define __OPS_SUPER_DOT_H__
+#include <linux/fs.h>
+extern struct super_operations gfs2_super_ops;
+#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..5453d2947ab3
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "ops_vm.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+static void pfault_be_greedy(struct gfs2_inode *ip)
+{
+        unsigned int time;
+        spin_lock(&ip->i_spin);
+        time = ip->i_greedy;
+        ip->i_last_pfault = jiffies;
+        spin_unlock(&ip->i_spin);
+        igrab(&ip->i_inode);
+        if (gfs2_glock_be_greedy(ip->i_gl, time))
+                iput(&ip->i_inode);
+}
+static struct page *gfs2_private_nopage(struct vm_area_struct *area,
+                                        unsigned long address, int *type)
+{
+        struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
+        struct page *result;
+        set_bit(GIF_PAGED, &ip->i_flags);
+        result = filemap_nopage(area, address, type);
+        if (result && result != NOPAGE_OOM)
+                pfault_be_greedy(ip);
+        return result;
+}
+static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned long index = page->index;
+        u64 lblock = index << (PAGE_CACHE_SHIFT -
+                                    sdp->sd_sb.sb_bsize_shift);
+        unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
+        struct gfs2_alloc *al;
+        unsigned int data_blocks, ind_blocks;
+        unsigned int x;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+        if (error)
+                goto out_gunlock_q;
+        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+        al->al_requested = data_blocks + ind_blocks;
+        error = gfs2_inplace_reserve(ip);
+        if (error)
+                goto out_gunlock_q;
+        error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
+                                 ind_blocks + RES_DINODE +
+                                 RES_STATFS + RES_QUOTA, 0);
+        if (error)
+                goto out_ipres;
+        if (gfs2_is_stuffed(ip)) {
+                error = gfs2_unstuff_dinode(ip, NULL);
+                if (error)
+                        goto out_trans;
+        }
+        for (x = 0; x < blocks; ) {
+                u64 dblock;
+                unsigned int extlen;
+                int new = 1;
+                error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+                if (error)
+                        goto out_trans;
+                lblock += extlen;
+                x += extlen;
+        }
+        gfs2_assert_warn(sdp, al->al_alloced);
+out_trans:
+        gfs2_trans_end(sdp);
+out_ipres:
+        gfs2_inplace_release(ip);
+out_gunlock_q:
+        gfs2_quota_unlock(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
+                                           unsigned long address, int *type)
+{
+        struct file *file = area->vm_file;
+        struct gfs2_file *gf = file->private_data;
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_holder i_gh;
+        struct page *result = NULL;
+        unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
+                              area->vm_pgoff;
+        int alloc_required;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                return NULL;
+        set_bit(GIF_PAGED, &ip->i_flags);
+        set_bit(GIF_SW_PAGED, &ip->i_flags);
+        error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT,
+                                          PAGE_CACHE_SIZE, &alloc_required);
+        if (error)
+                goto out;
+        set_bit(GFF_EXLOCK, &gf->f_flags);
+        result = filemap_nopage(area, address, type);
+        clear_bit(GFF_EXLOCK, &gf->f_flags);
+        if (!result || result == NOPAGE_OOM)
+                goto out;
+        if (alloc_required) {
+                error = alloc_page_backing(ip, result);
+                if (error) {
+                        page_cache_release(result);
+                        result = NULL;
+                        goto out;
+                }
+                set_page_dirty(result);
+        }
+        pfault_be_greedy(ip);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        return result;
+}
+struct vm_operations_struct gfs2_vm_ops_private = {
+        .nopage = gfs2_private_nopage,
+};
+struct vm_operations_struct gfs2_vm_ops_sharewrite = {
+        .nopage = gfs2_sharewrite_nopage,
+};
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..4ae8f43ed5e3
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_VM_DOT_H__
+#define __OPS_VM_DOT_H__
+#include <linux/mm.h>
+extern struct vm_operations_struct gfs2_vm_ops_private;
+extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
+#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..c69b94a55588
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1227 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+/*
+ * Quota change tags are associated with each transaction that allocates or
+ * deallocates space.  Those changes are accumulated locally to each node (in a
+ * per-node file) and then are periodically synced to the quota file.  This
+ * avoids the bottleneck of constantly touching the quota file, but introduces
+ * fuzziness in the current usage value of IDs that are being used on different
+ * nodes in the cluster simultaneously.  So, it is possible for a user on
+ * multiple nodes to overrun their quota, but that overrun is controlable.
+ * Since quota tags are part of transactions, there is no need to a quota check
+ * program to be run on node crashes or anything like that.
+ *
+ * There are couple of knobs that let the administrator manage the quota
+ * fuzziness.  "quota_quantum" sets the maximum time a quota change can be
+ * sitting on one node before being synced to the quota file.  (The default is
+ * 60 seconds.)  Another knob, "quota_scale" controls how quickly the frequency
+ * of quota file syncs increases as the user moves closer to their limit.  The
+ * more frequent the syncs, the more accurate the quota enforcement, but that
+ * means that there is more contention between the nodes for the quota file.
+ * The default value is one.  This sets the maximum theoretical quota overrun
+ * (with infinite node with infinite bandwidth) to twice the user's limit.  (In
+ * practice, the maximum overrun you see should be much less.)  A "quota_scale"
+ * number greater than one makes quota syncs more frequent and reduces the
+ * maximum overrun.  Numbers less than one (but greater than zero) make quota
+ * syncs less frequent.
+ *
+ * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
+ * the quota file, so it is not being constantly read.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "inode.h"
+#include "ops_file.h"
+#include "ops_address.h"
+#include "util.h"
+#define QUOTA_USER 1
+#define QUOTA_GROUP 0
+static u64 qd2offset(struct gfs2_quota_data *qd)
+{
+        u64 offset;
+        offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
+        offset *= sizeof(struct gfs2_quota);
+        return offset;
+}
+static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
+                    struct gfs2_quota_data **qdp)
+{
+        struct gfs2_quota_data *qd;
+        int error;
+        qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
+        if (!qd)
+                return -ENOMEM;
+        qd->qd_count = 1;
+        qd->qd_id = id;
+        if (user)
+                set_bit(QDF_USER, &qd->qd_flags);
+        qd->qd_slot = -1;
+        error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
+                              &gfs2_quota_glops, CREATE, &qd->qd_gl);
+        if (error)
+                goto fail;
+        error = gfs2_lvb_hold(qd->qd_gl);
+        gfs2_glock_put(qd->qd_gl);
+        if (error)
+                goto fail;
+        *qdp = qd;
+        return 0;
+fail:
+        kfree(qd);
+        return error;
+}
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+                  struct gfs2_quota_data **qdp)
+{
+        struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
+        int error, found;
+        *qdp = NULL;
+        for (;;) {
+                found = 0;
+                spin_lock(&sdp->sd_quota_spin);
+                list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+                        if (qd->qd_id == id &&
+                            !test_bit(QDF_USER, &qd->qd_flags) == !user) {
+                                qd->qd_count++;
+                                found = 1;
+                                break;
+                        }
+                }
+                if (!found)
+                        qd = NULL;
+                if (!qd && new_qd) {
+                        qd = new_qd;
+                        list_add(&qd->qd_list, &sdp->sd_quota_list);
+                        atomic_inc(&sdp->sd_quota_count);
+                        new_qd = NULL;
+                }
+                spin_unlock(&sdp->sd_quota_spin);
+                if (qd || !create) {
+                        if (new_qd) {
+                                gfs2_lvb_unhold(new_qd->qd_gl);
+                                kfree(new_qd);
+                        }
+                        *qdp = qd;
+                        return 0;
+                }
+                error = qd_alloc(sdp, user, id, &new_qd);
+                if (error)
+                        return error;
+        }
+}
+static void qd_hold(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        spin_lock(&sdp->sd_quota_spin);
+        gfs2_assert(sdp, qd->qd_count);
+        qd->qd_count++;
+        spin_unlock(&sdp->sd_quota_spin);
+}
+static void qd_put(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        spin_lock(&sdp->sd_quota_spin);
+        gfs2_assert(sdp, qd->qd_count);
+        if (!--qd->qd_count)
+                qd->qd_last_touched = jiffies;
+        spin_unlock(&sdp->sd_quota_spin);
+}
+static int slot_get(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        unsigned int c, o = 0, b;
+        unsigned char byte = 0;
+        spin_lock(&sdp->sd_quota_spin);
+        if (qd->qd_slot_count++) {
+                spin_unlock(&sdp->sd_quota_spin);
+                return 0;
+        }
+        for (c = 0; c < sdp->sd_quota_chunks; c++)
+                for (o = 0; o < PAGE_SIZE; o++) {
+                        byte = sdp->sd_quota_bitmap[c][o];
+                        if (byte != 0xFF)
+                                goto found;
+                }
+        goto fail;
+found:
+        for (b = 0; b < 8; b++)
+                if (!(byte & (1 << b)))
+                        break;
+        qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
+        if (qd->qd_slot >= sdp->sd_quota_slots)
+                goto fail;
+        sdp->sd_quota_bitmap[c][o] |= 1 << b;
+        spin_unlock(&sdp->sd_quota_spin);
+        return 0;
+fail:
+        qd->qd_slot_count--;
+        spin_unlock(&sdp->sd_quota_spin);
+        return -ENOSPC;
+}
+static void slot_hold(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        spin_lock(&sdp->sd_quota_spin);
+        gfs2_assert(sdp, qd->qd_slot_count);
+        qd->qd_slot_count++;
+        spin_unlock(&sdp->sd_quota_spin);
+}
+static void slot_put(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        spin_lock(&sdp->sd_quota_spin);
+        gfs2_assert(sdp, qd->qd_slot_count);
+        if (!--qd->qd_slot_count) {
+                gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+                qd->qd_slot = -1;
+        }
+        spin_unlock(&sdp->sd_quota_spin);
+}
+static int bh_get(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+        unsigned int block, offset;
+        struct buffer_head *bh;
+        int error;
+        struct buffer_head bh_map;
+        mutex_lock(&sdp->sd_quota_mutex);
+        if (qd->qd_bh_count++) {
+                mutex_unlock(&sdp->sd_quota_mutex);
+                return 0;
+        }
+        block = qd->qd_slot / sdp->sd_qc_per_block;
+        offset = qd->qd_slot % sdp->sd_qc_per_block;;
+        error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map, 1);
+        if (error)
+                goto fail;
+        error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+        if (error)
+                goto fail;
+        error = -EIO;
+        if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
+                goto fail_brelse;
+        qd->qd_bh = bh;
+        qd->qd_bh_qc = (struct gfs2_quota_change *)
+                (bh->b_data + sizeof(struct gfs2_meta_header) +
+                 offset * sizeof(struct gfs2_quota_change));
+        mutex_lock(&sdp->sd_quota_mutex);
+        return 0;
+fail_brelse:
+        brelse(bh);
+fail:
+        qd->qd_bh_count--;
+        mutex_unlock(&sdp->sd_quota_mutex);
+        return error;
+}
+static void bh_put(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        mutex_lock(&sdp->sd_quota_mutex);
+        gfs2_assert(sdp, qd->qd_bh_count);
+        if (!--qd->qd_bh_count) {
+                brelse(qd->qd_bh);
+                qd->qd_bh = NULL;
+                qd->qd_bh_qc = NULL;
+        }
+        mutex_unlock(&sdp->sd_quota_mutex);
+}
+static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
+{
+        struct gfs2_quota_data *qd = NULL;
+        int error;
+        int found = 0;
+        *qdp = NULL;
+        if (sdp->sd_vfs->s_flags & MS_RDONLY)
+                return 0;
+        spin_lock(&sdp->sd_quota_spin);
+        list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+                if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+                    !test_bit(QDF_CHANGE, &qd->qd_flags) ||
+                    qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
+                        continue;
+                list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+                set_bit(QDF_LOCKED, &qd->qd_flags);
+                gfs2_assert_warn(sdp, qd->qd_count);
+                qd->qd_count++;
+                qd->qd_change_sync = qd->qd_change;
+                gfs2_assert_warn(sdp, qd->qd_slot_count);
+                qd->qd_slot_count++;
+                found = 1;
+                break;
+        }
+        if (!found)
+                qd = NULL;
+        spin_unlock(&sdp->sd_quota_spin);
+        if (qd) {
+                gfs2_assert_warn(sdp, qd->qd_change_sync);
+                error = bh_get(qd);
+                if (error) {
+                        clear_bit(QDF_LOCKED, &qd->qd_flags);
+                        slot_put(qd);
+                        qd_put(qd);
+                        return error;
+                }
+        }
+        *qdp = qd;
+        return 0;
+}
+static int qd_trylock(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        if (sdp->sd_vfs->s_flags & MS_RDONLY)
+                return 0;
+        spin_lock(&sdp->sd_quota_spin);
+        if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+            !test_bit(QDF_CHANGE, &qd->qd_flags)) {
+                spin_unlock(&sdp->sd_quota_spin);
+                return 0;
+        }
+        list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+        set_bit(QDF_LOCKED, &qd->qd_flags);
+        gfs2_assert_warn(sdp, qd->qd_count);
+        qd->qd_count++;
+        qd->qd_change_sync = qd->qd_change;
+        gfs2_assert_warn(sdp, qd->qd_slot_count);
+        qd->qd_slot_count++;
+        spin_unlock(&sdp->sd_quota_spin);
+        gfs2_assert_warn(sdp, qd->qd_change_sync);
+        if (bh_get(qd)) {
+                clear_bit(QDF_LOCKED, &qd->qd_flags);
+                slot_put(qd);
+                qd_put(qd);
+                return 0;
+        }
+        return 1;
+}
+static void qd_unlock(struct gfs2_quota_data *qd)
+{
+        gfs2_assert_warn(qd->qd_gl->gl_sbd,
+                         test_bit(QDF_LOCKED, &qd->qd_flags));
+        clear_bit(QDF_LOCKED, &qd->qd_flags);
+        bh_put(qd);
+        slot_put(qd);
+        qd_put(qd);
+}
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+                    struct gfs2_quota_data **qdp)
+{
+        int error;
+        error = qd_get(sdp, user, id, create, qdp);
+        if (error)
+                return error;
+        error = slot_get(*qdp);
+        if (error)
+                goto fail;
+        error = bh_get(*qdp);
+        if (error)
+                goto fail_slot;
+        return 0;
+fail_slot:
+        slot_put(*qdp);
+fail:
+        qd_put(*qdp);
+        return error;
+}
+static void qdsb_put(struct gfs2_quota_data *qd)
+{
+        bh_put(qd);
+        slot_put(qd);
+        qd_put(qd);
+}
+int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_quota_data **qd = al->al_qd;
+        int error;
+        if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
+            gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
+                return -EIO;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return 0;
+        error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
+        if (error)
+                goto out;
+        al->al_qd_num++;
+        qd++;
+        error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
+        if (error)
+                goto out;
+        al->al_qd_num++;
+        qd++;
+        if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
+                error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+                if (error)
+                        goto out;
+                al->al_qd_num++;
+                qd++;
+        }
+        if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
+                error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+                if (error)
+                        goto out;
+                al->al_qd_num++;
+                qd++;
+        }
+out:
+        if (error)
+                gfs2_quota_unhold(ip);
+        return error;
+}
+void gfs2_quota_unhold(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        unsigned int x;
+        gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
+        for (x = 0; x < al->al_qd_num; x++) {
+                qdsb_put(al->al_qd[x]);
+                al->al_qd[x] = NULL;
+        }
+        al->al_qd_num = 0;
+}
+static int sort_qd(const void *a, const void *b)
+{
+        const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
+        const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
+        if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
+            !test_bit(QDF_USER, &qd_b->qd_flags)) {
+                if (test_bit(QDF_USER, &qd_a->qd_flags))
+                        return -1;
+                else
+                        return 1;
+        }
+        if (qd_a->qd_id < qd_b->qd_id)
+                return -1;
+        if (qd_a->qd_id > qd_b->qd_id)
+                return 1;
+        return 0;
+}
+static void do_qc(struct gfs2_quota_data *qd, s64 change)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+        struct gfs2_quota_change *qc = qd->qd_bh_qc;
+        s64 x;
+        mutex_lock(&sdp->sd_quota_mutex);
+        gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
+        if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
+                qc->qc_change = 0;
+                qc->qc_flags = 0;
+                if (test_bit(QDF_USER, &qd->qd_flags))
+                        qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
+                qc->qc_id = cpu_to_be32(qd->qd_id);
+        }
+        x = qc->qc_change;
+        x = be64_to_cpu(x) + change;
+        qc->qc_change = cpu_to_be64(x);
+        spin_lock(&sdp->sd_quota_spin);
+        qd->qd_change = x;
+        spin_unlock(&sdp->sd_quota_spin);
+        if (!x) {
+                gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
+                clear_bit(QDF_CHANGE, &qd->qd_flags);
+                qc->qc_flags = 0;
+                qc->qc_id = 0;
+                slot_put(qd);
+                qd_put(qd);
+        } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
+                qd_hold(qd);
+                slot_hold(qd);
+        }
+        mutex_unlock(&sdp->sd_quota_mutex);
+}
+/**
+ * gfs2_adjust_quota
+ *
+ * This function was mostly borrowed from gfs2_block_truncate_page which was
+ * in turn mostly borrowed from ext3
+ */
+static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
+                             s64 change, struct gfs2_quota_data *qd)
+{
+        struct inode *inode = &ip->i_inode;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long index = loc >> PAGE_CACHE_SHIFT;
+        unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
+        unsigned blocksize, iblock, pos;
+        struct buffer_head *bh;
+        struct page *page;
+        void *kaddr;
+        __be64 *ptr;
+        s64 value;
+        int err = -EIO;
+        page = grab_cache_page(mapping, index);
+        if (!page)
+                return -ENOMEM;
+        blocksize = inode->i_sb->s_blocksize;
+        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, blocksize, 0);
+        bh = page_buffers(page);
+        pos = blocksize;
+        while (offset >= pos) {
+                bh = bh->b_this_page;
+                iblock++;
+                pos += blocksize;
+        }
+        if (!buffer_mapped(bh)) {
+                gfs2_get_block(inode, iblock, bh, 1);
+                if (!buffer_mapped(bh))
+                        goto unlock;
+        }
+        if (PageUptodate(page))
+                set_buffer_uptodate(bh);
+        if (!buffer_uptodate(bh)) {
+                ll_rw_block(READ_META, 1, &bh);
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh))
+                        goto unlock;
+        }
+        gfs2_trans_add_bh(ip->i_gl, bh, 0);
+        kaddr = kmap_atomic(page, KM_USER0);
+        ptr = kaddr + offset;
+        value = (s64)be64_to_cpu(*ptr) + change;
+        *ptr = cpu_to_be64(value);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+        err = 0;
+        qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
+        qd->qd_qb.qb_value = cpu_to_be64(value);
+unlock:
+        unlock_page(page);
+        page_cache_release(page);
+        return err;
+}
+static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
+{
+        struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        unsigned int data_blocks, ind_blocks;
+        struct gfs2_holder *ghs, i_gh;
+        unsigned int qx, x;
+        struct gfs2_quota_data *qd;
+        loff_t offset;
+        unsigned int nalloc = 0;
+        struct gfs2_alloc *al = NULL;
+        int error;
+        gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+                              &data_blocks, &ind_blocks);
+        ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!ghs)
+                return -ENOMEM;
+        sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+        for (qx = 0; qx < num_qd; qx++) {
+                error = gfs2_glock_nq_init(qda[qx]->qd_gl,
+                                           LM_ST_EXCLUSIVE,
+                                           GL_NOCACHE, &ghs[qx]);
+                if (error)
+                        goto out;
+        }
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                goto out;
+        for (x = 0; x < num_qd; x++) {
+                int alloc_required;
+                offset = qd2offset(qda[x]);
+                error = gfs2_write_alloc_required(ip, offset,
+                                                  sizeof(struct gfs2_quota),
+                                                  &alloc_required);
+                if (error)
+                        goto out_gunlock;
+                if (alloc_required)
+                        nalloc++;
+        }
+        if (nalloc) {
+                al = gfs2_alloc_get(ip);
+                al->al_requested = nalloc * (data_blocks + ind_blocks);
+                error = gfs2_inplace_reserve(ip);
+                if (error)
+                        goto out_alloc;
+                error = gfs2_trans_begin(sdp,
+                                         al->al_rgd->rd_ri.ri_length +
+                                         num_qd * data_blocks +
+                                         nalloc * ind_blocks +
+                                         RES_DINODE + num_qd +
+                                         RES_STATFS, 0);
+                if (error)
+                        goto out_ipres;
+        } else {
+                error = gfs2_trans_begin(sdp,
+                                         num_qd * data_blocks +
+                                         RES_DINODE + num_qd, 0);
+                if (error)
+                        goto out_gunlock;
+        }
+        for (x = 0; x < num_qd; x++) {
+                qd = qda[x];
+                offset = qd2offset(qd);
+                error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
+                                          (struct gfs2_quota_data *)
+                                          qd->qd_gl->gl_lvb);
+                if (error)
+                        goto out_end_trans;
+                do_qc(qd, -qd->qd_change_sync);
+        }
+        error = 0;
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipres:
+        if (nalloc)
+                gfs2_inplace_release(ip);
+out_alloc:
+        if (nalloc)
+                gfs2_alloc_put(ip);
+out_gunlock:
+        gfs2_glock_dq_uninit(&i_gh);
+out:
+        while (qx--)
+                gfs2_glock_dq_uninit(&ghs[qx]);
+        kfree(ghs);
+        gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+        return error;
+}
+static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
+                    struct gfs2_holder *q_gh)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        struct gfs2_holder i_gh;
+        struct gfs2_quota q;
+        char buf[sizeof(struct gfs2_quota)];
+        struct file_ra_state ra_state;
+        int error;
+        struct gfs2_quota_lvb *qlvb;
+        file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
+restart:
+        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
+        if (error)
+                return error;
+        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
+                loff_t pos;
+                gfs2_glock_dq_uninit(q_gh);
+                error = gfs2_glock_nq_init(qd->qd_gl,
+                                          LM_ST_EXCLUSIVE, GL_NOCACHE,
+                                          q_gh);
+                if (error)
+                        return error;
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+                if (error)
+                        goto fail;
+                memset(buf, 0, sizeof(struct gfs2_quota));
+                pos = qd2offset(qd);
+                error = gfs2_internal_read(ip, &ra_state, buf,
+                                           &pos, sizeof(struct gfs2_quota));
+                if (error < 0)
+                        goto fail_gunlock;
+                gfs2_glock_dq_uninit(&i_gh);
+                gfs2_quota_in(&q, buf);
+                qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+                qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+                qlvb->__pad = 0;
+                qlvb->qb_limit = cpu_to_be64(q.qu_limit);
+                qlvb->qb_warn = cpu_to_be64(q.qu_warn);
+                qlvb->qb_value = cpu_to_be64(q.qu_value);
+                qd->qd_qb = *qlvb;
+                if (gfs2_glock_is_blocking(qd->qd_gl)) {
+                        gfs2_glock_dq_uninit(q_gh);
+                        force_refresh = 0;
+                        goto restart;
+                }
+        }
+        return 0;
+fail_gunlock:
+        gfs2_glock_dq_uninit(&i_gh);
+fail:
+        gfs2_glock_dq_uninit(q_gh);
+        return error;
+}
+int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        unsigned int x;
+        int error = 0;
+        gfs2_quota_hold(ip, uid, gid);
+        if (capable(CAP_SYS_RESOURCE) ||
+            sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+                return 0;
+        sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
+             sort_qd, NULL);
+        for (x = 0; x < al->al_qd_num; x++) {
+                error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
+                if (error)
+                        break;
+        }
+        if (!error)
+                set_bit(GIF_QD_LOCKED, &ip->i_flags);
+        else {
+                while (x--)
+                        gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+                gfs2_quota_unhold(ip);
+        }
+        return error;
+}
+static int need_sync(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_tune *gt = &sdp->sd_tune;
+        s64 value;
+        unsigned int num, den;
+        int do_sync = 1;
+        if (!qd->qd_qb.qb_limit)
+                return 0;
+        spin_lock(&sdp->sd_quota_spin);
+        value = qd->qd_change;
+        spin_unlock(&sdp->sd_quota_spin);
+        spin_lock(&gt->gt_spin);
+        num = gt->gt_quota_scale_num;
+        den = gt->gt_quota_scale_den;
+        spin_unlock(&gt->gt_spin);
+        if (value < 0)
+                do_sync = 0;
+        else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
+                 (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+                do_sync = 0;
+        else {
+                value *= gfs2_jindex_size(sdp) * num;
+                do_div(value, den);
+                value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
+                if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+                        do_sync = 0;
+        }
+        return do_sync;
+}
+void gfs2_quota_unlock(struct gfs2_inode *ip)
+{
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_quota_data *qda[4];
+        unsigned int count = 0;
+        unsigned int x;
+        if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
+                goto out;
+        for (x = 0; x < al->al_qd_num; x++) {
+                struct gfs2_quota_data *qd;
+                int sync;
+                qd = al->al_qd[x];
+                sync = need_sync(qd);
+                gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+                if (sync && qd_trylock(qd))
+                        qda[count++] = qd;
+        }
+        if (count) {
+                do_sync(count, qda);
+                for (x = 0; x < count; x++)
+                        qd_unlock(qda[x]);
+        }
+out:
+        gfs2_quota_unhold(ip);
+}
+#define MAX_LINE 256
+static int print_message(struct gfs2_quota_data *qd, char *type)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+               sdp->sd_fsname, type,
+               (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
+               qd->qd_id);
+        return 0;
+}
+int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_quota_data *qd;
+        s64 value;
+        unsigned int x;
+        int error = 0;
+        if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
+                return 0;
+        if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+                return 0;
+        for (x = 0; x < al->al_qd_num; x++) {
+                qd = al->al_qd[x];
+                if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+                      (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
+                        continue;
+                value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
+                spin_lock(&sdp->sd_quota_spin);
+                value += qd->qd_change;
+                spin_unlock(&sdp->sd_quota_spin);
+                if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
+                        print_message(qd, "exceeded");
+                        error = -EDQUOT;
+                        break;
+                } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
+                           (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
+                           time_after_eq(jiffies, qd->qd_last_warn +
+                                         gfs2_tune_get(sdp,
+                                                gt_quota_warn_period) * HZ)) {
+                        error = print_message(qd, "warning");
+                        qd->qd_last_warn = jiffies;
+                }
+        }
+        return error;
+}
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+                       u32 uid, u32 gid)
+{
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_quota_data *qd;
+        unsigned int x;
+        unsigned int found = 0;
+        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+                return;
+        if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+                return;
+        for (x = 0; x < al->al_qd_num; x++) {
+                qd = al->al_qd[x];
+                if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+                    (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
+                        do_qc(qd, change);
+                        found++;
+                }
+        }
+}
+int gfs2_quota_sync(struct gfs2_sbd *sdp)
+{
+        struct gfs2_quota_data **qda;
+        unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
+        unsigned int num_qd;
+        unsigned int x;
+        int error = 0;
+        sdp->sd_quota_sync_gen++;
+        qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
+        if (!qda)
+                return -ENOMEM;
+        do {
+                num_qd = 0;
+                for (;;) {
+                        error = qd_fish(sdp, qda + num_qd);
+                        if (error || !qda[num_qd])
+                                break;
+                        if (++num_qd == max_qd)
+                                break;
+                }
+                if (num_qd) {
+                        if (!error)
+                                error = do_sync(num_qd, qda);
+                        if (!error)
+                                for (x = 0; x < num_qd; x++)
+                                        qda[x]->qd_sync_gen =
+                                                sdp->sd_quota_sync_gen;
+                        for (x = 0; x < num_qd; x++)
+                                qd_unlock(qda[x]);
+                }
+        } while (!error && num_qd == max_qd);
+        kfree(qda);
+        return error;
+}
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
+{
+        struct gfs2_quota_data *qd;
+        struct gfs2_holder q_gh;
+        int error;
+        error = qd_get(sdp, user, id, CREATE, &qd);
+        if (error)
+                return error;
+        error = do_glock(qd, FORCE, &q_gh);
+        if (!error)
+                gfs2_glock_dq_uninit(&q_gh);
+        qd_put(qd);
+        return error;
+}
+int gfs2_quota_init(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+        unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        unsigned int x, slot = 0;
+        unsigned int found = 0;
+        u64 dblock;
+        u32 extlen = 0;
+        int error;
+        if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
+            ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
+        sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
+        error = -ENOMEM;
+        sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
+                                       sizeof(unsigned char *), GFP_KERNEL);
+        if (!sdp->sd_quota_bitmap)
+                return error;
+        for (x = 0; x < sdp->sd_quota_chunks; x++) {
+                sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
+                if (!sdp->sd_quota_bitmap[x])
+                        goto fail;
+        }
+        for (x = 0; x < blocks; x++) {
+                struct buffer_head *bh;
+                unsigned int y;
+                if (!extlen) {
+                        int new = 0;
+                        error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
+                        if (error)
+                                goto fail;
+                }
+                error = -EIO;
+                bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+                if (!bh)
+                        goto fail;
+                if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
+                        brelse(bh);
+                        goto fail;
+                }
+                for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
+                     y++, slot++) {
+                        struct gfs2_quota_change qc;
+                        struct gfs2_quota_data *qd;
+                        gfs2_quota_change_in(&qc, bh->b_data +
+                                          sizeof(struct gfs2_meta_header) +
+                                          y * sizeof(struct gfs2_quota_change));
+                        if (!qc.qc_change)
+                                continue;
+                        error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
+                                         qc.qc_id, &qd);
+                        if (error) {
+                                brelse(bh);
+                                goto fail;
+                        }
+                        set_bit(QDF_CHANGE, &qd->qd_flags);
+                        qd->qd_change = qc.qc_change;
+                        qd->qd_slot = slot;
+                        qd->qd_slot_count = 1;
+                        qd->qd_last_touched = jiffies;
+                        spin_lock(&sdp->sd_quota_spin);
+                        gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+                        list_add(&qd->qd_list, &sdp->sd_quota_list);
+                        atomic_inc(&sdp->sd_quota_count);
+                        spin_unlock(&sdp->sd_quota_spin);
+                        found++;
+                }
+                brelse(bh);
+                dblock++;
+                extlen--;
+        }
+        if (found)
+                fs_info(sdp, "found %u quota changes\n", found);
+        return 0;
+fail:
+        gfs2_quota_cleanup(sdp);
+        return error;
+}
+void gfs2_quota_scan(struct gfs2_sbd *sdp)
+{
+        struct gfs2_quota_data *qd, *safe;
+        LIST_HEAD(dead);
+        spin_lock(&sdp->sd_quota_spin);
+        list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
+                if (!qd->qd_count &&
+                    time_after_eq(jiffies, qd->qd_last_touched +
+                                gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
+                        list_move(&qd->qd_list, &dead);
+                        gfs2_assert_warn(sdp,
+                                         atomic_read(&sdp->sd_quota_count) > 0);
+                        atomic_dec(&sdp->sd_quota_count);
+                }
+        }
+        spin_unlock(&sdp->sd_quota_spin);
+        while (!list_empty(&dead)) {
+                qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
+                list_del(&qd->qd_list);
+                gfs2_assert_warn(sdp, !qd->qd_change);
+                gfs2_assert_warn(sdp, !qd->qd_slot_count);
+                gfs2_assert_warn(sdp, !qd->qd_bh_count);
+                gfs2_lvb_unhold(qd->qd_gl);
+                kfree(qd);
+        }
+}
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
+{
+        struct list_head *head = &sdp->sd_quota_list;
+        struct gfs2_quota_data *qd;
+        unsigned int x;
+        spin_lock(&sdp->sd_quota_spin);
+        while (!list_empty(head)) {
+                qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
+                if (qd->qd_count > 1 ||
+                    (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+                        list_move(&qd->qd_list, head);
+                        spin_unlock(&sdp->sd_quota_spin);
+                        schedule();
+                        spin_lock(&sdp->sd_quota_spin);
+                        continue;
+                }
+                list_del(&qd->qd_list);
+                atomic_dec(&sdp->sd_quota_count);
+                spin_unlock(&sdp->sd_quota_spin);
+                if (!qd->qd_count) {
+                        gfs2_assert_warn(sdp, !qd->qd_change);
+                        gfs2_assert_warn(sdp, !qd->qd_slot_count);
+                } else
+                        gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+                gfs2_assert_warn(sdp, !qd->qd_bh_count);
+                gfs2_lvb_unhold(qd->qd_gl);
+                kfree(qd);
+                spin_lock(&sdp->sd_quota_spin);
+        }
+        spin_unlock(&sdp->sd_quota_spin);
+        gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
+        if (sdp->sd_quota_bitmap) {
+                for (x = 0; x < sdp->sd_quota_chunks; x++)
+                        kfree(sdp->sd_quota_bitmap[x]);
+                kfree(sdp->sd_quota_bitmap);
+        }
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..a8be1417051f
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __QUOTA_DOT_H__
+#define __QUOTA_DOT_H__
+struct gfs2_inode;
+struct gfs2_sbd;
+#define NO_QUOTA_CHANGE ((u32)-1)
+int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_unhold(struct gfs2_inode *ip);
+int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_unlock(struct gfs2_inode *ip);
+int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+                       u32 uid, u32 gid);
+int gfs2_quota_sync(struct gfs2_sbd *sdp);
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+int gfs2_quota_init(struct gfs2_sbd *sdp);
+void gfs2_quota_scan(struct gfs2_sbd *sdp);
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..0a8a4b87dcc6
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,570 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+#include "dir.h"
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+                           struct buffer_head **bh)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_glock *gl = ip->i_gl;
+        int new = 0;
+        u64 dblock;
+        u32 extlen;
+        int error;
+        error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
+        if (error)
+                return error;
+        if (!dblock) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        *bh = gfs2_meta_ra(gl, dblock, extlen);
+        return error;
+}
+int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+        struct list_head *head = &sdp->sd_revoke_list;
+        struct gfs2_revoke_replay *rr;
+        int found = 0;
+        list_for_each_entry(rr, head, rr_list) {
+                if (rr->rr_blkno == blkno) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (found) {
+                rr->rr_where = where;
+                return 0;
+        }
+        rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
+        if (!rr)
+                return -ENOMEM;
+        rr->rr_blkno = blkno;
+        rr->rr_where = where;
+        list_add(&rr->rr_list, head);
+        return 1;
+}
+int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+        struct gfs2_revoke_replay *rr;
+        int wrap, a, b, revoke;
+        int found = 0;
+        list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
+                if (rr->rr_blkno == blkno) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                return 0;
+        wrap = (rr->rr_where < sdp->sd_replay_tail);
+        a = (sdp->sd_replay_tail < where);
+        b = (where < rr->rr_where);
+        revoke = (wrap) ? (a || b) : (a && b);
+        return revoke;
+}
+void gfs2_revoke_clean(struct gfs2_sbd *sdp)
+{
+        struct list_head *head = &sdp->sd_revoke_list;
+        struct gfs2_revoke_replay *rr;
+        while (!list_empty(head)) {
+                rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
+                list_del(&rr->rr_list);
+                kfree(rr);
+        }
+}
+/**
+ * get_log_header - read the log header for a given segment
+ * @jd: the journal
+ * @blk: the block to look at
+ * @lh: the log header to return
+ *
+ * Read the log header for a given segement in a given journal.  Do a few
+ * sanity checks on it.
+ *
+ * Returns: 0 on success,
+ *          1 if the header was invalid or incomplete,
+ *          errno on error
+ */
+static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
+                          struct gfs2_log_header *head)
+{
+        struct buffer_head *bh;
+        struct gfs2_log_header lh;
+        u32 hash;
+        int error;
+        error = gfs2_replay_read_block(jd, blk, &bh);
+        if (error)
+                return error;
+        memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
+        lh.lh_hash = 0;
+        hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
+        gfs2_log_header_in(&lh, bh->b_data);
+        brelse(bh);
+        if (lh.lh_header.mh_magic != GFS2_MAGIC ||
+            lh.lh_header.mh_type != GFS2_METATYPE_LH ||
+            lh.lh_blkno != blk || lh.lh_hash != hash)
+                return 1;
+        *head = lh;
+        return 0;
+}
+/**
+ * find_good_lh - find a good log header
+ * @jd: the journal
+ * @blk: the segment to start searching from
+ * @lh: the log header to fill in
+ * @forward: if true search forward in the log, else search backward
+ *
+ * Call get_log_header() to get a log header for a segment, but if the
+ * segment is bad, either scan forward or backward until we find a good one.
+ *
+ * Returns: errno
+ */
+static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
+                        struct gfs2_log_header *head)
+{
+        unsigned int orig_blk = *blk;
+        int error;
+        for (;;) {
+                error = get_log_header(jd, *blk, head);
+                if (error <= 0)
+                        return error;
+                if (++*blk == jd->jd_blocks)
+                        *blk = 0;
+                if (*blk == orig_blk) {
+                        gfs2_consist_inode(GFS2_I(jd->jd_inode));
+                        return -EIO;
+                }
+        }
+}
+/**
+ * jhead_scan - make sure we've found the head of the log
+ * @jd: the journal
+ * @head: this is filled in with the log descriptor of the head
+ *
+ * At this point, seg and lh should be either the head of the log or just
+ * before.  Scan forward until we find the head.
+ *
+ * Returns: errno
+ */
+static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+        unsigned int blk = head->lh_blkno;
+        struct gfs2_log_header lh;
+        int error;
+        for (;;) {
+                if (++blk == jd->jd_blocks)
+                        blk = 0;
+                error = get_log_header(jd, blk, &lh);
+                if (error < 0)
+                        return error;
+                if (error == 1)
+                        continue;
+                if (lh.lh_sequence == head->lh_sequence) {
+                        gfs2_consist_inode(GFS2_I(jd->jd_inode));
+                        return -EIO;
+                }
+                if (lh.lh_sequence < head->lh_sequence)
+                        break;
+                *head = lh;
+        }
+        return 0;
+}
+/**
+ * gfs2_find_jhead - find the head of a log
+ * @jd: the journal
+ * @head: the log descriptor for the head of the log is returned here
+ *
+ * Do a binary search of a journal and find the valid log entry with the
+ * highest sequence number.  (i.e. the log head)
+ *
+ * Returns: errno
+ */
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+        struct gfs2_log_header lh_1, lh_m;
+        u32 blk_1, blk_2, blk_m;
+        int error;
+        blk_1 = 0;
+        blk_2 = jd->jd_blocks - 1;
+        for (;;) {
+                blk_m = (blk_1 + blk_2) / 2;
+                error = find_good_lh(jd, &blk_1, &lh_1);
+                if (error)
+                        return error;
+                error = find_good_lh(jd, &blk_m, &lh_m);
+                if (error)
+                        return error;
+                if (blk_1 == blk_m || blk_m == blk_2)
+                        break;
+                if (lh_1.lh_sequence <= lh_m.lh_sequence)
+                        blk_1 = blk_m;
+                else
+                        blk_2 = blk_m;
+        }
+        error = jhead_scan(jd, &lh_1);
+        if (error)
+                return error;
+        *head = lh_1;
+        return error;
+}
+/**
+ * foreach_descriptor - go through the active part of the log
+ * @jd: the journal
+ * @start: the first log header in the active region
+ * @end: the last log header (don't process the contents of this entry))
+ *
+ * Call a given function once for every log descriptor in the active
+ * portion of the log.
+ *
+ * Returns: errno
+ */
+static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
+                              unsigned int end, int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        struct buffer_head *bh;
+        struct gfs2_log_descriptor *ld;
+        int error = 0;
+        u32 length;
+        __be64 *ptr;
+        unsigned int offset = sizeof(struct gfs2_log_descriptor);
+        offset += sizeof(__be64) - 1;
+        offset &= ~(sizeof(__be64) - 1);
+        while (start != end) {
+                error = gfs2_replay_read_block(jd, start, &bh);
+                if (error)
+                        return error;
+                if (gfs2_meta_check(sdp, bh)) {
+                        brelse(bh);
+                        return -EIO;
+                }
+                ld = (struct gfs2_log_descriptor *)bh->b_data;
+                length = be32_to_cpu(ld->ld_length);
+                if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
+                        struct gfs2_log_header lh;
+                        error = get_log_header(jd, start, &lh);
+                        if (!error) {
+                                gfs2_replay_incr_blk(sdp, &start);
+                                brelse(bh);
+                                continue;
+                        }
+                        if (error == 1) {
+                                gfs2_consist_inode(GFS2_I(jd->jd_inode));
+                                error = -EIO;
+                        }
+                        brelse(bh);
+                        return error;
+                } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
+                        brelse(bh);
+                        return -EIO;
+                }
+                ptr = (__be64 *)(bh->b_data + offset);
+                error = lops_scan_elements(jd, start, ld, ptr, pass);
+                if (error) {
+                        brelse(bh);
+                        return error;
+                }
+                while (length--)
+                        gfs2_replay_incr_blk(sdp, &start);
+                brelse(bh);
+        }
+        return 0;
+}
+/**
+ * clean_journal - mark a dirty journal as being clean
+ * @sdp: the filesystem
+ * @jd: the journal
+ * @gl: the journal's glock
+ * @head: the head journal to start from
+ *
+ * Returns: errno
+ */
+static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        unsigned int lblock;
+        struct gfs2_log_header *lh;
+        u32 hash;
+        struct buffer_head *bh;
+        int error;
+        struct buffer_head bh_map;
+        lblock = head->lh_blkno;
+        gfs2_replay_incr_blk(sdp, &lblock);
+        error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map, 1);
+        if (error)
+                return error;
+        if (!bh_map.b_blocknr) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr);
+        lock_buffer(bh);
+        memset(bh->b_data, 0, bh->b_size);
+        set_buffer_uptodate(bh);
+        clear_buffer_dirty(bh);
+        unlock_buffer(bh);
+        lh = (struct gfs2_log_header *)bh->b_data;
+        memset(lh, 0, sizeof(struct gfs2_log_header));
+        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
+        lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
+        lh->lh_blkno = cpu_to_be32(lblock);
+        hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
+        lh->lh_hash = cpu_to_be32(hash);
+        set_buffer_dirty(bh);
+        if (sync_dirty_buffer(bh))
+                gfs2_io_error_bh(sdp, bh);
+        brelse(bh);
+        return error;
+}
+/**
+ * gfs2_recover_journal - recovery a given journal
+ * @jd: the struct gfs2_jdesc describing the journal
+ *
+ * Acquire the journal's lock, check to see if the journal is clean, and
+ * do recovery if necessary.
+ *
+ * Returns: errno
+ */
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        struct gfs2_log_header head;
+        struct gfs2_holder j_gh, ji_gh, t_gh;
+        unsigned long t;
+        int ro = 0;
+        unsigned int pass;
+        int error;
+        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+                fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
+                        jd->jd_jid);
+                /* Aquire the journal lock so we can do recovery */
+                error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
+                                          LM_ST_EXCLUSIVE,
+                                          LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
+                                          &j_gh);
+                switch (error) {
+                case 0:
+                        break;
+                case GLR_TRYFAILED:
+                        fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
+                        error = 0;
+                default:
+                        goto fail;
+                };
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+                                           LM_FLAG_NOEXP, &ji_gh);
+                if (error)
+                        goto fail_gunlock_j;
+        } else {
+                fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
+        }
+        fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
+        error = gfs2_jdesc_check(jd);
+        if (error)
+                goto fail_gunlock_ji;
+        error = gfs2_find_jhead(jd, &head);
+        if (error)
+                goto fail_gunlock_ji;
+        if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+                fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
+                        jd->jd_jid);
+                t = jiffies;
+                /* Acquire a shared hold on the transaction lock */
+                error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+                                           LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
+                                           GL_NOCANCEL | GL_NOCACHE, &t_gh);
+                if (error)
+                        goto fail_gunlock_ji;
+                if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+                        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+                                ro = 1;
+                } else {
+                        if (sdp->sd_vfs->s_flags & MS_RDONLY)
+                                ro = 1;
+                }
+                if (ro) {
+                        fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
+                                jd->jd_jid);
+                        error = -EROFS;
+                        goto fail_gunlock_tr;
+                }
+                fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
+                for (pass = 0; pass < 2; pass++) {
+                        lops_before_scan(jd, &head, pass);
+                        error = foreach_descriptor(jd, head.lh_tail,
+                                                   head.lh_blkno, pass);
+                        lops_after_scan(jd, error, pass);
+                        if (error)
+                                goto fail_gunlock_tr;
+                }
+                error = clean_journal(jd, &head);
+                if (error)
+                        goto fail_gunlock_tr;
+                gfs2_glock_dq_uninit(&t_gh);
+                t = DIV_ROUND_UP(jiffies - t, HZ);
+                fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
+                        jd->jd_jid, t);
+        }
+        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+                gfs2_glock_dq_uninit(&ji_gh);
+        gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
+        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+                gfs2_glock_dq_uninit(&j_gh);
+        fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
+        return 0;
+fail_gunlock_tr:
+        gfs2_glock_dq_uninit(&t_gh);
+fail_gunlock_ji:
+        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+                gfs2_glock_dq_uninit(&ji_gh);
+fail_gunlock_j:
+                gfs2_glock_dq_uninit(&j_gh);
+        }
+        fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
+fail:
+        gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+        return error;
+}
+/**
+ * gfs2_check_journals - Recover any dirty journals
+ * @sdp: the filesystem
+ *
+ */
+void gfs2_check_journals(struct gfs2_sbd *sdp)
+{
+        struct gfs2_jdesc *jd;
+        for (;;) {
+                jd = gfs2_jdesc_find_dirty(sdp);
+                if (!jd)
+                        break;
+                if (jd != sdp->sd_jdesc)
+                        gfs2_recover_journal(jd);
+        }
+}
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..961feedf4d8b
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __RECOVERY_DOT_H__
+#define __RECOVERY_DOT_H__
+#include "incore.h"
+static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
+{
+        if (++*blk == sdp->sd_jdesc->jd_blocks)
+                *blk = 0;
+}
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+                           struct buffer_head **bh);
+int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+int gfs2_find_jhead(struct gfs2_jdesc *jd,
+                    struct gfs2_log_header *head);
+int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+void gfs2_check_journals(struct gfs2_sbd *sdp);
+#endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..b261385c0065
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1513 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "ops_file.h"
+#include "util.h"
+#define BFITNOENT ((u32)~0)
+/*
+ * These routines are used by the resource group routines (rgrp.c)
+ * to keep track of block allocation.  Each block is represented by two
+ * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
+ *
+ * 0 = Free
+ * 1 = Used (not metadata)
+ * 2 = Unlinked (still in use) inode
+ * 3 = Used (metadata)
+ */
+static const char valid_change[16] = {
+                /* current */
+        /* n */ 0, 1, 1, 1,
+        /* e */ 1, 0, 0, 0,
+        /* w */ 0, 0, 0, 1,
+                1, 0, 0, 0
+};
+/**
+ * gfs2_setbit - Set a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to set
+ * @new_state: the new state of the block
+ *
+ */
+static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                        unsigned int buflen, u32 block,
+                        unsigned char new_state)
+{
+        unsigned char *byte, *end, cur_state;
+        unsigned int bit;
+        byte = buffer + (block / GFS2_NBBY);
+        bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+        end = buffer + buflen;
+        gfs2_assert(rgd->rd_sbd, byte < end);
+        cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+        if (valid_change[new_state * 4 + cur_state]) {
+                *byte ^= cur_state << bit;
+                *byte |= new_state << bit;
+        } else
+                gfs2_consist_rgrpd(rgd);
+}
+/**
+ * gfs2_testbit - test a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to read
+ *
+ */
+static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                                  unsigned int buflen, u32 block)
+{
+        unsigned char *byte, *end, cur_state;
+        unsigned int bit;
+        byte = buffer + (block / GFS2_NBBY);
+        bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+        end = buffer + buflen;
+        gfs2_assert(rgd->rd_sbd, byte < end);
+        cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+        return cur_state;
+}
+/**
+ * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
+ *       a block in a given allocation state.
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @goal: start search at this block's bit-pair (within @buffer)
+ * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
+ *       bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
+ *
+ * Scope of @goal and returned block number is only within this bitmap buffer,
+ * not entire rgrp or filesystem.  @buffer will be offset from the actual
+ * beginning of a bitmap block buffer, skipping any header structures.
+ *
+ * Return: the block number (bitmap buffer scope) that was found
+ */
+static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                            unsigned int buflen, u32 goal,
+                            unsigned char old_state)
+{
+        unsigned char *byte, *end, alloc;
+        u32 blk = goal;
+        unsigned int bit;
+        byte = buffer + (goal / GFS2_NBBY);
+        bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+        end = buffer + buflen;
+        alloc = (old_state & 1) ? 0 : 0x55;
+        while (byte < end) {
+                if ((*byte & 0x55) == alloc) {
+                        blk += (8 - bit) >> 1;
+                        bit = 0;
+                        byte++;
+                        continue;
+                }
+                if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
+                        return blk;
+                bit += GFS2_BIT_SIZE;
+                if (bit >= 8) {
+                        bit = 0;
+                        byte++;
+                }
+                blk++;
+        }
+        return BFITNOENT;
+}
+/**
+ * gfs2_bitcount - count the number of bits in a certain state
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @state: the state of the block we're looking for
+ *
+ * Returns: The number of bits
+ */
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                              unsigned int buflen, unsigned char state)
+{
+        unsigned char *byte = buffer;
+        unsigned char *end = buffer + buflen;
+        unsigned char state1 = state << 2;
+        unsigned char state2 = state << 4;
+        unsigned char state3 = state << 6;
+        u32 count = 0;
+        for (; byte < end; byte++) {
+                if (((*byte) & 0x03) == state)
+                        count++;
+                if (((*byte) & 0x0C) == state1)
+                        count++;
+                if (((*byte) & 0x30) == state2)
+                        count++;
+                if (((*byte) & 0xC0) == state3)
+                        count++;
+        }
+        return count;
+}
+/**
+ * gfs2_rgrp_verify - Verify that a resource group is consistent
+ * @sdp: the filesystem
+ * @rgd: the rgrp
+ *
+ */
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct gfs2_bitmap *bi = NULL;
+        u32 length = rgd->rd_ri.ri_length;
+        u32 count[4], tmp;
+        int buf, x;
+        memset(count, 0, 4 * sizeof(u32));
+        /* Count # blocks in each of 4 possible allocation states */
+        for (buf = 0; buf < length; buf++) {
+                bi = rgd->rd_bits + buf;
+                for (x = 0; x < 4; x++)
+                        count[x] += gfs2_bitcount(rgd,
+                                                  bi->bi_bh->b_data +
+                                                  bi->bi_offset,
+                                                  bi->bi_len, x);
+        }
+        if (count[0] != rgd->rd_rg.rg_free) {
+                if (gfs2_consist_rgrpd(rgd))
+                        fs_err(sdp, "free data mismatch:  %u != %u\n",
+                               count[0], rgd->rd_rg.rg_free);
+                return;
+        }
+        tmp = rgd->rd_ri.ri_data -
+                rgd->rd_rg.rg_free -
+                rgd->rd_rg.rg_dinodes;
+        if (count[1] + count[2] != tmp) {
+                if (gfs2_consist_rgrpd(rgd))
+                        fs_err(sdp, "used data mismatch:  %u != %u\n",
+                               count[1], tmp);
+                return;
+        }
+        if (count[3] != rgd->rd_rg.rg_dinodes) {
+                if (gfs2_consist_rgrpd(rgd))
+                        fs_err(sdp, "used metadata mismatch:  %u != %u\n",
+                               count[3], rgd->rd_rg.rg_dinodes);
+                return;
+        }
+        if (count[2] > count[3]) {
+                if (gfs2_consist_rgrpd(rgd))
+                        fs_err(sdp, "unlinked inodes > inodes:  %u\n",
+                               count[2]);
+                return;
+        }
+}
+static inline int rgrp_contains_block(struct gfs2_rindex *ri, u64 block)
+{
+        u64 first = ri->ri_data0;
+        u64 last = first + ri->ri_data;
+        return first <= block && block < last;
+}
+/**
+ * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
+ * @sdp: The GFS2 superblock
+ * @n: The data block number
+ *
+ * Returns: The resource group, or NULL if not found
+ */
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
+{
+        struct gfs2_rgrpd *rgd;
+        spin_lock(&sdp->sd_rindex_spin);
+        list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
+                if (rgrp_contains_block(&rgd->rd_ri, blk)) {
+                        list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+                        spin_unlock(&sdp->sd_rindex_spin);
+                        return rgd;
+                }
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        return NULL;
+}
+/**
+ * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The first rgrp in the filesystem
+ */
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
+{
+        gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
+        return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
+}
+/**
+ * gfs2_rgrpd_get_next - get the next RG
+ * @rgd: A RG
+ *
+ * Returns: The next rgrp
+ */
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
+{
+        if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
+                return NULL;
+        return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
+}
+static void clear_rgrpdi(struct gfs2_sbd *sdp)
+{
+        struct list_head *head;
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_glock *gl;
+        spin_lock(&sdp->sd_rindex_spin);
+        sdp->sd_rindex_forward = NULL;
+        head = &sdp->sd_rindex_recent_list;
+        while (!list_empty(head)) {
+                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
+                list_del(&rgd->rd_recent);
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        head = &sdp->sd_rindex_list;
+        while (!list_empty(head)) {
+                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
+                gl = rgd->rd_gl;
+                list_del(&rgd->rd_list);
+                list_del(&rgd->rd_list_mru);
+                if (gl) {
+                        gl->gl_object = NULL;
+                        gfs2_glock_put(gl);
+                }
+                kfree(rgd->rd_bits);
+                kfree(rgd);
+        }
+}
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+{
+        mutex_lock(&sdp->sd_rindex_mutex);
+        clear_rgrpdi(sdp);
+        mutex_unlock(&sdp->sd_rindex_mutex);
+}
+/**
+ * gfs2_compute_bitstructs - Compute the bitmap sizes
+ * @rgd: The resource group descriptor
+ *
+ * Calculates bitmap descriptors, one for each block that contains bitmap data
+ *
+ * Returns: errno
+ */
+static int compute_bitstructs(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct gfs2_bitmap *bi;
+        u32 length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
+        u32 bytes_left, bytes;
+        int x;
+        if (!length)
+                return -EINVAL;
+        rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
+        if (!rgd->rd_bits)
+                return -ENOMEM;
+        bytes_left = rgd->rd_ri.ri_bitbytes;
+        for (x = 0; x < length; x++) {
+                bi = rgd->rd_bits + x;
+                /* small rgrp; bitmap stored completely in header block */
+                if (length == 1) {
+                        bytes = bytes_left;
+                        bi->bi_offset = sizeof(struct gfs2_rgrp);
+                        bi->bi_start = 0;
+                        bi->bi_len = bytes;
+                /* header block */
+                } else if (x == 0) {
+                        bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
+                        bi->bi_offset = sizeof(struct gfs2_rgrp);
+                        bi->bi_start = 0;
+                        bi->bi_len = bytes;
+                /* last block */
+                } else if (x + 1 == length) {
+                        bytes = bytes_left;
+                        bi->bi_offset = sizeof(struct gfs2_meta_header);
+                        bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
+                        bi->bi_len = bytes;
+                /* other blocks */
+                } else {
+                        bytes = sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_meta_header);
+                        bi->bi_offset = sizeof(struct gfs2_meta_header);
+                        bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
+                        bi->bi_len = bytes;
+                }
+                bytes_left -= bytes;
+        }
+        if (bytes_left) {
+                gfs2_consist_rgrpd(rgd);
+                return -EIO;
+        }
+        bi = rgd->rd_bits + (length - 1);
+        if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
+                if (gfs2_consist_rgrpd(rgd)) {
+                        gfs2_rindex_print(&rgd->rd_ri);
+                        fs_err(sdp, "start=%u len=%u offset=%u\n",
+                               bi->bi_start, bi->bi_len, bi->bi_offset);
+                }
+                return -EIO;
+        }
+        return 0;
+}
+/**
+ * gfs2_ri_update - Pull in a new resource index from the disk
+ * @gl: The glock covering the rindex inode
+ *
+ * Returns: 0 on successful update, error code otherwise
+ */
+static int gfs2_ri_update(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct inode *inode = &ip->i_inode;
+        struct gfs2_rgrpd *rgd;
+        char buf[sizeof(struct gfs2_rindex)];
+        struct file_ra_state ra_state;
+        u64 junk = ip->i_di.di_size;
+        int error;
+        if (do_div(junk, sizeof(struct gfs2_rindex))) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        clear_rgrpdi(sdp);
+        file_ra_state_init(&ra_state, inode->i_mapping);
+        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
+                loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
+                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
+                                            sizeof(struct gfs2_rindex));
+                if (!error)
+                        break;
+                if (error != sizeof(struct gfs2_rindex)) {
+                        if (error > 0)
+                                error = -EIO;
+                        goto fail;
+                }
+                rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
+                error = -ENOMEM;
+                if (!rgd)
+                        goto fail;
+                mutex_init(&rgd->rd_mutex);
+                lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
+                rgd->rd_sbd = sdp;
+                list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
+                list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+                gfs2_rindex_in(&rgd->rd_ri, buf);
+                error = compute_bitstructs(rgd);
+                if (error)
+                        goto fail;
+                error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
+                                       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
+                if (error)
+                        goto fail;
+                rgd->rd_gl->gl_object = rgd;
+                rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
+        }
+        sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+        return 0;
+fail:
+        clear_rgrpdi(sdp);
+        return error;
+}
+/**
+ * gfs2_rindex_hold - Grab a lock on the rindex
+ * @sdp: The GFS2 superblock
+ * @ri_gh: the glock holder
+ *
+ * We grab a lock on the rindex inode to make sure that it doesn't
+ * change whilst we are performing an operation. We keep this lock
+ * for quite long periods of time compared to other locks. This
+ * doesn't matter, since it is shared and it is very, very rarely
+ * accessed in the exclusive mode (i.e. only when expanding the filesystem).
+ *
+ * This makes sure that we're using the latest copy of the resource index
+ * special file, which might have been updated if someone expanded the
+ * filesystem (via gfs2_grow utility), which adds new resource groups.
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+        struct gfs2_glock *gl = ip->i_gl;
+        int error;
+        error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
+        if (error)
+                return error;
+        /* Read new copy from disk if we don't have the latest */
+        if (sdp->sd_rindex_vn != gl->gl_vn) {
+                mutex_lock(&sdp->sd_rindex_mutex);
+                if (sdp->sd_rindex_vn != gl->gl_vn) {
+                        error = gfs2_ri_update(ip);
+                        if (error)
+                                gfs2_glock_dq_uninit(ri_gh);
+                }
+                mutex_unlock(&sdp->sd_rindex_mutex);
+        }
+        return error;
+}
+/**
+ * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ * Read in all of a Resource Group's header and bitmap blocks.
+ * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
+ *
+ * Returns: errno
+ */
+int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct gfs2_glock *gl = rgd->rd_gl;
+        unsigned int length = rgd->rd_ri.ri_length;
+        struct gfs2_bitmap *bi;
+        unsigned int x, y;
+        int error;
+        mutex_lock(&rgd->rd_mutex);
+        spin_lock(&sdp->sd_rindex_spin);
+        if (rgd->rd_bh_count) {
+                rgd->rd_bh_count++;
+                spin_unlock(&sdp->sd_rindex_spin);
+                mutex_unlock(&rgd->rd_mutex);
+                return 0;
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        for (x = 0; x < length; x++) {
+                bi = rgd->rd_bits + x;
+                error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, 0, &bi->bi_bh);
+                if (error)
+                        goto fail;
+        }
+        for (y = length; y--;) {
+                bi = rgd->rd_bits + y;
+                error = gfs2_meta_wait(sdp, bi->bi_bh);
+                if (error)
+                        goto fail;
+                if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
+                                              GFS2_METATYPE_RG)) {
+                        error = -EIO;
+                        goto fail;
+                }
+        }
+        if (rgd->rd_rg_vn != gl->gl_vn) {
+                gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
+                rgd->rd_rg_vn = gl->gl_vn;
+        }
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        rgd->rd_bh_count++;
+        spin_unlock(&sdp->sd_rindex_spin);
+        mutex_unlock(&rgd->rd_mutex);
+        return 0;
+fail:
+        while (x--) {
+                bi = rgd->rd_bits + x;
+                brelse(bi->bi_bh);
+                bi->bi_bh = NULL;
+                gfs2_assert_warn(sdp, !bi->bi_clone);
+        }
+        mutex_unlock(&rgd->rd_mutex);
+        return error;
+}
+void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        spin_lock(&sdp->sd_rindex_spin);
+        gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+        rgd->rd_bh_count++;
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+/**
+ * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ */
+void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        int x, length = rgd->rd_ri.ri_length;
+        spin_lock(&sdp->sd_rindex_spin);
+        gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+        if (--rgd->rd_bh_count) {
+                spin_unlock(&sdp->sd_rindex_spin);
+                return;
+        }
+        for (x = 0; x < length; x++) {
+                struct gfs2_bitmap *bi = rgd->rd_bits + x;
+                kfree(bi->bi_clone);
+                bi->bi_clone = NULL;
+                brelse(bi->bi_bh);
+                bi->bi_bh = NULL;
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        unsigned int length = rgd->rd_ri.ri_length;
+        unsigned int x;
+        for (x = 0; x < length; x++) {
+                struct gfs2_bitmap *bi = rgd->rd_bits + x;
+                if (!bi->bi_clone)
+                        continue;
+                memcpy(bi->bi_clone + bi->bi_offset,
+                       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
+        }
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+/**
+ * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
+ * @ip: the incore GFS2 inode structure
+ *
+ * Returns: the struct gfs2_alloc
+ */
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
+{
+        struct gfs2_alloc *al = &ip->i_alloc;
+        /* FIXME: Should assert that the correct locks are held here... */
+        memset(al, 0, sizeof(*al));
+        return al;
+}
+/**
+ * try_rgrp_fit - See if a given reservation will fit in a given RG
+ * @rgd: the RG data
+ * @al: the struct gfs2_alloc structure describing the reservation
+ *
+ * If there's room for the requested blocks to be allocated from the RG:
+ *   Sets the $al_reserved_data field in @al.
+ *   Sets the $al_reserved_meta field in @al.
+ *   Sets the $al_rgd field in @al.
+ *
+ * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
+ */
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        int ret = 0;
+        spin_lock(&sdp->sd_rindex_spin);
+        if (rgd->rd_free_clone >= al->al_requested) {
+                al->al_rgd = rgd;
+                ret = 1;
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        return ret;
+}
+/**
+ * recent_rgrp_first - get first RG from "recent" list
+ * @sdp: The GFS2 superblock
+ * @rglast: address of the rgrp used last
+ *
+ * Returns: The first rgrp in the recent list
+ */
+static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
+                                            u64 rglast)
+{
+        struct gfs2_rgrpd *rgd = NULL;
+        spin_lock(&sdp->sd_rindex_spin);
+        if (list_empty(&sdp->sd_rindex_recent_list))
+                goto out;
+        if (!rglast)
+                goto first;
+        list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+                if (rgd->rd_ri.ri_addr == rglast)
+                        goto out;
+        }
+first:
+        rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
+                         rd_recent);
+out:
+        spin_unlock(&sdp->sd_rindex_spin);
+        return rgd;
+}
+/**
+ * recent_rgrp_next - get next RG from "recent" list
+ * @cur_rgd: current rgrp
+ * @remove:
+ *
+ * Returns: The next rgrp in the recent list
+ */
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
+                                           int remove)
+{
+        struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
+        struct list_head *head;
+        struct gfs2_rgrpd *rgd;
+        spin_lock(&sdp->sd_rindex_spin);
+        head = &sdp->sd_rindex_recent_list;
+        list_for_each_entry(rgd, head, rd_recent) {
+                if (rgd == cur_rgd) {
+                        if (cur_rgd->rd_recent.next != head)
+                                rgd = list_entry(cur_rgd->rd_recent.next,
+                                                 struct gfs2_rgrpd, rd_recent);
+                        else
+                                rgd = NULL;
+                        if (remove)
+                                list_del(&cur_rgd->rd_recent);
+                        goto out;
+                }
+        }
+        rgd = NULL;
+        if (!list_empty(head))
+                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
+out:
+        spin_unlock(&sdp->sd_rindex_spin);
+        return rgd;
+}
+/**
+ * recent_rgrp_add - add an RG to tail of "recent" list
+ * @new_rgd: The rgrp to add
+ *
+ */
+static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
+{
+        struct gfs2_sbd *sdp = new_rgd->rd_sbd;
+        struct gfs2_rgrpd *rgd;
+        unsigned int count = 0;
+        unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
+        spin_lock(&sdp->sd_rindex_spin);
+        list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+                if (rgd == new_rgd)
+                        goto out;
+                if (++count >= max)
+                        goto out;
+        }
+        list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
+out:
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+/**
+ * forward_rgrp_get - get an rgrp to try next from full list
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The rgrp to try next
+ */
+static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
+{
+        struct gfs2_rgrpd *rgd;
+        unsigned int journals = gfs2_jindex_size(sdp);
+        unsigned int rg = 0, x;
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd = sdp->sd_rindex_forward;
+        if (!rgd) {
+                if (sdp->sd_rgrps >= journals)
+                        rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
+                for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
+                     x++, rgd = gfs2_rgrpd_get_next(rgd))
+                        /* Do Nothing */;
+                sdp->sd_rindex_forward = rgd;
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        return rgd;
+}
+/**
+ * forward_rgrp_set - set the forward rgrp pointer
+ * @sdp: the filesystem
+ * @rgd: The new forward rgrp
+ *
+ */
+static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
+{
+        spin_lock(&sdp->sd_rindex_spin);
+        sdp->sd_rindex_forward = rgd;
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+/**
+ * get_local_rgrp - Choose and lock a rgrp for allocation
+ * @ip: the inode to reserve space for
+ * @rgp: the chosen and locked rgrp
+ *
+ * Try to acquire rgrp in way which avoids contending with others.
+ *
+ * Returns: errno
+ */
+static int get_local_rgrp(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrpd *rgd, *begin = NULL;
+        struct gfs2_alloc *al = &ip->i_alloc;
+        int flags = LM_FLAG_TRY;
+        int skipped = 0;
+        int loops = 0;
+        int error;
+        /* Try recently successful rgrps */
+        rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
+        while (rgd) {
+                error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+                                           LM_FLAG_TRY, &al->al_rgd_gh);
+                switch (error) {
+                case 0:
+                        if (try_rgrp_fit(rgd, al))
+                                goto out;
+                        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+                        rgd = recent_rgrp_next(rgd, 1);
+                        break;
+                case GLR_TRYFAILED:
+                        rgd = recent_rgrp_next(rgd, 0);
+                        break;
+                default:
+                        return error;
+                }
+        }
+        /* Go through full list of rgrps */
+        begin = rgd = forward_rgrp_get(sdp);
+        for (;;) {
+                error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
+                                          &al->al_rgd_gh);
+                switch (error) {
+                case 0:
+                        if (try_rgrp_fit(rgd, al))
+                                goto out;
+                        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+                        break;
+                case GLR_TRYFAILED:
+                        skipped++;
+                        break;
+                default:
+                        return error;
+                }
+                rgd = gfs2_rgrpd_get_next(rgd);
+                if (!rgd)
+                        rgd = gfs2_rgrpd_get_first(sdp);
+                if (rgd == begin) {
+                        if (++loops >= 2 || !skipped)
+                                return -ENOSPC;
+                        flags = 0;
+                }
+        }
+out:
+        ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
+        if (begin) {
+                recent_rgrp_add(rgd);
+                rgd = gfs2_rgrpd_get_next(rgd);
+                if (!rgd)
+                        rgd = gfs2_rgrpd_get_first(sdp);
+                forward_rgrp_set(sdp, rgd);
+        }
+        return 0;
+}
+/**
+ * gfs2_inplace_reserve_i - Reserve space in the filesystem
+ * @ip: the inode to reserve space for
+ *
+ * Returns: errno
+ */
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        int error;
+        if (gfs2_assert_warn(sdp, al->al_requested))
+                return -EINVAL;
+        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+        if (error)
+                return error;
+        error = get_local_rgrp(ip);
+        if (error) {
+                gfs2_glock_dq_uninit(&al->al_ri_gh);
+                return error;
+        }
+        al->al_file = file;
+        al->al_line = line;
+        return 0;
+}
+/**
+ * gfs2_inplace_release - release an inplace reservation
+ * @ip: the inode the reservation was taken out on
+ *
+ * Release a reservation made by gfs2_inplace_reserve().
+ */
+void gfs2_inplace_release(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
+                fs_warn(sdp, "al_alloced = %u, al_requested = %u "
+                             "al_file = %s, al_line = %u\n",
+                             al->al_alloced, al->al_requested, al->al_file,
+                             al->al_line);
+        al->al_rgd = NULL;
+        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+        gfs2_glock_dq_uninit(&al->al_ri_gh);
+}
+/**
+ * gfs2_get_block_type - Check a block in a RG is of given type
+ * @rgd: the resource group holding the block
+ * @block: the block number
+ *
+ * Returns: The block type (GFS2_BLKST_*)
+ */
+unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+{
+        struct gfs2_bitmap *bi = NULL;
+        u32 length, rgrp_block, buf_block;
+        unsigned int buf;
+        unsigned char type;
+        length = rgd->rd_ri.ri_length;
+        rgrp_block = block - rgd->rd_ri.ri_data0;
+        for (buf = 0; buf < length; buf++) {
+                bi = rgd->rd_bits + buf;
+                if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+                        break;
+        }
+        gfs2_assert(rgd->rd_sbd, buf < length);
+        buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
+        type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                           bi->bi_len, buf_block);
+        return type;
+}
+/**
+ * rgblk_search - find a block in @old_state, change allocation
+ *           state to @new_state
+ * @rgd: the resource group descriptor
+ * @goal: the goal block within the RG (start here to search for avail block)
+ * @old_state: GFS2_BLKST_XXX the before-allocation state to find
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Walk rgrp's bitmap to find bits that represent a block in @old_state.
+ * Add the found bitmap buffer to the transaction.
+ * Set the found bits to @new_state to change block's allocation state.
+ *
+ * This function never fails, because we wouldn't call it unless we
+ * know (from reservation results, etc.) that a block is available.
+ *
+ * Scope of @goal and returned block is just within rgrp, not the whole
+ * filesystem.
+ *
+ * Returns:  the block number allocated
+ */
+static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
+                             unsigned char old_state, unsigned char new_state)
+{
+        struct gfs2_bitmap *bi = NULL;
+        u32 length = rgd->rd_ri.ri_length;
+        u32 blk = 0;
+        unsigned int buf, x;
+        /* Find bitmap block that contains bits for goal block */
+        for (buf = 0; buf < length; buf++) {
+                bi = rgd->rd_bits + buf;
+                if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+                        break;
+        }
+        gfs2_assert(rgd->rd_sbd, buf < length);
+        /* Convert scope of "goal" from rgrp-wide to within found bit block */
+        goal -= bi->bi_start * GFS2_NBBY;
+        /* Search (up to entire) bitmap in this rgrp for allocatable block.
+           "x <= length", instead of "x < length", because we typically start
+           the search in the middle of a bit block, but if we can't find an
+           allocatable block anywhere else, we want to be able wrap around and
+           search in the first part of our first-searched bit block.  */
+        for (x = 0; x <= length; x++) {
+                if (bi->bi_clone)
+                        blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
+                                          bi->bi_len, goal, old_state);
+                else
+                        blk = gfs2_bitfit(rgd,
+                                          bi->bi_bh->b_data + bi->bi_offset,
+                                          bi->bi_len, goal, old_state);
+                if (blk != BFITNOENT)
+                        break;
+                /* Try next bitmap block (wrap back to rgrp header if at end) */
+                buf = (buf + 1) % length;
+                bi = rgd->rd_bits + buf;
+                goal = 0;
+        }
+        if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
+                blk = 0;
+        gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+        gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                    bi->bi_len, blk, new_state);
+        if (bi->bi_clone)
+                gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
+                            bi->bi_len, blk, new_state);
+        return bi->bi_start * GFS2_NBBY + blk;
+}
+/**
+ * rgblk_free - Change alloc state of given block(s)
+ * @sdp: the filesystem
+ * @bstart: the start of a run of blocks to free
+ * @blen: the length of the block run (all must lie within ONE RG!)
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Returns:  Resource group containing the block(s)
+ */
+static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
+                                     u32 blen, unsigned char new_state)
+{
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_bitmap *bi = NULL;
+        u32 length, rgrp_blk, buf_blk;
+        unsigned int buf;
+        rgd = gfs2_blk2rgrpd(sdp, bstart);
+        if (!rgd) {
+                if (gfs2_consist(sdp))
+                        fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
+                return NULL;
+        }
+        length = rgd->rd_ri.ri_length;
+        rgrp_blk = bstart - rgd->rd_ri.ri_data0;
+        while (blen--) {
+                for (buf = 0; buf < length; buf++) {
+                        bi = rgd->rd_bits + buf;
+                        if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+                                break;
+                }
+                gfs2_assert(rgd->rd_sbd, buf < length);
+                buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
+                rgrp_blk++;
+                if (!bi->bi_clone) {
+                        bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+                                               GFP_NOFS | __GFP_NOFAIL);
+                        memcpy(bi->bi_clone + bi->bi_offset,
+                               bi->bi_bh->b_data + bi->bi_offset,
+                               bi->bi_len);
+                }
+                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+                gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                            bi->bi_len, buf_blk, new_state);
+        }
+        return rgd;
+}
+/**
+ * gfs2_alloc_data - Allocate a data block
+ * @ip: the inode to allocate the data block for
+ *
+ * Returns: the allocated block
+ */
+u64 gfs2_alloc_data(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_rgrpd *rgd = al->al_rgd;
+        u32 goal, blk;
+        u64 block;
+        if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
+                goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
+        else
+                goal = rgd->rd_last_alloc_data;
+        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+        rgd->rd_last_alloc_data = blk;
+        block = rgd->rd_ri.ri_data0 + blk;
+        ip->i_di.di_goal_data = block;
+        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        rgd->rd_rg.rg_free--;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        al->al_alloced++;
+        gfs2_statfs_change(sdp, 0, -1, 0);
+        gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone--;
+        spin_unlock(&sdp->sd_rindex_spin);
+        return block;
+}
+/**
+ * gfs2_alloc_meta - Allocate a metadata block
+ * @ip: the inode to allocate the metadata block for
+ *
+ * Returns: the allocated block
+ */
+u64 gfs2_alloc_meta(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_rgrpd *rgd = al->al_rgd;
+        u32 goal, blk;
+        u64 block;
+        if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
+                goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
+        else
+                goal = rgd->rd_last_alloc_meta;
+        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+        rgd->rd_last_alloc_meta = blk;
+        block = rgd->rd_ri.ri_data0 + blk;
+        ip->i_di.di_goal_meta = block;
+        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        rgd->rd_rg.rg_free--;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        al->al_alloced++;
+        gfs2_statfs_change(sdp, 0, -1, 0);
+        gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+        gfs2_trans_add_unrevoke(sdp, block);
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone--;
+        spin_unlock(&sdp->sd_rindex_spin);
+        return block;
+}
+/**
+ * gfs2_alloc_di - Allocate a dinode
+ * @dip: the directory that the inode is going in
+ *
+ * Returns: the block allocated
+ */
+u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_alloc *al = &dip->i_alloc;
+        struct gfs2_rgrpd *rgd = al->al_rgd;
+        u32 blk;
+        u64 block;
+        blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
+                           GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
+        rgd->rd_last_alloc_meta = blk;
+        block = rgd->rd_ri.ri_data0 + blk;
+        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        rgd->rd_rg.rg_free--;
+        rgd->rd_rg.rg_dinodes++;
+        *generation = rgd->rd_rg.rg_igeneration++;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        al->al_alloced++;
+        gfs2_statfs_change(sdp, 0, -1, +1);
+        gfs2_trans_add_unrevoke(sdp, block);
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone--;
+        spin_unlock(&sdp->sd_rindex_spin);
+        return block;
+}
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrpd *rgd;
+        rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+        if (!rgd)
+                return;
+        rgd->rd_rg.rg_free += blen;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_trans_add_rg(rgd);
+        gfs2_statfs_change(sdp, 0, +blen, 0);
+        gfs2_quota_change(ip, -(s64)blen,
+                         ip->i_di.di_uid, ip->i_di.di_gid);
+}
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrpd *rgd;
+        rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+        if (!rgd)
+                return;
+        rgd->rd_rg.rg_free += blen;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_trans_add_rg(rgd);
+        gfs2_statfs_change(sdp, 0, +blen, 0);
+        gfs2_quota_change(ip, -(s64)blen, ip->i_di.di_uid, ip->i_di.di_gid);
+        gfs2_meta_wipe(ip, bstart, blen);
+}
+void gfs2_unlink_di(struct inode *inode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_rgrpd *rgd;
+        u64 blkno = ip->i_num.no_addr;
+        rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
+        if (!rgd)
+                return;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_trans_add_rg(rgd);
+}
+static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct gfs2_rgrpd *tmp_rgd;
+        tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
+        if (!tmp_rgd)
+                return;
+        gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
+        if (!rgd->rd_rg.rg_dinodes)
+                gfs2_consist_rgrpd(rgd);
+        rgd->rd_rg.rg_dinodes--;
+        rgd->rd_rg.rg_free++;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_statfs_change(sdp, 0, +1, -1);
+        gfs2_trans_add_rg(rgd);
+}
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+        gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
+        gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
+        gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
+}
+/**
+ * gfs2_rlist_add - add a RG to a list of RGs
+ * @sdp: the filesystem
+ * @rlist: the list of resource groups
+ * @block: the block
+ *
+ * Figure out what RG a block belongs to and add that RG to the list
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+                    u64 block)
+{
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_rgrpd **tmp;
+        unsigned int new_space;
+        unsigned int x;
+        if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
+                return;
+        rgd = gfs2_blk2rgrpd(sdp, block);
+        if (!rgd) {
+                if (gfs2_consist(sdp))
+                        fs_err(sdp, "block = %llu\n", (unsigned long long)block);
+                return;
+        }
+        for (x = 0; x < rlist->rl_rgrps; x++)
+                if (rlist->rl_rgd[x] == rgd)
+                        return;
+        if (rlist->rl_rgrps == rlist->rl_space) {
+                new_space = rlist->rl_space + 10;
+                tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
+                              GFP_NOFS | __GFP_NOFAIL);
+                if (rlist->rl_rgd) {
+                        memcpy(tmp, rlist->rl_rgd,
+                               rlist->rl_space * sizeof(struct gfs2_rgrpd *));
+                        kfree(rlist->rl_rgd);
+                }
+                rlist->rl_space = new_space;
+                rlist->rl_rgd = tmp;
+        }
+        rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
+}
+/**
+ * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
+ *      and initialize an array of glock holders for them
+ * @rlist: the list of resource groups
+ * @state: the lock state to acquire the RG lock in
+ * @flags: the modifier flags for the holder structures
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+                      int flags)
+{
+        unsigned int x;
+        rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
+                                GFP_NOFS | __GFP_NOFAIL);
+        for (x = 0; x < rlist->rl_rgrps; x++)
+                gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
+                                state, flags,
+                                &rlist->rl_ghs[x]);
+}
+/**
+ * gfs2_rlist_free - free a resource group list
+ * @list: the list of resource groups
+ *
+ */
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
+{
+        unsigned int x;
+        kfree(rlist->rl_rgd);
+        if (rlist->rl_ghs) {
+                for (x = 0; x < rlist->rl_rgrps; x++)
+                        gfs2_holder_uninit(&rlist->rl_ghs[x]);
+                kfree(rlist->rl_ghs);
+        }
+}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..9eedfd12bfff
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __RGRP_DOT_H__
+#define __RGRP_DOT_H__
+struct gfs2_rgrpd;
+struct gfs2_sbd;
+struct gfs2_holder;
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+static inline void gfs2_alloc_put(struct gfs2_inode *ip)
+{
+        return; /* Se we can see where ip->i_alloc is used */
+}
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
+                         char *file, unsigned int line);
+#define gfs2_inplace_reserve(ip) \
+gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+void gfs2_inplace_release(struct gfs2_inode *ip);
+unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+u64 gfs2_alloc_data(struct gfs2_inode *ip);
+u64 gfs2_alloc_meta(struct gfs2_inode *ip);
+u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+void gfs2_unlink_di(struct inode *inode);
+struct gfs2_rgrp_list {
+        unsigned int rl_rgrps;
+        unsigned int rl_space;
+        struct gfs2_rgrpd **rl_rgd;
+        struct gfs2_holder *rl_ghs;
+};
+void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+                    u64 block);
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+                      int flags);
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..6a78b1b32e25
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,976 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "util.h"
+static const u32 gfs2_old_fs_formats[] = {
+        0
+};
+static const u32 gfs2_old_multihost_formats[] = {
+        0
+};
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+void gfs2_tune_init(struct gfs2_tune *gt)
+{
+        spin_lock_init(&gt->gt_spin);
+        gt->gt_ilimit = 100;
+        gt->gt_ilimit_tries = 3;
+        gt->gt_ilimit_min = 1;
+        gt->gt_demote_secs = 300;
+        gt->gt_incore_log_blocks = 1024;
+        gt->gt_log_flush_secs = 60;
+        gt->gt_jindex_refresh_secs = 60;
+        gt->gt_scand_secs = 15;
+        gt->gt_recoverd_secs = 60;
+        gt->gt_logd_secs = 1;
+        gt->gt_quotad_secs = 5;
+        gt->gt_quota_simul_sync = 64;
+        gt->gt_quota_warn_period = 10;
+        gt->gt_quota_scale_num = 1;
+        gt->gt_quota_scale_den = 1;
+        gt->gt_quota_cache_secs = 300;
+        gt->gt_quota_quantum = 60;
+        gt->gt_atime_quantum = 3600;
+        gt->gt_new_files_jdata = 0;
+        gt->gt_new_files_directio = 0;
+        gt->gt_max_atomic_write = 4 << 20;
+        gt->gt_max_readahead = 1 << 18;
+        gt->gt_lockdump_size = 131072;
+        gt->gt_stall_secs = 600;
+        gt->gt_complain_secs = 10;
+        gt->gt_reclaim_limit = 5000;
+        gt->gt_entries_per_readdir = 32;
+        gt->gt_prefetch_secs = 10;
+        gt->gt_greedy_default = HZ / 10;
+        gt->gt_greedy_quantum = HZ / 40;
+        gt->gt_greedy_max = HZ / 4;
+        gt->gt_statfs_quantum = 30;
+        gt->gt_statfs_slow = 0;
+}
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
+{
+        unsigned int x;
+        if (sb->sb_header.mh_magic != GFS2_MAGIC ||
+            sb->sb_header.mh_type != GFS2_METATYPE_SB) {
+                if (!silent)
+                        printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+                return -EINVAL;
+        }
+        /*  If format numbers match exactly, we're done.  */
+        if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+            sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+                return 0;
+        if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+                for (x = 0; gfs2_old_fs_formats[x]; x++)
+                        if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
+                                break;
+                if (!gfs2_old_fs_formats[x]) {
+                        printk(KERN_WARNING
+                               "GFS2: code version (%u, %u) is incompatible "
+                               "with ondisk format (%u, %u)\n",
+                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                               sb->sb_fs_format, sb->sb_multihost_format);
+                        printk(KERN_WARNING
+                               "GFS2: I don't know how to upgrade this FS\n");
+                        return -EINVAL;
+                }
+        }
+        if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+                for (x = 0; gfs2_old_multihost_formats[x]; x++)
+                        if (gfs2_old_multihost_formats[x] ==
+                            sb->sb_multihost_format)
+                                break;
+                if (!gfs2_old_multihost_formats[x]) {
+                        printk(KERN_WARNING
+                               "GFS2: code version (%u, %u) is incompatible "
+                               "with ondisk format (%u, %u)\n",
+                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                               sb->sb_fs_format, sb->sb_multihost_format);
+                        printk(KERN_WARNING
+                               "GFS2: I don't know how to upgrade this FS\n");
+                        return -EINVAL;
+                }
+        }
+        if (!sdp->sd_args.ar_upgrade) {
+                printk(KERN_WARNING
+                       "GFS2: code version (%u, %u) is incompatible "
+                       "with ondisk format (%u, %u)\n",
+                       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                       sb->sb_fs_format, sb->sb_multihost_format);
+                printk(KERN_INFO
+                       "GFS2: Use the \"upgrade\" mount option to upgrade "
+                       "the FS\n");
+                printk(KERN_INFO "GFS2: See the manual for more details\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
+{
+        struct page *page = bio->bi_private;
+        if (bio->bi_size)
+                return 1;
+        if (!error)
+                SetPageUptodate(page);
+        else
+                printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+        unlock_page(page);
+        return 0;
+}
+struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
+{
+        struct page *page;
+        struct bio *bio;
+        page = alloc_page(GFP_KERNEL);
+        if (unlikely(!page))
+                return NULL;
+        ClearPageUptodate(page);
+        ClearPageDirty(page);
+        lock_page(page);
+        bio = bio_alloc(GFP_KERNEL, 1);
+        if (unlikely(!bio)) {
+                __free_page(page);
+                return NULL;
+        }
+        bio->bi_sector = sector;
+        bio->bi_bdev = sb->s_bdev;
+        bio_add_page(bio, page, PAGE_SIZE, 0);
+        bio->bi_end_io = end_bio_io_page;
+        bio->bi_private = page;
+        submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+        wait_on_page_locked(page);
+        bio_put(bio);
+        if (!PageUptodate(page)) {
+                __free_page(page);
+                return NULL;
+        }
+        return page;
+}
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @gl: the glock for the superblock (assumed to be held)
+ * @silent: Don't print message if mount fails
+ *
+ */
+int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+{
+        u32 hash_blocks, ind_blocks, leaf_blocks;
+        u32 tmp_blocks;
+        unsigned int x;
+        int error;
+        struct page *page;
+        char *sb;
+        page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+        if (!page) {
+                if (!silent)
+                        fs_err(sdp, "can't read superblock\n");
+                return -EIO;
+        }
+        sb = kmap(page);
+        gfs2_sb_in(&sdp->sd_sb, sb);
+        kunmap(page);
+        __free_page(page);
+        error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+        if (error)
+                return error;
+        sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+                               GFS2_BASIC_BLOCK_SHIFT;
+        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+        sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_dinode)) / sizeof(u64);
+        sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_meta_header)) / sizeof(u64);
+        sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+        sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+        sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+        sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+        sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_meta_header)) /
+                                sizeof(struct gfs2_quota_change);
+        /* Compute maximum reservation required to add a entry to a directory */
+        hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+                             sdp->sd_jbsize);
+        ind_blocks = 0;
+        for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+                tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+                ind_blocks += tmp_blocks;
+        }
+        leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+        sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+        sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_dinode);
+        sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+        for (x = 2;; x++) {
+                u64 space, d;
+                u32 m;
+                space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+                d = space;
+                m = do_div(d, sdp->sd_inptrs);
+                if (d != sdp->sd_heightsize[x - 1] || m)
+                        break;
+                sdp->sd_heightsize[x] = space;
+        }
+        sdp->sd_max_height = x;
+        gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+        sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+                                 sizeof(struct gfs2_dinode);
+        sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+        for (x = 2;; x++) {
+                u64 space, d;
+                u32 m;
+                space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+                d = space;
+                m = do_div(d, sdp->sd_inptrs);
+                if (d != sdp->sd_jheightsize[x - 1] || m)
+                        break;
+                sdp->sd_jheightsize[x] = space;
+        }
+        sdp->sd_max_jheight = x;
+        gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+        return 0;
+}
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * This is very similar to the gfs2_rindex_hold() function, except that
+ * in general we hold the jindex lock for longer periods of time and
+ * we grab it far less frequently (in general) then the rgrp lock.
+ *
+ * Returns: errno
+ */
+int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+        struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+        struct qstr name;
+        char buf[20];
+        struct gfs2_jdesc *jd;
+        int error;
+        name.name = buf;
+        mutex_lock(&sdp->sd_jindex_mutex);
+        for (;;) {
+                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
+                                           GL_LOCAL_EXCL, ji_gh);
+                if (error)
+                        break;
+                name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+                name.hash = gfs2_disk_hash(name.name, name.len);
+                error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL);
+                if (error == -ENOENT) {
+                        error = 0;
+                        break;
+                }
+                gfs2_glock_dq_uninit(ji_gh);
+                if (error)
+                        break;
+                error = -ENOMEM;
+                jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+                if (!jd)
+                        break;
+                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
+                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+                        if (!jd->jd_inode)
+                                error = -ENOENT;
+                        else
+                                error = PTR_ERR(jd->jd_inode);
+                        kfree(jd);
+                        break;
+                }
+                spin_lock(&sdp->sd_jindex_spin);
+                jd->jd_jid = sdp->sd_journals++;
+                list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+                spin_unlock(&sdp->sd_jindex_spin);
+        }
+        mutex_unlock(&sdp->sd_jindex_mutex);
+        return error;
+}
+/**
+ * gfs2_jindex_free - Clear all the journal index information
+ * @sdp: The GFS2 superblock
+ *
+ */
+void gfs2_jindex_free(struct gfs2_sbd *sdp)
+{
+        struct list_head list;
+        struct gfs2_jdesc *jd;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_add(&list, &sdp->sd_jindex_list);
+        list_del_init(&sdp->sd_jindex_list);
+        sdp->sd_journals = 0;
+        spin_unlock(&sdp->sd_jindex_spin);
+        while (!list_empty(&list)) {
+                jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+                list_del(&jd->jd_list);
+                iput(jd->jd_inode);
+                kfree(jd);
+        }
+}
+static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
+{
+        struct gfs2_jdesc *jd;
+        int found = 0;
+        list_for_each_entry(jd, head, jd_list) {
+                if (jd->jd_jid == jid) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                jd = NULL;
+        return jd;
+}
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
+{
+        struct gfs2_jdesc *jd;
+        spin_lock(&sdp->sd_jindex_spin);
+        jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+        spin_unlock(&sdp->sd_jindex_spin);
+        return jd;
+}
+void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+        struct gfs2_jdesc *jd;
+        spin_lock(&sdp->sd_jindex_spin);
+        jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+        if (jd)
+                jd->jd_dirty = 1;
+        spin_unlock(&sdp->sd_jindex_spin);
+}
+struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+        struct gfs2_jdesc *jd;
+        int found = 0;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (jd->jd_dirty) {
+                        jd->jd_dirty = 0;
+                        found = 1;
+                        break;
+                }
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+        if (!found)
+                jd = NULL;
+        return jd;
+}
+int gfs2_jdesc_check(struct gfs2_jdesc *jd)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        int ar;
+        int error;
+        if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
+            (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+        if (!error && ar) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+        }
+        return error;
+}
+/**
+ * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+        struct gfs2_glock *j_gl = ip->i_gl;
+        struct gfs2_holder t_gh;
+        struct gfs2_log_header head;
+        int error;
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+                                   GL_LOCAL_EXCL, &t_gh);
+        if (error)
+                return error;
+        gfs2_meta_cache_flush(ip);
+        j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+        error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+        if (error)
+                goto fail;
+        if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+                gfs2_consist(sdp);
+                error = -EIO;
+                goto fail;
+        }
+        /*  Initialize some head of the log stuff  */
+        sdp->sd_log_sequence = head.lh_sequence + 1;
+        gfs2_log_pointers_init(sdp, head.lh_blkno);
+        error = gfs2_quota_init(sdp);
+        if (error)
+                goto fail;
+        set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+        gfs2_glock_dq_uninit(&t_gh);
+        return 0;
+fail:
+        t_gh.gh_flags |= GL_NOCACHE;
+        gfs2_glock_dq_uninit(&t_gh);
+        return error;
+}
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+        struct gfs2_holder t_gh;
+        int error;
+        gfs2_quota_sync(sdp);
+        gfs2_statfs_sync(sdp);
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+                                GL_LOCAL_EXCL | GL_NOCACHE,
+                                &t_gh);
+        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return error;
+        gfs2_meta_syncfs(sdp);
+        gfs2_log_shutdown(sdp);
+        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+        if (t_gh.gh_gl)
+                gfs2_glock_dq_uninit(&t_gh);
+        gfs2_quota_cleanup(sdp);
+        return error;
+}
+int gfs2_statfs_init(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+        struct buffer_head *m_bh, *l_bh;
+        struct gfs2_holder gh;
+        int error;
+        error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+                                   &gh);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+        if (error)
+                goto out;
+        if (sdp->sd_args.ar_spectator) {
+                spin_lock(&sdp->sd_statfs_spin);
+                gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                                      sizeof(struct gfs2_dinode));
+                spin_unlock(&sdp->sd_statfs_spin);
+        } else {
+                error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+                if (error)
+                        goto out_m_bh;
+                spin_lock(&sdp->sd_statfs_spin);
+                gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                                      sizeof(struct gfs2_dinode));
+                gfs2_statfs_change_in(l_sc, l_bh->b_data +
+                                      sizeof(struct gfs2_dinode));
+                spin_unlock(&sdp->sd_statfs_spin);
+                brelse(l_bh);
+        }
+out_m_bh:
+        brelse(m_bh);
+out:
+        gfs2_glock_dq_uninit(&gh);
+        return 0;
+}
+void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+                        s64 dinodes)
+{
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+        struct buffer_head *l_bh;
+        int error;
+        error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+        if (error)
+                return;
+        mutex_lock(&sdp->sd_statfs_mutex);
+        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        mutex_unlock(&sdp->sd_statfs_mutex);
+        spin_lock(&sdp->sd_statfs_spin);
+        l_sc->sc_total += total;
+        l_sc->sc_free += free;
+        l_sc->sc_dinodes += dinodes;
+        gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+        spin_unlock(&sdp->sd_statfs_spin);
+        brelse(l_bh);
+}
+int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+        struct gfs2_holder gh;
+        struct buffer_head *m_bh, *l_bh;
+        int error;
+        error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+                                   &gh);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+        if (error)
+                goto out;
+        spin_lock(&sdp->sd_statfs_spin);
+        gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                              sizeof(struct gfs2_dinode));
+        if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
+                spin_unlock(&sdp->sd_statfs_spin);
+                goto out_bh;
+        }
+        spin_unlock(&sdp->sd_statfs_spin);
+        error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+        if (error)
+                goto out_bh;
+        error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+        if (error)
+                goto out_bh2;
+        mutex_lock(&sdp->sd_statfs_mutex);
+        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        mutex_unlock(&sdp->sd_statfs_mutex);
+        spin_lock(&sdp->sd_statfs_spin);
+        m_sc->sc_total += l_sc->sc_total;
+        m_sc->sc_free += l_sc->sc_free;
+        m_sc->sc_dinodes += l_sc->sc_dinodes;
+        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+               0, sizeof(struct gfs2_statfs_change));
+        spin_unlock(&sdp->sd_statfs_spin);
+        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+        gfs2_trans_end(sdp);
+out_bh2:
+        brelse(l_bh);
+out_bh:
+        brelse(m_bh);
+out:
+        gfs2_glock_dq_uninit(&gh);
+        return error;
+}
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+{
+        struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+        spin_lock(&sdp->sd_statfs_spin);
+        *sc = *m_sc;
+        sc->sc_total += l_sc->sc_total;
+        sc->sc_free += l_sc->sc_free;
+        sc->sc_dinodes += l_sc->sc_dinodes;
+        spin_unlock(&sdp->sd_statfs_spin);
+        if (sc->sc_free < 0)
+                sc->sc_free = 0;
+        if (sc->sc_free > sc->sc_total)
+                sc->sc_free = sc->sc_total;
+        if (sc->sc_dinodes < 0)
+                sc->sc_dinodes = 0;
+        return 0;
+}
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+                            struct gfs2_statfs_change *sc)
+{
+        gfs2_rgrp_verify(rgd);
+        sc->sc_total += rgd->rd_ri.ri_data;
+        sc->sc_free += rgd->rd_rg.rg_free;
+        sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
+        return 0;
+}
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+{
+        struct gfs2_holder ri_gh;
+        struct gfs2_rgrpd *rgd_next;
+        struct gfs2_holder *gha, *gh;
+        unsigned int slots = 64;
+        unsigned int x;
+        int done;
+        int error = 0, err;
+        memset(sc, 0, sizeof(struct gfs2_statfs_change));
+        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!gha)
+                return -ENOMEM;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto out;
+        rgd_next = gfs2_rgrpd_get_first(sdp);
+        for (;;) {
+                done = 1;
+                for (x = 0; x < slots; x++) {
+                        gh = gha + x;
+                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
+                                err = gfs2_glock_wait(gh);
+                                if (err) {
+                                        gfs2_holder_uninit(gh);
+                                        error = err;
+                                } else {
+                                        if (!error)
+                                                error = statfs_slow_fill(
+                                                        gh->gh_gl->gl_object, sc);
+                                        gfs2_glock_dq_uninit(gh);
+                                }
+                        }
+                        if (gh->gh_gl)
+                                done = 0;
+                        else if (rgd_next && !error) {
+                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
+                                                           LM_ST_SHARED,
+                                                           GL_ASYNC,
+                                                           gh);
+                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
+                                done = 0;
+                        }
+                        if (signal_pending(current))
+                                error = -ERESTARTSYS;
+                }
+                if (done)
+                        break;
+                yield();
+        }
+        gfs2_glock_dq_uninit(&ri_gh);
+out:
+        kfree(gha);
+        return error;
+}
+struct lfcc {
+        struct list_head list;
+        struct gfs2_holder gh;
+};
+/**
+ * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
+ *                            journals are clean
+ * @sdp: the file system
+ * @state: the state to put the transaction lock into
+ * @t_gh: the hold on the transaction lock
+ *
+ * Returns: errno
+ */
+static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
+                                    struct gfs2_holder *t_gh)
+{
+        struct gfs2_inode *ip;
+        struct gfs2_holder ji_gh;
+        struct gfs2_jdesc *jd;
+        struct lfcc *lfcc;
+        LIST_HEAD(list);
+        struct gfs2_log_header lh;
+        int error;
+        error = gfs2_jindex_hold(sdp, &ji_gh);
+        if (error)
+                return error;
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
+                if (!lfcc) {
+                        error = -ENOMEM;
+                        goto out;
+                }
+                ip = GFS2_I(jd->jd_inode);
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
+                if (error) {
+                        kfree(lfcc);
+                        goto out;
+                }
+                list_add(&lfcc->list, &list);
+        }
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
+                               LM_FLAG_PRIORITY | GL_NOCACHE,
+                               t_gh);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                error = gfs2_jdesc_check(jd);
+                if (error)
+                        break;
+                error = gfs2_find_jhead(jd, &lh);
+                if (error)
+                        break;
+                if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+                        error = -EBUSY;
+                        break;
+                }
+        }
+        if (error)
+                gfs2_glock_dq_uninit(t_gh);
+out:
+        while (!list_empty(&list)) {
+                lfcc = list_entry(list.next, struct lfcc, list);
+                list_del(&lfcc->list);
+                gfs2_glock_dq_uninit(&lfcc->gh);
+                kfree(lfcc);
+        }
+        gfs2_glock_dq_uninit(&ji_gh);
+        return error;
+}
+/**
+ * gfs2_freeze_fs - freezes the file system
+ * @sdp: the file system
+ *
+ * This function flushes data and meta data for all machines by
+ * aquiring the transaction log exclusively.  All journals are
+ * ensured to be in a clean state as well.
+ *
+ * Returns: errno
+ */
+int gfs2_freeze_fs(struct gfs2_sbd *sdp)
+{
+        int error = 0;
+        mutex_lock(&sdp->sd_freeze_lock);
+        if (!sdp->sd_freeze_count++) {
+                error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
+                if (error)
+                        sdp->sd_freeze_count--;
+        }
+        mutex_unlock(&sdp->sd_freeze_lock);
+        return error;
+}
+/**
+ * gfs2_unfreeze_fs - unfreezes the file system
+ * @sdp: the file system
+ *
+ * This function allows the file system to proceed by unlocking
+ * the exclusively held transaction lock.  Other GFS2 nodes are
+ * now free to acquire the lock shared and go on with their lives.
+ *
+ */
+void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
+{
+        mutex_lock(&sdp->sd_freeze_lock);
+        if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
+                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+        mutex_unlock(&sdp->sd_freeze_lock);
+}
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..5bb443ae0f59
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __SUPER_DOT_H__
+#define __SUPER_DOT_H__
+#include "incore.h"
+void gfs2_tune_init(struct gfs2_tune *gt);
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
+int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
+struct page *gfs2_read_super(struct super_block *sb, sector_t sector);
+static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
+{
+        unsigned int x;
+        spin_lock(&sdp->sd_jindex_spin);
+        x = sdp->sd_journals;
+        spin_unlock(&sdp->sd_jindex_spin);
+        return x;
+}
+int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
+void gfs2_jindex_free(struct gfs2_sbd *sdp);
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
+struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
+int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+                              struct gfs2_inode **ipp);
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
+int gfs2_statfs_init(struct gfs2_sbd *sdp);
+void gfs2_statfs_change(struct gfs2_sbd *sdp,
+                        s64 total, s64 free, s64 dinodes);
+int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
+int gfs2_freeze_fs(struct gfs2_sbd *sdp);
+void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+#endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..0e0ec988f731
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "lm.h"
+#include "sys.h"
+#include "super.h"
+#include "glock.h"
+#include "quota.h"
+#include "util.h"
+char *gfs2_sys_margs;
+spinlock_t gfs2_sys_margs_lock;
+static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id);
+}
+static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
+}
+static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
+{
+        unsigned int count;
+        mutex_lock(&sdp->sd_freeze_lock);
+        count = sdp->sd_freeze_count;
+        mutex_unlock(&sdp->sd_freeze_lock);
+        return snprintf(buf, PAGE_SIZE, "%u\n", count);
+}
+static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        ssize_t ret = len;
+        int error = 0;
+        int n = simple_strtol(buf, NULL, 0);
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        switch (n) {
+        case 0:
+                gfs2_unfreeze_fs(sdp);
+                break;
+        case 1:
+                error = gfs2_freeze_fs(sdp);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        if (error)
+                fs_warn(sdp, "freeze %d error %d", n, error);
+        return ret;
+}
+static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
+{
+        unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
+        return snprintf(buf, PAGE_SIZE, "%u\n", b);
+}
+static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (simple_strtol(buf, NULL, 0) != 1)
+                return -EINVAL;
+        gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
+                sdp->sd_fsname);
+        return len;
+}
+static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
+                                 size_t len)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (simple_strtol(buf, NULL, 0) != 1)
+                return -EINVAL;
+        gfs2_statfs_sync(sdp);
+        return len;
+}
+static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (simple_strtol(buf, NULL, 0) != 1)
+                return -EINVAL;
+        gfs2_gl_hash_clear(sdp, NO_WAIT);
+        return len;
+}
+static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
+                                size_t len)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (simple_strtol(buf, NULL, 0) != 1)
+                return -EINVAL;
+        gfs2_quota_sync(sdp);
+        return len;
+}
+static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
+                                        size_t len)
+{
+        u32 id;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        id = simple_strtoul(buf, NULL, 0);
+        gfs2_quota_refresh(sdp, 1, id);
+        return len;
+}
+static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
+                                         size_t len)
+{
+        u32 id;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        id = simple_strtoul(buf, NULL, 0);
+        gfs2_quota_refresh(sdp, 0, id);
+        return len;
+}
+struct gfs2_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+#define GFS2_ATTR(name, mode, show, store) \
+static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
+GFS2_ATTR(id,                  0444, id_show,       NULL);
+GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
+GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
+GFS2_ATTR(shrink,              0200, NULL,          shrink_store);
+GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
+GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
+GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
+GFS2_ATTR(quota_refresh_user,  0200, NULL,          quota_refresh_user_store);
+GFS2_ATTR(quota_refresh_group, 0200, NULL,          quota_refresh_group_store);
+static struct attribute *gfs2_attrs[] = {
+        &gfs2_attr_id.attr,
+        &gfs2_attr_fsname.attr,
+        &gfs2_attr_freeze.attr,
+        &gfs2_attr_shrink.attr,
+        &gfs2_attr_withdraw.attr,
+        &gfs2_attr_statfs_sync.attr,
+        &gfs2_attr_quota_sync.attr,
+        &gfs2_attr_quota_refresh_user.attr,
+        &gfs2_attr_quota_refresh_group.attr,
+        NULL,
+};
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->show ? a->show(sdp, buf) : 0;
+}
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->store ? a->store(sdp, buf, len) : len;
+}
+static struct sysfs_ops gfs2_attr_ops = {
+        .show  = gfs2_attr_show,
+        .store = gfs2_attr_store,
+};
+static struct kobj_type gfs2_ktype = {
+        .default_attrs = gfs2_attrs,
+        .sysfs_ops     = &gfs2_attr_ops,
+};
+static struct kset gfs2_kset = {
+        .subsys = &fs_subsys,
+        .kobj   = {.name = "gfs2"},
+        .ktype  = &gfs2_ktype,
+};
+/*
+ * display struct lm_lockstruct fields
+ */
+struct lockstruct_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+#define LOCKSTRUCT_ATTR(name, fmt)                                          \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
+{                                                                           \
+        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
+}                                                                           \
+static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
+LOCKSTRUCT_ATTR(jid,      "%u\n");
+LOCKSTRUCT_ATTR(first,    "%u\n");
+LOCKSTRUCT_ATTR(lvb_size, "%u\n");
+LOCKSTRUCT_ATTR(flags,    "%d\n");
+static struct attribute *lockstruct_attrs[] = {
+        &lockstruct_attr_jid.attr,
+        &lockstruct_attr_first.attr,
+        &lockstruct_attr_lvb_size.attr,
+        &lockstruct_attr_flags.attr,
+        NULL,
+};
+/*
+ * display struct gfs2_args fields
+ */
+struct args_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+#define ARGS_ATTR(name, fmt)                                                \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
+{                                                                           \
+        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
+}                                                                           \
+static struct args_attr args_attr_##name = __ATTR_RO(name)
+ARGS_ATTR(lockproto,       "%s\n");
+ARGS_ATTR(locktable,       "%s\n");
+ARGS_ATTR(hostdata,        "%s\n");
+ARGS_ATTR(spectator,       "%d\n");
+ARGS_ATTR(ignore_local_fs, "%d\n");
+ARGS_ATTR(localcaching,    "%d\n");
+ARGS_ATTR(localflocks,     "%d\n");
+ARGS_ATTR(debug,           "%d\n");
+ARGS_ATTR(upgrade,         "%d\n");
+ARGS_ATTR(num_glockd,      "%u\n");
+ARGS_ATTR(posix_acl,       "%d\n");
+ARGS_ATTR(quota,           "%u\n");
+ARGS_ATTR(suiddir,         "%d\n");
+ARGS_ATTR(data,            "%d\n");
+/* one oddball doesn't fit the macro mold */
+static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%d\n",
+                        !!test_bit(SDF_NOATIME, &sdp->sd_flags));
+}
+static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
+static struct attribute *args_attrs[] = {
+        &args_attr_lockproto.attr,
+        &args_attr_locktable.attr,
+        &args_attr_hostdata.attr,
+        &args_attr_spectator.attr,
+        &args_attr_ignore_local_fs.attr,
+        &args_attr_localcaching.attr,
+        &args_attr_localflocks.attr,
+        &args_attr_debug.attr,
+        &args_attr_upgrade.attr,
+        &args_attr_num_glockd.attr,
+        &args_attr_posix_acl.attr,
+        &args_attr_quota.attr,
+        &args_attr_suiddir.attr,
+        &args_attr_data.attr,
+        &args_attr_noatime.attr,
+        NULL,
+};
+/*
+ * display counters from superblock
+ */
+struct counters_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+#define COUNTERS_ATTR(name, fmt)                                            \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
+{                                                                           \
+        return snprintf(buf, PAGE_SIZE, fmt,                                \
+                        (unsigned int)atomic_read(&sdp->sd_##name));        \
+}                                                                           \
+static struct counters_attr counters_attr_##name = __ATTR_RO(name)
+COUNTERS_ATTR(glock_count,      "%u\n");
+COUNTERS_ATTR(glock_held_count, "%u\n");
+COUNTERS_ATTR(inode_count,      "%u\n");
+COUNTERS_ATTR(reclaimed,        "%u\n");
+static struct attribute *counters_attrs[] = {
+        &counters_attr_glock_count.attr,
+        &counters_attr_glock_held_count.attr,
+        &counters_attr_inode_count.attr,
+        &counters_attr_reclaimed.attr,
+        NULL,
+};
+/*
+ * get and set struct gfs2_tune fields
+ */
+static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%u %u\n",
+                        sdp->sd_tune.gt_quota_scale_num,
+                        sdp->sd_tune.gt_quota_scale_den);
+}
+static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
+                                 size_t len)
+{
+        struct gfs2_tune *gt = &sdp->sd_tune;
+        unsigned int x, y;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
+                return -EINVAL;
+        spin_lock(&gt->gt_spin);
+        gt->gt_quota_scale_num = x;
+        gt->gt_quota_scale_den = y;
+        spin_unlock(&gt->gt_spin);
+        return len;
+}
+static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
+                        int check_zero, const char *buf, size_t len)
+{
+        struct gfs2_tune *gt = &sdp->sd_tune;
+        unsigned int x;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        x = simple_strtoul(buf, NULL, 0);
+        if (check_zero && !x)
+                return -EINVAL;
+        spin_lock(&gt->gt_spin);
+        *field = x;
+        spin_unlock(&gt->gt_spin);
+        return len;
+}
+struct tune_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+#define TUNE_ATTR_3(name, show, store)                                        \
+static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+#define TUNE_ATTR_2(name, store)                                              \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
+{                                                                             \
+        return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name);      \
+}                                                                             \
+TUNE_ATTR_3(name, name##_show, store)
+#define TUNE_ATTR(name, check_zero)                                           \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{                                                                             \
+        return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len);  \
+}                                                                             \
+TUNE_ATTR_2(name, name##_store)
+#define TUNE_ATTR_DAEMON(name, process)                                       \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{                                                                             \
+        ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len);      \
+        wake_up_process(sdp->sd_##process);                                   \
+        return r;                                                             \
+}                                                                             \
+TUNE_ATTR_2(name, name##_store)
+TUNE_ATTR(ilimit, 0);
+TUNE_ATTR(ilimit_tries, 0);
+TUNE_ATTR(ilimit_min, 0);
+TUNE_ATTR(demote_secs, 0);
+TUNE_ATTR(incore_log_blocks, 0);
+TUNE_ATTR(log_flush_secs, 0);
+TUNE_ATTR(jindex_refresh_secs, 0);
+TUNE_ATTR(quota_warn_period, 0);
+TUNE_ATTR(quota_quantum, 0);
+TUNE_ATTR(atime_quantum, 0);
+TUNE_ATTR(max_readahead, 0);
+TUNE_ATTR(complain_secs, 0);
+TUNE_ATTR(reclaim_limit, 0);
+TUNE_ATTR(prefetch_secs, 0);
+TUNE_ATTR(statfs_slow, 0);
+TUNE_ATTR(new_files_jdata, 0);
+TUNE_ATTR(new_files_directio, 0);
+TUNE_ATTR(quota_simul_sync, 1);
+TUNE_ATTR(quota_cache_secs, 1);
+TUNE_ATTR(max_atomic_write, 1);
+TUNE_ATTR(stall_secs, 1);
+TUNE_ATTR(entries_per_readdir, 1);
+TUNE_ATTR(greedy_default, 1);
+TUNE_ATTR(greedy_quantum, 1);
+TUNE_ATTR(greedy_max, 1);
+TUNE_ATTR(statfs_quantum, 1);
+TUNE_ATTR_DAEMON(scand_secs, scand_process);
+TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
+TUNE_ATTR_DAEMON(logd_secs, logd_process);
+TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
+TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
+static struct attribute *tune_attrs[] = {
+        &tune_attr_ilimit.attr,
+        &tune_attr_ilimit_tries.attr,
+        &tune_attr_ilimit_min.attr,
+        &tune_attr_demote_secs.attr,
+        &tune_attr_incore_log_blocks.attr,
+        &tune_attr_log_flush_secs.attr,
+        &tune_attr_jindex_refresh_secs.attr,
+        &tune_attr_quota_warn_period.attr,
+        &tune_attr_quota_quantum.attr,
+        &tune_attr_atime_quantum.attr,
+        &tune_attr_max_readahead.attr,
+        &tune_attr_complain_secs.attr,
+        &tune_attr_reclaim_limit.attr,
+        &tune_attr_prefetch_secs.attr,
+        &tune_attr_statfs_slow.attr,
+        &tune_attr_quota_simul_sync.attr,
+        &tune_attr_quota_cache_secs.attr,
+        &tune_attr_max_atomic_write.attr,
+        &tune_attr_stall_secs.attr,
+        &tune_attr_entries_per_readdir.attr,
+        &tune_attr_greedy_default.attr,
+        &tune_attr_greedy_quantum.attr,
+        &tune_attr_greedy_max.attr,
+        &tune_attr_statfs_quantum.attr,
+        &tune_attr_scand_secs.attr,
+        &tune_attr_recoverd_secs.attr,
+        &tune_attr_logd_secs.attr,
+        &tune_attr_quotad_secs.attr,
+        &tune_attr_quota_scale.attr,
+        &tune_attr_new_files_jdata.attr,
+        &tune_attr_new_files_directio.attr,
+        NULL,
+};
+static struct attribute_group lockstruct_group = {
+        .name = "lockstruct",
+        .attrs = lockstruct_attrs,
+};
+static struct attribute_group counters_group = {
+        .name = "counters",
+        .attrs = counters_attrs,
+};
+static struct attribute_group args_group = {
+        .name = "args",
+        .attrs = args_attrs,
+};
+static struct attribute_group tune_group = {
+        .name = "tune",
+        .attrs = tune_attrs,
+};
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
+{
+        int error;
+        sdp->sd_kobj.kset = &gfs2_kset;
+        sdp->sd_kobj.ktype = &gfs2_ktype;
+        error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
+        if (error)
+                goto fail;
+        error = kobject_register(&sdp->sd_kobj);
+        if (error)
+                goto fail;
+        error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
+        if (error)
+                goto fail_reg;
+        error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
+        if (error)
+                goto fail_lockstruct;
+        error = sysfs_create_group(&sdp->sd_kobj, &args_group);
+        if (error)
+                goto fail_counters;
+        error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
+        if (error)
+                goto fail_args;
+        return 0;
+fail_args:
+        sysfs_remove_group(&sdp->sd_kobj, &args_group);
+fail_counters:
+        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
+fail_lockstruct:
+        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+fail_reg:
+        kobject_unregister(&sdp->sd_kobj);
+fail:
+        fs_err(sdp, "error %d adding sysfs files", error);
+        return error;
+}
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
+{
+        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
+        sysfs_remove_group(&sdp->sd_kobj, &args_group);
+        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
+        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+        kobject_unregister(&sdp->sd_kobj);
+}
+int gfs2_sys_init(void)
+{
+        gfs2_sys_margs = NULL;
+        spin_lock_init(&gfs2_sys_margs_lock);
+        return kset_register(&gfs2_kset);
+}
+void gfs2_sys_uninit(void)
+{
+        kfree(gfs2_sys_margs);
+        kset_unregister(&gfs2_kset);
+}
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..1ca8cdac5304
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __SYS_DOT_H__
+#define __SYS_DOT_H__
+#include <linux/spinlock.h>
+struct gfs2_sbd;
+/* Allow args to be passed to GFS2 when using an initial ram disk */
+extern char *gfs2_sys_margs;
+extern spinlock_t gfs2_sys_margs_lock;
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
+int gfs2_sys_init(void);
+void gfs2_sys_uninit(void);
+#endif /* __SYS_DOT_H__ */
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..f8dabf8446bb
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/kallsyms.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+                     unsigned int revokes)
+{
+        struct gfs2_trans *tr;
+        int error;
+        BUG_ON(current->journal_info);
+        BUG_ON(blocks == 0 && revokes == 0);
+        tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
+        if (!tr)
+                return -ENOMEM;
+        tr->tr_ip = (unsigned long)__builtin_return_address(0);
+        tr->tr_blocks = blocks;
+        tr->tr_revokes = revokes;
+        tr->tr_reserved = 1;
+        if (blocks)
+                tr->tr_reserved += 6 + blocks;
+        if (revokes)
+                tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
+                                                   sizeof(u64));
+        INIT_LIST_HEAD(&tr->tr_list_buf);
+        gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
+        error = gfs2_glock_nq(&tr->tr_t_gh);
+        if (error)
+                goto fail_holder_uninit;
+        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                tr->tr_t_gh.gh_flags |= GL_NOCACHE;
+                error = -EROFS;
+                goto fail_gunlock;
+        }
+        error = gfs2_log_reserve(sdp, tr->tr_reserved);
+        if (error)
+                goto fail_gunlock;
+        current->journal_info = tr;
+        return 0;
+fail_gunlock:
+        gfs2_glock_dq(&tr->tr_t_gh);
+fail_holder_uninit:
+        gfs2_holder_uninit(&tr->tr_t_gh);
+        kfree(tr);
+        return error;
+}
+void gfs2_trans_end(struct gfs2_sbd *sdp)
+{
+        struct gfs2_trans *tr = current->journal_info;
+        BUG_ON(!tr);
+        current->journal_info = NULL;
+        if (!tr->tr_touched) {
+                gfs2_log_release(sdp, tr->tr_reserved);
+                gfs2_glock_dq(&tr->tr_t_gh);
+                gfs2_holder_uninit(&tr->tr_t_gh);
+                kfree(tr);
+                return;
+        }
+        if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
+                fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
+                       tr->tr_num_buf, tr->tr_blocks);
+                print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+        }
+        if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
+                fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
+                       tr->tr_num_revoke, tr->tr_revokes);
+                print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+        }
+        gfs2_log_commit(sdp, tr);
+        gfs2_glock_dq(&tr->tr_t_gh);
+        gfs2_holder_uninit(&tr->tr_t_gh);
+        kfree(tr);
+        if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+                gfs2_log_flush(sdp, NULL);
+}
+void gfs2_trans_add_gl(struct gfs2_glock *gl)
+{
+        lops_add(gl->gl_sbd, &gl->gl_le);
+}
+/**
+ * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to add
+ * @meta: True in the case of adding metadata
+ *
+ */
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_bufdata *bd;
+        bd = bh->b_private;
+        if (bd)
+                gfs2_assert(sdp, bd->bd_gl == gl);
+        else {
+                gfs2_attach_bufdata(gl, bh, meta);
+                bd = bh->b_private;
+        }
+        lops_add(sdp, &bd->bd_le);
+}
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno)
+{
+        struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
+                                         GFP_NOFS | __GFP_NOFAIL);
+        lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
+        rv->rv_blkno = blkno;
+        lops_add(sdp, &rv->rv_le);
+}
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
+{
+        struct gfs2_revoke *rv;
+        int found = 0;
+        gfs2_log_lock(sdp);
+        list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
+                if (rv->rv_blkno == blkno) {
+                        list_del(&rv->rv_le.le_list);
+                        gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
+                        sdp->sd_log_num_revoke--;
+                        found = 1;
+                        break;
+                }
+        }
+        gfs2_log_unlock(sdp);
+        if (found) {
+                struct gfs2_trans *tr = current->journal_info;
+                kfree(rv);
+                tr->tr_num_revoke_rm++;
+        }
+}
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
+{
+        lops_add(rgd->rd_sbd, &rgd->rd_le);
+}
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..23d4cbe1de5b
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __TRANS_DOT_H__
+#define __TRANS_DOT_H__
+#include <linux/buffer_head.h>
+struct gfs2_sbd;
+struct gfs2_rgrpd;
+struct gfs2_glock;
+#define RES_DINODE      1
+#define RES_INDIRECT    1
+#define RES_JDATA       1
+#define RES_DATA        1
+#define RES_LEAF        1
+#define RES_RG_BIT      2
+#define RES_EATTR       1
+#define RES_STATFS      1
+#define RES_QUOTA       2
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+                     unsigned int revokes);
+void gfs2_trans_end(struct gfs2_sbd *sdp);
+void gfs2_trans_add_gl(struct gfs2_glock *gl);
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
+#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..196c604faadc
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "lm.h"
+#include "util.h"
+kmem_cache_t *gfs2_glock_cachep __read_mostly;
+kmem_cache_t *gfs2_inode_cachep __read_mostly;
+kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
+void gfs2_assert_i(struct gfs2_sbd *sdp)
+{
+        printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
+               sdp->sd_fsname);
+}
+/**
+ * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+                           const char *function, char *file, unsigned int line)
+{
+        int me;
+        me = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname, assertion,
+                sdp->sd_fsname, function, file, line);
+        dump_stack();
+        return (me) ? -1 : -2;
+}
+/**
+ * gfs2_assert_warn_i - Print a message to the console if @assertion is false
+ * Returns: -1 if we printed something
+ *          -2 if we didn't
+ */
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+                       const char *function, char *file, unsigned int line)
+{
+        if (time_before(jiffies,
+                        sdp->sd_last_warning +
+                        gfs2_tune_get(sdp, gt_complain_secs) * HZ))
+                return -2;
+        printk(KERN_WARNING
+               "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+               "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+               sdp->sd_fsname, assertion,
+               sdp->sd_fsname, function, file, line);
+        if (sdp->sd_args.ar_debug)
+                BUG();
+        else
+                dump_stack();
+        sdp->sd_last_warning = jiffies;
+        return -1;
+}
+/**
+ * gfs2_consist_i - Flag a filesystem consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
+                   char *file, unsigned int line)
+{
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+/**
+ * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+                         const char *function, char *file, unsigned int line)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+                "GFS2: fsid=%s:   inode = %llu %llu\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
+                (unsigned long long)ip->i_num.no_addr,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+/**
+ * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+                         const char *function, char *file, unsigned int line)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+                "GFS2: fsid=%s:   RG = %llu\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+/**
+ * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                       const char *type, const char *function, char *file,
+                       unsigned int line)
+{
+        int me;
+        me = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: invalid metadata block\n"
+                "GFS2: fsid=%s:   bh = %llu (%s)\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
+                sdp->sd_fsname, function, file, line);
+        return (me) ? -1 : -2;
+}
+/**
+ * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                           u16 type, u16 t, const char *function,
+                           char *file, unsigned int line)
+{
+        int me;
+        me = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: invalid metadata block\n"
+                "GFS2: fsid=%s:   bh = %llu (type: exp=%u, found=%u)\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
+                sdp->sd_fsname, function, file, line);
+        return (me) ? -1 : -2;
+}
+/**
+ * gfs2_io_error_i - Flag an I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
+                    unsigned int line)
+{
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: I/O error\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+/**
+ * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                       const char *function, char *file, unsigned int line)
+{
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: I/O error\n"
+                "GFS2: fsid=%s:   block = %llu\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+                      unsigned int bit, int new_value)
+{
+        unsigned int c, o, b = bit;
+        int old_value;
+        c = b / (8 * PAGE_SIZE);
+        b %= 8 * PAGE_SIZE;
+        o = b / 8;
+        b %= 8;
+        old_value = (bitmap[c][o] & (1 << b));
+        gfs2_assert_withdraw(sdp, !old_value != !new_value);
+        if (new_value)
+                bitmap[c][o] |= 1 << b;
+        else
+                bitmap[c][o] &= ~(1 << b);
+}
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..76a50899fe9e
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+#include "incore.h"
+#define fs_printk(level, fs, fmt, arg...) \
+        printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
+#define fs_info(fs, fmt, arg...) \
+        fs_printk(KERN_INFO , fs , fmt , ## arg)
+#define fs_warn(fs, fmt, arg...) \
+        fs_printk(KERN_WARNING , fs , fmt , ## arg)
+#define fs_err(fs, fmt, arg...) \
+        fs_printk(KERN_ERR, fs , fmt , ## arg)
+void gfs2_assert_i(struct gfs2_sbd *sdp);
+#define gfs2_assert(sdp, assertion) \
+do { \
+        if (unlikely(!(assertion))) { \
+                gfs2_assert_i(sdp); \
+                BUG(); \
+        } \
+} while (0)
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+                           const char *function, char *file, unsigned int line);
+#define gfs2_assert_withdraw(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
+                                        __FUNCTION__, __FILE__, __LINE__))
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+                       const char *function, char *file, unsigned int line);
+#define gfs2_assert_warn(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
+                                        __FUNCTION__, __FILE__, __LINE__))
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
+                   const char *function, char *file, unsigned int line);
+#define gfs2_consist(sdp) \
+gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+                         const char *function, char *file, unsigned int line);
+#define gfs2_consist_inode(ip) \
+gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+                         const char *function, char *file, unsigned int line);
+#define gfs2_consist_rgrpd(rgd) \
+gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                       const char *type, const char *function,
+                       char *file, unsigned int line);
+static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
+                                    struct buffer_head *bh,
+                                    const char *function,
+                                    char *file, unsigned int line)
+{
+        struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+        u32 magic = mh->mh_magic;
+        magic = be32_to_cpu(magic);
+        if (unlikely(magic != GFS2_MAGIC))
+                return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+                                          file, line);
+        return 0;
+}
+#define gfs2_meta_check(sdp, bh) \
+gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                           u16 type, u16 t,
+                           const char *function,
+                           char *file, unsigned int line);
+static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
+                                        struct buffer_head *bh,
+                                        u16 type,
+                                        const char *function,
+                                        char *file, unsigned int line)
+{
+        struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+        u32 magic = mh->mh_magic;
+        u16 t = be32_to_cpu(mh->mh_type);
+        magic = be32_to_cpu(magic);
+        if (unlikely(magic != GFS2_MAGIC))
+                return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+                                          file, line);
+        if (unlikely(t != type))
+                return gfs2_metatype_check_ii(sdp, bh, type, t, function,
+                                              file, line);
+        return 0;
+}
+#define gfs2_metatype_check(sdp, bh, type) \
+gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
+static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
+                                     u16 format)
+{
+        struct gfs2_meta_header *mh;
+        mh = (struct gfs2_meta_header *)bh->b_data;
+        mh->mh_type = cpu_to_be32(type);
+        mh->mh_format = cpu_to_be32(format);
+}
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
+                    char *file, unsigned int line);
+#define gfs2_io_error(sdp) \
+gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                       const char *function, char *file, unsigned int line);
+#define gfs2_io_error_bh(sdp, bh) \
+gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
+extern kmem_cache_t *gfs2_glock_cachep;
+extern kmem_cache_t *gfs2_inode_cachep;
+extern kmem_cache_t *gfs2_bufdata_cachep;
+static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
+                                           unsigned int *p)
+{
+        unsigned int x;
+        spin_lock(&gt->gt_spin);
+        x = *p;
+        spin_unlock(&gt->gt_spin);
+        return x;
+}
+#define gfs2_tune_get(sdp, field) \
+gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+                      unsigned int bit, int new_value);
+#endif /* __UTIL_DOT_H__ */
author	Linus Torvalds <torvalds@g5.osdl.org>	2006-10-04 12:06:16 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-10-04 12:06:16 -0400
commit	4a61f17378c2cdd9bd8f34ef8bd7422861d0c1f1 (patch)
tree	a2054556900af8c16fd9f5419f012dcf1ee2995a /fs
parent	d002ec481c24f325ed6cfcb7810d317c015dd1b5 (diff)
parent	7ecdb70a0ea436c06540140242bfac6ac3babfc0 (diff)