80 files changed, 2508 insertions, 1714 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 8cd2417a14db..5e8e9d9ccb33 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -426,7 +426,6 @@ config OCFS2_FS
        select CONFIGFS_FS
        select JBD
        select CRC32
-        select INET
        help
          OCFS2 is a general purpose extent based shared disk cluster file
          system with many similarities to ext3. It supports 64 bit inode
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index d04d2f7448d9..85e3850bf2c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,6 +1,8 @@
 Version 1.47
 ------------
 Fix oops in list_del during mount caused by unaligned string.
+Seek to SEEK_END forces check for update of file size for non-cached
+files.
 Version 1.46
 ------------
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 10c90294cd18..93ef09971d2f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -511,7 +511,15 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 {
        /* origin == SEEK_END => we must revalidate the cached file length */
        if (origin == SEEK_END) {
-                int retval = cifs_revalidate(file->f_path.dentry);
+                int retval;
+                /* some applications poll for the file length in this strange
+                   way so we must seek to end on non-oplocked files by
+                   setting the revalidate time to zero */
+                if(file->f_path.dentry->d_inode)                
+                        CIFS_I(file->f_path.dentry->d_inode)->time = 0;
+                retval = cifs_revalidate(file->f_path.dentry);
                if (retval < 0)
                        return (loff_t)retval;
        }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8a49b2e77d37..e9dcf5ee29a2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1146,7 +1146,7 @@ static int cifs_writepages(struct address_space *mapping,
        pgoff_t end;
        pgoff_t index;
        int range_whole = 0;
-        struct kvec iov[32];
+        struct kvec * iov;
        int len;
        int n_iov = 0;
        pgoff_t next;
@@ -1171,15 +1171,21 @@ static int cifs_writepages(struct address_space *mapping,
        if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
                if(cifs_sb->tcon->ses->server->secMode &
                          (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                        if(!experimEnabled)
+                        if(!experimEnabled) 
                                return generic_writepages(mapping, wbc);
+        iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
+        if(iov == NULL)
+                return generic_writepages(mapping, wbc);
        /*
         * BB: Is this meaningful for a non-block-device file system?
         * If it is, we should test it again after we do I/O
         */
        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                wbc->encountered_congestion = 1;
+                kfree(iov);
                return 0;
        }
@@ -1345,7 +1351,7 @@ retry:
                mapping->writeback_index = index;
        FreeXid(xid);
+        kfree(iov);
        return rc;
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 99dfb5337e31..782940be550f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -156,9 +156,9 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                tmp_inode->i_atime = cnvrtDosUnixTm(
                                le16_to_cpu(pfindData->LastAccessDate),
                                le16_to_cpu(pfindData->LastAccessTime));
-                tmp_inode->i_ctime = cnvrtDosUnixTm(
+                tmp_inode->i_ctime = cnvrtDosUnixTm(
-                                le16_to_cpu(pfindData->LastWriteDate),
+                                le16_to_cpu(pfindData->LastWriteDate),
-                                le16_to_cpu(pfindData->LastWriteTime));
+                                le16_to_cpu(pfindData->LastWriteTime));
                AdjustForTZ(cifs_sb->tcon, tmp_inode);
                attr = le16_to_cpu(pfindData->Attributes);
                allocation_size = le32_to_cpu(pfindData->AllocationSize);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 7a1b2b961ec8..1b1daf63f062 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -196,7 +196,7 @@ dohash(char *out, char *in, char *key, int forw)
        char c[28];
        char d[28];
        char *cd;
-        char ki[16][48];
+        char (*ki)[48];
        char *pd1;
        char l[32], r[32];
        char *rl;
@@ -206,6 +206,12 @@ dohash(char *out, char *in, char *key, int forw)
        if(pk1 == NULL)
                return;
+        ki = kmalloc(16*48, GFP_KERNEL);
+        if(ki == NULL) {
+                kfree(pk1);
+                return;
+        }
        cd = pk1 + 56;
        pd1= cd  + 56;
        rl = pd1 + 64;
@@ -243,6 +249,7 @@ dohash(char *out, char *in, char *key, int forw)
                er = kmalloc(48+48+32+32+32, GFP_KERNEL);
                if(er == NULL) {
                        kfree(pk1);
+                        kfree(ki);
                        return;
                }
                erk = er+48;
@@ -290,6 +297,7 @@ dohash(char *out, char *in, char *key, int forw)
        permute(out, rl, perm6, 64);
        kfree(pk1);
+        kfree(ki);
 }
 static void
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2a7cb086e80c..d98be5e01328 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -162,14 +162,17 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
        int error;
        if (!buffer->page)
-                buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
+                buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0);
        if (!buffer->page)
                return -ENOMEM;
-        if (count > PAGE_SIZE)
+        if (count >= PAGE_SIZE)
-                count = PAGE_SIZE;
+                count = PAGE_SIZE - 1;
        error = copy_from_user(buffer->page,buf,count);
        buffer->needs_read_fill = 1;
+        /* if buf is assumed to contain a string, terminate it by \0,
+         * so e.g. sscanf() can scan the string easily */
+        buffer->page[count] = 0;
        return error ? -EFAULT : count;
 }
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index b5654a284fef..6fa7b0d5c043 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,21 +3,21 @@ menu "Distributed Lock Manager"
 config DLM
        tristate "Distributed Lock Manager (DLM)"
-        depends on IPV6 || IPV6=n
+        depends on SYSFS && (IPV6 || IPV6=n)
        select CONFIGFS_FS
        select IP_SCTP if DLM_SCTP
        help
-        A general purpose distributed lock manager for kernel or userspace
+          A general purpose distributed lock manager for kernel or userspace
-        applications.
+          applications.
 choice
        prompt "Select DLM communications protocol"
        depends on DLM
        default DLM_TCP
        help
-        The DLM Can use TCP or SCTP for it's network communications.
+          The DLM Can use TCP or SCTP for it's network communications.
-        SCTP supports multi-homed operations whereas TCP doesn't.
+          SCTP supports multi-homed operations whereas TCP doesn't.
-        However, SCTP seems to have stability problems at the moment.
+          However, SCTP seems to have stability problems at the moment.
 config DLM_TCP
        bool "TCP/IP"
@@ -31,8 +31,8 @@ config DLM_DEBUG
        bool "DLM debugging"
        depends on DLM
        help
-        Under the debugfs mount point, the name of each lockspace will
+          Under the debugfs mount point, the name of each lockspace will
-        appear as a file in the "dlm" directory.  The output is the
+          appear as a file in the "dlm" directory.  The output is the
-        list of resource and locks the local node knows about.
+          list of resource and locks the local node knows about.
 endmenu
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 88553054bbfa..8665c88e5af2 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -54,6 +54,11 @@ static struct config_item *make_node(struct config_group *, const char *);
 static void drop_node(struct config_group *, struct config_item *);
 static void release_node(struct config_item *);
+static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
+                            char *buf);
+static ssize_t store_cluster(struct config_item *i,
+                             struct configfs_attribute *a,
+                             const char *buf, size_t len);
 static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
                         char *buf);
 static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
@@ -73,6 +78,101 @@ static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
 static ssize_t node_weight_read(struct node *nd, char *buf);
 static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+struct cluster {
+        struct config_group group;
+        unsigned int cl_tcp_port;
+        unsigned int cl_buffer_size;
+        unsigned int cl_rsbtbl_size;
+        unsigned int cl_lkbtbl_size;
+        unsigned int cl_dirtbl_size;
+        unsigned int cl_recover_timer;
+        unsigned int cl_toss_secs;
+        unsigned int cl_scan_secs;
+        unsigned int cl_log_debug;
+};
+enum {
+        CLUSTER_ATTR_TCP_PORT = 0,
+        CLUSTER_ATTR_BUFFER_SIZE,
+        CLUSTER_ATTR_RSBTBL_SIZE,
+        CLUSTER_ATTR_LKBTBL_SIZE,
+        CLUSTER_ATTR_DIRTBL_SIZE,
+        CLUSTER_ATTR_RECOVER_TIMER,
+        CLUSTER_ATTR_TOSS_SECS,
+        CLUSTER_ATTR_SCAN_SECS,
+        CLUSTER_ATTR_LOG_DEBUG,
+};
+struct cluster_attribute {
+        struct configfs_attribute attr;
+        ssize_t (*show)(struct cluster *, char *);
+        ssize_t (*store)(struct cluster *, const char *, size_t);
+};
+static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
+                           unsigned int *info_field, int check_zero,
+                           const char *buf, size_t len)
+{
+        unsigned int x;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        x = simple_strtoul(buf, NULL, 0);
+        if (check_zero && !x)
+                return -EINVAL;
+        *cl_field = x;
+        *info_field = x;
+        return len;
+}
+#define __CONFIGFS_ATTR(_name,_mode,_read,_write) {                           \
+        .attr   = { .ca_name = __stringify(_name),                            \
+                    .ca_mode = _mode,                                         \
+                    .ca_owner = THIS_MODULE },                                \
+        .show   = _read,                                                      \
+        .store  = _write,                                                     \
+}
+#define CLUSTER_ATTR(name, check_zero)                                        \
+static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
+{                                                                             \
+        return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name,         \
+                           check_zero, buf, len);                             \
+}                                                                             \
+static ssize_t name##_read(struct cluster *cl, char *buf)                     \
+{                                                                             \
+        return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name);               \
+}                                                                             \
+static struct cluster_attribute cluster_attr_##name =                         \
+__CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
+CLUSTER_ATTR(tcp_port, 1);
+CLUSTER_ATTR(buffer_size, 1);
+CLUSTER_ATTR(rsbtbl_size, 1);
+CLUSTER_ATTR(lkbtbl_size, 1);
+CLUSTER_ATTR(dirtbl_size, 1);
+CLUSTER_ATTR(recover_timer, 1);
+CLUSTER_ATTR(toss_secs, 1);
+CLUSTER_ATTR(scan_secs, 1);
+CLUSTER_ATTR(log_debug, 0);
+static struct configfs_attribute *cluster_attrs[] = {
+        [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
+        [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
+        [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
+        [CLUSTER_ATTR_LKBTBL_SIZE] = &cluster_attr_lkbtbl_size.attr,
+        [CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr,
+        [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
+        [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
+        [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
+        [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
+        NULL,
+};
 enum {
        COMM_ATTR_NODEID = 0,
        COMM_ATTR_LOCAL,
@@ -152,10 +252,6 @@ struct clusters {
        struct configfs_subsystem subsys;
 };
-struct cluster {
-        struct config_group group;
-};
 struct spaces {
        struct config_group ss_group;
 };
@@ -197,6 +293,8 @@ static struct configfs_group_operations clusters_ops = {
 static struct configfs_item_operations cluster_ops = {
        .release = release_cluster,
+        .show_attribute = show_cluster,
+        .store_attribute = store_cluster,
 };
 static struct configfs_group_operations spaces_ops = {
@@ -237,6 +335,7 @@ static struct config_item_type clusters_type = {
 static struct config_item_type cluster_type = {
        .ct_item_ops = &cluster_ops,
+        .ct_attrs = cluster_attrs,
        .ct_owner = THIS_MODULE,
 };
@@ -317,6 +416,16 @@ static struct config_group *make_cluster(struct config_group *g,
        cl->group.default_groups[1] = &cms->cs_group;
        cl->group.default_groups[2] = NULL;
+        cl->cl_tcp_port = dlm_config.ci_tcp_port;
+        cl->cl_buffer_size = dlm_config.ci_buffer_size;
+        cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size;
+        cl->cl_lkbtbl_size = dlm_config.ci_lkbtbl_size;
+        cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size;
+        cl->cl_recover_timer = dlm_config.ci_recover_timer;
+        cl->cl_toss_secs = dlm_config.ci_toss_secs;
+        cl->cl_scan_secs = dlm_config.ci_scan_secs;
+        cl->cl_log_debug = dlm_config.ci_log_debug;
        space_list = &sps->ss_group;
        comm_list = &cms->cs_group;
        return &cl->group;
@@ -509,6 +618,25 @@ void dlm_config_exit(void)
 * Functions for user space to read/write attributes
 */
+static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
+                            char *buf)
+{
+        struct cluster *cl = to_cluster(i);
+        struct cluster_attribute *cla =
+                        container_of(a, struct cluster_attribute, attr);
+        return cla->show ? cla->show(cl, buf) : 0;
+}
+static ssize_t store_cluster(struct config_item *i,
+                             struct configfs_attribute *a,
+                             const char *buf, size_t len)
+{
+        struct cluster *cl = to_cluster(i);
+        struct cluster_attribute *cla =
+                container_of(a, struct cluster_attribute, attr);
+        return cla->store ? cla->store(cl, buf, len) : -EINVAL;
+}
 static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
                         char *buf)
 {
@@ -775,15 +903,17 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_RECOVER_TIMER      5
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_SCAN_SECS          5
+#define DEFAULT_LOG_DEBUG          0
 struct dlm_config_info dlm_config = {
-        .tcp_port = DEFAULT_TCP_PORT,
+        .ci_tcp_port = DEFAULT_TCP_PORT,
-        .buffer_size = DEFAULT_BUFFER_SIZE,
+        .ci_buffer_size = DEFAULT_BUFFER_SIZE,
-        .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+        .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
-        .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+        .ci_lkbtbl_size = DEFAULT_LKBTBL_SIZE,
-        .dirtbl_size = DEFAULT_DIRTBL_SIZE,
+        .ci_dirtbl_size = DEFAULT_DIRTBL_SIZE,
-        .recover_timer = DEFAULT_RECOVER_TIMER,
+        .ci_recover_timer = DEFAULT_RECOVER_TIMER,
-        .toss_secs = DEFAULT_TOSS_SECS,
+        .ci_toss_secs = DEFAULT_TOSS_SECS,
-        .scan_secs = DEFAULT_SCAN_SECS
+        .ci_scan_secs = DEFAULT_SCAN_SECS,
+        .ci_log_debug = DEFAULT_LOG_DEBUG
 };
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 9da7839958a9..1e978611a96e 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -17,14 +17,15 @@
 #define DLM_MAX_ADDR_COUNT 3
 struct dlm_config_info {
-        int tcp_port;
+        int ci_tcp_port;
-        int buffer_size;
+        int ci_buffer_size;
-        int rsbtbl_size;
+        int ci_rsbtbl_size;
-        int lkbtbl_size;
+        int ci_lkbtbl_size;
-        int dirtbl_size;
+        int ci_dirtbl_size;
-        int recover_timer;
+        int ci_recover_timer;
-        int toss_secs;
+        int ci_toss_secs;
-        int scan_secs;
+        int ci_scan_secs;
+        int ci_log_debug;
 };
 extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 1ee8195e6fc0..61d93201e1b2 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -41,6 +41,7 @@
 #include <asm/uaccess.h>
 #include <linux/dlm.h>
+#include "config.h"
 #define DLM_LOCKSPACE_LEN       64
@@ -69,12 +70,12 @@ struct dlm_mhandle;
 #define log_error(ls, fmt, args...) \
        printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
-#define DLM_LOG_DEBUG
+#define log_debug(ls, fmt, args...) \
-#ifdef DLM_LOG_DEBUG
+do { \
-#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
+        if (dlm_config.ci_log_debug) \
-#else
+                printk(KERN_DEBUG "dlm: %s: " fmt "\n", \
-#define log_debug(ls, fmt, args...)
+                       (ls)->ls_name , ##args); \
-#endif
+} while (0)
 #define DLM_ASSERT(x, do) \
 { \
@@ -309,8 +310,8 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
 /* dlm_header is first element of all structs sent between nodes */
-#define DLM_HEADER_MAJOR        0x00020000
+#define DLM_HEADER_MAJOR        0x00030000
-#define DLM_HEADER_MINOR        0x00000001
+#define DLM_HEADER_MINOR        0x00000000
 #define DLM_MSG                 1
 #define DLM_RCOM                2
@@ -386,6 +387,8 @@ struct dlm_rcom {
        uint32_t                rc_type;        /* DLM_RCOM_ */
        int                     rc_result;      /* multi-purpose */
        uint64_t                rc_id;          /* match reply with request */
+        uint64_t                rc_seq;         /* sender's ls_recover_seq */
+        uint64_t                rc_seq_reply;   /* remote ls_recover_seq */
        char                    rc_buf[0];
 };
@@ -523,6 +526,7 @@ struct dlm_user_proc {
        spinlock_t              asts_spin;
        struct list_head        locks;
        spinlock_t              locks_spin;
+        struct list_head        unlocking;
        wait_queue_head_t       wait;
 };
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 30878defaeb6..e725005fafd0 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -754,6 +754,11 @@ static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
        mutex_unlock(&ls->ls_waiters_mutex);
 }
+/* We clear the RESEND flag because we might be taking an lkb off the waiters
+   list as part of process_requestqueue (e.g. a lookup that has an optimized
+   request reply on the requestqueue) between dlm_recover_waiters_pre() which
+   set RESEND and dlm_recover_waiters_post() */
 static int _remove_from_waiters(struct dlm_lkb *lkb)
 {
        int error = 0;
@@ -764,6 +769,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb)
                goto out;
        }
        lkb->lkb_wait_type = 0;
+        lkb->lkb_flags &= ~DLM_IFL_RESEND;
        list_del(&lkb->lkb_wait_reply);
        unhold_lkb(lkb);
 out:
@@ -810,7 +816,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
                list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
                                            res_hashchain) {
                        if (!time_after_eq(jiffies, r->res_toss_time +
-                                           dlm_config.toss_secs * HZ))
+                                           dlm_config.ci_toss_secs * HZ))
                                continue;
                        found = 1;
                        break;
@@ -2144,12 +2150,24 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
        if (lkb->lkb_astaddr)
                ms->m_asts |= AST_COMP;
-        if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
+        /* compare with switch in create_message; send_remove() doesn't
-                memcpy(ms->m_extra, r->res_name, r->res_length);
+           use send_args() */
-        else if (lkb->lkb_lvbptr)
+        switch (ms->m_type) {
+        case DLM_MSG_REQUEST:
+        case DLM_MSG_LOOKUP:
+                memcpy(ms->m_extra, r->res_name, r->res_length);
+                break;
+        case DLM_MSG_CONVERT:
+        case DLM_MSG_UNLOCK:
+        case DLM_MSG_REQUEST_REPLY:
+        case DLM_MSG_CONVERT_REPLY:
+        case DLM_MSG_GRANT:
+                if (!lkb->lkb_lvbptr)
+                        break;
                memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+                break;
+        }
 }
 static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
@@ -2418,8 +2436,12 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
        DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
-        if (receive_lvb(ls, lkb, ms))
+        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
-                return -ENOMEM;
+                /* lkb was just created so there won't be an lvb yet */
+                lkb->lkb_lvbptr = allocate_lvb(ls);
+                if (!lkb->lkb_lvbptr)
+                        return -ENOMEM;
+        }
        return 0;
 }
@@ -3002,7 +3024,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
 {
        struct dlm_message *ms = (struct dlm_message *) hd;
        struct dlm_ls *ls;
-        int error;
+        int error = 0;
        if (!recovery)
                dlm_message_in(ms);
@@ -3119,7 +3141,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
 out:
        dlm_put_lockspace(ls);
        dlm_astd_wake();
-        return 0;
+        return error;
 }
@@ -3132,6 +3154,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
        if (middle_conversion(lkb)) {
                hold_lkb(lkb);
                ls->ls_stub_ms.m_result = -EINPROGRESS;
+                ls->ls_stub_ms.m_flags = lkb->lkb_flags;
                _remove_from_waiters(lkb);
                _receive_convert_reply(lkb, &ls->ls_stub_ms);
@@ -3205,6 +3228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                case DLM_MSG_UNLOCK:
                        hold_lkb(lkb);
                        ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
                        _remove_from_waiters(lkb);
                        _receive_unlock_reply(lkb, &ls->ls_stub_ms);
                        dlm_put_lkb(lkb);
@@ -3213,6 +3237,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                case DLM_MSG_CANCEL:
                        hold_lkb(lkb);
                        ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
                        _remove_from_waiters(lkb);
                        _receive_cancel_reply(lkb, &ls->ls_stub_ms);
                        dlm_put_lkb(lkb);
@@ -3571,6 +3596,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
        lock_rsb(r);
        switch (error) {
+        case -EBADR:
+                /* There's a chance the new master received our lock before
+                   dlm_recover_master_reply(), this wouldn't happen if we did
+                   a barrier between recover_masters and recover_locks. */
+                log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id,
+                          (unsigned long)r, r->res_name);
+                dlm_send_rcom_lock(r, lkb);
+                goto out;
        case -EEXIST:
                log_debug(ls, "master copy exists %x", lkb->lkb_id);
                /* fall through */
@@ -3585,7 +3618,7 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
        /* an ack for dlm_recover_locks() which waits for replies from
           all the locks it sends to new masters */
        dlm_recovered_lock(r);
+ out:
        unlock_rsb(r);
        put_rsb(r);
        dlm_put_lkb(lkb);
@@ -3610,7 +3643,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
        }
        if (flags & DLM_LKF_VALBLK) {
-                ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
                if (!ua->lksb.sb_lvbptr) {
                        kfree(ua);
                        __put_lkb(ls, lkb);
@@ -3679,7 +3712,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        ua = (struct dlm_user_args *)lkb->lkb_astparam;
        if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
-                ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
                if (!ua->lksb.sb_lvbptr) {
                        error = -ENOMEM;
                        goto out_put;
@@ -3745,12 +3778,10 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
                goto out_put;
        spin_lock(&ua->proc->locks_spin);
-        list_del_init(&lkb->lkb_ownqueue);
+        /* dlm_user_add_ast() may have already taken lkb off the proc list */
+        if (!list_empty(&lkb->lkb_ownqueue))
+                list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
        spin_unlock(&ua->proc->locks_spin);
-        /* this removes the reference for the proc->locks list added by
-           dlm_user_request */
-        unhold_lkb(lkb);
 out_put:
        dlm_put_lkb(lkb);
 out:
@@ -3790,9 +3821,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        /* this lkb was removed from the WAITING queue */
        if (lkb->lkb_grmode == DLM_LOCK_IV) {
                spin_lock(&ua->proc->locks_spin);
-                list_del_init(&lkb->lkb_ownqueue);
+                list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
                spin_unlock(&ua->proc->locks_spin);
-                unhold_lkb(lkb);
        }
 out_put:
        dlm_put_lkb(lkb);
@@ -3853,11 +3883,6 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
        mutex_lock(&ls->ls_clear_proc_locks);
        list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
-                if (lkb->lkb_ast_type) {
-                        list_del(&lkb->lkb_astqueue);
-                        unhold_lkb(lkb);
-                }
                list_del_init(&lkb->lkb_ownqueue);
                if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
@@ -3874,6 +3899,20 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
                dlm_put_lkb(lkb);
        }
+        /* in-progress unlocks */
+        list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+                list_del_init(&lkb->lkb_ownqueue);
+                lkb->lkb_flags |= DLM_IFL_DEAD;
+                dlm_put_lkb(lkb);
+        }
+        list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+                list_del(&lkb->lkb_astqueue);
+                dlm_put_lkb(lkb);
+        }
        mutex_unlock(&ls->ls_clear_proc_locks);
        unlock_recovery(ls);
 }
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 59012b089e8d..f40817b53c6f 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -236,7 +236,7 @@ static int dlm_scand(void *data)
        while (!kthread_should_stop()) {
                list_for_each_entry(ls, &lslist, ls_list)
                        dlm_scan_rsbs(ls);
-                schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
+                schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
        }
        return 0;
 }
@@ -422,7 +422,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        ls->ls_count = 0;
        ls->ls_flags = 0;
-        size = dlm_config.rsbtbl_size;
+        size = dlm_config.ci_rsbtbl_size;
        ls->ls_rsbtbl_size = size;
        ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
@@ -434,7 +434,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
                rwlock_init(&ls->ls_rsbtbl[i].lock);
        }
-        size = dlm_config.lkbtbl_size;
+        size = dlm_config.ci_lkbtbl_size;
        ls->ls_lkbtbl_size = size;
        ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
@@ -446,7 +446,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
                ls->ls_lkbtbl[i].counter = 1;
        }
-        size = dlm_config.dirtbl_size;
+        size = dlm_config.ci_dirtbl_size;
        ls->ls_dirtbl_size = size;
        ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
@@ -489,7 +489,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        mutex_init(&ls->ls_requestqueue_mutex);
        mutex_init(&ls->ls_clear_proc_locks);
-        ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+        ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
        if (!ls->ls_recover_buf)
                goto out_dirfree;
diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c
index fe158d7a9285..dc83a9d979b5 100644
--- a/fs/dlm/lowcomms-sctp.c
+++ b/fs/dlm/lowcomms-sctp.c
@@ -72,6 +72,8 @@ struct nodeinfo {
        struct list_head        writequeue; /* outgoing writequeue_entries */
        spinlock_t              writequeue_lock;
        int                     nodeid;
+        struct work_struct      swork; /* Send workqueue */
+        struct work_struct      lwork; /* Locking workqueue */
 };
 static DEFINE_IDR(nodeinfo_idr);
@@ -96,6 +98,7 @@ struct connection {
        atomic_t                waiting_requests;
        struct cbuf             cb;
        int                     eagain_flag;
+        struct work_struct      work; /* Send workqueue */
 };
 /* An entry waiting to be sent */
@@ -137,19 +140,23 @@ static void cbuf_eat(struct cbuf *cb, int n)
 static LIST_HEAD(write_nodes);
 static DEFINE_SPINLOCK(write_nodes_lock);
 /* Maximum number of incoming messages to process before
 * doing a schedule()
 */
 #define MAX_RX_MSG_COUNT 25
-/* Manage daemons */
+/* Work queues */
-static struct task_struct *recv_task;
+static struct workqueue_struct *recv_workqueue;
-static struct task_struct *send_task;
+static struct workqueue_struct *send_workqueue;
-static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_wait);
+static struct workqueue_struct *lock_workqueue;
 /* The SCTP connection */
 static struct connection sctp_con;
+static void process_send_sockets(struct work_struct *work);
+static void process_recv_sockets(struct work_struct *work);
+static void process_lock_request(struct work_struct *work);
 static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
 {
@@ -222,6 +229,8 @@ static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
        spin_lock_init(&ni->lock);
        INIT_LIST_HEAD(&ni->writequeue);
        spin_lock_init(&ni->writequeue_lock);
+        INIT_WORK(&ni->lwork, process_lock_request);
+        INIT_WORK(&ni->swork, process_send_sockets);
        ni->nodeid = nodeid;
        if (nodeid > max_nodeid)
@@ -249,11 +258,8 @@ static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
 /* Data or notification available on socket */
 static void lowcomms_data_ready(struct sock *sk, int count_unused)
 {
-        atomic_inc(&sctp_con.waiting_requests);
        if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
-                return;
+                queue_work(recv_workqueue, &sctp_con.work);
-        wake_up_interruptible(&lowcomms_recv_wait);
 }
@@ -361,10 +367,10 @@ static void init_failed(void)
                                spin_lock_bh(&write_nodes_lock);
                                list_add_tail(&ni->write_list, &write_nodes);
                                spin_unlock_bh(&write_nodes_lock);
+                                queue_work(send_workqueue, &ni->swork);
                        }
                }
        }
-        wake_up_process(send_task);
 }
 /* Something happened to an association */
@@ -446,8 +452,8 @@ static void process_sctp_notification(struct msghdr *msg, char *buf)
                                spin_lock_bh(&write_nodes_lock);
                                list_add_tail(&ni->write_list, &write_nodes);
                                spin_unlock_bh(&write_nodes_lock);
+                                queue_work(send_workqueue, &ni->swork);
                        }
-                        wake_up_process(send_task);
                }
                break;
@@ -580,8 +586,8 @@ static int receive_from_sock(void)
                                spin_lock_bh(&write_nodes_lock);
                                list_add_tail(&ni->write_list, &write_nodes);
                                spin_unlock_bh(&write_nodes_lock);
+                                queue_work(send_workqueue, &ni->swork);
                        }
-                        wake_up_process(send_task);
                }
        }
@@ -590,6 +596,7 @@ static int receive_from_sock(void)
                return 0;
        cbuf_add(&sctp_con.cb, ret);
+        // PJC: TODO: Add to node's workqueue....can we ??
        ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
                                          page_address(sctp_con.rx_page),
                                          sctp_con.cb.base, sctp_con.cb.len,
@@ -635,7 +642,7 @@ static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
        if (result < 0)
                log_print("Can't bind to port %d addr number %d",
-                          dlm_config.tcp_port, num);
+                          dlm_config.ci_tcp_port, num);
        return result;
 }
@@ -711,7 +718,7 @@ static int init_sock(void)
        /* Bind to all interfaces. */
        for (i = 0; i < dlm_local_count; i++) {
                memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
-                make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
+                make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
                result = add_bind_addr(&localaddr, addr_len, num);
                if (result)
@@ -820,7 +827,8 @@ void dlm_lowcomms_commit_buffer(void *arg)
                spin_lock_bh(&write_nodes_lock);
                list_add_tail(&ni->write_list, &write_nodes);
                spin_unlock_bh(&write_nodes_lock);
-                wake_up_process(send_task);
+                queue_work(send_workqueue, &ni->swork);
        }
        return;
@@ -863,7 +871,7 @@ static void initiate_association(int nodeid)
                return;
        }
-        make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
+        make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
        outmessage.msg_name = &rem_addr;
        outmessage.msg_namelen = addrlen;
@@ -1088,101 +1096,75 @@ int dlm_lowcomms_close(int nodeid)
        return 0;
 }
-static int write_list_empty(void)
+// PJC: The work queue function for receiving.
+static void process_recv_sockets(struct work_struct *work)
 {
-        int status;
+        if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
+                int ret;
-        spin_lock_bh(&write_nodes_lock);
-        status = list_empty(&write_nodes);
-        spin_unlock_bh(&write_nodes_lock);
-        return status;
-}
-static int dlm_recvd(void *data)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        while (!kthread_should_stop()) {
                int count = 0;
-                set_current_state(TASK_INTERRUPTIBLE);
+                do {
-                add_wait_queue(&lowcomms_recv_wait, &wait);
+                        ret = receive_from_sock();
-                if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
-                        cond_resched();
-                remove_wait_queue(&lowcomms_recv_wait, &wait);
-                set_current_state(TASK_RUNNING);
-                if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
-                        int ret;
-                        do {
-                                ret = receive_from_sock();
-                                /* Don't starve out everyone else */
+                        /* Don't starve out everyone else */
-                                if (++count >= MAX_RX_MSG_COUNT) {
+                        if (++count >= MAX_RX_MSG_COUNT) {
-                                        cond_resched();
+                                cond_resched();
-                                        count = 0;
+                                count = 0;
-                                }
+                        }
-                        } while (!kthread_should_stop() && ret >=0);
+                } while (!kthread_should_stop() && ret >=0);
-                }
-                cond_resched();
        }
+        cond_resched();
-        return 0;
 }
-static int dlm_sendd(void *data)
+// PJC: the work queue function for sending
+static void process_send_sockets(struct work_struct *work)
 {
-        DECLARE_WAITQUEUE(wait, current);
+        if (sctp_con.eagain_flag) {
+                sctp_con.eagain_flag = 0;
-        add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+                refill_write_queue();
-        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (write_list_empty())
-                        cond_resched();
-                set_current_state(TASK_RUNNING);
-                if (sctp_con.eagain_flag) {
-                        sctp_con.eagain_flag = 0;
-                        refill_write_queue();
-                }
-                process_output_queue();
        }
+        process_output_queue();
+}
-        remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+// PJC: Process lock requests from a particular node.
+// TODO: can we optimise this out on UP ??
-        return 0;
+static void process_lock_request(struct work_struct *work)
+{
 }
 static void daemons_stop(void)
 {
-        kthread_stop(recv_task);
+        destroy_workqueue(recv_workqueue);
-        kthread_stop(send_task);
+        destroy_workqueue(send_workqueue);
+        destroy_workqueue(lock_workqueue);
 }
 static int daemons_start(void)
 {
-        struct task_struct *p;
        int error;
+        recv_workqueue = create_workqueue("dlm_recv");
+        error = IS_ERR(recv_workqueue);
+        if (error) {
+                log_print("can't start dlm_recv %d", error);
+                return error;
+        }
-        p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+        send_workqueue = create_singlethread_workqueue("dlm_send");
-        error = IS_ERR(p);
+        error = IS_ERR(send_workqueue);
        if (error) {
-                log_print("can't start dlm_recvd %d", error);
+                log_print("can't start dlm_send %d", error);
+                destroy_workqueue(recv_workqueue);
                return error;
        }
-        recv_task = p;
-        p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+        lock_workqueue = create_workqueue("dlm_rlock");
-        error = IS_ERR(p);
+        error = IS_ERR(lock_workqueue);
        if (error) {
-                log_print("can't start dlm_sendd %d", error);
+                log_print("can't start dlm_rlock %d", error);
-                kthread_stop(recv_task);
+                destroy_workqueue(send_workqueue);
+                destroy_workqueue(recv_workqueue);
                return error;
        }
-        send_task = p;
        return 0;
 }
@@ -1194,6 +1176,8 @@ int dlm_lowcomms_start(void)
 {
        int error;
+        INIT_WORK(&sctp_con.work, process_recv_sockets);
        error = init_sock();
        if (error)
                goto fail_sock;
@@ -1224,4 +1208,3 @@ void dlm_lowcomms_stop(void)
        for (i = 0; i < dlm_local_count; i++)
                kfree(dlm_local_addr[i]);
 }
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c
index 9be3a440c42a..f1efd17b2614 100644
--- a/fs/dlm/lowcomms-tcp.c
+++ b/fs/dlm/lowcomms-tcp.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -96,10 +96,7 @@ static bool cbuf_empty(struct cbuf *cb)
 struct connection {
        struct socket *sock;    /* NULL if not connected */
        uint32_t nodeid;        /* So we know who we are in the list */
-        struct rw_semaphore sock_sem; /* Stop connect races */
+        struct mutex sock_mutex;
-        struct list_head read_list;   /* On this list when ready for reading */
-        struct list_head write_list;  /* On this list when ready for writing */
-        struct list_head state_list;  /* On this list when ready to connect */
        unsigned long flags;    /* bit 1,2 = We are on the read/write lists */
 #define CF_READ_PENDING 1
 #define CF_WRITE_PENDING 2
@@ -112,9 +109,10 @@ struct connection {
        struct page *rx_page;
        struct cbuf cb;
        int retries;
-        atomic_t waiting_requests;
 #define MAX_CONNECT_RETRIES 3
        struct connection *othercon;
+        struct work_struct rwork; /* Receive workqueue */
+        struct work_struct swork; /* Send workqueue */
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -131,14 +129,9 @@ struct writequeue_entry {
 static struct sockaddr_storage dlm_local_addr;
-/* Manage daemons */
+/* Work queues */
-static struct task_struct *recv_task;
+static struct workqueue_struct *recv_workqueue;
-static struct task_struct *send_task;
+static struct workqueue_struct *send_workqueue;
-static wait_queue_t lowcomms_send_waitq_head;
-static DECLARE_WAIT_QUEUE_HEAD(lowcomms_send_waitq);
-static wait_queue_t lowcomms_recv_waitq_head;
-static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_waitq);
 /* An array of pointers to connections, indexed by NODEID */
 static struct connection **connections;
@@ -146,17 +139,8 @@ static DECLARE_MUTEX(connections_lock);
 static struct kmem_cache *con_cache;
 static int conn_array_size;
-/* List of sockets that have reads pending */
+static void process_recv_sockets(struct work_struct *work);
-static LIST_HEAD(read_sockets);
+static void process_send_sockets(struct work_struct *work);
-static DEFINE_SPINLOCK(read_sockets_lock);
-/* List of sockets which have writes pending */
-static LIST_HEAD(write_sockets);
-static DEFINE_SPINLOCK(write_sockets_lock);
-/* List of sockets which have connects pending */
-static LIST_HEAD(state_sockets);
-static DEFINE_SPINLOCK(state_sockets_lock);
 static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 {
@@ -186,9 +170,11 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
                        goto finish;
                con->nodeid = nodeid;
-                init_rwsem(&con->sock_sem);
+                mutex_init(&con->sock_mutex);
                INIT_LIST_HEAD(&con->writequeue);
                spin_lock_init(&con->writequeue_lock);
+                INIT_WORK(&con->swork, process_send_sockets);
+                INIT_WORK(&con->rwork, process_recv_sockets);
                connections[nodeid] = con;
        }
@@ -203,41 +189,22 @@ static void lowcomms_data_ready(struct sock *sk, int count_unused)
 {
        struct connection *con = sock2con(sk);
-        atomic_inc(&con->waiting_requests);
+        if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
-        if (test_and_set_bit(CF_READ_PENDING, &con->flags))
+                queue_work(recv_workqueue, &con->rwork);
-                return;
-        spin_lock_bh(&read_sockets_lock);
-        list_add_tail(&con->read_list, &read_sockets);
-        spin_unlock_bh(&read_sockets_lock);
-        wake_up_interruptible(&lowcomms_recv_waitq);
 }
 static void lowcomms_write_space(struct sock *sk)
 {
        struct connection *con = sock2con(sk);
-        if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
-                return;
+                queue_work(send_workqueue, &con->swork);
-        spin_lock_bh(&write_sockets_lock);
-        list_add_tail(&con->write_list, &write_sockets);
-        spin_unlock_bh(&write_sockets_lock);
-        wake_up_interruptible(&lowcomms_send_waitq);
 }
 static inline void lowcomms_connect_sock(struct connection *con)
 {
-        if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
+        if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
-                return;
+                queue_work(send_workqueue, &con->swork);
-        spin_lock_bh(&state_sockets_lock);
-        list_add_tail(&con->state_list, &state_sockets);
-        spin_unlock_bh(&state_sockets_lock);
-        wake_up_interruptible(&lowcomms_send_waitq);
 }
 static void lowcomms_state_change(struct sock *sk)
@@ -279,7 +246,7 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 /* Close a remote connection and tidy up */
 static void close_connection(struct connection *con, bool and_other)
 {
-        down_write(&con->sock_sem);
+        mutex_lock(&con->sock_mutex);
        if (con->sock) {
                sock_release(con->sock);
@@ -294,7 +261,7 @@ static void close_connection(struct connection *con, bool and_other)
                con->rx_page = NULL;
        }
        con->retries = 0;
-        up_write(&con->sock_sem);
+        mutex_unlock(&con->sock_mutex);
 }
 /* Data received from remote end */
@@ -308,10 +275,13 @@ static int receive_from_sock(struct connection *con)
        int r;
        int call_again_soon = 0;
-        down_read(&con->sock_sem);
+        mutex_lock(&con->sock_mutex);
+        if (con->sock == NULL) {
+                ret = -EAGAIN;
+                goto out_close;
+        }
-        if (con->sock == NULL)
-                goto out;
        if (con->rx_page == NULL) {
                /*
                 * This doesn't need to be atomic, but I think it should
@@ -359,6 +329,9 @@ static int receive_from_sock(struct connection *con)
        if (ret <= 0)
                goto out_close;
+        if (ret == -EAGAIN)
+                goto out_resched;
        if (ret == len)
                call_again_soon = 1;
        cbuf_add(&con->cb, ret);
@@ -381,24 +354,26 @@ static int receive_from_sock(struct connection *con)
                con->rx_page = NULL;
        }
-out:
        if (call_again_soon)
                goto out_resched;
-        up_read(&con->sock_sem);
+        mutex_unlock(&con->sock_mutex);
        return 0;
 out_resched:
-        lowcomms_data_ready(con->sock->sk, 0);
+        if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
-        up_read(&con->sock_sem);
+                queue_work(recv_workqueue, &con->rwork);
-        cond_resched();
+        mutex_unlock(&con->sock_mutex);
-        return 0;
+        return -EAGAIN;
 out_close:
-        up_read(&con->sock_sem);
+        mutex_unlock(&con->sock_mutex);
        if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
                close_connection(con, false);
                /* Reconnect when there is something to send */
        }
+        /* Don't return success if we really got EOF */
+        if (ret == 0)
+                ret = -EAGAIN;
        return ret;
 }
@@ -412,6 +387,7 @@ static int accept_from_sock(struct connection *con)
        int len;
        int nodeid;
        struct connection *newcon;
+        struct connection *addcon;
        memset(&peeraddr, 0, sizeof(peeraddr));
        result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
@@ -419,7 +395,7 @@ static int accept_from_sock(struct connection *con)
        if (result < 0)
                return -ENOMEM;
-        down_read(&con->sock_sem);
+        mutex_lock_nested(&con->sock_mutex, 0);
        result = -ENOTCONN;
        if (con->sock == NULL)
@@ -445,7 +421,7 @@ static int accept_from_sock(struct connection *con)
        if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
                printk("dlm: connect from non cluster node\n");
                sock_release(newsock);
-                up_read(&con->sock_sem);
+                mutex_unlock(&con->sock_mutex);
                return -1;
        }
@@ -462,7 +438,7 @@ static int accept_from_sock(struct connection *con)
                result = -ENOMEM;
                goto accept_err;
        }
-        down_write(&newcon->sock_sem);
+        mutex_lock_nested(&newcon->sock_mutex, 1);
        if (newcon->sock) {
                struct connection *othercon = newcon->othercon;
@@ -470,41 +446,45 @@ static int accept_from_sock(struct connection *con)
                        othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
                        if (!othercon) {
                                printk("dlm: failed to allocate incoming socket\n");
-                                up_write(&newcon->sock_sem);
+                                mutex_unlock(&newcon->sock_mutex);
                                result = -ENOMEM;
                                goto accept_err;
                        }
                        othercon->nodeid = nodeid;
                        othercon->rx_action = receive_from_sock;
-                        init_rwsem(&othercon->sock_sem);
+                        mutex_init(&othercon->sock_mutex);
+                        INIT_WORK(&othercon->swork, process_send_sockets);
+                        INIT_WORK(&othercon->rwork, process_recv_sockets);
                        set_bit(CF_IS_OTHERCON, &othercon->flags);
                        newcon->othercon = othercon;
                }
                othercon->sock = newsock;
                newsock->sk->sk_user_data = othercon;
                add_sock(newsock, othercon);
+                addcon = othercon;
        }
        else {
                newsock->sk->sk_user_data = newcon;
                newcon->rx_action = receive_from_sock;
                add_sock(newsock, newcon);
+                addcon = newcon;
        }
-        up_write(&newcon->sock_sem);
+        mutex_unlock(&newcon->sock_mutex);
        /*
         * Add it to the active queue in case we got data
         * beween processing the accept adding the socket
         * to the read_sockets list
         */
-        lowcomms_data_ready(newsock->sk, 0);
+        if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
-        up_read(&con->sock_sem);
+                queue_work(recv_workqueue, &addcon->rwork);
+        mutex_unlock(&con->sock_mutex);
        return 0;
 accept_err:
-        up_read(&con->sock_sem);
+        mutex_unlock(&con->sock_mutex);
        sock_release(newsock);
        if (result != -EAGAIN)
@@ -525,7 +505,7 @@ static void connect_to_sock(struct connection *con)
                return;
        }
-        down_write(&con->sock_sem);
+        mutex_lock(&con->sock_mutex);
        if (con->retries++ > MAX_CONNECT_RETRIES)
                goto out;
@@ -548,7 +528,7 @@ static void connect_to_sock(struct connection *con)
        sock->sk->sk_user_data = con;
        con->rx_action = receive_from_sock;
-        make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
+        make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
        add_sock(sock, con);
@@ -577,7 +557,7 @@ out_err:
                result = 0;
        }
 out:
-        up_write(&con->sock_sem);
+        mutex_unlock(&con->sock_mutex);
        return;
 }
@@ -616,10 +596,10 @@ static struct socket *create_listen_sock(struct connection *con,
        con->sock = sock;
        /* Bind to our port */
-        make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
+        make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
        result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
        if (result < 0) {
-                printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
+                printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port);
                sock_release(sock);
                sock = NULL;
                con->sock = NULL;
@@ -638,7 +618,7 @@ static struct socket *create_listen_sock(struct connection *con,
        result = sock->ops->listen(sock, 5);
        if (result < 0) {
-                printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
+                printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port);
                sock_release(sock);
                sock = NULL;
                goto create_out;
@@ -709,6 +689,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len,
        if (!con)
                return NULL;
+        spin_lock(&con->writequeue_lock);
        e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
        if ((&e->list == &con->writequeue) ||
            (PAGE_CACHE_SIZE - e->end < len)) {
@@ -747,6 +728,7 @@ void dlm_lowcomms_commit_buffer(void *mh)
        struct connection *con = e->con;
        int users;
+        spin_lock(&con->writequeue_lock);
        users = --e->users;
        if (users)
                goto out;
@@ -754,12 +736,8 @@ void dlm_lowcomms_commit_buffer(void *mh)
        kunmap(e->page);
        spin_unlock(&con->writequeue_lock);
-        if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
+        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
-                spin_lock_bh(&write_sockets_lock);
+                queue_work(send_workqueue, &con->swork);
-                list_add_tail(&con->write_list, &write_sockets);
-                spin_unlock_bh(&write_sockets_lock);
-                wake_up_interruptible(&lowcomms_send_waitq);
        }
        return;
@@ -783,7 +761,7 @@ static void send_to_sock(struct connection *con)
        struct writequeue_entry *e;
        int len, offset;
-        down_read(&con->sock_sem);
+        mutex_lock(&con->sock_mutex);
        if (con->sock == NULL)
                goto out_connect;
@@ -800,6 +778,7 @@ static void send_to_sock(struct connection *con)
                offset = e->offset;
                BUG_ON(len == 0 && e->users == 0);
                spin_unlock(&con->writequeue_lock);
+                kmap(e->page);
                ret = 0;
                if (len) {
@@ -828,18 +807,18 @@ static void send_to_sock(struct connection *con)
        }
        spin_unlock(&con->writequeue_lock);
 out:
-        up_read(&con->sock_sem);
+        mutex_unlock(&con->sock_mutex);
        return;
 send_error:
-        up_read(&con->sock_sem);
+        mutex_unlock(&con->sock_mutex);
        close_connection(con, false);
        lowcomms_connect_sock(con);
        return;
 out_connect:
-        up_read(&con->sock_sem);
+        mutex_unlock(&con->sock_mutex);
-        lowcomms_connect_sock(con);
+        connect_to_sock(con);
        return;
 }
@@ -872,7 +851,6 @@ int dlm_lowcomms_close(int nodeid)
        if (con) {
                clean_one_writequeue(con);
                close_connection(con, true);
-                atomic_set(&con->waiting_requests, 0);
        }
        return 0;
@@ -880,102 +858,29 @@ out:
        return -1;
 }
-/* API send message call, may queue the request */
-/* N.B. This is the old interface - use the new one for new calls */
-int lowcomms_send_message(int nodeid, char *buf, int len, gfp_t allocation)
-{
-        struct writequeue_entry *e;
-        char *b;
-        e = dlm_lowcomms_get_buffer(nodeid, len, allocation, &b);
-        if (e) {
-                memcpy(b, buf, len);
-                dlm_lowcomms_commit_buffer(e);
-                return 0;
-        }
-        return -ENOBUFS;
-}
 /* Look for activity on active sockets */
-static void process_sockets(void)
+static void process_recv_sockets(struct work_struct *work)
 {
-        struct list_head *list;
+        struct connection *con = container_of(work, struct connection, rwork);
-        struct list_head *temp;
+        int err;
-        int count = 0;
-        spin_lock_bh(&read_sockets_lock);
-        list_for_each_safe(list, temp, &read_sockets) {
-                struct connection *con =
+        clear_bit(CF_READ_PENDING, &con->flags);
-                        list_entry(list, struct connection, read_list);
+        do {
-                list_del(&con->read_list);
+                err = con->rx_action(con);
-                clear_bit(CF_READ_PENDING, &con->flags);
+        } while (!err);
-                spin_unlock_bh(&read_sockets_lock);
-                /* This can reach zero if we are processing requests
-                 * as they come in.
-                 */
-                if (atomic_read(&con->waiting_requests) == 0) {
-                        spin_lock_bh(&read_sockets_lock);
-                        continue;
-                }
-                do {
-                        con->rx_action(con);
-                        /* Don't starve out everyone else */
-                        if (++count >= MAX_RX_MSG_COUNT) {
-                                cond_resched();
-                                count = 0;
-                        }
-                } while (!atomic_dec_and_test(&con->waiting_requests) &&
-                         !kthread_should_stop());
-                spin_lock_bh(&read_sockets_lock);
-        }
-        spin_unlock_bh(&read_sockets_lock);
 }
-/* Try to send any messages that are pending
- */
-static void process_output_queue(void)
-{
-        struct list_head *list;
-        struct list_head *temp;
-        spin_lock_bh(&write_sockets_lock);
-        list_for_each_safe(list, temp, &write_sockets) {
-                struct connection *con =
-                        list_entry(list, struct connection, write_list);
-                clear_bit(CF_WRITE_PENDING, &con->flags);
-                list_del(&con->write_list);
-                spin_unlock_bh(&write_sockets_lock);
-                send_to_sock(con);
-                spin_lock_bh(&write_sockets_lock);
-        }
-        spin_unlock_bh(&write_sockets_lock);
-}
-static void process_state_queue(void)
+static void process_send_sockets(struct work_struct *work)
 {
-        struct list_head *list;
+        struct connection *con = container_of(work, struct connection, swork);
-        struct list_head *temp;
-        spin_lock_bh(&state_sockets_lock);
-        list_for_each_safe(list, temp, &state_sockets) {
-                struct connection *con =
-                        list_entry(list, struct connection, state_list);
-                list_del(&con->state_list);
-                clear_bit(CF_CONNECT_PENDING, &con->flags);
-                spin_unlock_bh(&state_sockets_lock);
+        if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
                connect_to_sock(con);
-                spin_lock_bh(&state_sockets_lock);
        }
-        spin_unlock_bh(&state_sockets_lock);
+        clear_bit(CF_WRITE_PENDING, &con->flags);
+        send_to_sock(con);
 }
@@ -992,109 +897,33 @@ static void clean_writequeues(void)
        }
 }
-static int read_list_empty(void)
+static void work_stop(void)
 {
-        int status;
+        destroy_workqueue(recv_workqueue);
+        destroy_workqueue(send_workqueue);
-        spin_lock_bh(&read_sockets_lock);
-        status = list_empty(&read_sockets);
-        spin_unlock_bh(&read_sockets_lock);
-        return status;
-}
-/* DLM Transport comms receive daemon */
-static int dlm_recvd(void *data)
-{
-        init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
-        add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
-        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (read_list_empty())
-                        cond_resched();
-                set_current_state(TASK_RUNNING);
-                process_sockets();
-        }
-        return 0;
 }
-static int write_and_state_lists_empty(void)
+static int work_start(void)
 {
-        int status;
-        spin_lock_bh(&write_sockets_lock);
-        status = list_empty(&write_sockets);
-        spin_unlock_bh(&write_sockets_lock);
-        spin_lock_bh(&state_sockets_lock);
-        if (list_empty(&state_sockets) == 0)
-                status = 0;
-        spin_unlock_bh(&state_sockets_lock);
-        return status;
-}
-/* DLM Transport send daemon */
-static int dlm_sendd(void *data)
-{
-        init_waitqueue_entry(&lowcomms_send_waitq_head, current);
-        add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
-        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (write_and_state_lists_empty())
-                        cond_resched();
-                set_current_state(TASK_RUNNING);
-                process_state_queue();
-                process_output_queue();
-        }
-        return 0;
-}
-static void daemons_stop(void)
-{
-        kthread_stop(recv_task);
-        kthread_stop(send_task);
-}
-static int daemons_start(void)
-{
-        struct task_struct *p;
        int error;
+        recv_workqueue = create_workqueue("dlm_recv");
-        p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+        error = IS_ERR(recv_workqueue);
-        error = IS_ERR(p);
        if (error) {
-                log_print("can't start dlm_recvd %d", error);
+                log_print("can't start dlm_recv %d", error);
                return error;
        }
-        recv_task = p;
-        p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+        send_workqueue = create_singlethread_workqueue("dlm_send");
-        error = IS_ERR(p);
+        error = IS_ERR(send_workqueue);
        if (error) {
-                log_print("can't start dlm_sendd %d", error);
+                log_print("can't start dlm_send %d", error);
-                kthread_stop(recv_task);
+                destroy_workqueue(recv_workqueue);
                return error;
        }
-        send_task = p;
        return 0;
 }
-/*
- * Return the largest buffer size we can cope with.
- */
-int lowcomms_max_buffer_size(void)
-{
-        return PAGE_CACHE_SIZE;
-}
 void dlm_lowcomms_stop(void)
 {
        int i;
@@ -1107,7 +936,7 @@ void dlm_lowcomms_stop(void)
                        connections[i]->flags |= 0xFF;
        }
-        daemons_stop();
+        work_stop();
        clean_writequeues();
        for (i = 0; i < conn_array_size; i++) {
@@ -1159,7 +988,7 @@ int dlm_lowcomms_start(void)
        if (error)
                goto fail_unlisten;
-        error = daemons_start();
+        error = work_start();
        if (error)
                goto fail_unlisten;
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index c9b1c3d535f4..a5126e0c68a6 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -82,7 +82,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
                if (msglen < sizeof(struct dlm_header))
                        break;
                err = -E2BIG;
-                if (msglen > dlm_config.buffer_size) {
+                if (msglen > dlm_config.ci_buffer_size) {
                        log_print("message size %d from %d too big, buf len %d",
                                  msglen, nodeid, len);
                        break;
@@ -103,7 +103,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
                if (msglen > sizeof(__tmp) &&
                    msg == (struct dlm_header *) __tmp) {
-                        msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+                        msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
                        if (msg == NULL)
                                return ret;
                }
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 4cc31be9cd9d..6bfbd6153809 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -56,6 +56,10 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
        rc->rc_type = type;
+        spin_lock(&ls->ls_recover_lock);
+        rc->rc_seq = ls->ls_recover_seq;
+        spin_unlock(&ls->ls_recover_lock);
        *mh_ret = mh;
        *rc_ret = rc;
        return 0;
@@ -78,8 +82,17 @@ static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
        rf->rf_lsflags = ls->ls_exflags;
 }
-static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
+static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 {
+        struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
+        if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
+                log_error(ls, "version mismatch: %x nodeid %d: %x",
+                          DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
+                          rc->rc_header.h_version);
+                return -EINVAL;
+        }
        if (rf->rf_lvblen != ls->ls_lvblen ||
            rf->rf_lsflags != ls->ls_exflags) {
                log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -125,7 +138,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
                goto out;
        allow_sync_reply(ls, &rc->rc_id);
-        memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+        memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
        send_rcom(ls, mh, rc);
@@ -141,8 +154,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
                log_debug(ls, "remote node %d not ready", nodeid);
                rc->rc_result = 0;
        } else
-                error = check_config(ls, (struct rcom_config *) rc->rc_buf,
+                error = check_config(ls, rc, nodeid);
-                                     nodeid);
        /* the caller looks at rc_result for the remote recovery status */
 out:
        return error;
@@ -159,6 +171,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
        if (error)
                return;
        rc->rc_id = rc_in->rc_id;
+        rc->rc_seq_reply = rc_in->rc_seq;
        rc->rc_result = dlm_recover_status(ls);
        make_config(ls, (struct rcom_config *) rc->rc_buf);
@@ -200,7 +213,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
        if (nodeid == dlm_our_nodeid()) {
                dlm_copy_master_names(ls, last_name, last_len,
                                      ls->ls_recover_buf + len,
-                                      dlm_config.buffer_size - len, nodeid);
+                                      dlm_config.ci_buffer_size - len, nodeid);
                goto out;
        }
@@ -210,7 +223,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
        memcpy(rc->rc_buf, last_name, last_len);
        allow_sync_reply(ls, &rc->rc_id);
-        memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+        memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
        send_rcom(ls, mh, rc);
@@ -224,30 +237,17 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 {
        struct dlm_rcom *rc;
        struct dlm_mhandle *mh;
-        int error, inlen, outlen;
+        int error, inlen, outlen, nodeid;
-        int nodeid = rc_in->rc_header.h_nodeid;
-        uint32_t status = dlm_recover_status(ls);
-        /*
-         * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
-         * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
-         * It could only happen in rare cases where we get a late NAMES
-         * message from a previous instance of recovery.
-         */
-        if (!(status & DLM_RS_NODES)) {
-                log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
-                return;
-        }
        nodeid = rc_in->rc_header.h_nodeid;
        inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
-        outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+        outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
        error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
        if (error)
                return;
        rc->rc_id = rc_in->rc_id;
+        rc->rc_seq_reply = rc_in->rc_seq;
        dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
                              nodeid);
@@ -294,6 +294,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
                ret_nodeid = error;
        rc->rc_result = ret_nodeid;
        rc->rc_id = rc_in->rc_id;
+        rc->rc_seq_reply = rc_in->rc_seq;
        send_rcom(ls, mh, rc);
 }
@@ -375,20 +376,13 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
        memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
        rc->rc_id = rc_in->rc_id;
+        rc->rc_seq_reply = rc_in->rc_seq;
        send_rcom(ls, mh, rc);
 }
 static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 {
-        uint32_t status = dlm_recover_status(ls);
-        if (!(status & DLM_RS_DIR)) {
-                log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
-                          rc_in->rc_header.h_nodeid);
-                return;
-        }
        dlm_recover_process_copy(ls, rc_in);
 }
@@ -415,6 +409,7 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
        rc->rc_type = DLM_RCOM_STATUS_REPLY;
        rc->rc_id = rc_in->rc_id;
+        rc->rc_seq_reply = rc_in->rc_seq;
        rc->rc_result = -ESRCH;
        rf = (struct rcom_config *) rc->rc_buf;
@@ -426,6 +421,31 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
        return 0;
 }
+static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+        uint64_t seq;
+        int rv = 0;
+        switch (rc->rc_type) {
+        case DLM_RCOM_STATUS_REPLY:
+        case DLM_RCOM_NAMES_REPLY:
+        case DLM_RCOM_LOOKUP_REPLY:
+        case DLM_RCOM_LOCK_REPLY:
+                spin_lock(&ls->ls_recover_lock);
+                seq = ls->ls_recover_seq;
+                spin_unlock(&ls->ls_recover_lock);
+                if (rc->rc_seq_reply != seq) {
+                        log_debug(ls, "ignoring old reply %x from %d "
+                                      "seq_reply %llx expect %llx",
+                                      rc->rc_type, rc->rc_header.h_nodeid,
+                                      (unsigned long long)rc->rc_seq_reply,
+                                      (unsigned long long)seq);
+                        rv = 1;
+                }
+        }
+        return rv;
+}
 /* Called by dlm_recvd; corresponds to dlm_receive_message() but special
   recovery-only comms are sent through here. */
@@ -449,11 +469,14 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
        }
        if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
-                log_error(ls, "ignoring recovery message %x from %d",
+                log_debug(ls, "ignoring recovery message %x from %d",
                          rc->rc_type, nodeid);
                goto out;
        }
+        if (is_old_reply(ls, rc))
+                goto out;
        if (nodeid != rc->rc_header.h_nodeid) {
                log_error(ls, "bad rcom nodeid %d from %d",
                          rc->rc_header.h_nodeid, nodeid);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index cf9f6831bab5..c2cc7694cd16 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -44,7 +44,7 @@
 static void dlm_wait_timer_fn(unsigned long data)
 {
        struct dlm_ls *ls = (struct dlm_ls *) data;
-        mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
+        mod_timer(&ls->ls_timer, jiffies + (dlm_config.ci_recover_timer * HZ));
        wake_up(&ls->ls_wait_general);
 }
@@ -55,7 +55,7 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
        init_timer(&ls->ls_timer);
        ls->ls_timer.function = dlm_wait_timer_fn;
        ls->ls_timer.data = (long) ls;
-        ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
+        ls->ls_timer.expires = jiffies + (dlm_config.ci_recover_timer * HZ);
        add_timer(&ls->ls_timer);
        wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
@@ -397,7 +397,9 @@ int dlm_recover_masters(struct dlm_ls *ls)
                if (dlm_no_directory(ls))
                        count += recover_master_static(r);
-                else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
+                else if (!is_master(r) &&
+                         (dlm_is_removed(ls, r->res_nodeid) ||
+                          rsb_flag(r, RSB_NEW_MASTER))) {
                        recover_master(r);
                        count++;
                }
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 650536aa5139..3cb636d60249 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -77,7 +77,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = dlm_recover_members(ls, rv, &neg);
        if (error) {
-                log_error(ls, "recover_members failed %d", error);
+                log_debug(ls, "recover_members failed %d", error);
                goto fail;
        }
        start = jiffies;
@@ -89,7 +89,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = dlm_recover_directory(ls);
        if (error) {
-                log_error(ls, "recover_directory failed %d", error);
+                log_debug(ls, "recover_directory failed %d", error);
                goto fail;
        }
@@ -99,7 +99,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = dlm_recover_directory_wait(ls);
        if (error) {
-                log_error(ls, "recover_directory_wait failed %d", error);
+                log_debug(ls, "recover_directory_wait failed %d", error);
                goto fail;
        }
@@ -129,7 +129,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_masters(ls);
                if (error) {
-                        log_error(ls, "recover_masters failed %d", error);
+                        log_debug(ls, "recover_masters failed %d", error);
                        goto fail;
                }
@@ -139,13 +139,13 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_locks(ls);
                if (error) {
-                        log_error(ls, "recover_locks failed %d", error);
+                        log_debug(ls, "recover_locks failed %d", error);
                        goto fail;
                }
                error = dlm_recover_locks_wait(ls);
                if (error) {
-                        log_error(ls, "recover_locks_wait failed %d", error);
+                        log_debug(ls, "recover_locks_wait failed %d", error);
                        goto fail;
                }
@@ -166,7 +166,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_locks_wait(ls);
                if (error) {
-                        log_error(ls, "recover_locks_wait failed %d", error);
+                        log_debug(ls, "recover_locks_wait failed %d", error);
                        goto fail;
                }
        }
@@ -184,7 +184,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        dlm_set_recover_status(ls, DLM_RS_DONE);
        error = dlm_recover_done_wait(ls);
        if (error) {
-                log_error(ls, "recover_done_wait failed %d", error);
+                log_debug(ls, "recover_done_wait failed %d", error);
                goto fail;
        }
@@ -192,19 +192,19 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = enable_locking(ls, rv->seq);
        if (error) {
-                log_error(ls, "enable_locking failed %d", error);
+                log_debug(ls, "enable_locking failed %d", error);
                goto fail;
        }
        error = dlm_process_requestqueue(ls);
        if (error) {
-                log_error(ls, "process_requestqueue failed %d", error);
+                log_debug(ls, "process_requestqueue failed %d", error);
                goto fail;
        }
        error = dlm_recover_waiters_post(ls);
        if (error) {
-                log_error(ls, "recover_waiters_post failed %d", error);
+                log_debug(ls, "recover_waiters_post failed %d", error);
                goto fail;
        }
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index c37e93e4f2df..d378b7fe2a1e 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -180,6 +180,14 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
            ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
                remove_ownqueue = 1;
+        /* unlocks or cancels of waiting requests need to be removed from the
+           proc's unlocking list, again there must be a better way...  */
+        if (ua->lksb.sb_status == -DLM_EUNLOCK ||
+            (ua->lksb.sb_status == -DLM_ECANCEL &&
+             lkb->lkb_grmode == DLM_LOCK_IV))
+                remove_ownqueue = 1;
        /* We want to copy the lvb to userspace when the completion
           ast is read if the status is 0, the lock has an lvb and
           lvb_ops says we should.  We could probably have set_lvb_lock()
@@ -523,6 +531,7 @@ static int device_open(struct inode *inode, struct file *file)
        proc->lockspace = ls->ls_local_handle;
        INIT_LIST_HEAD(&proc->asts);
        INIT_LIST_HEAD(&proc->locks);
+        INIT_LIST_HEAD(&proc->unlocking);
        spin_lock_init(&proc->asts_spin);
        spin_lock_init(&proc->locks_spin);
        init_waitqueue_head(&proc->wait);
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 767197db9944..963889cf6740 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -134,6 +134,8 @@ void dlm_rcom_out(struct dlm_rcom *rc)
        rc->rc_type             = cpu_to_le32(rc->rc_type);
        rc->rc_result           = cpu_to_le32(rc->rc_result);
        rc->rc_id               = cpu_to_le64(rc->rc_id);
+        rc->rc_seq              = cpu_to_le64(rc->rc_seq);
+        rc->rc_seq_reply        = cpu_to_le64(rc->rc_seq_reply);
        if (type == DLM_RCOM_LOCK)
                rcom_lock_out((struct rcom_lock *) rc->rc_buf);
@@ -151,6 +153,8 @@ void dlm_rcom_in(struct dlm_rcom *rc)
        rc->rc_type             = le32_to_cpu(rc->rc_type);
        rc->rc_result           = le32_to_cpu(rc->rc_result);
        rc->rc_id               = le64_to_cpu(rc->rc_id);
+        rc->rc_seq              = le64_to_cpu(rc->rc_seq);
+        rc->rc_seq_reply        = le64_to_cpu(rc->rc_seq_reply);
        if (rc->rc_type == DLM_RCOM_LOCK)
                rcom_lock_in((struct rcom_lock *) rc->rc_buf);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 6a2ffa2db14f..de8e64c03f73 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -4,44 +4,43 @@ config GFS2_FS
        select FS_POSIX_ACL
        select CRC32
        help
-        A cluster filesystem.
+          A cluster filesystem.
-        Allows a cluster of computers to simultaneously use a block device
+          Allows a cluster of computers to simultaneously use a block device
-        that is shared between them (with FC, iSCSI, NBD, etc...).  GFS reads
+          that is shared between them (with FC, iSCSI, NBD, etc...).  GFS reads
-        and writes to the block device like a local filesystem, but also uses
+          and writes to the block device like a local filesystem, but also uses
-        a lock module to allow the computers coordinate their I/O so
+          a lock module to allow the computers coordinate their I/O so
-        filesystem consistency is maintained.  One of the nifty features of
+          filesystem consistency is maintained.  One of the nifty features of
-        GFS is perfect consistency -- changes made to the filesystem on one
+          GFS is perfect consistency -- changes made to the filesystem on one
-        machine show up immediately on all other machines in the cluster.
+          machine show up immediately on all other machines in the cluster.
-        To use the GFS2 filesystem, you will need to enable one or more of
+          To use the GFS2 filesystem, you will need to enable one or more of
-        the below locking modules. Documentation and utilities for GFS2 can
+          the below locking modules. Documentation and utilities for GFS2 can
-        be found here: http://sources.redhat.com/cluster
+          be found here: http://sources.redhat.com/cluster
 config GFS2_FS_LOCKING_NOLOCK
        tristate "GFS2 \"nolock\" locking module"
        depends on GFS2_FS
        help
-        Single node locking module for GFS2.
+          Single node locking module for GFS2.
-        Use this module if you want to use GFS2 on a single node without
+          Use this module if you want to use GFS2 on a single node without
-        its clustering features. You can still take advantage of the
+          its clustering features. You can still take advantage of the
-        large file support, and upgrade to running a full cluster later on
+          large file support, and upgrade to running a full cluster later on
-        if required.
+          if required.
-        If you will only be using GFS2 in cluster mode, you do not need this
+          If you will only be using GFS2 in cluster mode, you do not need this
-        module.
+          module.
 config GFS2_FS_LOCKING_DLM
        tristate "GFS2 DLM locking module"
-        depends on GFS2_FS && NET && INET && (IPV6 || IPV6=n)
+        depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n)
        select IP_SCTP if DLM_SCTP
        select CONFIGFS_FS
        select DLM
        help
-        Multiple node locking module for GFS2
+          Multiple node locking module for GFS2
-        Most users of GFS2 will require this module. It provides the locking
-        interface between GFS2 and the DLM, which is required to use GFS2
-        in a cluster environment.
+          Most users of GFS2 will require this module. It provides the locking
+          interface between GFS2 and the DLM, which is required to use GFS2
+          in a cluster environment.
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 8240c1ff94f4..113f6c9110c7 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -773,7 +773,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                        gfs2_free_data(ip, bstart, blen);
        }
-        ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
+        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
        gfs2_dinode_out(ip, dibh->b_data);
@@ -848,7 +848,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
        }
        ip->i_di.di_size = size;
-        ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
+        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (error)
@@ -963,7 +963,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
        if (gfs2_is_stuffed(ip)) {
                ip->i_di.di_size = size;
-                ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
+                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
@@ -975,7 +975,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                if (!error) {
                        ip->i_di.di_size = size;
-                        ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
+                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
                        ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                        gfs2_dinode_out(ip, dibh->b_data);
@@ -1048,7 +1048,7 @@ static int trunc_end(struct gfs2_inode *ip)
                        ip->i_num.no_addr;
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        }
-        ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
+        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
        ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 0fdcb7713cd9..c93ca8f361b5 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -131,7 +131,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
        memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
        if (ip->i_di.di_size < offset + size)
                ip->i_di.di_size = offset + size;
-        ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
+        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
@@ -229,7 +229,7 @@ out:
        if (ip->i_di.di_size < offset + copied)
                ip->i_di.di_size = offset + copied;
-        ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
+        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -1198,12 +1198,11 @@ static int compare_dents(const void *a, const void *b)
 */
 static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
-                           void *opaque, gfs2_filldir_t filldir,
+                           void *opaque, filldir_t filldir,
                           const struct gfs2_dirent **darr, u32 entries,
                           int *copied)
 {
        const struct gfs2_dirent *dent, *dent_next;
-        struct gfs2_inum_host inum;
        u64 off, off_next;
        unsigned int x, y;
        int run = 0;
@@ -1240,11 +1239,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
                        *offset = off;
                }
-                gfs2_inum_in(&inum, (char *)&dent->de_inum);
                error = filldir(opaque, (const char *)(dent + 1),
                                be16_to_cpu(dent->de_name_len),
-                                off, &inum,
+                                off, be64_to_cpu(dent->de_inum.no_addr),
                                be16_to_cpu(dent->de_type));
                if (error)
                        return 1;
@@ -1262,8 +1259,8 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
 }
 static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
-                              gfs2_filldir_t filldir, int *copied,
+                              filldir_t filldir, int *copied, unsigned *depth,
-                              unsigned *depth, u64 leaf_no)
+                              u64 leaf_no)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct buffer_head *bh;
@@ -1343,7 +1340,7 @@ out:
 */
 static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
-                      gfs2_filldir_t filldir)
+                      filldir_t filldir)
 {
        struct gfs2_inode *dip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1402,7 +1399,7 @@ out:
 }
 int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-                  gfs2_filldir_t filldir)
+                  filldir_t filldir)
 {
        struct gfs2_inode *dip = GFS2_I(inode);
        struct dirent_gather g;
@@ -1568,7 +1565,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                                break;
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
                        ip->i_di.di_entries++;
-                        ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds();
+                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
                        gfs2_dinode_out(ip, bh->b_data);
                        brelse(bh);
                        error = 0;
@@ -1654,7 +1651,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
                gfs2_consist_inode(dip);
        gfs2_trans_add_bh(dip->i_gl, bh, 1);
        dip->i_di.di_entries--;
-        dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds();
+        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC;
        gfs2_dinode_out(dip, bh->b_data);
        brelse(bh);
        mark_inode_dirty(&dip->i_inode);
@@ -1702,7 +1699,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
                gfs2_trans_add_bh(dip->i_gl, bh, 1);
        }
-        dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds();
+        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC;
        gfs2_dinode_out(dip, bh->b_data);
        brelse(bh);
        return 0;
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index b21b33668a5b..48fe89046bba 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,30 +16,13 @@ struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
-/**
- * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
- * @opaque: opaque data used by the function
- * @name: the name of the directory entry
- * @length: the length of the name
- * @offset: the entry's offset in the directory
- * @inum: the inode number the entry points to
- * @type: the type of inode the entry points to
- *
- * Returns: 0 on success, 1 if buffer full
- */
-typedef int (*gfs2_filldir_t) (void *opaque,
-                              const char *name, unsigned int length,
-                              u64 offset,
-                              struct gfs2_inum_host *inum, unsigned int type);
 int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
                    struct gfs2_inum_host *inum, unsigned int *type);
 int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
                 const struct gfs2_inum_host *inum, unsigned int type);
 int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
-int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque,
+int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-                  gfs2_filldir_t filldir);
+                  filldir_t filldir);
 int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
                   struct gfs2_inum_host *new_inum, unsigned int new_type);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index ebebbdcd7057..0c83c7f4dda8 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -301,7 +301,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
-                ip->i_inode.i_ctime.tv_sec = get_seconds();
+                ip->i_inode.i_ctime = CURRENT_TIME_SEC;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
@@ -718,7 +718,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                                            (er->er_mode & S_IFMT));
                        ip->i_inode.i_mode = er->er_mode;
                }
-                ip->i_inode.i_ctime.tv_sec = get_seconds();
+                ip->i_inode.i_ctime = CURRENT_TIME_SEC;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
@@ -853,7 +853,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
                        (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
                ip->i_inode.i_mode = er->er_mode;
        }
-        ip->i_inode.i_ctime.tv_sec = get_seconds();
+        ip->i_inode.i_ctime = CURRENT_TIME_SEC;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
@@ -1134,7 +1134,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
-                ip->i_inode.i_ctime.tv_sec = get_seconds();
+                ip->i_inode.i_ctime = CURRENT_TIME_SEC;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 438146904b58..6618c1190252 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,6 +19,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/list.h>
 #include <linux/lm_interface.h>
+#include <linux/wait.h>
+#include <linux/rwsem.h>
 #include <asm/uaccess.h>
 #include "gfs2.h"
@@ -33,11 +35,6 @@
 #include "super.h"
 #include "util.h"
-struct greedy {
-        struct gfs2_holder gr_gh;
-        struct delayed_work gr_work;
-};
 struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
 };
@@ -47,6 +44,9 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl);
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
 static int dump_glock(struct gfs2_glock *gl);
 static int dump_inode(struct gfs2_inode *ip);
+static void gfs2_glock_xmote_th(struct gfs2_holder *gh);
+static void gfs2_glock_drop_th(struct gfs2_glock *gl);
+static DECLARE_RWSEM(gfs2_umount_flush_sem);
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -213,30 +213,6 @@ out:
 }
 /**
- * queue_empty - check to see if a glock's queue is empty
- * @gl: the glock
- * @head: the head of the queue to check
- *
- * This function protects the list in the event that a process already
- * has a holder on the list and is adding a second holder for itself.
- * The glmutex lock is what generally prevents processes from working
- * on the same glock at once, but the special case of adding a second
- * holder for yourself ("recursive" locking) doesn't involve locking
- * glmutex, making the spin lock necessary.
- *
- * Returns: 1 if the queue is empty
- */
-static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
-{
-        int empty;
-        spin_lock(&gl->gl_spin);
-        empty = list_empty(head);
-        spin_unlock(&gl->gl_spin);
-        return empty;
-}
-/**
 * search_bucket() - Find struct gfs2_glock by lock number
 * @bucket: the bucket to search
 * @name: The lock name
@@ -395,11 +371,6 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
        gh->gh_flags = flags;
        gh->gh_error = 0;
        gh->gh_iflags = 0;
-        init_completion(&gh->gh_wait);
-        if (gh->gh_state == LM_ST_EXCLUSIVE)
-                gh->gh_flags |= GL_LOCAL_EXCL;
        gfs2_glock_hold(gl);
 }
@@ -417,9 +388,6 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
 {
        gh->gh_state = state;
        gh->gh_flags = flags;
-        if (gh->gh_state == LM_ST_EXCLUSIVE)
-                gh->gh_flags |= GL_LOCAL_EXCL;
        gh->gh_iflags &= 1 << HIF_ALLOCED;
        gh->gh_ip = (unsigned long)__builtin_return_address(0);
 }
@@ -479,6 +447,29 @@ static void gfs2_holder_put(struct gfs2_holder *gh)
        kfree(gh);
 }
+static void gfs2_holder_dispose_or_wake(struct gfs2_holder *gh)
+{
+        if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) {
+                gfs2_holder_put(gh);
+                return;
+        }
+        clear_bit(HIF_WAIT, &gh->gh_iflags);
+        smp_mb();
+        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+static int holder_wait(void *word)
+{
+        schedule();
+        return 0;
+}
+static void wait_on_holder(struct gfs2_holder *gh)
+{
+        might_sleep();
+        wait_on_bit(&gh->gh_iflags, HIF_WAIT, holder_wait, TASK_UNINTERRUPTIBLE);
+}
 /**
 * rq_mutex - process a mutex request in the queue
 * @gh: the glock holder
@@ -493,7 +484,9 @@ static int rq_mutex(struct gfs2_holder *gh)
        list_del_init(&gh->gh_list);
        /*  gh->gh_error never examined.  */
        set_bit(GLF_LOCK, &gl->gl_flags);
-        complete(&gh->gh_wait);
+        clear_bit(HIF_WAIT, &gh->gh_iflags);
+        smp_mb();
+        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
        return 1;
 }
@@ -511,7 +504,6 @@ static int rq_promote(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
        if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
                if (list_empty(&gl->gl_holders)) {
@@ -526,7 +518,7 @@ static int rq_promote(struct gfs2_holder *gh)
                                gfs2_reclaim_glock(sdp);
                        }
-                        glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+                        gfs2_glock_xmote_th(gh);
                        spin_lock(&gl->gl_spin);
                }
                return 1;
@@ -537,11 +529,11 @@ static int rq_promote(struct gfs2_holder *gh)
                set_bit(GLF_LOCK, &gl->gl_flags);
        } else {
                struct gfs2_holder *next_gh;
-                if (gh->gh_flags & GL_LOCAL_EXCL)
+                if (gh->gh_state == LM_ST_EXCLUSIVE)
                        return 1;
                next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
                                     gh_list);
-                if (next_gh->gh_flags & GL_LOCAL_EXCL)
+                if (next_gh->gh_state == LM_ST_EXCLUSIVE)
                         return 1;
        }
@@ -549,7 +541,7 @@ static int rq_promote(struct gfs2_holder *gh)
        gh->gh_error = 0;
        set_bit(HIF_HOLDER, &gh->gh_iflags);
-        complete(&gh->gh_wait);
+        gfs2_holder_dispose_or_wake(gh);
        return 0;
 }
@@ -564,7 +556,6 @@ static int rq_promote(struct gfs2_holder *gh)
 static int rq_demote(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
        if (!list_empty(&gl->gl_holders))
                return 1;
@@ -573,10 +564,7 @@ static int rq_demote(struct gfs2_holder *gh)
                list_del_init(&gh->gh_list);
                gh->gh_error = 0;
                spin_unlock(&gl->gl_spin);
-                if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                gfs2_holder_dispose_or_wake(gh);
-                        gfs2_holder_put(gh);
-                else
-                        complete(&gh->gh_wait);
                spin_lock(&gl->gl_spin);
        } else {
                gl->gl_req_gh = gh;
@@ -585,9 +573,9 @@ static int rq_demote(struct gfs2_holder *gh)
                if (gh->gh_state == LM_ST_UNLOCKED ||
                    gl->gl_state != LM_ST_EXCLUSIVE)
-                        glops->go_drop_th(gl);
+                        gfs2_glock_drop_th(gl);
                else
-                        glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+                        gfs2_glock_xmote_th(gh);
                spin_lock(&gl->gl_spin);
        }
@@ -596,30 +584,6 @@ static int rq_demote(struct gfs2_holder *gh)
 }
 /**
- * rq_greedy - process a queued request to drop greedy status
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-static int rq_greedy(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        list_del_init(&gh->gh_list);
-        /*  gh->gh_error never examined.  */
-        clear_bit(GLF_GREEDY, &gl->gl_flags);
-        spin_unlock(&gl->gl_spin);
-        gfs2_holder_uninit(gh);
-        kfree(container_of(gh, struct greedy, gr_gh));
-        spin_lock(&gl->gl_spin);
-        return 0;
-}
-/**
 * run_queue - process holder structures on a glock
 * @gl: the glock
 *
@@ -649,8 +613,6 @@ static void run_queue(struct gfs2_glock *gl)
                        if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
                                blocked = rq_demote(gh);
-                        else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
-                                blocked = rq_greedy(gh);
                        else
                                gfs2_assert_warn(gl->gl_sbd, 0);
@@ -684,6 +646,8 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
        gfs2_holder_init(gl, 0, 0, &gh);
        set_bit(HIF_MUTEX, &gh.gh_iflags);
+        if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags))
+                BUG();
        spin_lock(&gl->gl_spin);
        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
@@ -691,11 +655,13 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
        } else {
                gl->gl_owner = current;
                gl->gl_ip = (unsigned long)__builtin_return_address(0);
-                complete(&gh.gh_wait);
+                clear_bit(HIF_WAIT, &gh.gh_iflags);
+                smp_mb();
+                wake_up_bit(&gh.gh_iflags, HIF_WAIT);
        }
        spin_unlock(&gl->gl_spin);
-        wait_for_completion(&gh.gh_wait);
+        wait_on_holder(&gh);
        gfs2_holder_uninit(&gh);
 }
@@ -774,6 +740,7 @@ restart:
                        return;
                set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
                set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
+                set_bit(HIF_WAIT, &new_gh->gh_iflags);
                goto restart;
        }
@@ -825,7 +792,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
        int op_done = 1;
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
        gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
        state_change(gl, ret & LM_OUT_ST_MASK);
@@ -908,12 +875,8 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
        gfs2_glock_put(gl);
-        if (gh) {
+        if (gh)
-                if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                gfs2_holder_dispose_or_wake(gh);
-                        gfs2_holder_put(gh);
-                else
-                        complete(&gh->gh_wait);
-        }
 }
 /**
@@ -924,23 +887,26 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 *
 */
-void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
+void gfs2_glock_xmote_th(struct gfs2_holder *gh)
 {
+        struct gfs2_glock *gl = gh->gh_gl;
        struct gfs2_sbd *sdp = gl->gl_sbd;
+        int flags = gh->gh_flags;
+        unsigned state = gh->gh_state;
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
                                 LM_FLAG_NOEXP | LM_FLAG_ANY |
                                 LM_FLAG_PRIORITY);
        unsigned int lck_ret;
+        if (glops->go_xmote_th)
+                glops->go_xmote_th(gl);
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
        gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
        gfs2_assert_warn(sdp, state != gl->gl_state);
-        if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
-                glops->go_sync(gl);
        gfs2_glock_hold(gl);
        gl->gl_req_bh = xmote_bh;
@@ -971,10 +937,8 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_holder *gh = gl->gl_req_gh;
-        clear_bit(GLF_PREFETCH, &gl->gl_flags);
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
        gfs2_assert_warn(sdp, !ret);
        state_change(gl, LM_ST_UNLOCKED);
@@ -1001,12 +965,8 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
        gfs2_glock_put(gl);
-        if (gh) {
+        if (gh)
-                if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                gfs2_holder_dispose_or_wake(gh);
-                        gfs2_holder_put(gh);
-                else
-                        complete(&gh->gh_wait);
-        }
 }
 /**
@@ -1015,19 +975,19 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 *
 */
-void gfs2_glock_drop_th(struct gfs2_glock *gl)
+static void gfs2_glock_drop_th(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        unsigned int ret;
+        if (glops->go_drop_th)
+                glops->go_drop_th(gl);
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
        gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
-        if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
-                glops->go_sync(gl);
        gfs2_glock_hold(gl);
        gl->gl_req_bh = drop_bh;
@@ -1107,8 +1067,7 @@ static int glock_wait_internal(struct gfs2_holder *gh)
        if (gh->gh_flags & LM_FLAG_PRIORITY)
                do_cancels(gh);
-        wait_for_completion(&gh->gh_wait);
+        wait_on_holder(gh);
        if (gh->gh_error)
                return gh->gh_error;
@@ -1164,6 +1123,8 @@ static void add_to_queue(struct gfs2_holder *gh)
        struct gfs2_holder *existing;
        BUG_ON(!gh->gh_owner);
+        if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
+                BUG();
        existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
        if (existing) {
@@ -1227,8 +1188,6 @@ restart:
                }
        }
-        clear_bit(GLF_PREFETCH, &gl->gl_flags);
        return error;
 }
@@ -1321,98 +1280,6 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 }
 /**
- * gfs2_glock_prefetch - Try to prefetch a glock
- * @gl: the glock
- * @state: the state to prefetch in
- * @flags: flags passed to go_xmote_th()
- *
- */
-static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
-                                int flags)
-{
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        spin_lock(&gl->gl_spin);
-        if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) ||
-            !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) ||
-            !list_empty(&gl->gl_waiters3) ||
-            relaxed_state_ok(gl->gl_state, state, flags)) {
-                spin_unlock(&gl->gl_spin);
-                return;
-        }
-        set_bit(GLF_PREFETCH, &gl->gl_flags);
-        set_bit(GLF_LOCK, &gl->gl_flags);
-        spin_unlock(&gl->gl_spin);
-        glops->go_xmote_th(gl, state, flags);
-}
-static void greedy_work(struct work_struct *work)
-{
-        struct greedy *gr = container_of(work, struct greedy, gr_work.work);
-        struct gfs2_holder *gh = &gr->gr_gh;
-        struct gfs2_glock *gl = gh->gh_gl;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
-        if (glops->go_greedy)
-                glops->go_greedy(gl);
-        spin_lock(&gl->gl_spin);
-        if (list_empty(&gl->gl_waiters2)) {
-                clear_bit(GLF_GREEDY, &gl->gl_flags);
-                spin_unlock(&gl->gl_spin);
-                gfs2_holder_uninit(gh);
-                kfree(gr);
-        } else {
-                gfs2_glock_hold(gl);
-                list_add_tail(&gh->gh_list, &gl->gl_waiters2);
-                run_queue(gl);
-                spin_unlock(&gl->gl_spin);
-                gfs2_glock_put(gl);
-        }
-}
-/**
- * gfs2_glock_be_greedy -
- * @gl:
- * @time:
- *
- * Returns: 0 if go_greedy will be called, 1 otherwise
- */
-int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
-{
-        struct greedy *gr;
-        struct gfs2_holder *gh;
-        if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
-            test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
-                return 1;
-        gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
-        if (!gr) {
-                clear_bit(GLF_GREEDY, &gl->gl_flags);
-                return 1;
-        }
-        gh = &gr->gr_gh;
-        gfs2_holder_init(gl, 0, 0, gh);
-        set_bit(HIF_GREEDY, &gh->gh_iflags);
-        INIT_DELAYED_WORK(&gr->gr_work, greedy_work);
-        set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
-        schedule_delayed_work(&gr->gr_work, time);
-        return 0;
-}
-/**
 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
 * @gh: the holder structure
 *
@@ -1470,10 +1337,7 @@ static int glock_compare(const void *arg_a, const void *arg_b)
                return 1;
        if (a->ln_number < b->ln_number)
                return -1;
-        if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+        BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type);
-                return 1;
-        if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL))
-                return 1;
        return 0;
 }
@@ -1618,34 +1482,6 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 }
 /**
- * gfs2_glock_prefetch_num - prefetch a glock based on lock number
- * @sdp: the filesystem
- * @number: the lock number
- * @glops: the glock operations for the type of glock
- * @state: the state to acquire the glock in
- * @flags: modifier flags for the aquisition
- *
- * Returns: errno
- */
-void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
-                             const struct gfs2_glock_operations *glops,
-                             unsigned int state, int flags)
-{
-        struct gfs2_glock *gl;
-        int error;
-        if (atomic_read(&sdp->sd_reclaim_count) <
-            gfs2_tune_get(sdp, gt_reclaim_limit)) {
-                error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
-                if (!error) {
-                        gfs2_glock_prefetch(gl, state, flags);
-                        gfs2_glock_put(gl);
-                }
-        }
-}
-/**
 * gfs2_lvb_hold - attach a LVB from a glock
 * @gl: The glock in question
 *
@@ -1703,8 +1539,6 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
        if (!gl)
                return;
-        if (gl->gl_ops->go_callback)
-                gl->gl_ops->go_callback(gl, state);
        handle_callback(gl, state);
        spin_lock(&gl->gl_spin);
@@ -1746,12 +1580,14 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
                struct lm_async_cb *async = data;
                struct gfs2_glock *gl;
+                down_read(&gfs2_umount_flush_sem);
                gl = gfs2_glock_find(sdp, &async->lc_name);
                if (gfs2_assert_warn(sdp, gl))
                        return;
                if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
                        gl->gl_req_bh(gl, async->lc_ret);
                gfs2_glock_put(gl);
+                up_read(&gfs2_umount_flush_sem);
                return;
        }
@@ -1781,15 +1617,11 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 static int demote_ok(struct gfs2_glock *gl)
 {
-        struct gfs2_sbd *sdp = gl->gl_sbd;
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        int demote = 1;
        if (test_bit(GLF_STICKY, &gl->gl_flags))
                demote = 0;
-        else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
-                demote = time_after_eq(jiffies, gl->gl_stamp +
-                                    gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
        else if (glops->go_demote_ok)
                demote = glops->go_demote_ok(gl);
@@ -1845,7 +1677,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
        atomic_inc(&sdp->sd_reclaimed);
        if (gfs2_glmutex_trylock(gl)) {
-                if (queue_empty(gl, &gl->gl_holders) &&
+                if (list_empty(&gl->gl_holders) &&
                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
                        handle_callback(gl, LM_ST_UNLOCKED);
                gfs2_glmutex_unlock(gl);
@@ -1909,7 +1741,7 @@ static void scan_glock(struct gfs2_glock *gl)
                return;
        if (gfs2_glmutex_trylock(gl)) {
-                if (queue_empty(gl, &gl->gl_holders) &&
+                if (list_empty(&gl->gl_holders) &&
                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
                        goto out_schedule;
                gfs2_glmutex_unlock(gl);
@@ -1958,7 +1790,7 @@ static void clear_glock(struct gfs2_glock *gl)
        }
        if (gfs2_glmutex_trylock(gl)) {
-                if (queue_empty(gl, &gl->gl_holders) &&
+                if (list_empty(&gl->gl_holders) &&
                    gl->gl_state != LM_ST_UNLOCKED)
                        handle_callback(gl, LM_ST_UNLOCKED);
                gfs2_glmutex_unlock(gl);
@@ -2000,7 +1832,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
                        t = jiffies;
                }
+                down_write(&gfs2_umount_flush_sem);
                invalidate_inodes(sdp->sd_vfs);
+                up_write(&gfs2_umount_flush_sem);
                msleep(10);
        }
 }
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index fb39108fc05c..f50e40ceca43 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -20,7 +20,6 @@
 #define LM_FLAG_ANY             0x00000008
 #define LM_FLAG_PRIORITY        0x00000010 */
-#define GL_LOCAL_EXCL           0x00000020
 #define GL_ASYNC                0x00000040
 #define GL_EXACT                0x00000080
 #define GL_SKIP                 0x00000100
@@ -83,17 +82,11 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
 void gfs2_holder_reinit(unsigned int state, unsigned flags,
                        struct gfs2_holder *gh);
 void gfs2_holder_uninit(struct gfs2_holder *gh);
-void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
-void gfs2_glock_drop_th(struct gfs2_glock *gl);
 int gfs2_glock_nq(struct gfs2_holder *gh);
 int gfs2_glock_poll(struct gfs2_holder *gh);
 int gfs2_glock_wait(struct gfs2_holder *gh);
 void gfs2_glock_dq(struct gfs2_holder *gh);
-int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
 void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
 int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
                      u64 number, const struct gfs2_glock_operations *glops,
@@ -103,10 +96,6 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
-void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
-                             const struct gfs2_glock_operations *glops,
-                             unsigned int state, int flags);
 /**
 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
 * @gl: the glock
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index b068d10bcb6e..c4b0391b7aa2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -117,12 +117,14 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
 static void meta_go_sync(struct gfs2_glock *gl)
 {
+        if (gl->gl_state != LM_ST_EXCLUSIVE)
+                return;
        if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
                gfs2_log_flush(gl->gl_sbd, gl);
                gfs2_meta_sync(gl);
                gfs2_ail_empty_gl(gl);
        }
 }
 /**
@@ -142,6 +144,37 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
 }
 /**
+ * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * @gl: the glock protecting the inode
+ *
+ */
+static void inode_go_sync(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip = gl->gl_object;
+        if (ip && !S_ISREG(ip->i_inode.i_mode))
+                ip = NULL;
+        if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
+                gfs2_log_flush(gl->gl_sbd, gl);
+                if (ip)
+                        filemap_fdatawrite(ip->i_inode.i_mapping);
+                gfs2_meta_sync(gl);
+                if (ip) {
+                        struct address_space *mapping = ip->i_inode.i_mapping;
+                        int error = filemap_fdatawait(mapping);
+                        if (error == -ENOSPC)
+                                set_bit(AS_ENOSPC, &mapping->flags);
+                        else if (error)
+                                set_bit(AS_EIO, &mapping->flags);
+                }
+                clear_bit(GLF_DIRTY, &gl->gl_flags);
+                gfs2_ail_empty_gl(gl);
+        }
+}
+/**
 * inode_go_xmote_th - promote/demote a glock
 * @gl: the glock
 * @state: the requested state
@@ -149,12 +182,12 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
 *
 */
-static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+static void inode_go_xmote_th(struct gfs2_glock *gl)
-                              int flags)
 {
        if (gl->gl_state != LM_ST_UNLOCKED)
                gfs2_pte_inval(gl);
-        gfs2_glock_xmote_th(gl, state, flags);
+        if (gl->gl_state == LM_ST_EXCLUSIVE)
+                inode_go_sync(gl);
 }
 /**
@@ -189,38 +222,8 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl)
 static void inode_go_drop_th(struct gfs2_glock *gl)
 {
        gfs2_pte_inval(gl);
-        gfs2_glock_drop_th(gl);
+        if (gl->gl_state == LM_ST_EXCLUSIVE)
-}
+                inode_go_sync(gl);
-/**
- * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
- * @gl: the glock protecting the inode
- *
- */
-static void inode_go_sync(struct gfs2_glock *gl)
-{
-        struct gfs2_inode *ip = gl->gl_object;
-        if (ip && !S_ISREG(ip->i_inode.i_mode))
-                ip = NULL;
-        if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
-                gfs2_log_flush(gl->gl_sbd, gl);
-                if (ip)
-                        filemap_fdatawrite(ip->i_inode.i_mapping);
-                gfs2_meta_sync(gl);
-                if (ip) {
-                        struct address_space *mapping = ip->i_inode.i_mapping;
-                        int error = filemap_fdatawait(mapping);
-                        if (error == -ENOSPC)
-                                set_bit(AS_ENOSPC, &mapping->flags);
-                        else if (error)
-                                set_bit(AS_EIO, &mapping->flags);
-                }
-                clear_bit(GLF_DIRTY, &gl->gl_flags);
-                gfs2_ail_empty_gl(gl);
-        }
 }
 /**
@@ -295,7 +298,7 @@ static int inode_go_lock(struct gfs2_holder *gh)
        if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
            (gl->gl_state == LM_ST_EXCLUSIVE) &&
-            (gh->gh_flags & GL_LOCAL_EXCL))
+            (gh->gh_state == LM_ST_EXCLUSIVE))
                error = gfs2_truncatei_resume(ip);
        return error;
@@ -319,39 +322,6 @@ static void inode_go_unlock(struct gfs2_holder *gh)
 }
 /**
- * inode_greedy -
- * @gl: the glock
- *
- */
-static void inode_greedy(struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_inode *ip = gl->gl_object;
-        unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
-        unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
-        unsigned int new_time;
-        spin_lock(&ip->i_spin);
-        if (time_after(ip->i_last_pfault + quantum, jiffies)) {
-                new_time = ip->i_greedy + quantum;
-                if (new_time > max)
-                        new_time = max;
-        } else {
-                new_time = ip->i_greedy - quantum;
-                if (!new_time || new_time > max)
-                        new_time = 1;
-        }
-        ip->i_greedy = new_time;
-        spin_unlock(&ip->i_spin);
-        iput(&ip->i_inode);
-}
-/**
 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
 * @gl: the glock
 *
@@ -398,8 +368,7 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 *
 */
-static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+static void trans_go_xmote_th(struct gfs2_glock *gl)
-                              int flags)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -408,8 +377,6 @@ static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
-        gfs2_glock_xmote_th(gl, state, flags);
 }
 /**
@@ -461,8 +428,6 @@ static void trans_go_drop_th(struct gfs2_glock *gl)
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
-        gfs2_glock_drop_th(gl);
 }
 /**
@@ -478,8 +443,8 @@ static int quota_go_demote_ok(struct gfs2_glock *gl)
 }
 const struct gfs2_glock_operations gfs2_meta_glops = {
-        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_xmote_th = meta_go_sync,
-        .go_drop_th = gfs2_glock_drop_th,
+        .go_drop_th = meta_go_sync,
        .go_type = LM_TYPE_META,
 };
@@ -487,19 +452,14 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
        .go_xmote_th = inode_go_xmote_th,
        .go_xmote_bh = inode_go_xmote_bh,
        .go_drop_th = inode_go_drop_th,
-        .go_sync = inode_go_sync,
        .go_inval = inode_go_inval,
        .go_demote_ok = inode_go_demote_ok,
        .go_lock = inode_go_lock,
        .go_unlock = inode_go_unlock,
-        .go_greedy = inode_greedy,
        .go_type = LM_TYPE_INODE,
 };
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
-        .go_xmote_th = gfs2_glock_xmote_th,
-        .go_drop_th = gfs2_glock_drop_th,
-        .go_sync = meta_go_sync,
        .go_inval = meta_go_inval,
        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
@@ -515,33 +475,23 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
 };
 const struct gfs2_glock_operations gfs2_iopen_glops = {
-        .go_xmote_th = gfs2_glock_xmote_th,
-        .go_drop_th = gfs2_glock_drop_th,
        .go_type = LM_TYPE_IOPEN,
 };
 const struct gfs2_glock_operations gfs2_flock_glops = {
-        .go_xmote_th = gfs2_glock_xmote_th,
-        .go_drop_th = gfs2_glock_drop_th,
        .go_type = LM_TYPE_FLOCK,
 };
 const struct gfs2_glock_operations gfs2_nondisk_glops = {
-        .go_xmote_th = gfs2_glock_xmote_th,
-        .go_drop_th = gfs2_glock_drop_th,
        .go_type = LM_TYPE_NONDISK,
 };
 const struct gfs2_glock_operations gfs2_quota_glops = {
-        .go_xmote_th = gfs2_glock_xmote_th,
-        .go_drop_th = gfs2_glock_drop_th,
        .go_demote_ok = quota_go_demote_ok,
        .go_type = LM_TYPE_QUOTA,
 };
 const struct gfs2_glock_operations gfs2_journal_glops = {
-        .go_xmote_th = gfs2_glock_xmote_th,
-        .go_drop_th = gfs2_glock_drop_th,
        .go_type = LM_TYPE_JOURNAL,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 734421edae85..12c80fd28db5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -101,17 +101,14 @@ struct gfs2_bufdata {
 };
 struct gfs2_glock_operations {
-        void (*go_xmote_th) (struct gfs2_glock *gl, unsigned int state, int flags);
+        void (*go_xmote_th) (struct gfs2_glock *gl);
        void (*go_xmote_bh) (struct gfs2_glock *gl);
        void (*go_drop_th) (struct gfs2_glock *gl);
        void (*go_drop_bh) (struct gfs2_glock *gl);
-        void (*go_sync) (struct gfs2_glock *gl);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
        int (*go_demote_ok) (struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
-        void (*go_callback) (struct gfs2_glock *gl, unsigned int state);
-        void (*go_greedy) (struct gfs2_glock *gl);
        const int go_type;
 };
@@ -120,7 +117,6 @@ enum {
        HIF_MUTEX               = 0,
        HIF_PROMOTE             = 1,
        HIF_DEMOTE              = 2,
-        HIF_GREEDY              = 3,
        /* States */
        HIF_ALLOCED             = 4,
@@ -128,6 +124,7 @@ enum {
        HIF_HOLDER              = 6,
        HIF_FIRST               = 7,
        HIF_ABORTED             = 9,
+        HIF_WAIT                = 10,
 };
 struct gfs2_holder {
@@ -140,17 +137,14 @@ struct gfs2_holder {
        int gh_error;
        unsigned long gh_iflags;
-        struct completion gh_wait;
        unsigned long gh_ip;
 };
 enum {
        GLF_LOCK                = 1,
        GLF_STICKY              = 2,
-        GLF_PREFETCH            = 3,
        GLF_DIRTY               = 5,
        GLF_SKIP_WAITERS2       = 6,
-        GLF_GREEDY              = 7,
 };
 struct gfs2_glock {
@@ -167,7 +161,7 @@ struct gfs2_glock {
        unsigned long gl_ip;
        struct list_head gl_holders;
        struct list_head gl_waiters1;   /* HIF_MUTEX */
-        struct list_head gl_waiters2;   /* HIF_DEMOTE, HIF_GREEDY */
+        struct list_head gl_waiters2;   /* HIF_DEMOTE */
        struct list_head gl_waiters3;   /* HIF_PROMOTE */
        const struct gfs2_glock_operations *gl_ops;
@@ -236,7 +230,6 @@ struct gfs2_inode {
        spinlock_t i_spin;
        struct rw_semaphore i_rw_mutex;
-        unsigned int i_greedy;
        unsigned long i_last_pfault;
        struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
@@ -418,17 +411,12 @@ struct gfs2_tune {
        unsigned int gt_atime_quantum; /* Min secs between atime updates */
        unsigned int gt_new_files_jdata;
        unsigned int gt_new_files_directio;
-        unsigned int gt_max_atomic_write; /* Split big writes into this size */
        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
        unsigned int gt_lockdump_size;
        unsigned int gt_stall_secs; /* Detects trouble! */
        unsigned int gt_complain_secs;
        unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
        unsigned int gt_entries_per_readdir;
-        unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
-        unsigned int gt_greedy_default;
-        unsigned int gt_greedy_quantum;
-        unsigned int gt_greedy_max;
        unsigned int gt_statfs_quantum;
        unsigned int gt_statfs_slow;
 };
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d122074c45e1..0d6831a40565 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -287,10 +287,8 @@ out:
 *
 * Returns: errno
 */
 int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
 {
-        struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
        struct buffer_head *dibh;
        u32 nlink;
        int error;
@@ -315,42 +313,34 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
        else
                drop_nlink(&ip->i_inode);
-        ip->i_inode.i_ctime.tv_sec = get_seconds();
+        ip->i_inode.i_ctime = CURRENT_TIME_SEC;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
        mark_inode_dirty(&ip->i_inode);
-        if (ip->i_inode.i_nlink == 0) {
+        if (ip->i_inode.i_nlink == 0)
-                struct gfs2_rgrpd *rgd;
-                struct gfs2_holder ri_gh, rg_gh;
-                error = gfs2_rindex_hold(sdp, &ri_gh);
-                if (error)
-                        goto out;
-                error = -EIO;
-                rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
-                if (!rgd)
-                        goto out_norgrp;
-                error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
-                if (error)
-                        goto out_norgrp;
                gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
-                gfs2_glock_dq_uninit(&rg_gh);
-out_norgrp:
-                gfs2_glock_dq_uninit(&ri_gh);
-        }
-out:
        return error;
 }
 struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
 {
        struct qstr qstr;
+        struct inode *inode;
        gfs2_str2qstr(&qstr, name);
-        return gfs2_lookupi(dip, &qstr, 1, NULL);
+        inode = gfs2_lookupi(dip, &qstr, 1, NULL);
+        /* gfs2_lookupi has inconsistent callers: vfs
+         * related routines expect NULL for no entry found,
+         * gfs2_lookup_simple callers expect ENOENT
+         * and do not check for NULL.
+         */
+        if (inode == NULL)
+                return ERR_PTR(-ENOENT);
+        else
+                return inode;
 }
@@ -361,8 +351,10 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
 * @is_root: If 1, ignore the caller's permissions
 * @i_gh: An uninitialized holder for the new inode glock
 *
- * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
+ * This can be called via the VFS filldir function when NFS is doing
- * @is_root is true.
+ * a readdirplus and the inode which its intending to stat isn't
+ * already in cache. In this case we must not take the directory glock
+ * again, since the readdir call will have already taken that lock.
 *
 * Returns: errno
 */
@@ -375,8 +367,9 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
        struct gfs2_holder d_gh;
        struct gfs2_inum_host inum;
        unsigned int type;
-        int error = 0;
+        int error;
        struct inode *inode = NULL;
+        int unlock = 0;
        if (!name->len || name->len > GFS2_FNAMESIZE)
                return ERR_PTR(-ENAMETOOLONG);
@@ -388,9 +381,12 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
                return dir;
        }
-        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+        if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) {
-        if (error)
+                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-                return ERR_PTR(error);
+                if (error)
+                        return ERR_PTR(error);
+                unlock = 1;
+        }
        if (!is_root) {
                error = permission(dir, MAY_EXEC, NULL);
@@ -405,10 +401,11 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
        inode = gfs2_inode_lookup(sb, &inum, type);
 out:
-        gfs2_glock_dq_uninit(&d_gh);
+        if (unlock)
+                gfs2_glock_dq_uninit(&d_gh);
        if (error == -ENOENT)
                return NULL;
-        return inode;
+        return inode ? inode : ERR_PTR(error);
 }
 static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
index effe4a337c1d..e30673dd37e0 100644
--- a/fs/gfs2/lm.c
+++ b/fs/gfs2/lm.c
@@ -104,15 +104,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
        vprintk(fmt, args);
        va_end(args);
-        fs_err(sdp, "about to withdraw from the cluster\n");
+        fs_err(sdp, "about to withdraw this file system\n");
        BUG_ON(sdp->sd_args.ar_debug);
-        fs_err(sdp, "waiting for outstanding I/O\n");
-        /* FIXME: suspend dm device so oustanding bio's complete
-           and all further io requests fail */
        fs_err(sdp, "telling LM to withdraw\n");
        gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
        fs_err(sdp, "withdrawn\n");
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 33af707a4d3f..a87c7bf3c568 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -36,7 +36,7 @@
 #define GDLM_STRNAME_BYTES      24
 #define GDLM_LVB_SIZE           32
-#define GDLM_DROP_COUNT         50000
+#define GDLM_DROP_COUNT         200000
 #define GDLM_DROP_PERIOD        60
 #define GDLM_NAME_LEN           128
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
index 2194b1d5b5ec..a0e7eda643ed 100644
--- a/fs/gfs2/locking/dlm/main.c
+++ b/fs/gfs2/locking/dlm/main.c
@@ -11,9 +11,6 @@
 #include "lock_dlm.h"
-extern int gdlm_drop_count;
-extern int gdlm_drop_period;
 extern struct lm_lockops gdlm_ops;
 static int __init init_lock_dlm(void)
@@ -40,9 +37,6 @@ static int __init init_lock_dlm(void)
                return error;
        }
-        gdlm_drop_count = GDLM_DROP_COUNT;
-        gdlm_drop_period = GDLM_DROP_PERIOD;
        printk(KERN_INFO
               "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
        return 0;
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index cdd1694e889b..1d8faa3da8af 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -9,8 +9,6 @@
 #include "lock_dlm.h"
-int gdlm_drop_count;
-int gdlm_drop_period;
 const struct lm_lockops gdlm_ops;
@@ -24,8 +22,8 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
        if (!ls)
                return NULL;
-        ls->drop_locks_count = gdlm_drop_count;
+        ls->drop_locks_count = GDLM_DROP_COUNT;
-        ls->drop_locks_period = gdlm_drop_period;
+        ls->drop_locks_period = GDLM_DROP_PERIOD;
        ls->fscb = cb;
        ls->sdp = sdp;
        ls->fsflags = flags;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 29ae06f94944..4746b884662d 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -116,6 +116,17 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
        return sprintf(buf, "%d\n", ls->recover_jid_status);
 }
+static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->drop_locks_count);
+}
+static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+        ls->drop_locks_count = simple_strtol(buf, NULL, 0);
+        return len;
+}
 struct gdlm_attr {
        struct attribute attr;
        ssize_t (*show)(struct gdlm_ls *, char *);
@@ -135,6 +146,7 @@ GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
 GDLM_ATTR(recover,        0644, recover_show,        recover_store);
 GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
 GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+GDLM_ATTR(drop_count,     0644, drop_count_show,     drop_count_store);
 static struct attribute *gdlm_attrs[] = {
        &gdlm_attr_proto_name.attr,
@@ -147,6 +159,7 @@ static struct attribute *gdlm_attrs[] = {
        &gdlm_attr_recover.attr,
        &gdlm_attr_recover_done.attr,
        &gdlm_attr_recover_status.attr,
+        &gdlm_attr_drop_count.attr,
        NULL,
 };
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4d7f94d8c7bd..16bb4b4561ae 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -69,13 +69,16 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
        struct gfs2_trans *tr;
-        if (!list_empty(&bd->bd_list_tr))
+        gfs2_log_lock(sdp);
+        if (!list_empty(&bd->bd_list_tr)) {
+                gfs2_log_unlock(sdp);
                return;
+        }
        tr = current->journal_info;
        tr->tr_touched = 1;
        tr->tr_num_buf++;
        list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+        gfs2_log_unlock(sdp);
        if (!list_empty(&le->le_list))
                return;
@@ -84,7 +87,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        gfs2_meta_check(sdp, bd->bd_bh);
        gfs2_pin(sdp, bd->bd_bh);
        gfs2_log_lock(sdp);
        sdp->sd_log_num_buf++;
        list_add(&le->le_list, &sdp->sd_log_le_buf);
@@ -98,11 +100,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        struct list_head *head = &tr->tr_list_buf;
        struct gfs2_bufdata *bd;
+        gfs2_log_lock(sdp);
        while (!list_empty(head)) {
                bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
                list_del_init(&bd->bd_list_tr);
                tr->tr_num_buf--;
        }
+        gfs2_log_unlock(sdp);
        gfs2_assert_warn(sdp, !tr->tr_num_buf);
 }
@@ -462,13 +466,17 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        struct address_space *mapping = bd->bd_bh->b_page->mapping;
        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        gfs2_log_lock(sdp);
        tr->tr_touched = 1;
        if (list_empty(&bd->bd_list_tr) &&
            (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
                tr->tr_num_buf++;
                list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+                gfs2_log_unlock(sdp);
                gfs2_pin(sdp, bd->bd_bh);
                tr->tr_num_buf_new++;
+        } else {
+                gfs2_log_unlock(sdp);
        }
        gfs2_trans_add_gl(bd->bd_gl);
        gfs2_log_lock(sdp);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index d8d69a72a10d..56e33590b656 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -16,6 +16,7 @@
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/fs.h>
+#include <linux/writeback.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
@@ -157,6 +158,32 @@ out_ignore:
 }
 /**
+ * gfs2_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: Write-back control
+ *
+ * For journaled files and/or ordered writes this just falls back to the
+ * kernel's default writepages path for now. We will probably want to change
+ * that eventually (i.e. when we look at allocate on flush).
+ *
+ * For the data=writeback case though we can already ignore buffer heads
+ * and write whole extents at once. This is a big reduction in the
+ * number of I/O requests we send and the bmap calls we make in this case.
+ */
+static int gfs2_writepages(struct address_space *mapping,
+                           struct writeback_control *wbc)
+{
+        struct inode *inode = mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip))
+                return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+        return generic_writepages(mapping, wbc);
+}
+/**
 * stuffed_readpage - Fill in a Linux page with stuffed file data
 * @ip: the inode
 * @page: the page
@@ -256,7 +283,7 @@ out_unlock:
 *    the page lock and the glock) and return having done no I/O. Its
 *    obviously not something we'd want to do on too regular a basis.
 *    Any I/O we ignore at this time will be done via readpage later.
- * 2. We have to handle stuffed files here too.
+ * 2. We don't handle stuffed files here we let readpage do the honours.
 * 3. mpage_readpages() does most of the heavy lifting in the common case.
 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
@@ -269,8 +296,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct gfs2_holder gh;
-        unsigned page_idx;
+        int ret = 0;
-        int ret;
        int do_unlock = 0;
        if (likely(file != &gfs2_internal_file_sentinel)) {
@@ -289,29 +315,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
                        goto out_unlock;
        }
 skip_lock:
-        if (gfs2_is_stuffed(ip)) {
+        if (!gfs2_is_stuffed(ip))
-                struct pagevec lru_pvec;
-                pagevec_init(&lru_pvec, 0);
-                for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-                        struct page *page = list_entry(pages->prev, struct page, lru);
-                        prefetchw(&page->flags);
-                        list_del(&page->lru);
-                        if (!add_to_page_cache(page, mapping,
-                                               page->index, GFP_KERNEL)) {
-                                ret = stuffed_readpage(ip, page);
-                                unlock_page(page);
-                                if (!pagevec_add(&lru_pvec, page))
-                                         __pagevec_lru_add(&lru_pvec);
-                        } else {
-                                page_cache_release(page);
-                        }
-                }
-                pagevec_lru_add(&lru_pvec);
-                ret = 0;
-        } else {
-                /* What we really want to do .... */
                ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
-        }
        if (do_unlock) {
                gfs2_glock_dq_m(1, &gh);
@@ -356,8 +361,10 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|LM_FLAG_TRY_1CB, &ip->i_gh);
        error = gfs2_glock_nq_atime(&ip->i_gh);
        if (unlikely(error)) {
-                if (error == GLR_TRYFAILED)
+                if (error == GLR_TRYFAILED) {
+                        unlock_page(page);
                        error = AOP_TRUNCATED_PAGE;
+                }
                goto out_uninit;
        }
@@ -594,6 +601,36 @@ static void gfs2_invalidatepage(struct page *page, unsigned long offset)
        return;
 }
+/**
+ * gfs2_ok_for_dio - check that dio is valid on this file
+ * @ip: The inode
+ * @rw: READ or WRITE
+ * @offset: The offset at which we are reading or writing
+ *
+ * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
+ *          1 (to accept the i/o request)
+ */
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
+{
+        /*
+         * Should we return an error here? I can't see that O_DIRECT for
+         * a journaled file makes any sense. For now we'll silently fall
+         * back to buffered I/O, likewise we do the same for stuffed
+         * files since they are (a) small and (b) unaligned.
+         */
+        if (gfs2_is_jdata(ip))
+                return 0;
+        if (gfs2_is_stuffed(ip))
+                return 0;
+        if (offset > i_size_read(&ip->i_inode))
+                return 0;
+        return 1;
+}
 static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
                              const struct iovec *iov, loff_t offset,
                              unsigned long nr_segs)
@@ -604,42 +641,28 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
        struct gfs2_holder gh;
        int rv;
-        if (rw == READ)
-                mutex_lock(&inode->i_mutex);
        /*
-         * Shared lock, even if its a write, since we do no allocation
+         * Deferred lock, even if its a write, since we do no allocation
-         * on this path. All we need change is atime.
+         * on this path. All we need change is atime, and this lock mode
+         * ensures that other nodes have flushed their buffered read caches
+         * (i.e. their page cache entries for this inode). We do not,
+         * unfortunately have the option of only flushing a range like
+         * the VFS does.
         */
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
        rv = gfs2_glock_nq_atime(&gh);
        if (rv)
-                goto out;
+                return rv;
+        rv = gfs2_ok_for_dio(ip, rw, offset);
-        if (offset > i_size_read(inode))
+        if (rv != 1)
-                goto out;
+                goto out; /* dio not valid, fall back to buffered i/o */
-        /*
+        rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
-         * Should we return an error here? I can't see that O_DIRECT for
+                                           iov, offset, nr_segs,
-         * a journaled file makes any sense. For now we'll silently fall
+                                           gfs2_get_block_direct, NULL);
-         * back to buffered I/O, likewise we do the same for stuffed
-         * files since they are (a) small and (b) unaligned.
-         */
-        if (gfs2_is_jdata(ip))
-                goto out;
-        if (gfs2_is_stuffed(ip))
-                goto out;
-        rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
-                                            inode->i_sb->s_bdev,
-                                            iov, offset, nr_segs,
-                                            gfs2_get_block_direct, NULL);
 out:
        gfs2_glock_dq_m(1, &gh);
        gfs2_holder_uninit(&gh);
-        if (rw == READ)
-                mutex_unlock(&inode->i_mutex);
        return rv;
 }
@@ -763,6 +786,7 @@ out:
 const struct address_space_operations gfs2_file_aops = {
        .writepage = gfs2_writepage,
+        .writepages = gfs2_writepages,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
        .sync_page = block_sync_page,
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index d355899585d8..9187eb174b43 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -46,6 +46,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        struct gfs2_inum_host inum;
        unsigned int type;
        int error;
+        int had_lock=0;
        if (inode && is_bad_inode(inode))
                goto invalid;
@@ -53,9 +54,12 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        if (sdp->sd_args.ar_localcaching)
                goto valid;
-        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+        had_lock = gfs2_glock_is_locked_by_me(dip->i_gl);
-        if (error)
+        if (!had_lock) {
-                goto fail;
+                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+                if (error)
+                        goto fail;
+        } 
        error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
        switch (error) {
@@ -82,13 +86,15 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        }
 valid_gunlock:
-        gfs2_glock_dq_uninit(&d_gh);
+        if (!had_lock)
+                gfs2_glock_dq_uninit(&d_gh);
 valid:
        dput(parent);
        return 1;
 invalid_gunlock:
-        gfs2_glock_dq_uninit(&d_gh);
+        if (!had_lock)
+                gfs2_glock_dq_uninit(&d_gh);
 invalid:
        if (inode && S_ISDIR(inode->i_mode)) {
                if (have_submounts(dentry))
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index b4e7b8775315..4855e8cca622 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,6 +22,7 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
+#include "ops_dentry.h"
 #include "ops_export.h"
 #include "rgrp.h"
 #include "util.h"
@@ -112,13 +113,12 @@ struct get_name_filldir {
        char *name;
 };
-static int get_name_filldir(void *opaque, const char *name, unsigned int length,
+static int get_name_filldir(void *opaque, const char *name, int length,
-                            u64 offset, struct gfs2_inum_host *inum,
+                            loff_t offset, u64 inum, unsigned int type)
-                            unsigned int type)
 {
-        struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
+        struct get_name_filldir *gnfd = opaque;
-        if (!gfs2_inum_equal(inum, &gnfd->inum))
+        if (inum != gnfd->inum.no_addr)
                return 0;
        memcpy(gnfd->name, name, length);
@@ -189,6 +189,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
                return ERR_PTR(-ENOMEM);
        }
+        dentry->d_op = &gfs2_dops;
        return dentry;
 }
@@ -215,8 +216,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
        }
        error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
-                                  LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
+                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-                                  &i_gh);
        if (error)
                return ERR_PTR(error);
@@ -269,6 +269,7 @@ out_inode:
                return ERR_PTR(-ENOMEM);
        }
+        dentry->d_op = &gfs2_dops;
        return dentry;
 fail_rgd:
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index faa07e4b97d0..c996aa739a05 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -43,15 +43,6 @@
 #include "util.h"
 #include "eaops.h"
-/* For regular, non-NFS */
-struct filldir_reg {
-        struct gfs2_sbd *fdr_sbd;
-        int fdr_prefetch;
-        filldir_t fdr_filldir;
-        void *fdr_opaque;
-};
 /*
 * Most fields left uninitialised to catch anybody who tries to
 * use them. f_flags set to prevent file_accessed() from touching
@@ -128,41 +119,6 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
 }
 /**
- * filldir_func - Report a directory entry to the caller of gfs2_dir_read()
- * @opaque: opaque data used by the function
- * @name: the name of the directory entry
- * @length: the length of the name
- * @offset: the entry's offset in the directory
- * @inum: the inode number the entry points to
- * @type: the type of inode the entry points to
- *
- * Returns: 0 on success, 1 if buffer full
- */
-static int filldir_func(void *opaque, const char *name, unsigned int length,
-                        u64 offset, struct gfs2_inum_host *inum,
-                        unsigned int type)
-{
-        struct filldir_reg *fdr = (struct filldir_reg *)opaque;
-        struct gfs2_sbd *sdp = fdr->fdr_sbd;
-        int error;
-        error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
-                                 inum->no_addr, type);
-        if (error)
-                return 1;
-        if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
-                gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops,
-                                       LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
-                gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops,
-                                       LM_ST_SHARED, LM_FLAG_TRY);
-        }
-        return 0;
-}
-/**
 * gfs2_readdir - Read directory entries from a directory
 * @file: The directory to read from
 * @dirent: Buffer for dirents
@@ -175,16 +131,10 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
 {
        struct inode *dir = file->f_mapping->host;
        struct gfs2_inode *dip = GFS2_I(dir);
-        struct filldir_reg fdr;
        struct gfs2_holder d_gh;
        u64 offset = file->f_pos;
        int error;
-        fdr.fdr_sbd = GFS2_SB(dir);
-        fdr.fdr_prefetch = 1;
-        fdr.fdr_filldir = filldir;
-        fdr.fdr_opaque = dirent;
        gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
        error = gfs2_glock_nq_atime(&d_gh);
        if (error) {
@@ -192,7 +142,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
                return error;
        }
-        error = gfs2_dir_read(dir, &offset, &fdr, filldir_func);
+        error = gfs2_dir_read(dir, &offset, dirent, filldir);
        gfs2_glock_dq_uninit(&d_gh);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 636dda4c7d38..f40a84807d75 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -264,13 +264,23 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        struct gfs2_inode *dip = GFS2_I(dir);
        struct gfs2_sbd *sdp = GFS2_SB(dir);
        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-        struct gfs2_holder ghs[2];
+        struct gfs2_holder ghs[3];
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_holder ri_gh;
        int error;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                return error;
        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+        gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
-        error = gfs2_glock_nq_m(2, ghs);
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+        gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
+        error = gfs2_glock_nq_m(3, ghs);
        if (error)
                goto out;
@@ -291,10 +301,12 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 out_end_trans:
        gfs2_trans_end(sdp);
 out_gunlock:
-        gfs2_glock_dq_m(2, ghs);
+        gfs2_glock_dq_m(3, ghs);
 out:
        gfs2_holder_uninit(ghs);
        gfs2_holder_uninit(ghs + 1);
+        gfs2_holder_uninit(ghs + 2);
+        gfs2_glock_dq_uninit(&ri_gh);
        return error;
 }
@@ -449,13 +461,22 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        struct gfs2_inode *dip = GFS2_I(dir);
        struct gfs2_sbd *sdp = GFS2_SB(dir);
        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-        struct gfs2_holder ghs[2];
+        struct gfs2_holder ghs[3];
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_holder ri_gh;
        int error;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                return error;
        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-        error = gfs2_glock_nq_m(2, ghs);
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+        gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
+        error = gfs2_glock_nq_m(3, ghs);
        if (error)
                goto out;
@@ -483,10 +504,12 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        gfs2_trans_end(sdp);
 out_gunlock:
-        gfs2_glock_dq_m(2, ghs);
+        gfs2_glock_dq_m(3, ghs);
 out:
        gfs2_holder_uninit(ghs);
        gfs2_holder_uninit(ghs + 1);
+        gfs2_holder_uninit(ghs + 2);
+        gfs2_glock_dq_uninit(&ri_gh);
        return error;
 }
@@ -547,7 +570,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
        struct gfs2_inode *nip = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(odir);
-        struct gfs2_holder ghs[4], r_gh;
+        struct gfs2_holder ghs[5], r_gh;
+        struct gfs2_rgrpd *nrgd;
        unsigned int num_gh;
        int dir_rename = 0;
        int alloc_required;
@@ -587,6 +611,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        if (nip) {
                gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
                num_gh++;
+                /* grab the resource lock for unlink flag twiddling 
+                 * this is the case of the target file already existing
+                 * so we unlink before doing the rename
+                 */
+                nrgd = gfs2_blk2rgrpd(sdp, nip->i_num.no_addr);
+                if (nrgd)
+                        gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
        }
        error = gfs2_glock_nq_m(num_gh, ghs);
@@ -684,12 +715,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
                                         al->al_rgd->rd_ri.ri_length +
                                         4 * RES_DINODE + 4 * RES_LEAF +
-                                         RES_STATFS + RES_QUOTA, 0);
+                                         RES_STATFS + RES_QUOTA + 4, 0);
                if (error)
                        goto out_ipreserv;
        } else {
                error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
-                                         5 * RES_LEAF, 0);
+                                         5 * RES_LEAF + 4, 0);
                if (error)
                        goto out_gunlock;
        }
@@ -728,7 +759,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                error = gfs2_meta_inode_buffer(ip, &dibh);
                if (error)
                        goto out_end_trans;
-                ip->i_inode.i_ctime.tv_sec = get_seconds();
+                ip->i_inode.i_ctime = CURRENT_TIME_SEC;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
@@ -1018,7 +1049,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
        }
        generic_fillattr(inode, stat);
-        if (unlock);
+        if (unlock)
                gfs2_glock_dq_uninit(&gh);
        return 0;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 7685b46f934b..47369d011214 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -173,6 +173,9 @@ static void gfs2_write_super_lockfs(struct super_block *sb)
        struct gfs2_sbd *sdp = sb->s_fs_info;
        int error;
+        if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return;
        for (;;) {
                error = gfs2_freeze_fs(sdp);
                if (!error)
@@ -426,6 +429,12 @@ static void gfs2_delete_inode(struct inode *inode)
        }
        error = gfs2_dinode_dealloc(ip);
+        /*
+         * Must do this before unlock to avoid trying to write back
+         * potentially dirty data now that inode no longer exists
+         * on disk.
+         */
+        truncate_inode_pages(&inode->i_data, 0);
 out_unlock:
        gfs2_glock_dq(&ip->i_iopen_gh);
@@ -443,14 +452,12 @@ out:
 static struct inode *gfs2_alloc_inode(struct super_block *sb)
 {
-        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *ip;
        ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
        if (ip) {
                ip->i_flags = 0;
                ip->i_gl = NULL;
-                ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
                ip->i_last_pfault = jiffies;
        }
        return &ip->i_inode;
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index 45a5f11fc39a..14b380fb0602 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -28,34 +28,13 @@
 #include "trans.h"
 #include "util.h"
-static void pfault_be_greedy(struct gfs2_inode *ip)
-{
-        unsigned int time;
-        spin_lock(&ip->i_spin);
-        time = ip->i_greedy;
-        ip->i_last_pfault = jiffies;
-        spin_unlock(&ip->i_spin);
-        igrab(&ip->i_inode);
-        if (gfs2_glock_be_greedy(ip->i_gl, time))
-                iput(&ip->i_inode);
-}
 static struct page *gfs2_private_nopage(struct vm_area_struct *area,
                                        unsigned long address, int *type)
 {
        struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
-        struct page *result;
        set_bit(GIF_PAGED, &ip->i_flags);
+        return filemap_nopage(area, address, type);
-        result = filemap_nopage(area, address, type);
-        if (result && result != NOPAGE_OOM)
-                pfault_be_greedy(ip);
-        return result;
 }
 static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
@@ -167,7 +146,6 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
                set_page_dirty(result);
        }
-        pfault_be_greedy(ip);
 out:
        gfs2_glock_dq_uninit(&i_gh);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 43a24f2e5905..70f424fcf1cd 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -71,17 +71,12 @@ void gfs2_tune_init(struct gfs2_tune *gt)
        gt->gt_atime_quantum = 3600;
        gt->gt_new_files_jdata = 0;
        gt->gt_new_files_directio = 0;
-        gt->gt_max_atomic_write = 4 << 20;
        gt->gt_max_readahead = 1 << 18;
        gt->gt_lockdump_size = 131072;
        gt->gt_stall_secs = 600;
        gt->gt_complain_secs = 10;
        gt->gt_reclaim_limit = 5000;
        gt->gt_entries_per_readdir = 32;
-        gt->gt_prefetch_secs = 10;
-        gt->gt_greedy_default = HZ / 10;
-        gt->gt_greedy_quantum = HZ / 40;
-        gt->gt_greedy_max = HZ / 4;
        gt->gt_statfs_quantum = 30;
        gt->gt_statfs_slow = 0;
 }
@@ -359,8 +354,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
        mutex_lock(&sdp->sd_jindex_mutex);
        for (;;) {
-                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
+                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
-                                           GL_LOCAL_EXCL, ji_gh);
                if (error)
                        break;
@@ -529,8 +523,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
        struct gfs2_log_header_host head;
        int error;
-        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
-                                   GL_LOCAL_EXCL, &t_gh);
        if (error)
                return error;
@@ -583,9 +576,8 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
        gfs2_quota_sync(sdp);
        gfs2_statfs_sync(sdp);
-        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-                                GL_LOCAL_EXCL | GL_NOCACHE,
+                                   &t_gh);
-                                &t_gh);
        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
                return error;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 983eaf1e06be..d01f9f0fda26 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -436,17 +436,12 @@ TUNE_ATTR(atime_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(reclaim_limit, 0);
-TUNE_ATTR(prefetch_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(new_files_directio, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(quota_cache_secs, 1);
-TUNE_ATTR(max_atomic_write, 1);
 TUNE_ATTR(stall_secs, 1);
-TUNE_ATTR(greedy_default, 1);
-TUNE_ATTR(greedy_quantum, 1);
-TUNE_ATTR(greedy_max, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_DAEMON(scand_secs, scand_process);
 TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
@@ -465,15 +460,10 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_max_readahead.attr,
        &tune_attr_complain_secs.attr,
        &tune_attr_reclaim_limit.attr,
-        &tune_attr_prefetch_secs.attr,
        &tune_attr_statfs_slow.attr,
        &tune_attr_quota_simul_sync.attr,
        &tune_attr_quota_cache_secs.attr,
-        &tune_attr_max_atomic_write.attr,
        &tune_attr_stall_secs.attr,
-        &tune_attr_greedy_default.attr,
-        &tune_attr_greedy_quantum.attr,
-        &tune_attr_greedy_max.attr,
        &tune_attr_statfs_quantum.attr,
        &tune_attr_scand_secs.attr,
        &tune_attr_recoverd_secs.attr,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index f5719117edfe..e285022f006c 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -182,9 +182,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
         * Take appropriate lock on inode
         */
        if (create)
-                IWRITE_LOCK(ip);
+                IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
        else
-                IREAD_LOCK(ip);
+                IREAD_LOCK(ip, RDWRLOCK_NORMAL);
        if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) &&
            (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) &&
@@ -359,7 +359,7 @@ void jfs_truncate(struct inode *ip)
        nobh_truncate_page(ip->i_mapping, ip->i_size);
-        IWRITE_LOCK(ip);
+        IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
        jfs_truncate_nolock(ip, ip->i_size);
        IWRITE_UNLOCK(ip);
 }
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index ddffbbd4d955..7378798f0b21 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -39,10 +39,6 @@ extern void jfs_proc_clean(void);
 /*
 *      assert with traditional printf/panic
 */
-#ifdef CONFIG_KERNEL_ASSERTS
-/* kgdb stuff */
-#define assert(p) KERNEL_ASSERT(#p, p)
-#else
 #define assert(p) do {  \
        if (!(p)) {     \
                printk(KERN_CRIT "BUG at %s:%d assert(%s)\n",   \
@@ -50,7 +46,6 @@ extern void jfs_proc_clean(void);
                BUG();  \
        }               \
 } while (0)
-#endif
 /*
 *      debug ON
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 23546c8fd48b..82b0544bd76d 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -337,7 +337,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
        struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
-        IREAD_LOCK(ipbmap);
+        IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
        /* block to be freed better be within the mapsize. */
        if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
@@ -733,7 +733,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
         * allocation group size, try to allocate anywhere.
         */
        if (l2nb > bmp->db_agl2size) {
-                IWRITE_LOCK(ipbmap);
+                IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
                rc = dbAllocAny(bmp, nblocks, l2nb, results);
@@ -774,7 +774,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
         * the hint using a tiered strategy.
         */
        if (nblocks <= BPERDMAP) {
-                IREAD_LOCK(ipbmap);
+                IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
                /* get the buffer for the dmap containing the hint.
                 */
@@ -844,7 +844,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
        /* try to satisfy the allocation request with blocks within
         * the same allocation group as the hint.
         */
-        IWRITE_LOCK(ipbmap);
+        IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
        if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
                goto write_unlock;
@@ -856,7 +856,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
         * Let dbNextAG recommend a preferred allocation group
         */
        agno = dbNextAG(ipbmap);
-        IWRITE_LOCK(ipbmap);
+        IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
        /* Try to allocate within this allocation group.  if that fails, try to
         * allocate anywhere in the map.
@@ -900,7 +900,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
        s64 lblkno;
        struct metapage *mp;
-        IREAD_LOCK(ipbmap);
+        IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
        /*
         * validate extent request:
@@ -1050,7 +1050,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
         */
        extblkno = lastblkno + 1;
-        IREAD_LOCK(ipbmap);
+        IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
        /* better be within the file system */
        bmp = sbi->bmap;
@@ -3116,7 +3116,7 @@ int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
        struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
        struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
-        IREAD_LOCK(ipbmap);
+        IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
        /* block to be allocated better be within the mapsize. */
        ASSERT(nblocks <= bmp->db_mapsize - blkno);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 53f63b47a6d3..aa5124b643b1 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -331,7 +331,7 @@ int diRead(struct inode *ip)
        /* read the iag */
        imap = JFS_IP(ipimap)->i_imap;
-        IREAD_LOCK(ipimap);
+        IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
        rc = diIAGRead(imap, iagno, &mp);
        IREAD_UNLOCK(ipimap);
        if (rc) {
@@ -920,7 +920,7 @@ int diFree(struct inode *ip)
        /* Obtain read lock in imap inode.  Don't release it until we have
         * read all of the IAG's that we are going to.
         */
-        IREAD_LOCK(ipimap);
+        IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
        /* read the iag.
         */
@@ -1415,7 +1415,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
        AG_LOCK(imap, agno);
        /* Get read lock on imap inode */
-        IREAD_LOCK(ipimap);
+        IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
        /* get the iag number and read the iag */
        iagno = INOTOIAG(inum);
@@ -1808,7 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
                return -ENOSPC;
        /* obtain read lock on imap inode */
-        IREAD_LOCK(imap->im_ipimap);
+        IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
        /* read the iag at the head of the list.
         */
@@ -1946,7 +1946,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
        } else {
                /* read the iag.
                 */
-                IREAD_LOCK(imap->im_ipimap);
+                IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
                if ((rc = diIAGRead(imap, iagno, &mp))) {
                        IREAD_UNLOCK(imap->im_ipimap);
                        jfs_error(ip->i_sb, "diAllocExt: error reading iag");
@@ -2509,7 +2509,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
                 */
                /* acquire inode map lock */
-                IWRITE_LOCK(ipimap);
+                IWRITE_LOCK(ipimap, RDWRLOCK_IMAP);
                if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
                        IWRITE_UNLOCK(ipimap);
@@ -2648,7 +2648,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
        }
        /* obtain read lock on map */
-        IREAD_LOCK(ipimap);
+        IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
        /* read the iag */
        if ((rc = diIAGRead(imap, iagno, &mp))) {
@@ -2779,7 +2779,7 @@ diUpdatePMap(struct inode *ipimap,
                return -EIO;
        }
        /* read the iag */
-        IREAD_LOCK(ipimap);
+        IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
        rc = diIAGRead(imap, iagno, &mp);
        IREAD_UNLOCK(ipimap);
        if (rc)
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 94005584445a..8f453eff3c83 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -109,9 +109,11 @@ struct jfs_inode_info {
 #define JFS_ACL_NOT_CACHED ((void *)-1)
-#define IREAD_LOCK(ip)          down_read(&JFS_IP(ip)->rdwrlock)
+#define IREAD_LOCK(ip, subclass) \
+        down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
 #define IREAD_UNLOCK(ip)        up_read(&JFS_IP(ip)->rdwrlock)
-#define IWRITE_LOCK(ip)         down_write(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_LOCK(ip, subclass) \
+        down_write_nested(&JFS_IP(ip)->rdwrlock, subclass)
 #define IWRITE_UNLOCK(ip)       up_write(&JFS_IP(ip)->rdwrlock)
 /*
@@ -127,6 +129,29 @@ enum cflags {
        COMMIT_Synclist,        /* metadata pages on group commit synclist */
 };
+/*
+ * commit_mutex nesting subclasses:
+ */
+enum commit_mutex_class
+{
+        COMMIT_MUTEX_PARENT,
+        COMMIT_MUTEX_CHILD,
+        COMMIT_MUTEX_SECOND_PARENT,     /* Renaming */
+        COMMIT_MUTEX_VICTIM             /* Inode being unlinked due to rename */
+};
+/*
+ * rdwrlock subclasses:
+ * The dmap inode may be locked while a normal inode or the imap inode are
+ * locked.
+ */
+enum rdwrlock_class
+{
+        RDWRLOCK_NORMAL,
+        RDWRLOCK_IMAP,
+        RDWRLOCK_DMAP
+};
 #define set_cflag(flag, ip)     set_bit(flag, &(JFS_IP(ip)->cflag))
 #define clear_cflag(flag, ip)   clear_bit(flag, &(JFS_IP(ip)->cflag))
 #define test_cflag(flag, ip)    test_bit(flag, &(JFS_IP(ip)->cflag))
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
index 7d78e83d7c40..df48ece4b7a3 100644
--- a/fs/jfs/jfs_lock.h
+++ b/fs/jfs/jfs_lock.h
@@ -42,7 +42,7 @@ do {							\
                if (cond)                               \
                        break;                          \
                unlock_cmd;                             \
-                schedule();                             \
+                io_schedule();                          \
                lock_cmd;                               \
        }                                               \
        current->state = TASK_RUNNING;                  \
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index ceaf03b94935..58deae007507 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -56,7 +56,7 @@ static inline void __lock_metapage(struct metapage *mp)
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (metapage_locked(mp)) {
                        unlock_page(mp->page);
-                        schedule();
+                        io_schedule();
                        lock_page(mp->page);
                }
        } while (trylock_metapage(mp));
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d558e51b0df8..6988a1082f58 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -135,7 +135,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
        add_wait_queue(event, &wait);
        set_current_state(TASK_UNINTERRUPTIBLE);
        TXN_UNLOCK();
-        schedule();
+        io_schedule();
        current->state = TASK_RUNNING;
        remove_wait_queue(event, &wait);
 }
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index e98eb03e5310..acc97c46d8a4 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -757,6 +757,11 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
                        nsplit = 0;
                /* push (bn, index) of the parent page/entry */
+                if (BT_STACK_FULL(btstack)) {
+                        jfs_error(ip->i_sb, "stack overrun in xtSearch!");
+                        XT_PUTPAGE(mp);
+                        return -EIO;
+                }
                BT_PUSH(btstack, bn, index);
                /* get the child page block number */
@@ -3915,6 +3920,11 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
         */
      getChild:
        /* save current parent entry for the child page */
+        if (BT_STACK_FULL(&btstack)) {
+                jfs_error(ip->i_sb, "stack overrun in xtTruncate!");
+                XT_PUTPAGE(mp);
+                return -EIO;
+        }
        BT_PUSH(&btstack, bn, index);
        /* get child page */
@@ -4112,6 +4122,11 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
         */
      getChild:
        /* save current parent entry for the child page */
+        if (BT_STACK_FULL(&btstack)) {
+                jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!");
+                XT_PUTPAGE(mp);
+                return -EIO;
+        }
        BT_PUSH(&btstack, bn, index);
        /* get child page */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a6a8c16c872c..7ab47561b68d 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -104,8 +104,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        tid = txBegin(dip->i_sb, 0);
-        mutex_lock(&JFS_IP(dip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
-        mutex_lock(&JFS_IP(ip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
        rc = jfs_init_acl(tid, ip, dip);
        if (rc)
@@ -238,8 +238,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        tid = txBegin(dip->i_sb, 0);
-        mutex_lock(&JFS_IP(dip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
-        mutex_lock(&JFS_IP(ip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
        rc = jfs_init_acl(tid, ip, dip);
        if (rc)
@@ -365,8 +365,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
        tid = txBegin(dip->i_sb, 0);
-        mutex_lock(&JFS_IP(dip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
-        mutex_lock(&JFS_IP(ip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
        iplist[0] = dip;
        iplist[1] = ip;
@@ -483,12 +483,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
        if ((rc = get_UCSname(&dname, dentry)))
                goto out;
-        IWRITE_LOCK(ip);
+        IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
        tid = txBegin(dip->i_sb, 0);
-        mutex_lock(&JFS_IP(dip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
-        mutex_lock(&JFS_IP(ip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
        iplist[0] = dip;
        iplist[1] = ip;
@@ -802,8 +802,8 @@ static int jfs_link(struct dentry *old_dentry,
        tid = txBegin(ip->i_sb, 0);
-        mutex_lock(&JFS_IP(dir)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
-        mutex_lock(&JFS_IP(ip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
        /*
         * scan parent directory for entry/freespace
@@ -913,8 +913,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        tid = txBegin(dip->i_sb, 0);
-        mutex_lock(&JFS_IP(dip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
-        mutex_lock(&JFS_IP(ip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
        rc = jfs_init_security(tid, ip, dip);
        if (rc)
@@ -1127,7 +1127,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto out3;
                }
        } else if (new_ip) {
-                IWRITE_LOCK(new_ip);
+                IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
                /* Init inode for quota operations. */
                DQUOT_INIT(new_ip);
        }
@@ -1137,13 +1137,21 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         */
        tid = txBegin(new_dir->i_sb, 0);
-        mutex_lock(&JFS_IP(new_dir)->commit_mutex);
+        /*
-        mutex_lock(&JFS_IP(old_ip)->commit_mutex);
+         * How do we know the locking is safe from deadlocks?
+         * The vfs does the hard part for us.  Any time we are taking nested
+         * commit_mutexes, the vfs already has i_mutex held on the parent.
+         * Here, the vfs has already taken i_mutex on both old_dir and new_dir.
+         */
+        mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+        mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD);
        if (old_dir != new_dir)
-                mutex_lock(&JFS_IP(old_dir)->commit_mutex);
+                mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex,
+                                  COMMIT_MUTEX_SECOND_PARENT);
        if (new_ip) {
-                mutex_lock(&JFS_IP(new_ip)->commit_mutex);
+                mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex,
+                                  COMMIT_MUTEX_VICTIM);
                /*
                 * Change existing directory entry to new inode number
                 */
@@ -1357,8 +1365,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        tid = txBegin(dir->i_sb, 0);
-        mutex_lock(&JFS_IP(dir)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
-        mutex_lock(&JFS_IP(ip)->commit_mutex);
+        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
        rc = jfs_init_acl(tid, ip, dir);
        if (rc)
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 277ca67a2ad6..5a9779bb9236 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -184,10 +184,9 @@ static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
        flush_scheduled_work();
 }
-static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
-                                      unsigned int num_ios)
 {
-        atomic_set(&wc->wc_num_reqs, num_ios);
+        atomic_set(&wc->wc_num_reqs, 1);
        init_completion(&wc->wc_io_complete);
        wc->wc_error = 0;
 }
@@ -212,6 +211,7 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
        struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
        blk_run_address_space(mapping);
+        o2hb_bio_wait_dec(wc, 1);
        wait_for_completion(&wc->wc_io_complete);
 }
@@ -231,6 +231,7 @@ static int o2hb_bio_end_io(struct bio *bio,
                return 1;
        o2hb_bio_wait_dec(wc, 1);
+        bio_put(bio);
        return 0;
 }
@@ -238,23 +239,22 @@ static int o2hb_bio_end_io(struct bio *bio,
 * start_slot. */
 static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
                                      struct o2hb_bio_wait_ctxt *wc,
-                                      unsigned int start_slot,
+                                      unsigned int *current_slot,
-                                      unsigned int num_slots)
+                                      unsigned int max_slots)
 {
-        int i, nr_vecs, len, first_page, last_page;
+        int len, current_page;
        unsigned int vec_len, vec_start;
        unsigned int bits = reg->hr_block_bits;
        unsigned int spp = reg->hr_slots_per_page;
+        unsigned int cs = *current_slot;
        struct bio *bio;
        struct page *page;
-        nr_vecs = (num_slots + spp - 1) / spp;
        /* Testing has shown this allocation to take long enough under
         * GFP_KERNEL that the local node can get fenced. It would be
         * nicest if we could pre-allocate these bios and avoid this
         * all together. */
-        bio = bio_alloc(GFP_ATOMIC, nr_vecs);
+        bio = bio_alloc(GFP_ATOMIC, 16);
        if (!bio) {
                mlog(ML_ERROR, "Could not alloc slots BIO!\n");
                bio = ERR_PTR(-ENOMEM);
@@ -262,137 +262,53 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
        }
        /* Must put everything in 512 byte sectors for the bio... */
-        bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
+        bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
        bio->bi_bdev = reg->hr_bdev;
        bio->bi_private = wc;
        bio->bi_end_io = o2hb_bio_end_io;
-        first_page = start_slot / spp;
+        vec_start = (cs << bits) % PAGE_CACHE_SIZE;
-        last_page = first_page + nr_vecs;
+        while(cs < max_slots) {
-        vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
+                current_page = cs / spp;
-        for(i = first_page; i < last_page; i++) {
+                page = reg->hr_slot_data[current_page];
-                page = reg->hr_slot_data[i];
-                vec_len = PAGE_CACHE_SIZE;
+                vec_len = min(PAGE_CACHE_SIZE,
-                /* last page might be short */
+                              (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
-                if (((i + 1) * spp) > (start_slot + num_slots))
-                        vec_len = ((num_slots + start_slot) % spp) << bits;
-                vec_len -=  vec_start;
                mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
-                     i, vec_len, vec_start);
+                     current_page, vec_len, vec_start);
                len = bio_add_page(bio, page, vec_len, vec_start);
-                if (len != vec_len) {
+                if (len != vec_len) break;
-                        bio_put(bio);
-                        bio = ERR_PTR(-EIO);
-                        mlog(ML_ERROR, "Error adding page to bio i = %d, "
-                             "vec_len = %u, len = %d\n, start = %u\n",
-                             i, vec_len, len, vec_start);
-                        goto bail;
-                }
+                cs += vec_len / (PAGE_CACHE_SIZE/spp);
                vec_start = 0;
        }
 bail:
+        *current_slot = cs;
        return bio;
 }
-/*
- * Compute the maximum number of sectors the bdev can handle in one bio,
- * as a power of two.
- *
- * Stolen from oracleasm, thanks Joel!
- */
-static int compute_max_sectors(struct block_device *bdev)
-{
-        int max_pages, max_sectors, pow_two_sectors;
-        struct request_queue *q;
-        q = bdev_get_queue(bdev);
-        max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
-        if (max_pages > BIO_MAX_PAGES)
-                max_pages = BIO_MAX_PAGES;
-        if (max_pages > q->max_phys_segments)
-                max_pages = q->max_phys_segments;
-        if (max_pages > q->max_hw_segments)
-                max_pages = q->max_hw_segments;
-        max_pages--; /* Handle I/Os that straddle a page */
-        if (max_pages) {
-                max_sectors = max_pages << (PAGE_SHIFT - 9);
-        } else {
-                /* If BIO contains 1 or less than 1 page. */
-                max_sectors = q->max_sectors;
-        }
-        /* Why is fls() 1-based???? */
-        pow_two_sectors = 1 << (fls(max_sectors) - 1);
-        return pow_two_sectors;
-}
-static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
-                                               unsigned int num_slots,
-                                               unsigned int *num_bios,
-                                               unsigned int *slots_per_bio)
-{
-        unsigned int max_sectors, io_sectors;
-        max_sectors = compute_max_sectors(reg->hr_bdev);
-        io_sectors = num_slots << (reg->hr_block_bits - 9);
-        *num_bios = (io_sectors + max_sectors - 1) / max_sectors;
-        *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
-        mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
-             "device can handle %u sectors of I/O\n", io_sectors, num_slots,
-             max_sectors);
-        mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
-             *num_bios, *slots_per_bio);
-}
 static int o2hb_read_slots(struct o2hb_region *reg,
                           unsigned int max_slots)
 {
-        unsigned int num_bios, slots_per_bio, start_slot, num_slots;
+        unsigned int current_slot=0;
-        int i, status;
+        int status;
        struct o2hb_bio_wait_ctxt wc;
-        struct bio **bios;
        struct bio *bio;
-        o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
+        o2hb_bio_wait_init(&wc);
-        bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
+        while(current_slot < max_slots) {
-        if (!bios) {
+                bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
-                status = -ENOMEM;
-                mlog_errno(status);
-                return status;
-        }
-        o2hb_bio_wait_init(&wc, num_bios);
-        num_slots = slots_per_bio;
-        for(i = 0; i < num_bios; i++) {
-                start_slot = i * slots_per_bio;
-                /* adjust num_slots at last bio */
-                if (max_slots < (start_slot + num_slots))
-                        num_slots = max_slots - start_slot;
-                bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
                if (IS_ERR(bio)) {
-                        o2hb_bio_wait_dec(&wc, num_bios - i);
                        status = PTR_ERR(bio);
                        mlog_errno(status);
                        goto bail_and_wait;
                }
-                bios[i] = bio;
+                atomic_inc(&wc.wc_num_reqs);
                submit_bio(READ, bio);
        }
@@ -403,38 +319,30 @@ bail_and_wait:
        if (wc.wc_error && !status)
                status = wc.wc_error;
-        if (bios) {
-                for(i = 0; i < num_bios; i++)
-                        if (bios[i])
-                                bio_put(bios[i]);
-                kfree(bios);
-        }
        return status;
 }
 static int o2hb_issue_node_write(struct o2hb_region *reg,
-                                 struct bio **write_bio,
                                 struct o2hb_bio_wait_ctxt *write_wc)
 {
        int status;
        unsigned int slot;
        struct bio *bio;
-        o2hb_bio_wait_init(write_wc, 1);
+        o2hb_bio_wait_init(write_wc);
        slot = o2nm_this_node();
-        bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
+        bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
        if (IS_ERR(bio)) {
                status = PTR_ERR(bio);
                mlog_errno(status);
                goto bail;
        }
+        atomic_inc(&write_wc->wc_num_reqs);
        submit_bio(WRITE, bio);
-        *write_bio = bio;
        status = 0;
 bail:
        return status;
@@ -826,7 +734,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 {
        int i, ret, highest_node, change = 0;
        unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        struct bio *write_bio;
        struct o2hb_bio_wait_ctxt write_wc;
        ret = o2nm_configured_node_map(configured_nodes,
@@ -864,7 +771,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
        /* And fire off the write. Note that we don't wait on this I/O
         * until later. */
-        ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+        ret = o2hb_issue_node_write(reg, &write_wc);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -882,7 +789,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
         * people we find in our steady state have seen us.
         */
        o2hb_wait_on_io(reg, &write_wc);
-        bio_put(write_bio);
        if (write_wc.wc_error) {
                /* Do not re-arm the write timeout on I/O error - we
                 * can't be sure that the new block ever made it to
@@ -943,7 +849,6 @@ static int o2hb_thread(void *data)
 {
        int i, ret;
        struct o2hb_region *reg = data;
-        struct bio *write_bio;
        struct o2hb_bio_wait_ctxt write_wc;
        struct timeval before_hb, after_hb;
        unsigned int elapsed_msec;
@@ -993,10 +898,9 @@ static int o2hb_thread(void *data)
         *
         * XXX: Should we skip this on unclean_stop? */
        o2hb_prepare_block(reg, 0);
-        ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+        ret = o2hb_issue_node_write(reg, &write_wc);
        if (ret == 0) {
                o2hb_wait_on_io(reg, &write_wc);
-                bio_put(write_bio);
        } else {
                mlog_errno(ret);
        }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index ae4ff4a6636b..1718215fc018 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -556,6 +556,8 @@ static void o2net_register_callbacks(struct sock *sk,
        sk->sk_data_ready = o2net_data_ready;
        sk->sk_state_change = o2net_state_change;
+        mutex_init(&sc->sc_send_lock);
        write_unlock_bh(&sk->sk_callback_lock);
 }
@@ -688,6 +690,7 @@ static void o2net_handler_put(struct o2net_msg_handler *nmh)
 * be given to the handler if their payload is longer than the max. */
 int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
                           o2net_msg_handler_func *func, void *data,
+                           o2net_post_msg_handler_func *post_func,
                           struct list_head *unreg_list)
 {
        struct o2net_msg_handler *nmh = NULL;
@@ -722,6 +725,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
        nmh->nh_func = func;
        nmh->nh_func_data = data;
+        nmh->nh_post_func = post_func;
        nmh->nh_msg_type = msg_type;
        nmh->nh_max_len = max_len;
        nmh->nh_key = key;
@@ -856,10 +860,12 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
        ssize_t ret;
+        mutex_lock(&sc->sc_send_lock);
        ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
                                         virt_to_page(kmalloced_virt),
                                         (long)kmalloced_virt & ~PAGE_MASK,
                                         size, MSG_DONTWAIT);
+        mutex_unlock(&sc->sc_send_lock);
        if (ret != size) {
                mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 
                     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
@@ -974,8 +980,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        /* finally, convert the message header to network byte-order
         * and send */
+        mutex_lock(&sc->sc_send_lock);
        ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
                                 sizeof(struct o2net_msg) + caller_bytes);
+        mutex_unlock(&sc->sc_send_lock);
        msglog(msg, "sending returned %d\n", ret);
        if (ret < 0) {
                mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
@@ -1049,6 +1057,7 @@ static int o2net_process_message(struct o2net_sock_container *sc,
        int ret = 0, handler_status;
        enum  o2net_system_error syserr;
        struct o2net_msg_handler *nmh = NULL;
+        void *ret_data = NULL;
        msglog(hdr, "processing message\n");
@@ -1101,17 +1110,26 @@ static int o2net_process_message(struct o2net_sock_container *sc,
        sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
        handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
                                             be16_to_cpu(hdr->data_len),
-                                        nmh->nh_func_data);
+                                        nmh->nh_func_data, &ret_data);
        do_gettimeofday(&sc->sc_tv_func_stop);
 out_respond:
        /* this destroys the hdr, so don't use it after this */
+        mutex_lock(&sc->sc_send_lock);
        ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
                                      handler_status);
+        mutex_unlock(&sc->sc_send_lock);
        hdr = NULL;
        mlog(0, "sending handler status %d, syserr %d returned %d\n",
             handler_status, syserr, ret);
+        if (nmh) {
+                BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL);
+                if (nmh->nh_post_func)
+                        (nmh->nh_post_func)(handler_status, nmh->nh_func_data,
+                                            ret_data);
+        }
 out:
        if (nmh)
                o2net_handler_put(nmh);
@@ -1795,13 +1813,13 @@ out:
        ready(sk, bytes);
 }
-static int o2net_open_listening_sock(__be16 port)
+static int o2net_open_listening_sock(__be32 addr, __be16 port)
 {
        struct socket *sock = NULL;
        int ret;
        struct sockaddr_in sin = {
                .sin_family = PF_INET,
-                .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) },
+                .sin_addr = { .s_addr = (__force u32)addr },
                .sin_port = (__force u16)port,
        };
@@ -1824,15 +1842,15 @@ static int o2net_open_listening_sock(__be16 port)
        sock->sk->sk_reuse = 1;
        ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
        if (ret < 0) {
-                mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n",
+                mlog(ML_ERROR, "unable to bind socket at %u.%u.%u.%u:%u, "
-                     ntohs(port), ret);
+                     "ret=%d\n", NIPQUAD(addr), ntohs(port), ret);
                goto out;
        }
        ret = sock->ops->listen(sock, 64);
        if (ret < 0) {
-                mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n",
+                mlog(ML_ERROR, "unable to listen on %u.%u.%u.%u:%u, ret=%d\n",
-                     ntohs(port), ret);
+                     NIPQUAD(addr), ntohs(port), ret);
        }
 out:
@@ -1865,7 +1883,8 @@ int o2net_start_listening(struct o2nm_node *node)
                return -ENOMEM; /* ? */
        }
-        ret = o2net_open_listening_sock(node->nd_ipv4_port);
+        ret = o2net_open_listening_sock(node->nd_ipv4_address,
+                                        node->nd_ipv4_port);
        if (ret) {
                destroy_workqueue(o2net_wq);
                o2net_wq = NULL;
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 21a4e43df836..da880fc215f0 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -50,7 +50,10 @@ struct o2net_msg
        __u8  buf[0];
 };
-typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
+typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data,
+                                     void **ret_data);
+typedef void (o2net_post_msg_handler_func)(int status, void *data,
+                                           void *ret_data);
 #define O2NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(struct o2net_msg))
@@ -99,6 +102,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
 int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
                           o2net_msg_handler_func *func, void *data,
+                           o2net_post_msg_handler_func *post_func,
                           struct list_head *unreg_list);
 void o2net_unregister_handler_list(struct list_head *list);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index b700dc9624d1..4dae5df5e467 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
 * locking semantics of the file system using the protocol.  It should 
 * be somewhere else, I'm sure, but right now it isn't.
 *
+ * New in version 7:
+ *      - DLM join domain includes the live nodemap
+ *
+ * New in version 6:
+ *      - DLM lockres remote refcount fixes.
+ *
 * New in version 5:
 *      - Network timeout checking protocol
 *
@@ -51,7 +57,7 @@
 *      - full 64 bit i_size in the metadata lock lvbs
 *      - introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 5ULL
+#define O2NET_PROTOCOL_VERSION 7ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
@@ -149,6 +155,8 @@ struct o2net_sock_container {
        struct timeval          sc_tv_func_stop;
        u32                     sc_msg_key;
        u16                     sc_msg_type;
+        struct mutex            sc_send_lock;
 };
 struct o2net_msg_handler {
@@ -158,6 +166,8 @@ struct o2net_msg_handler {
        u32                     nh_key;
        o2net_msg_handler_func  *nh_func;
        o2net_msg_handler_func  *nh_func_data;
+        o2net_post_msg_handler_func
+                                *nh_post_func;
        struct kref             nh_kref;
        struct list_head        nh_unregister_item;
 };
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 681046d51393..241cad342a48 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -263,7 +263,8 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
-int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
+                          void **ret_data)
 {
        int ret;
        unsigned int locklen;
@@ -311,8 +312,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
            past->type != DLM_BAST) {
                mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
                     "name=%.*s\n", past->type, 
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
                     locklen, name);
                ret = DLM_IVLOCKID;
                goto leave;
@@ -323,8 +324,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
                mlog(0, "got %sast for unknown lockres! "
                     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
                     past->type == DLM_AST ? "" : "b",
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
                     locklen, name, locklen);
                ret = DLM_IVLOCKID;
                goto leave;
@@ -369,7 +370,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
        mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
             "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
-             dlm_get_lock_cookie_node(cookie), dlm_get_lock_cookie_seq(cookie),
+             dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
             locklen, name, locklen);
        ret = DLM_NORMAL;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 6b6ff76538c5..e90b92f9ece1 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -180,6 +180,11 @@ struct dlm_assert_master_priv
        unsigned ignore_higher:1;
 };
+struct dlm_deref_lockres_priv
+{
+        struct dlm_lock_resource *deref_res;
+        u8 deref_node;
+};
 struct dlm_work_item
 {
@@ -191,6 +196,7 @@ struct dlm_work_item
                struct dlm_request_all_locks_priv ral;
                struct dlm_mig_lockres_priv ml;
                struct dlm_assert_master_priv am;
+                struct dlm_deref_lockres_priv dl;
        } u;
 };
@@ -222,6 +228,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
 #define DLM_LOCK_RES_DIRTY                0x00000008
 #define DLM_LOCK_RES_IN_PROGRESS          0x00000010
 #define DLM_LOCK_RES_MIGRATING            0x00000020
+#define DLM_LOCK_RES_DROPPING_REF         0x00000040
+#define DLM_LOCK_RES_BLOCK_DIRTY          0x00001000
+#define DLM_LOCK_RES_SETREF_INPROG        0x00002000
 /* max milliseconds to wait to sync up a network failure with a node death */
 #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -265,6 +274,8 @@ struct dlm_lock_resource
        u8  owner;              //node which owns the lock resource, or unknown
        u16 state;
        char lvb[DLM_LVB_LEN];
+        unsigned int inflight_locks;
+        unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 };
 struct dlm_migratable_lock
@@ -367,7 +378,7 @@ enum {
        DLM_CONVERT_LOCK_MSG,    /* 504 */
        DLM_PROXY_AST_MSG,       /* 505 */
        DLM_UNLOCK_LOCK_MSG,     /* 506 */
-        DLM_UNUSED_MSG2,         /* 507 */
+        DLM_DEREF_LOCKRES_MSG,   /* 507 */
        DLM_MIGRATE_REQUEST_MSG, /* 508 */
        DLM_MIG_LOCKRES_MSG,     /* 509 */
        DLM_QUERY_JOIN_MSG,      /* 510 */
@@ -417,6 +428,9 @@ struct dlm_master_request
        u8 name[O2NM_MAX_NAME_LEN];
 };
+#define DLM_ASSERT_RESPONSE_REASSERT       0x00000001
+#define DLM_ASSERT_RESPONSE_MASTERY_REF    0x00000002
 #define DLM_ASSERT_MASTER_MLE_CLEANUP      0x00000001
 #define DLM_ASSERT_MASTER_REQUERY          0x00000002
 #define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
@@ -430,6 +444,8 @@ struct dlm_assert_master
        u8 name[O2NM_MAX_NAME_LEN];
 };
+#define DLM_MIGRATE_RESPONSE_MASTERY_REF   0x00000001
 struct dlm_migrate_request
 {
        u8 master;
@@ -609,12 +625,16 @@ struct dlm_begin_reco
 };
+#define BITS_PER_BYTE 8
+#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE)
 struct dlm_query_join_request
 {
        u8 node_idx;
        u8 pad1[2];
        u8 name_len;
        u8 domain[O2NM_MAX_NAME_LEN];
+        u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)];
 };
 struct dlm_assert_joined
@@ -648,6 +668,16 @@ struct dlm_finalize_reco
        __be32 pad2;
 };
+struct dlm_deref_lockres
+{
+        u32 pad1;
+        u16 pad2;
+        u8 node_idx;
+        u8 namelen;
+        u8 name[O2NM_MAX_NAME_LEN];
+};
 static inline enum dlm_status
 __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
 {
@@ -688,16 +718,20 @@ void dlm_lock_put(struct dlm_lock *lock);
 void dlm_lock_attach_lockres(struct dlm_lock *lock,
                             struct dlm_lock_resource *res);
-int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
-int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+                            void **ret_data);
-int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+                             void **ret_data);
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
+                          void **ret_data);
 void dlm_revert_pending_convert(struct dlm_lock_resource *res,
                                struct dlm_lock *lock);
 void dlm_revert_pending_lock(struct dlm_lock_resource *res,
                             struct dlm_lock *lock);
-int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+                            void **ret_data);
 void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
                               struct dlm_lock *lock);
 void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
@@ -721,8 +755,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                              struct dlm_lock_resource *res);
 void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                            struct dlm_lock_resource *res);
-void dlm_purge_lockres(struct dlm_ctxt *dlm,
-                       struct dlm_lock_resource *lockres);
 static inline void dlm_lockres_get(struct dlm_lock_resource *res)
 {
        /* This is called on every lookup, so it might be worth
@@ -733,6 +765,10 @@ void dlm_lockres_put(struct dlm_lock_resource *res);
 void __dlm_unhash_lockres(struct dlm_lock_resource *res);
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
                          struct dlm_lock_resource *res);
+struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
+                                                     const char *name,
+                                                     unsigned int len,
+                                                     unsigned int hash);
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                                const char *name,
                                                unsigned int len,
@@ -753,6 +789,47 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
                                          const char *name,
                                          unsigned int namelen);
+#define dlm_lockres_set_refmap_bit(bit,res)  \
+        __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
+#define dlm_lockres_clear_refmap_bit(bit,res)  \
+        __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
+static inline void __dlm_lockres_set_refmap_bit(int bit,
+                                                struct dlm_lock_resource *res,
+                                                const char *file,
+                                                int line)
+{
+        //printk("%s:%d:%.*s: setting bit %d\n", file, line,
+        //     res->lockname.len, res->lockname.name, bit);
+        set_bit(bit, res->refmap);
+}
+static inline void __dlm_lockres_clear_refmap_bit(int bit,
+                                                  struct dlm_lock_resource *res,
+                                                  const char *file,
+                                                  int line)
+{
+        //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
+        //     res->lockname.len, res->lockname.name, bit);
+        clear_bit(bit, res->refmap);
+}
+void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+                                   struct dlm_lock_resource *res,
+                                   const char *file,
+                                   int line);
+void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+                                   struct dlm_lock_resource *res,
+                                   int new_lockres,
+                                   const char *file,
+                                   int line);
+#define dlm_lockres_drop_inflight_ref(d,r)  \
+        __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
+#define dlm_lockres_grab_inflight_ref(d,r)  \
+        __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
+#define dlm_lockres_grab_inflight_ref_new(d,r)  \
+        __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_do_local_ast(struct dlm_ctxt *dlm,
@@ -801,10 +878,7 @@ int dlm_heartbeat_init(struct dlm_ctxt *dlm);
 void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
 void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
-int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
-int dlm_migrate_lockres(struct dlm_ctxt *dlm,
-                        struct dlm_lock_resource *res,
-                        u8 target);
 int dlm_finish_migration(struct dlm_ctxt *dlm,
                         struct dlm_lock_resource *res,
                         u8 old_master);
@@ -812,15 +886,27 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
                             struct dlm_lock_resource *res);
 void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
-int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
-int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
+                               void **ret_data);
-int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
-int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
+                              void **ret_data);
-int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
+void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
-int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
-int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
+                              void **ret_data);
-int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
-int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+                                void **ret_data);
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+                            void **ret_data);
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
+                               void **ret_data);
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
+                                  void **ret_data);
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
+                               void **ret_data);
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+                           void **ret_data);
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+                              void **ret_data);
 int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                          u8 nodenum, u8 *real_master);
@@ -856,10 +942,12 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
 int dlm_init_mle_cache(void);
 void dlm_destroy_mle_cache(void);
 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
+int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
+                         struct dlm_lock_resource *res);
 void dlm_clean_master_list(struct dlm_ctxt *dlm,
                           u8 dead_node);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
 int __dlm_lockres_unused(struct dlm_lock_resource *res);
 static inline const char * dlm_lock_mode_name(int mode)
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index c764dc8e40a2..ecb4d997221e 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -286,8 +286,8 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                __dlm_print_one_lock_resource(res);
                mlog(ML_ERROR, "converting a remote lock that is already "
                     "converting! (cookie=%u:%llu, conv=%d)\n",
-                     dlm_get_lock_cookie_node(lock->ml.cookie),
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                     dlm_get_lock_cookie_seq(lock->ml.cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                     lock->ml.convert_type);
                status = DLM_DENIED;
                goto bail;
@@ -418,7 +418,8 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
 * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
 *          status from __dlmconvert_master
 */
-int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+                             void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
@@ -428,7 +429,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_lockstatus *lksb;
        enum dlm_status status = DLM_NORMAL;
        u32 flags;
-        int call_ast = 0, kick_thread = 0, ast_reserved = 0;
+        int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0;
        if (!dlm_grab(dlm)) {
                dlm_error(DLM_REJECTED);
@@ -479,25 +480,14 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
                }
                lock = NULL;
        }
-        if (!lock) {
-                __dlm_print_one_lock_resource(res);
-                list_for_each(iter, &res->granted) {
-                        lock = list_entry(iter, struct dlm_lock, list);
-                        if (lock->ml.node == cnv->node_idx) {
-                                mlog(ML_ERROR, "There is something here "
-                                     "for node %u, lock->ml.cookie=%llu, "
-                                     "cnv->cookie=%llu\n", cnv->node_idx,
-                                     (unsigned long long)lock->ml.cookie,
-                                     (unsigned long long)cnv->cookie);
-                                break;
-                        }
-                }
-                lock = NULL;
-        }
        spin_unlock(&res->spinlock);
        if (!lock) {
                status = DLM_IVLOCKID;
-                dlm_error(status);
+                mlog(ML_ERROR, "did not find lock to convert on grant queue! "
+                               "cookie=%u:%llu\n",
+                     dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie)));
+                __dlm_print_one_lock_resource(res);
                goto leave;
        }
@@ -524,8 +514,11 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
                                             cnv->requested_type,
                                             &call_ast, &kick_thread);
                res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+                wake = 1;
        }
        spin_unlock(&res->spinlock);
+        if (wake)
+                wake_up(&res->wq);
        if (status != DLM_NORMAL) {
                if (status != DLM_NOTQUEUED)
@@ -534,12 +527,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
        }
 leave:
-        if (!lock)
+        if (lock)
-                mlog(ML_ERROR, "did not find lock to convert on grant queue! "
-                               "cookie=%u:%llu\n",
-                               dlm_get_lock_cookie_node(cnv->cookie),
-                               dlm_get_lock_cookie_seq(cnv->cookie));
-        else
                dlm_lock_put(lock);
        /* either queue the ast or release it, if reserved */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 3f6c8d88f7af..64239b37e5d4 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -53,6 +53,23 @@ void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
        spin_unlock(&res->spinlock);
 }
+static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
+{
+        int bit;
+        assert_spin_locked(&res->spinlock);
+        mlog(ML_NOTICE, "  refmap nodes: [ ");
+        bit = 0;
+        while (1) {
+                bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
+                if (bit >= O2NM_MAX_NODES)
+                        break;
+                printk("%u ", bit);
+                bit++;
+        }
+        printk("], inflight=%u\n", res->inflight_locks);
+}
 void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
        struct list_head *iter2;
@@ -65,6 +82,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
               res->owner, res->state);
        mlog(ML_NOTICE, "  last used: %lu, on purge list: %s\n",
             res->last_used, list_empty(&res->purge) ? "no" : "yes");
+        dlm_print_lockres_refmap(res);
        mlog(ML_NOTICE, "  granted queue: \n");
        list_for_each(iter2, &res->granted) {
                lock = list_entry(iter2, struct dlm_lock, list);
@@ -72,8 +90,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
                mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
                       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
                       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                       dlm_get_lock_cookie_node(lock->ml.cookie), 
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                       dlm_get_lock_cookie_seq(lock->ml.cookie), 
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                       list_empty(&lock->ast_list) ? 'y' : 'n',
                       lock->ast_pending ? 'y' : 'n',
                       list_empty(&lock->bast_list) ? 'y' : 'n',
@@ -87,8 +105,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
                mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
                       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
                       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                       dlm_get_lock_cookie_node(lock->ml.cookie), 
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                       dlm_get_lock_cookie_seq(lock->ml.cookie), 
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                       list_empty(&lock->ast_list) ? 'y' : 'n',
                       lock->ast_pending ? 'y' : 'n',
                       list_empty(&lock->bast_list) ? 'y' : 'n',
@@ -102,8 +120,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
                mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
                       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
                       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                       dlm_get_lock_cookie_node(lock->ml.cookie), 
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                       dlm_get_lock_cookie_seq(lock->ml.cookie), 
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                       list_empty(&lock->ast_list) ? 'y' : 'n',
                       lock->ast_pending ? 'y' : 'n',
                       list_empty(&lock->bast_list) ? 'y' : 'n',
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index f0b25f2dd205..6087c4749fee 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -48,6 +48,36 @@
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
+/*
+ * ocfs2 node maps are array of long int, which limits to send them freely
+ * across the wire due to endianness issues. To workaround this, we convert
+ * long ints to byte arrays. Following 3 routines are helper functions to
+ * set/test/copy bits within those array of bytes
+ */
+static inline void byte_set_bit(u8 nr, u8 map[])
+{
+        map[nr >> 3] |= (1UL << (nr & 7));
+}
+static inline int byte_test_bit(u8 nr, u8 map[])
+{
+        return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0;
+}
+static inline void byte_copymap(u8 dmap[], unsigned long smap[],
+                        unsigned int sz)
+{
+        unsigned int nn;
+        if (!sz)
+                return;
+        memset(dmap, 0, ((sz + 7) >> 3));
+        for (nn = 0 ; nn < sz; nn++)
+                if (test_bit(nn, smap))
+                        byte_set_bit(nn, dmap);
+}
 static void dlm_free_pagevec(void **vec, int pages)
 {
        while (pages--)
@@ -95,10 +125,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
 #define DLM_DOMAIN_BACKOFF_MS 200
-static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
-static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
+                                  void **ret_data);
-static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
-static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
+                                     void **ret_data);
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+                                   void **ret_data);
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+                                   void **ret_data);
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
@@ -125,10 +159,10 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
        hlist_add_head(&res->hash_node, bucket);
 }
-struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
-                                                const char *name,
+                                                     const char *name,
-                                                unsigned int len,
+                                                     unsigned int len,
-                                                unsigned int hash)
+                                                     unsigned int hash)
 {
        struct hlist_head *bucket;
        struct hlist_node *list;
@@ -154,6 +188,37 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
        return NULL;
 }
+/* intended to be called by functions which do not care about lock
+ * resources which are being purged (most net _handler functions).
+ * this will return NULL for any lock resource which is found but
+ * currently in the process of dropping its mastery reference.
+ * use __dlm_lookup_lockres_full when you need the lock resource
+ * regardless (e.g. dlm_get_lock_resource) */
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+                                                const char *name,
+                                                unsigned int len,
+                                                unsigned int hash)
+{
+        struct dlm_lock_resource *res = NULL;
+        mlog_entry("%.*s\n", len, name);
+        assert_spin_locked(&dlm->spinlock);
+        res = __dlm_lookup_lockres_full(dlm, name, len, hash);
+        if (res) {
+                spin_lock(&res->spinlock);
+                if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+                        spin_unlock(&res->spinlock);
+                        dlm_lockres_put(res);
+                        return NULL;
+                }
+                spin_unlock(&res->spinlock);
+        }
+        return res;
+}
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                    const char *name,
                                    unsigned int len)
@@ -330,43 +395,60 @@ static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
        wake_up(&dlm_domain_events);
 }
-static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
+static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 {
-        int i;
+        int i, num, n, ret = 0;
        struct dlm_lock_resource *res;
+        struct hlist_node *iter;
+        struct hlist_head *bucket;
+        int dropped;
        mlog(0, "Migrating locks from domain %s\n", dlm->name);
-restart:
+        num = 0;
        spin_lock(&dlm->spinlock);
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
+redo_bucket:
-                        res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
+                n = 0;
-                                          struct dlm_lock_resource, hash_node);
+                bucket = dlm_lockres_hash(dlm, i);
-                        /* need reference when manually grabbing lockres */
+                iter = bucket->first;
+                while (iter) {
+                        n++;
+                        res = hlist_entry(iter, struct dlm_lock_resource,
+                                          hash_node);
                        dlm_lockres_get(res);
-                        /* this should unhash the lockres
+                        /* migrate, if necessary.  this will drop the dlm
-                         * and exit with dlm->spinlock */
+                         * spinlock and retake it if it does migration. */
-                        mlog(0, "purging res=%p\n", res);
+                        dropped = dlm_empty_lockres(dlm, res);
-                        if (dlm_lockres_is_dirty(dlm, res)) {
-                                /* HACK!  this should absolutely go.
+                        spin_lock(&res->spinlock);
-                                 * need to figure out why some empty
+                        __dlm_lockres_calc_usage(dlm, res);
-                                 * lockreses are still marked dirty */
+                        iter = res->hash_node.next;
-                                mlog(ML_ERROR, "lockres %.*s dirty!\n",
+                        spin_unlock(&res->spinlock);
-                                     res->lockname.len, res->lockname.name);
-                                spin_unlock(&dlm->spinlock);
-                                dlm_kick_thread(dlm, res);
-                                wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
-                                dlm_lockres_put(res);
-                                goto restart;
-                        }
-                        dlm_purge_lockres(dlm, res);
                        dlm_lockres_put(res);
+                        cond_resched_lock(&dlm->spinlock);
+                        if (dropped)
+                                goto redo_bucket;
                }
+                num += n;
+                mlog(0, "%s: touched %d lockreses in bucket %d "
+                     "(tot=%d)\n", dlm->name, n, i, num);
        }
        spin_unlock(&dlm->spinlock);
+        wake_up(&dlm->dlm_thread_wq);
+        /* let the dlm thread take care of purging, keep scanning until
+         * nothing remains in the hash */
+        if (num) {
+                mlog(0, "%s: %d lock resources in hash last pass\n",
+                     dlm->name, num);
+                ret = -EAGAIN;
+        }
        mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
+        return ret;
 }
 static int dlm_no_joining_node(struct dlm_ctxt *dlm)
@@ -418,7 +500,8 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
        printk("\n");
 }
-static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+                                   void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        unsigned int node;
@@ -571,7 +654,9 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                /* We changed dlm state, notify the thread */
                dlm_kick_thread(dlm, NULL);
-                dlm_migrate_all_locks(dlm);
+                while (dlm_migrate_all_locks(dlm)) {
+                        mlog(0, "%s: more migration to do\n", dlm->name);
+                }
                dlm_mark_domain_leaving(dlm);
                dlm_leave_domain(dlm);
                dlm_complete_dlm_shutdown(dlm);
@@ -580,11 +665,13 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
 }
 EXPORT_SYMBOL_GPL(dlm_unregister_domain);
-static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
+                                  void **ret_data)
 {
        struct dlm_query_join_request *query;
        enum dlm_query_join_response response;
        struct dlm_ctxt *dlm = NULL;
+        u8 nodenum;
        query = (struct dlm_query_join_request *) msg->buf;
@@ -608,6 +695,28 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_lock(&dlm_domain_lock);
        dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
+        if (!dlm)
+                goto unlock_respond;
+        /*
+         * There is a small window where the joining node may not see the
+         * node(s) that just left but still part of the cluster. DISALLOW
+         * join request if joining node has different node map.
+         */
+        nodenum=0;
+        while (nodenum < O2NM_MAX_NODES) {
+                if (test_bit(nodenum, dlm->domain_map)) {
+                        if (!byte_test_bit(nodenum, query->node_map)) {
+                                mlog(0, "disallow join as node %u does not "
+                                     "have node %u in its nodemap\n",
+                                     query->node_idx, nodenum);
+                                response = JOIN_DISALLOW;
+                                goto unlock_respond;
+                        }
+                }
+                nodenum++;
+        }
        /* Once the dlm ctxt is marked as leaving then we don't want
         * to be put in someone's domain map. 
         * Also, explicitly disallow joining at certain troublesome
@@ -626,15 +735,15 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
                        /* Disallow parallel joins. */
                        response = JOIN_DISALLOW;
                } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
-                        mlog(ML_NOTICE, "node %u trying to join, but recovery "
+                        mlog(0, "node %u trying to join, but recovery "
                             "is ongoing.\n", bit);
                        response = JOIN_DISALLOW;
                } else if (test_bit(bit, dlm->recovery_map)) {
-                        mlog(ML_NOTICE, "node %u trying to join, but it "
+                        mlog(0, "node %u trying to join, but it "
                             "still needs recovery.\n", bit);
                        response = JOIN_DISALLOW;
                } else if (test_bit(bit, dlm->domain_map)) {
-                        mlog(ML_NOTICE, "node %u trying to join, but it "
+                        mlog(0, "node %u trying to join, but it "
                             "is still in the domain! needs recovery?\n",
                             bit);
                        response = JOIN_DISALLOW;
@@ -649,6 +758,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
                spin_unlock(&dlm->spinlock);
        }
+unlock_respond:
        spin_unlock(&dlm_domain_lock);
 respond:
@@ -657,7 +767,8 @@ respond:
        return response;
 }
-static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
+                                     void **ret_data)
 {
        struct dlm_assert_joined *assert;
        struct dlm_ctxt *dlm = NULL;
@@ -694,7 +805,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
        return 0;
 }
-static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+                                   void **ret_data)
 {
        struct dlm_cancel_join *cancel;
        struct dlm_ctxt *dlm = NULL;
@@ -796,6 +908,9 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        join_msg.name_len = strlen(dlm->name);
        memcpy(join_msg.domain, dlm->name, join_msg.name_len);
+        /* copy live node map to join message */
+        byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
        status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
                                    sizeof(join_msg), node, &retval);
        if (status < 0 && status != -ENOPROTOOPT) {
@@ -1036,98 +1151,106 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
                                        sizeof(struct dlm_master_request),
                                        dlm_master_request_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
                                        sizeof(struct dlm_assert_master),
                                        dlm_assert_master_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, dlm_assert_master_post_handler,
+                                        &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
                                        sizeof(struct dlm_create_lock),
                                        dlm_create_lock_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
                                        DLM_CONVERT_LOCK_MAX_LEN,
                                        dlm_convert_lock_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
                                        DLM_UNLOCK_LOCK_MAX_LEN,
                                        dlm_unlock_lock_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
                                        DLM_PROXY_AST_MAX_LEN,
                                        dlm_proxy_ast_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
                                        sizeof(struct dlm_exit_domain),
                                        dlm_exit_domain_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
+        if (status)
+                goto bail;
+        status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key,
+                                        sizeof(struct dlm_deref_lockres),
+                                        dlm_deref_lockres_handler,
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
                                        sizeof(struct dlm_migrate_request),
                                        dlm_migrate_request_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
                                        DLM_MIG_LOCKRES_MAX_LEN,
                                        dlm_mig_lockres_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
                                        sizeof(struct dlm_master_requery),
                                        dlm_master_requery_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
                                        sizeof(struct dlm_lock_request),
                                        dlm_request_all_locks_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
                                        sizeof(struct dlm_reco_data_done),
                                        dlm_reco_data_done_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
                                        sizeof(struct dlm_begin_reco),
                                        dlm_begin_reco_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
                                        sizeof(struct dlm_finalize_reco),
                                        dlm_finalize_reco_handler,
-                                        dlm, &dlm->dlm_domain_handlers);
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
@@ -1141,6 +1264,8 @@ bail:
 static int dlm_join_domain(struct dlm_ctxt *dlm)
 {
        int status;
+        unsigned int backoff;
+        unsigned int total_backoff = 0;
        BUG_ON(!dlm);
@@ -1172,18 +1297,27 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
        }
        do {
-                unsigned int backoff;
                status = dlm_try_to_join_domain(dlm);
                /* If we're racing another node to the join, then we
                 * need to back off temporarily and let them
                 * complete. */
+#define DLM_JOIN_TIMEOUT_MSECS  90000
                if (status == -EAGAIN) {
                        if (signal_pending(current)) {
                                status = -ERESTARTSYS;
                                goto bail;
                        }
+                        if (total_backoff >
+                            msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
+                                status = -ERESTARTSYS;
+                                mlog(ML_NOTICE, "Timed out joining dlm domain "
+                                     "%s after %u msecs\n", dlm->name,
+                                     jiffies_to_msecs(total_backoff));
+                                goto bail;
+                        }
                        /*
                         * <chip> After you!
                         * <dale> No, after you!
@@ -1193,6 +1327,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                         */
                        backoff = (unsigned int)(jiffies & 0x3);
                        backoff *= DLM_DOMAIN_BACKOFF_MS;
+                        total_backoff += backoff;
                        mlog(0, "backoff %d\n", backoff);
                        msleep(backoff);
                }
@@ -1421,21 +1556,21 @@ static int dlm_register_net_handlers(void)
        status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
                                        sizeof(struct dlm_query_join_request),
                                        dlm_query_join_handler,
-                                        NULL, &dlm_join_handlers);
+                                        NULL, NULL, &dlm_join_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
                                        sizeof(struct dlm_assert_joined),
                                        dlm_assert_joined_handler,
-                                        NULL, &dlm_join_handlers);
+                                        NULL, NULL, &dlm_join_handlers);
        if (status)
                goto bail;
        status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
                                        sizeof(struct dlm_cancel_join),
                                        dlm_cancel_join_handler,
-                                        NULL, &dlm_join_handlers);
+                                        NULL, NULL, &dlm_join_handlers);
 bail:
        if (status < 0)
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index e5ca3db197f6..52578d907d9a 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -163,6 +163,10 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
                        kick_thread = 1;
                }
        }
+        /* reduce the inflight count, this may result in the lockres
+         * being purged below during calc_usage */
+        if (lock->ml.node == dlm->node_num)
+                dlm_lockres_drop_inflight_ref(dlm, res);
        spin_unlock(&res->spinlock);
        wake_up(&res->wq);
@@ -437,7 +441,8 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
 *   held on exit:  none
 * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
 */
-int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+                            void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0ad872055cb3..77e4e6169a0d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -99,9 +99,10 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
                            int idx);
 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
-static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
+static int dlm_do_assert_master(struct dlm_ctxt *dlm,
-                                unsigned int namelen, void *nodemap,
+                                struct dlm_lock_resource *res,
-                                u32 flags);
+                                void *nodemap, u32 flags);
+static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
 static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
                                struct dlm_master_list_entry *mle,
@@ -237,7 +238,8 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
                        struct dlm_master_list_entry **mle,
                        char *name, unsigned int namelen);
-static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
+static int dlm_do_master_request(struct dlm_lock_resource *res,
+                                 struct dlm_master_list_entry *mle, int to);
 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
@@ -687,6 +689,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        INIT_LIST_HEAD(&res->purge);
        atomic_set(&res->asts_reserved, 0);
        res->migration_pending = 0;
+        res->inflight_locks = 0;
        kref_init(&res->refs);
@@ -700,6 +703,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->last_used = 0;
        memset(res->lvb, 0, DLM_LVB_LEN);
+        memset(res->refmap, 0, sizeof(res->refmap));
 }
 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
@@ -722,6 +726,42 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
        return res;
 }
+void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+                                   struct dlm_lock_resource *res,
+                                   int new_lockres,
+                                   const char *file,
+                                   int line)
+{
+        if (!new_lockres)
+                assert_spin_locked(&res->spinlock);
+        if (!test_bit(dlm->node_num, res->refmap)) {
+                BUG_ON(res->inflight_locks != 0);
+                dlm_lockres_set_refmap_bit(dlm->node_num, res);
+        }
+        res->inflight_locks++;
+        mlog(0, "%s:%.*s: inflight++: now %u\n",
+             dlm->name, res->lockname.len, res->lockname.name,
+             res->inflight_locks);
+}
+void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+                                   struct dlm_lock_resource *res,
+                                   const char *file,
+                                   int line)
+{
+        assert_spin_locked(&res->spinlock);
+        BUG_ON(res->inflight_locks == 0);
+        res->inflight_locks--;
+        mlog(0, "%s:%.*s: inflight--: now %u\n",
+             dlm->name, res->lockname.len, res->lockname.name,
+             res->inflight_locks);
+        if (res->inflight_locks == 0)
+                dlm_lockres_clear_refmap_bit(dlm->node_num, res);
+        wake_up(&res->wq);
+}
 /*
 * lookup a lock resource by name.
 * may already exist in the hashtable.
@@ -752,6 +792,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
        unsigned int hash;
        int tries = 0;
        int bit, wait_on_recovery = 0;
+        int drop_inflight_if_nonlocal = 0;
        BUG_ON(!lockid);
@@ -761,9 +802,30 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
 lookup:
        spin_lock(&dlm->spinlock);
-        tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
+        tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
        if (tmpres) {
+                int dropping_ref = 0;
+                spin_lock(&tmpres->spinlock);
+                if (tmpres->owner == dlm->node_num) {
+                        BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
+                        dlm_lockres_grab_inflight_ref(dlm, tmpres);
+                } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
+                        dropping_ref = 1;
+                spin_unlock(&tmpres->spinlock);
                spin_unlock(&dlm->spinlock);
+                /* wait until done messaging the master, drop our ref to allow
+                 * the lockres to be purged, start over. */
+                if (dropping_ref) {
+                        spin_lock(&tmpres->spinlock);
+                        __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
+                        spin_unlock(&tmpres->spinlock);
+                        dlm_lockres_put(tmpres);
+                        tmpres = NULL;
+                        goto lookup;
+                }
                mlog(0, "found in hash!\n");
                if (res)
                        dlm_lockres_put(res);
@@ -793,6 +855,7 @@ lookup:
                spin_lock(&res->spinlock);
                dlm_change_lockres_owner(dlm, res, dlm->node_num);
                __dlm_insert_lockres(dlm, res);
+                dlm_lockres_grab_inflight_ref(dlm, res);
                spin_unlock(&res->spinlock);
                spin_unlock(&dlm->spinlock);
                /* lockres still marked IN_PROGRESS */
@@ -805,29 +868,40 @@ lookup:
        /* if we found a block, wait for lock to be mastered by another node */
        blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
        if (blocked) {
+                int mig;
                if (mle->type == DLM_MLE_MASTER) {
                        mlog(ML_ERROR, "master entry for nonexistent lock!\n");
                        BUG();
-                } else if (mle->type == DLM_MLE_MIGRATION) {
+                }
-                        /* migration is in progress! */
+                mig = (mle->type == DLM_MLE_MIGRATION);
-                        /* the good news is that we now know the
+                /* if there is a migration in progress, let the migration
-                         * "current" master (mle->master). */
+                 * finish before continuing.  we can wait for the absence
+                 * of the MIGRATION mle: either the migrate finished or
+                 * one of the nodes died and the mle was cleaned up.
+                 * if there is a BLOCK here, but it already has a master
+                 * set, we are too late.  the master does not have a ref
+                 * for us in the refmap.  detach the mle and drop it.
+                 * either way, go back to the top and start over. */
+                if (mig || mle->master != O2NM_MAX_NODES) {
+                        BUG_ON(mig && mle->master == dlm->node_num);
+                        /* we arrived too late.  the master does not
+                         * have a ref for us. retry. */
+                        mlog(0, "%s:%.*s: late on %s\n",
+                             dlm->name, namelen, lockid,
+                             mig ?  "MIGRATION" : "BLOCK");
                        spin_unlock(&dlm->master_lock);
-                        assert_spin_locked(&dlm->spinlock);
-                        /* set the lockres owner and hash it */
-                        spin_lock(&res->spinlock);
-                        dlm_set_lockres_owner(dlm, res, mle->master);
-                        __dlm_insert_lockres(dlm, res);
-                        spin_unlock(&res->spinlock);
                        spin_unlock(&dlm->spinlock);
                        /* master is known, detach */
-                        dlm_mle_detach_hb_events(dlm, mle);
+                        if (!mig)
+                                dlm_mle_detach_hb_events(dlm, mle);
                        dlm_put_mle(mle);
                        mle = NULL;
-                        goto wake_waiters;
+                        /* this is lame, but we cant wait on either
+                         * the mle or lockres waitqueue here */
+                        if (mig)
+                                msleep(100);
+                        goto lookup;
                }
        } else {
                /* go ahead and try to master lock on this node */
@@ -858,6 +932,13 @@ lookup:
        /* finally add the lockres to its hash bucket */
        __dlm_insert_lockres(dlm, res);
+        /* since this lockres is new it doesnt not require the spinlock */
+        dlm_lockres_grab_inflight_ref_new(dlm, res);
+        /* if this node does not become the master make sure to drop
+         * this inflight reference below */
+        drop_inflight_if_nonlocal = 1;
        /* get an extra ref on the mle in case this is a BLOCK
         * if so, the creator of the BLOCK may try to put the last
         * ref at this time in the assert master handler, so we
@@ -910,7 +991,7 @@ redo_request:
        ret = -EINVAL;
        dlm_node_iter_init(mle->vote_map, &iter);
        while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
-                ret = dlm_do_master_request(mle, nodenum);
+                ret = dlm_do_master_request(res, mle, nodenum);
                if (ret < 0)
                        mlog_errno(ret);
                if (mle->master != O2NM_MAX_NODES) {
@@ -960,6 +1041,8 @@ wait:
 wake_waiters:
        spin_lock(&res->spinlock);
+        if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
+                dlm_lockres_drop_inflight_ref(dlm, res);
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
        spin_unlock(&res->spinlock);
        wake_up(&res->wq);
@@ -998,7 +1081,7 @@ recheck:
                /* this will cause the master to re-assert across
                 * the whole cluster, freeing up mles */
                if (res->owner != dlm->node_num) {
-                        ret = dlm_do_master_request(mle, res->owner);
+                        ret = dlm_do_master_request(res, mle, res->owner);
                        if (ret < 0) {
                                /* give recovery a chance to run */
                                mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
@@ -1062,6 +1145,8 @@ recheck:
                                 * now tell other nodes that I am
                                 * mastering this. */
                                mle->master = dlm->node_num;
+                                /* ref was grabbed in get_lock_resource
+                                 * will be dropped in dlmlock_master */
                                assert = 1;
                                sleep = 0;
                        }
@@ -1087,7 +1172,8 @@ recheck:
                                         (atomic_read(&mle->woken) == 1),
                                         timeo);
                if (res->owner == O2NM_MAX_NODES) {
-                        mlog(0, "waiting again\n");
+                        mlog(0, "%s:%.*s: waiting again\n", dlm->name,
+                             res->lockname.len, res->lockname.name);
                        goto recheck;
                }
                mlog(0, "done waiting, master is %u\n", res->owner);
@@ -1100,8 +1186,7 @@ recheck:
                m = dlm->node_num;
                mlog(0, "about to master %.*s here, this=%u\n",
                     res->lockname.len, res->lockname.name, m);
-                ret = dlm_do_assert_master(dlm, res->lockname.name,
+                ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
-                                           res->lockname.len, mle->vote_map, 0);
                if (ret) {
                        /* This is a failure in the network path,
                         * not in the response to the assert_master
@@ -1117,6 +1202,8 @@ recheck:
        /* set the lockres owner */
        spin_lock(&res->spinlock);
+        /* mastery reference obtained either during
+         * assert_master_handler or in get_lock_resource */
        dlm_change_lockres_owner(dlm, res, m);
        spin_unlock(&res->spinlock);
@@ -1283,7 +1370,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 *
 */
-static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
+static int dlm_do_master_request(struct dlm_lock_resource *res,
+                                 struct dlm_master_list_entry *mle, int to)
 {
        struct dlm_ctxt *dlm = mle->dlm;
        struct dlm_master_request request;
@@ -1339,6 +1427,9 @@ again:
                case DLM_MASTER_RESP_YES:
                        set_bit(to, mle->response_map);
                        mlog(0, "node %u is the master, response=YES\n", to);
+                        mlog(0, "%s:%.*s: master node %u now knows I have a "
+                             "reference\n", dlm->name, res->lockname.len,
+                             res->lockname.name, to);
                        mle->master = to;
                        break;
                case DLM_MASTER_RESP_NO:
@@ -1379,7 +1470,8 @@ out:
 *
 * if possible, TRIM THIS DOWN!!!
 */
-int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
+                               void **ret_data)
 {
        u8 response = DLM_MASTER_RESP_MAYBE;
        struct dlm_ctxt *dlm = data;
@@ -1417,10 +1509,11 @@ way_up_top:
                /* take care of the easy cases up front */
                spin_lock(&res->spinlock);
-                if (res->state & DLM_LOCK_RES_RECOVERING) {
+                if (res->state & (DLM_LOCK_RES_RECOVERING|
+                                  DLM_LOCK_RES_MIGRATING)) {
                        spin_unlock(&res->spinlock);
                        mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
-                             "being recovered\n");
+                             "being recovered/migrated\n");
                        response = DLM_MASTER_RESP_ERROR;
                        if (mle)
                                kmem_cache_free(dlm_mle_cache, mle);
@@ -1428,8 +1521,10 @@ way_up_top:
                }
                if (res->owner == dlm->node_num) {
+                        mlog(0, "%s:%.*s: setting bit %u in refmap\n",
+                             dlm->name, namelen, name, request->node_idx);
+                        dlm_lockres_set_refmap_bit(request->node_idx, res);
                        spin_unlock(&res->spinlock);
-                        // mlog(0, "this node is the master\n");
                        response = DLM_MASTER_RESP_YES;
                        if (mle)
                                kmem_cache_free(dlm_mle_cache, mle);
@@ -1477,7 +1572,6 @@ way_up_top:
                        mlog(0, "node %u is master, but trying to migrate to "
                             "node %u.\n", tmpmle->master, tmpmle->new_master);
                        if (tmpmle->master == dlm->node_num) {
-                                response = DLM_MASTER_RESP_YES;
                                mlog(ML_ERROR, "no owner on lockres, but this "
                                     "node is trying to migrate it to %u?!\n",
                                     tmpmle->new_master);
@@ -1494,6 +1588,10 @@ way_up_top:
                                 * go back and clean the mles on any
                                 * other nodes */
                                dispatch_assert = 1;
+                                dlm_lockres_set_refmap_bit(request->node_idx, res);
+                                mlog(0, "%s:%.*s: setting bit %u in refmap\n",
+                                     dlm->name, namelen, name,
+                                     request->node_idx);
                        } else
                                response = DLM_MASTER_RESP_NO;
                } else {
@@ -1607,17 +1705,24 @@ send_response:
 * can periodically run all locks owned by this node
 * and re-assert across the cluster...
 */
-static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
+int dlm_do_assert_master(struct dlm_ctxt *dlm,
-                                unsigned int namelen, void *nodemap,
+                         struct dlm_lock_resource *res,
-                                u32 flags)
+                         void *nodemap, u32 flags)
 {
        struct dlm_assert_master assert;
        int to, tmpret;
        struct dlm_node_iter iter;
        int ret = 0;
        int reassert;
+        const char *lockname = res->lockname.name;
+        unsigned int namelen = res->lockname.len;
        BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+        spin_lock(&res->spinlock);
+        res->state |= DLM_LOCK_RES_SETREF_INPROG;
+        spin_unlock(&res->spinlock);
 again:
        reassert = 0;
@@ -1647,6 +1752,7 @@ again:
                        mlog(0, "link to %d went down!\n", to);
                        /* any nonzero status return will do */
                        ret = tmpret;
+                        r = 0;
                } else if (r < 0) {
                        /* ok, something horribly messed.  kill thyself. */
                        mlog(ML_ERROR,"during assert master of %.*s to %u, "
@@ -1661,17 +1767,39 @@ again:
                        spin_unlock(&dlm->master_lock);
                        spin_unlock(&dlm->spinlock);
                        BUG();
-                } else if (r == EAGAIN) {
+                }
+                if (r & DLM_ASSERT_RESPONSE_REASSERT &&
+                    !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
+                                mlog(ML_ERROR, "%.*s: very strange, "
+                                     "master MLE but no lockres on %u\n",
+                                     namelen, lockname, to);
+                }
+                if (r & DLM_ASSERT_RESPONSE_REASSERT) {
                        mlog(0, "%.*s: node %u create mles on other "
                             "nodes and requests a re-assert\n", 
                             namelen, lockname, to);
                        reassert = 1;
                }
+                if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
+                        mlog(0, "%.*s: node %u has a reference to this "
+                             "lockres, set the bit in the refmap\n",
+                             namelen, lockname, to);
+                        spin_lock(&res->spinlock);
+                        dlm_lockres_set_refmap_bit(to, res);
+                        spin_unlock(&res->spinlock);
+                }
        }
        if (reassert)
                goto again;
+        spin_lock(&res->spinlock);
+        res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
+        spin_unlock(&res->spinlock);
+        wake_up(&res->wq);
        return ret;
 }
@@ -1684,7 +1812,8 @@ again:
 *
 * if possible, TRIM THIS DOWN!!!
 */
-int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
+                              void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_master_list_entry *mle = NULL;
@@ -1693,7 +1822,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
        char *name;
        unsigned int namelen, hash;
        u32 flags;
-        int master_request = 0;
+        int master_request = 0, have_lockres_ref = 0;
        int ret = 0;
        if (!dlm_grab(dlm))
@@ -1851,6 +1980,7 @@ ok:
                spin_unlock(&mle->spinlock);
                if (res) {
+                        int wake = 0;
                        spin_lock(&res->spinlock);
                        if (mle->type == DLM_MLE_MIGRATION) {
                                mlog(0, "finishing off migration of lockres %.*s, "
@@ -1858,12 +1988,16 @@ ok:
                                        res->lockname.len, res->lockname.name,
                                        dlm->node_num, mle->new_master);
                                res->state &= ~DLM_LOCK_RES_MIGRATING;
+                                wake = 1;
                                dlm_change_lockres_owner(dlm, res, mle->new_master);
                                BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
                        } else {
                                dlm_change_lockres_owner(dlm, res, mle->master);
                        }
                        spin_unlock(&res->spinlock);
+                        have_lockres_ref = 1;
+                        if (wake)
+                                wake_up(&res->wq);
                }
                /* master is known, detach if not already detached.
@@ -1913,12 +2047,28 @@ ok:
 done:
        ret = 0;
-        if (res)
+        if (res) {
-                dlm_lockres_put(res);
+                spin_lock(&res->spinlock);
+                res->state |= DLM_LOCK_RES_SETREF_INPROG;
+                spin_unlock(&res->spinlock);
+                *ret_data = (void *)res;
+        }
        dlm_put(dlm);
        if (master_request) {
                mlog(0, "need to tell master to reassert\n");
-                ret = EAGAIN;  // positive. negative would shoot down the node.
+                /* positive. negative would shoot down the node. */
+                ret |= DLM_ASSERT_RESPONSE_REASSERT;
+                if (!have_lockres_ref) {
+                        mlog(ML_ERROR, "strange, got assert from %u, MASTER "
+                             "mle present here for %s:%.*s, but no lockres!\n",
+                             assert->node_idx, dlm->name, namelen, name);
+                }
+        }
+        if (have_lockres_ref) {
+                /* let the master know we have a reference to the lockres */
+                ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
+                mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
+                     dlm->name, namelen, name, assert->node_idx);
        }
        return ret;
@@ -1929,11 +2079,25 @@ kill:
        __dlm_print_one_lock_resource(res);
        spin_unlock(&res->spinlock);
        spin_unlock(&dlm->spinlock);
-        dlm_lockres_put(res);
+        *ret_data = (void *)res; 
        dlm_put(dlm);
        return -EINVAL;
 }
+void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
+{
+        struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
+        if (ret_data) {
+                spin_lock(&res->spinlock);
+                res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
+                spin_unlock(&res->spinlock);
+                wake_up(&res->wq);
+                dlm_lockres_put(res);
+        }
+        return;
+}
 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
                               struct dlm_lock_resource *res,
                               int ignore_higher, u8 request_from, u32 flags)
@@ -2023,9 +2187,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
         * even if one or more nodes die */
        mlog(0, "worker about to master %.*s here, this=%u\n",
                     res->lockname.len, res->lockname.name, dlm->node_num);
-        ret = dlm_do_assert_master(dlm, res->lockname.name,
+        ret = dlm_do_assert_master(dlm, res, nodemap, flags);
-                                   res->lockname.len,
-                                   nodemap, flags);
        if (ret < 0) {
                /* no need to restart, we are done */
                if (!dlm_is_host_down(ret))
@@ -2097,14 +2259,180 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
        return ret;
 }
+/*
+ * DLM_DEREF_LOCKRES_MSG
+ */
+int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+        struct dlm_deref_lockres deref;
+        int ret = 0, r;
+        const char *lockname;
+        unsigned int namelen;
+        lockname = res->lockname.name;
+        namelen = res->lockname.len;
+        BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+        mlog(0, "%s:%.*s: sending deref to %d\n",
+             dlm->name, namelen, lockname, res->owner);
+        memset(&deref, 0, sizeof(deref));
+        deref.node_idx = dlm->node_num;
+        deref.namelen = namelen;
+        memcpy(deref.name, lockname, namelen);
+        ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
+                                 &deref, sizeof(deref), res->owner, &r);
+        if (ret < 0)
+                mlog_errno(ret);
+        else if (r < 0) {
+                /* BAD.  other node says I did not have a ref. */
+                mlog(ML_ERROR,"while dropping ref on %s:%.*s "
+                    "(master=%u) got %d.\n", dlm->name, namelen,
+                    lockname, res->owner, r);
+                dlm_print_one_lock_resource(res);
+                BUG();
+        }
+        return ret;
+}
+int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+                              void **ret_data)
+{
+        struct dlm_ctxt *dlm = data;
+        struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
+        struct dlm_lock_resource *res = NULL;
+        char *name;
+        unsigned int namelen;
+        int ret = -EINVAL;
+        u8 node;
+        unsigned int hash;
+        struct dlm_work_item *item;
+        int cleared = 0;
+        int dispatch = 0;
+        if (!dlm_grab(dlm))
+                return 0;
+        name = deref->name;
+        namelen = deref->namelen;
+        node = deref->node_idx;
+        if (namelen > DLM_LOCKID_NAME_MAX) {
+                mlog(ML_ERROR, "Invalid name length!");
+                goto done;
+        }
+        if (deref->node_idx >= O2NM_MAX_NODES) {
+                mlog(ML_ERROR, "Invalid node number: %u\n", node);
+                goto done;
+        }
+        hash = dlm_lockid_hash(name, namelen);
+        spin_lock(&dlm->spinlock);
+        res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+        if (!res) {
+                spin_unlock(&dlm->spinlock);
+                mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+                     dlm->name, namelen, name);
+                goto done;
+        }
+        spin_unlock(&dlm->spinlock);
+        spin_lock(&res->spinlock);
+        if (res->state & DLM_LOCK_RES_SETREF_INPROG)
+                dispatch = 1;
+        else {
+                BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+                if (test_bit(node, res->refmap)) {
+                        dlm_lockres_clear_refmap_bit(node, res);
+                        cleared = 1;
+                }
+        }
+        spin_unlock(&res->spinlock);
+        if (!dispatch) {
+                if (cleared)
+                        dlm_lockres_calc_usage(dlm, res);
+                else {
+                        mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
+                        "but it is already dropped!\n", dlm->name,
+                        res->lockname.len, res->lockname.name, node);
+                        __dlm_print_one_lock_resource(res);
+                }
+                ret = 0;
+                goto done;
+        }
+        item = kzalloc(sizeof(*item), GFP_NOFS);
+        if (!item) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto done;
+        }
+        dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
+        item->u.dl.deref_res = res;
+        item->u.dl.deref_node = node;
+        spin_lock(&dlm->work_lock);
+        list_add_tail(&item->list, &dlm->work_list);
+        spin_unlock(&dlm->work_lock);
+        queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+        return 0;
+done:
+        if (res)
+                dlm_lockres_put(res);
+        dlm_put(dlm);
+        return ret;
+}
+static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
+{
+        struct dlm_ctxt *dlm;
+        struct dlm_lock_resource *res;
+        u8 node;
+        u8 cleared = 0;
+        dlm = item->dlm;
+        res = item->u.dl.deref_res;
+        node = item->u.dl.deref_node;
+        spin_lock(&res->spinlock);
+        BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+        if (test_bit(node, res->refmap)) {
+                __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+                dlm_lockres_clear_refmap_bit(node, res);
+                cleared = 1;
+        }
+        spin_unlock(&res->spinlock);
+        if (cleared) {
+                mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
+                     dlm->name, res->lockname.len, res->lockname.name, node);
+                dlm_lockres_calc_usage(dlm, res);
+        } else {
+                mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
+                     "but it is already dropped!\n", dlm->name,
+                     res->lockname.len, res->lockname.name, node);
+                __dlm_print_one_lock_resource(res);
+        }
+        dlm_lockres_put(res);
+}
 /*
 * DLM_MIGRATE_LOCKRES
 */
-int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
-                        u8 target)
+                               struct dlm_lock_resource *res,
+                               u8 target)
 {
        struct dlm_master_list_entry *mle = NULL;
        struct dlm_master_list_entry *oldmle = NULL;
@@ -2116,7 +2444,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        struct list_head *queue, *iter;
        int i;
        struct dlm_lock *lock;
-        int empty = 1;
+        int empty = 1, wake = 0;
        if (!dlm_grab(dlm))
                return -EINVAL;
@@ -2241,6 +2569,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                     res->lockname.name, target);
                spin_lock(&res->spinlock);
                res->state &= ~DLM_LOCK_RES_MIGRATING;
+                wake = 1;
                spin_unlock(&res->spinlock);
                ret = -EINVAL;
        }
@@ -2268,6 +2597,9 @@ fail:
         * the lockres
         */
+        /* now that remote nodes are spinning on the MIGRATING flag,
+         * ensure that all assert_master work is flushed. */
+        flush_workqueue(dlm->dlm_worker);
        /* get an extra reference on the mle.
         * otherwise the assert_master from the new
@@ -2296,6 +2628,7 @@ fail:
                dlm_put_mle_inuse(mle);
                spin_lock(&res->spinlock);
                res->state &= ~DLM_LOCK_RES_MIGRATING;
+                wake = 1;
                spin_unlock(&res->spinlock);
                goto leave;
        }
@@ -2322,7 +2655,8 @@ fail:
                            res->owner == target)
                                break;
-                        mlog(0, "timed out during migration\n");
+                        mlog(0, "%s:%.*s: timed out during migration\n",
+                             dlm->name, res->lockname.len, res->lockname.name);
                        /* avoid hang during shutdown when migrating lockres 
                         * to a node which also goes down */
                        if (dlm_is_node_dead(dlm, target)) {
@@ -2330,20 +2664,20 @@ fail:
                                     "target %u is no longer up, restarting\n",
                                     dlm->name, res->lockname.len,
                                     res->lockname.name, target);
-                                ret = -ERESTARTSYS;
+                                ret = -EINVAL;
+                                /* migration failed, detach and clean up mle */
+                                dlm_mle_detach_hb_events(dlm, mle);
+                                dlm_put_mle(mle);
+                                dlm_put_mle_inuse(mle);
+                                spin_lock(&res->spinlock);
+                                res->state &= ~DLM_LOCK_RES_MIGRATING;
+                                wake = 1;
+                                spin_unlock(&res->spinlock);
+                                goto leave;
                        }
-                }
+                } else
-                if (ret == -ERESTARTSYS) {
+                        mlog(0, "%s:%.*s: caught signal during migration\n",
-                        /* migration failed, detach and clean up mle */
+                             dlm->name, res->lockname.len, res->lockname.name);
-                        dlm_mle_detach_hb_events(dlm, mle);
-                        dlm_put_mle(mle);
-                        dlm_put_mle_inuse(mle);
-                        spin_lock(&res->spinlock);
-                        res->state &= ~DLM_LOCK_RES_MIGRATING;
-                        spin_unlock(&res->spinlock);
-                        goto leave;
-                }
-                /* TODO: if node died: stop, clean up, return error */
        }
        /* all done, set the owner, clear the flag */
@@ -2366,6 +2700,11 @@ leave:
        if (ret < 0)
                dlm_kick_thread(dlm, res);
+        /* wake up waiters if the MIGRATING flag got set
+         * but migration failed */
+        if (wake)
+                wake_up(&res->wq);
        /* TODO: cleanup */
        if (mres)
                free_page((unsigned long)mres);
@@ -2376,6 +2715,53 @@ leave:
        return ret;
 }
+#define DLM_MIGRATION_RETRY_MS  100
+/* Should be called only after beginning the domain leave process.
+ * There should not be any remaining locks on nonlocal lock resources,
+ * and there should be no local locks left on locally mastered resources.
+ *
+ * Called with the dlm spinlock held, may drop it to do migration, but
+ * will re-acquire before exit.
+ *
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
+int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+        int ret;
+        int lock_dropped = 0;
+        if (res->owner != dlm->node_num) {
+                if (!__dlm_lockres_unused(res)) {
+                        mlog(ML_ERROR, "%s:%.*s: this node is not master, "
+                             "trying to free this but locks remain\n",
+                             dlm->name, res->lockname.len, res->lockname.name);
+                }
+                goto leave;
+        }
+        /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
+        spin_unlock(&dlm->spinlock);
+        lock_dropped = 1;
+        while (1) {
+                ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
+                if (ret >= 0)
+                        break;
+                if (ret == -ENOTEMPTY) {
+                        mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
+                                res->lockname.len, res->lockname.name);
+                        BUG();
+                }
+                mlog(0, "lockres %.*s: migrate failed, "
+                     "retrying\n", res->lockname.len,
+                     res->lockname.name);
+                msleep(DLM_MIGRATION_RETRY_MS);
+        }
+        spin_lock(&dlm->spinlock);
+leave:
+        return lock_dropped;
+}
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        int ret;
@@ -2405,7 +2791,8 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
        return can_proceed;
 }
-int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
+                                struct dlm_lock_resource *res)
 {
        int ret;
        spin_lock(&res->spinlock);
@@ -2434,8 +2821,15 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
        __dlm_lockres_reserve_ast(res);
        spin_unlock(&res->spinlock);
-        /* now flush all the pending asts.. hang out for a bit */
+        /* now flush all the pending asts */
        dlm_kick_thread(dlm, res);
+        /* before waiting on DIRTY, block processes which may
+         * try to dirty the lockres before MIGRATING is set */
+        spin_lock(&res->spinlock);
+        BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
+        res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
+        spin_unlock(&res->spinlock);
+        /* now wait on any pending asts and the DIRTY state */
        wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
        dlm_lockres_release_ast(dlm, res);
@@ -2461,6 +2855,13 @@ again:
                mlog(0, "trying again...\n");
                goto again;
        }
+        /* now that we are sure the MIGRATING state is there, drop
+         * the unneded state which blocked threads trying to DIRTY */
+        spin_lock(&res->spinlock);
+        BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+        BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+        res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+        spin_unlock(&res->spinlock);
        /* did the target go down or die? */
        spin_lock(&dlm->spinlock);
@@ -2490,7 +2891,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
 {
        struct list_head *iter, *iter2;
        struct list_head *queue = &res->granted;
-        int i;
+        int i, bit;
        struct dlm_lock *lock;
        assert_spin_locked(&res->spinlock);
@@ -2508,12 +2909,28 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
                                BUG_ON(!list_empty(&lock->bast_list));
                                BUG_ON(lock->ast_pending);
                                BUG_ON(lock->bast_pending);
+                                dlm_lockres_clear_refmap_bit(lock->ml.node, res);
                                list_del_init(&lock->list);
                                dlm_lock_put(lock);
                        }
                }
                queue++;
        }
+        bit = 0;
+        while (1) {
+                bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
+                if (bit >= O2NM_MAX_NODES)
+                        break;
+                /* do not clear the local node reference, if there is a
+                 * process holding this, let it drop the ref itself */
+                if (bit != dlm->node_num) {
+                        mlog(0, "%s:%.*s: node %u had a ref to this "
+                             "migrating lockres, clearing\n", dlm->name,
+                             res->lockname.len, res->lockname.name, bit);
+                        dlm_lockres_clear_refmap_bit(bit, res);
+                }
+                bit++;
+        }
 }
 /* for now this is not too intelligent.  we will
@@ -2601,6 +3018,16 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                        mlog(0, "migrate request (node %u) returned %d!\n",
                             nodenum, status);
                        ret = status;
+                } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
+                        /* during the migration request we short-circuited
+                         * the mastery of the lockres.  make sure we have
+                         * a mastery ref for nodenum */
+                        mlog(0, "%s:%.*s: need ref for node %u\n",
+                             dlm->name, res->lockname.len, res->lockname.name,
+                             nodenum);
+                        spin_lock(&res->spinlock);
+                        dlm_lockres_set_refmap_bit(nodenum, res);
+                        spin_unlock(&res->spinlock);
                }
        }
@@ -2619,7 +3046,8 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
 * we will have no mle in the list to start with.  now we can add an mle for
 * the migration and this should be the only one found for those scanning the
 * list.  */
-int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
+                                void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_lock_resource *res = NULL;
@@ -2745,7 +3173,13 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
                        /* remove it from the list so that only one
                         * mle will be found */
                        list_del_init(&tmp->list);
-                        __dlm_mle_detach_hb_events(dlm, mle);
+                        /* this was obviously WRONG.  mle is uninited here.  should be tmp. */
+                        __dlm_mle_detach_hb_events(dlm, tmp);
+                        ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+                        mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+                            "telling master to get ref for cleared out mle "
+                            "during migration\n", dlm->name, namelen, name,
+                            master, new_master);
                }
                spin_unlock(&tmp->spinlock);
        }
@@ -2753,6 +3187,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
        /* now add a migration mle to the tail of the list */
        dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
        mle->new_master = new_master;
+        /* the new master will be sending an assert master for this.
+         * at that point we will get the refmap reference */
        mle->master = master;
        /* do this for consistency with other mle types */
        set_bit(new_master, mle->maybe_map);
@@ -2902,6 +3338,13 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        clear_bit(dlm->node_num, iter.node_map);
        spin_unlock(&dlm->spinlock);
+        /* ownership of the lockres is changing.  account for the
+         * mastery reference here since old_master will briefly have
+         * a reference after the migration completes */
+        spin_lock(&res->spinlock);
+        dlm_lockres_set_refmap_bit(old_master, res);
+        spin_unlock(&res->spinlock);
        mlog(0, "now time to do a migrate request to other nodes\n");
        ret = dlm_do_migrate_request(dlm, res, old_master,
                                     dlm->node_num, &iter);
@@ -2914,8 +3357,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
             res->lockname.len, res->lockname.name);
        /* this call now finishes out the nodemap
         * even if one or more nodes die */
-        ret = dlm_do_assert_master(dlm, res->lockname.name,
+        ret = dlm_do_assert_master(dlm, res, iter.node_map,
-                                   res->lockname.len, iter.node_map,
                                   DLM_ASSERT_MASTER_FINISH_MIGRATION);
        if (ret < 0) {
                /* no longer need to retry.  all living nodes contacted. */
@@ -2927,8 +3369,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        set_bit(old_master, iter.node_map);
        mlog(0, "doing assert master of %.*s back to %u\n",
             res->lockname.len, res->lockname.name, old_master);
-        ret = dlm_do_assert_master(dlm, res->lockname.name,
+        ret = dlm_do_assert_master(dlm, res, iter.node_map,
-                                   res->lockname.len, iter.node_map,
                                   DLM_ASSERT_MASTER_FINISH_MIGRATION);
        if (ret < 0) {
                mlog(0, "assert master to original master failed "
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 367a11e9e2ed..6d4a83d50152 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -163,9 +163,6 @@ void dlm_dispatch_work(struct work_struct *work)
        dlm_workfunc_t *workfunc;
        int tot=0;
-        if (!dlm_joined(dlm))
-                return;
        spin_lock(&dlm->work_lock);
        list_splice_init(&dlm->work_list, &tmp_list);
        spin_unlock(&dlm->work_lock);
@@ -821,7 +818,8 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
 }
-int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
+                                  void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
@@ -978,7 +976,8 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
 }
-int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
+                               void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
@@ -1129,6 +1128,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        if (total_locks == mres_total_locks)
                mres->flags |= DLM_MRES_ALL_DONE;
+        mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
+             dlm->name, res->lockname.len, res->lockname.name,
+             orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery",
+             send_to);
        /* send it */
        ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
                                 sz, send_to, &status);
@@ -1213,6 +1217,34 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
        return 0;
 }
+static void dlm_add_dummy_lock(struct dlm_ctxt *dlm,
+                               struct dlm_migratable_lockres *mres)
+{
+        struct dlm_lock dummy;
+        memset(&dummy, 0, sizeof(dummy));
+        dummy.ml.cookie = 0;
+        dummy.ml.type = LKM_IVMODE;
+        dummy.ml.convert_type = LKM_IVMODE;
+        dummy.ml.highest_blocked = LKM_IVMODE;
+        dummy.lksb = NULL;
+        dummy.ml.node = dlm->node_num;
+        dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST);
+}
+static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm,
+                                    struct dlm_migratable_lock *ml,
+                                    u8 *nodenum)
+{
+        if (unlikely(ml->cookie == 0 &&
+            ml->type == LKM_IVMODE &&
+            ml->convert_type == LKM_IVMODE &&
+            ml->highest_blocked == LKM_IVMODE &&
+            ml->list == DLM_BLOCKED_LIST)) {
+                *nodenum = ml->node;
+                return 1;
+        }
+        return 0;
+}
 int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         struct dlm_migratable_lockres *mres,
@@ -1260,6 +1292,14 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                                goto error;
                }
        }
+        if (total_locks == 0) {
+                /* send a dummy lock to indicate a mastery reference only */
+                mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     send_to, flags & DLM_MRES_RECOVERY ? "recovery" :
+                     "migration");
+                dlm_add_dummy_lock(dlm, mres);
+        }
        /* flush any remaining locks */
        ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
        if (ret < 0)
@@ -1293,7 +1333,8 @@ error:
 * do we spin?  returning an error only delays the problem really
 */
-int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+                            void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_migratable_lockres *mres =
@@ -1382,17 +1423,21 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
                spin_lock(&res->spinlock);
                res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
                spin_unlock(&res->spinlock);
+                wake_up(&res->wq);
                /* add an extra ref for just-allocated lockres 
                 * otherwise the lockres will be purged immediately */
                dlm_lockres_get(res);
        }
        /* at this point we have allocated everything we need,
         * and we have a hashed lockres with an extra ref and
         * the proper res->state flags. */
        ret = 0;
+        spin_lock(&res->spinlock);
+        /* drop this either when master requery finds a different master
+         * or when a lock is added by the recovery worker */
+        dlm_lockres_grab_inflight_ref(dlm, res);
        if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
                /* migration cannot have an unknown master */
                BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
@@ -1400,10 +1445,11 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
                          "unknown owner.. will need to requery: "
                          "%.*s\n", mres->lockname_len, mres->lockname);
        } else {
-                spin_lock(&res->spinlock);
+                /* take a reference now to pin the lockres, drop it
+                 * when locks are added in the worker */
                dlm_change_lockres_owner(dlm, res, dlm->node_num);
-                spin_unlock(&res->spinlock);
        }
+        spin_unlock(&res->spinlock);
        /* queue up work for dlm_mig_lockres_worker */
        dlm_grab(dlm);  /* get an extra ref for the work item */
@@ -1459,6 +1505,9 @@ again:
                                   "this node will take it.\n",
                                   res->lockname.len, res->lockname.name);
                } else {
+                        spin_lock(&res->spinlock);
+                        dlm_lockres_drop_inflight_ref(dlm, res);
+                        spin_unlock(&res->spinlock);
                        mlog(0, "master needs to respond to sender "
                                  "that node %u still owns %.*s\n",
                                  real_master, res->lockname.len,
@@ -1578,7 +1627,8 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 /* this function cannot error, so unless the sending
 * or receiving of the message failed, the owner can
 * be trusted */
-int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
+                               void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
@@ -1660,21 +1710,38 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 {
        struct dlm_migratable_lock *ml;
        struct list_head *queue;
+        struct list_head *tmpq = NULL;
        struct dlm_lock *newlock = NULL;
        struct dlm_lockstatus *lksb = NULL;
        int ret = 0;
-        int i, bad;
+        int i, j, bad;
        struct list_head *iter;
        struct dlm_lock *lock = NULL;
+        u8 from = O2NM_MAX_NODES;
+        unsigned int added = 0;
        mlog(0, "running %d locks for this lockres\n", mres->num_locks);
        for (i=0; i<mres->num_locks; i++) {
                ml = &(mres->ml[i]);
+                if (dlm_is_dummy_lock(dlm, ml, &from)) {
+                        /* placeholder, just need to set the refmap bit */
+                        BUG_ON(mres->num_locks != 1);
+                        mlog(0, "%s:%.*s: dummy lock for %u\n",
+                             dlm->name, mres->lockname_len, mres->lockname,
+                             from);
+                        spin_lock(&res->spinlock);
+                        dlm_lockres_set_refmap_bit(from, res);
+                        spin_unlock(&res->spinlock);
+                        added++;
+                        break;
+                }
                BUG_ON(ml->highest_blocked != LKM_IVMODE);
                newlock = NULL;
                lksb = NULL;
                queue = dlm_list_num_to_pointer(res, ml->list);
+                tmpq = NULL;
                /* if the lock is for the local node it needs to
                 * be moved to the proper location within the queue.
@@ -1684,11 +1751,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
                        spin_lock(&res->spinlock);
-                        list_for_each(iter, queue) {
+                        for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
-                                lock = list_entry (iter, struct dlm_lock, list);
+                                tmpq = dlm_list_idx_to_ptr(res, j);
-                                if (lock->ml.cookie != ml->cookie)
+                                list_for_each(iter, tmpq) {
-                                        lock = NULL;
+                                        lock = list_entry (iter, struct dlm_lock, list);
-                                else
+                                        if (lock->ml.cookie != ml->cookie)
+                                                lock = NULL;
+                                        else
+                                                break;
+                                }
+                                if (lock)
                                        break;
                        }
@@ -1698,12 +1770,20 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                                u64 c = ml->cookie;
                                mlog(ML_ERROR, "could not find local lock "
                                               "with cookie %u:%llu!\n",
-                                               dlm_get_lock_cookie_node(c),
+                                     dlm_get_lock_cookie_node(be64_to_cpu(c)),
-                                               dlm_get_lock_cookie_seq(c));
+                                     dlm_get_lock_cookie_seq(be64_to_cpu(c)));
+                                __dlm_print_one_lock_resource(res);
                                BUG();
                        }
                        BUG_ON(lock->ml.node != ml->node);
+                        if (tmpq != queue) {
+                                mlog(0, "lock was on %u instead of %u for %.*s\n",
+                                     j, ml->list, res->lockname.len, res->lockname.name);
+                                spin_unlock(&res->spinlock);
+                                continue;
+                        }
                        /* see NOTE above about why we do not update
                         * to match the master here */
@@ -1711,6 +1791,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        /* do not alter lock refcount.  switching lists. */
                        list_move_tail(&lock->list, queue);
                        spin_unlock(&res->spinlock);
+                        added++;
                        mlog(0, "just reordered a local lock!\n");
                        continue;
@@ -1799,14 +1880,14 @@ skip_lvb:
                                mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
                                     "exists on this lockres!\n", dlm->name,
                                     res->lockname.len, res->lockname.name,
-                                     dlm_get_lock_cookie_node(c),
+                                     dlm_get_lock_cookie_node(be64_to_cpu(c)),
-                                     dlm_get_lock_cookie_seq(c));
+                                     dlm_get_lock_cookie_seq(be64_to_cpu(c)));
                                mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
                                     "node=%u, cookie=%u:%llu, queue=%d\n",
                                     ml->type, ml->convert_type, ml->node,
-                                     dlm_get_lock_cookie_node(ml->cookie),
+                                     dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)),
-                                     dlm_get_lock_cookie_seq(ml->cookie),
+                                     dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)),
                                     ml->list);
                                __dlm_print_one_lock_resource(res);
@@ -1817,12 +1898,22 @@ skip_lvb:
                if (!bad) {
                        dlm_lock_get(newlock);
                        list_add_tail(&newlock->list, queue);
+                        mlog(0, "%s:%.*s: added lock for node %u, "
+                             "setting refmap bit\n", dlm->name,
+                             res->lockname.len, res->lockname.name, ml->node);
+                        dlm_lockres_set_refmap_bit(ml->node, res);
+                        added++;
                }
                spin_unlock(&res->spinlock);
        }
        mlog(0, "done running all the locks\n");
 leave:
+        /* balance the ref taken when the work was queued */
+        spin_lock(&res->spinlock);
+        dlm_lockres_drop_inflight_ref(dlm, res);
+        spin_unlock(&res->spinlock);
        if (ret < 0) {
                mlog_errno(ret);
                if (newlock)
@@ -1935,9 +2026,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                if (res->owner == dead_node) {
                        list_del_init(&res->recovering);
                        spin_lock(&res->spinlock);
+                        /* new_master has our reference from
+                         * the lock state sent during recovery */
                        dlm_change_lockres_owner(dlm, res, new_master);
                        res->state &= ~DLM_LOCK_RES_RECOVERING;
-                        if (!__dlm_lockres_unused(res))
+                        if (__dlm_lockres_has_locks(res))
                                __dlm_dirty_lockres(dlm, res);
                        spin_unlock(&res->spinlock);
                        wake_up(&res->wq);
@@ -1977,9 +2070,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                                        dlm_lockres_put(res);
                                }
                                spin_lock(&res->spinlock);
+                                /* new_master has our reference from
+                                 * the lock state sent during recovery */
                                dlm_change_lockres_owner(dlm, res, new_master);
                                res->state &= ~DLM_LOCK_RES_RECOVERING;
-                                if (!__dlm_lockres_unused(res))
+                                if (__dlm_lockres_has_locks(res))
                                        __dlm_dirty_lockres(dlm, res);
                                spin_unlock(&res->spinlock);
                                wake_up(&res->wq);
@@ -2048,6 +2143,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 {
        struct list_head *iter, *tmpiter;
        struct dlm_lock *lock;
+        unsigned int freed = 0;
        /* this node is the lockres master:
         * 1) remove any stale locks for the dead node
@@ -2062,6 +2158,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
+                        freed++;
                }
        }
        list_for_each_safe(iter, tmpiter, &res->converting) {
@@ -2069,6 +2166,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
+                        freed++;
                }
        }
        list_for_each_safe(iter, tmpiter, &res->blocked) {
@@ -2076,9 +2174,23 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
+                        freed++;
                }
        }
+        if (freed) {
+                mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
+                     "dropping ref from lockres\n", dlm->name,
+                     res->lockname.len, res->lockname.name, freed, dead_node);
+                BUG_ON(!test_bit(dead_node, res->refmap));
+                dlm_lockres_clear_refmap_bit(dead_node, res);
+        } else if (test_bit(dead_node, res->refmap)) {
+                mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+                     "no locks and had not purged before dying\n", dlm->name,
+                     res->lockname.len, res->lockname.name, dead_node);
+                dlm_lockres_clear_refmap_bit(dead_node, res);
+        }
        /* do not kick thread yet */
        __dlm_dirty_lockres(dlm, res);
 }
@@ -2141,9 +2253,21 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                        spin_lock(&res->spinlock);
                        /* zero the lvb if necessary */
                        dlm_revalidate_lvb(dlm, res, dead_node);
-                        if (res->owner == dead_node)
+                        if (res->owner == dead_node) {
+                                if (res->state & DLM_LOCK_RES_DROPPING_REF)
+                                        mlog(0, "%s:%.*s: owned by "
+                                             "dead node %u, this node was "
+                                             "dropping its ref when it died. "
+                                             "continue, dropping the flag.\n",
+                                             dlm->name, res->lockname.len,
+                                             res->lockname.name, dead_node);
+                                /* the wake_up for this will happen when the
+                                 * RECOVERING flag is dropped later */
+                                res->state &= ~DLM_LOCK_RES_DROPPING_REF;
                                dlm_move_lockres_to_recovery_list(dlm, res);
-                        else if (res->owner == dlm->node_num) {
+                        } else if (res->owner == dlm->node_num) {
                                dlm_free_dead_locks(dlm, res, dead_node);
                                __dlm_lockres_calc_usage(dlm, res);
                        }
@@ -2480,7 +2604,8 @@ retry:
        return ret;
 }
-int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+                           void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
@@ -2608,7 +2733,8 @@ stage2:
        return ret;
 }
-int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+                              void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 0c822f3ffb05..8ffa0916eb86 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -54,9 +54,6 @@
 #include "cluster/masklog.h"
 static int dlm_thread(void *data);
-static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
-                                  struct dlm_lock_resource *lockres);
 static void dlm_flush_asts(struct dlm_ctxt *dlm);
 #define dlm_lock_is_remote(dlm, lock)     ((lock)->ml.node != (dlm)->node_num)
@@ -82,14 +79,33 @@ repeat:
        current->state = TASK_RUNNING;
 }
+int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
-int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
        if (list_empty(&res->granted) &&
            list_empty(&res->converting) &&
-            list_empty(&res->blocked) &&
+            list_empty(&res->blocked))
-            list_empty(&res->dirty))
+                return 0;
-                return 1;
+        return 1;
+}
+/* "unused": the lockres has no locks, is not on the dirty list,
+ * has no inflight locks (in the gap between mastery and acquiring
+ * the first lock), and has no bits in its refmap.
+ * truly ready to be freed. */
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
+{
+        if (!__dlm_lockres_has_locks(res) &&
+            (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
+                /* try not to scan the bitmap unless the first two
+                 * conditions are already true */
+                int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+                if (bit >= O2NM_MAX_NODES) {
+                        /* since the bit for dlm->node_num is not
+                         * set, inflight_locks better be zero */
+                        BUG_ON(res->inflight_locks != 0);
+                        return 1;
+                }
+        }
        return 0;
 }
@@ -106,46 +122,21 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
        if (__dlm_lockres_unused(res)){
-                /* For now, just keep any resource we master */
-                if (res->owner == dlm->node_num)
-                {
-                        if (!list_empty(&res->purge)) {
-                                mlog(0, "we master %s:%.*s, but it is on "
-                                     "the purge list.  Removing\n",
-                                     dlm->name, res->lockname.len,
-                                     res->lockname.name);
-                                list_del_init(&res->purge);
-                                dlm->purge_count--;
-                        }
-                        return;
-                }
                if (list_empty(&res->purge)) {
-                        mlog(0, "putting lockres %.*s from purge list\n",
+                        mlog(0, "putting lockres %.*s:%p onto purge list\n",
-                             res->lockname.len, res->lockname.name);
+                             res->lockname.len, res->lockname.name, res);
                        res->last_used = jiffies;
+                        dlm_lockres_get(res);
                        list_add_tail(&res->purge, &dlm->purge_list);
                        dlm->purge_count++;
-                        /* if this node is not the owner, there is
-                         * no way to keep track of who the owner could be.
-                         * unhash it to avoid serious problems. */
-                        if (res->owner != dlm->node_num) {
-                                mlog(0, "%s:%.*s: doing immediate "
-                                     "purge of lockres owned by %u\n",
-                                     dlm->name, res->lockname.len,
-                                     res->lockname.name, res->owner);
-                                dlm_purge_lockres_now(dlm, res);
-                        }
                }
        } else if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s from purge list, "
+                mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
-                     "owner=%u\n", res->lockname.len, res->lockname.name,
+                     res->lockname.len, res->lockname.name, res, res->owner);
-                     res->owner);
                list_del_init(&res->purge);
+                dlm_lockres_put(res);
                dlm->purge_count--;
        }
 }
@@ -163,68 +154,65 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
        spin_unlock(&dlm->spinlock);
 }
-/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
+static int dlm_purge_lockres(struct dlm_ctxt *dlm,
- * to do migration, but will re-acquire before exit. */
+                             struct dlm_lock_resource *res)
-void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
 {
        int master;
-        int ret;
+        int ret = 0;
-        spin_lock(&lockres->spinlock);
-        master = lockres->owner == dlm->node_num;
-        spin_unlock(&lockres->spinlock);
-        mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
+        spin_lock(&res->spinlock);
-             lockres->lockname.name, master);
+        if (!__dlm_lockres_unused(res)) {
+                spin_unlock(&res->spinlock);
-        /* Non master is the easy case -- no migration required, just
+                mlog(0, "%s:%.*s: tried to purge but not unused\n",
-         * quit. */
+                     dlm->name, res->lockname.len, res->lockname.name);
+                return -ENOTEMPTY;
+        }
+        master = (res->owner == dlm->node_num);
        if (!master)
-                goto finish;
+                res->state |= DLM_LOCK_RES_DROPPING_REF;
+        spin_unlock(&res->spinlock);
-        /* Wheee! Migrate lockres here! */
-        spin_unlock(&dlm->spinlock);
-again:
-        ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
+        mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
-        if (ret == -ENOTEMPTY) {
+             res->lockname.name, master);
-                mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
-                     lockres->lockname.len, lockres->lockname.name);
-                BUG();
+        if (!master) {
-        } else if (ret < 0) {
+                spin_lock(&res->spinlock);
-                mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
+                /* This ensures that clear refmap is sent after the set */
-                     lockres->lockname.len, lockres->lockname.name);
+                __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
-                msleep(100);
+                spin_unlock(&res->spinlock);
-                goto again;
+                /* drop spinlock to do messaging, retake below */
+                spin_unlock(&dlm->spinlock);
+                /* clear our bit from the master's refmap, ignore errors */
+                ret = dlm_drop_lockres_ref(dlm, res);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        if (!dlm_is_host_down(ret))
+                                BUG();
+                }
+                mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
+                     dlm->name, res->lockname.len, res->lockname.name, ret);
+                spin_lock(&dlm->spinlock);
        }
-        spin_lock(&dlm->spinlock);
+        if (!list_empty(&res->purge)) {
+                mlog(0, "removing lockres %.*s:%p from purgelist, "
-finish:
+                     "master = %d\n", res->lockname.len, res->lockname.name,
-        if (!list_empty(&lockres->purge)) {
+                     res, master);
-                list_del_init(&lockres->purge);
+                list_del_init(&res->purge);
+                dlm_lockres_put(res);
                dlm->purge_count--;
        }
-        __dlm_unhash_lockres(lockres);
+        __dlm_unhash_lockres(res);
-}
-/* make an unused lockres go away immediately.
- * as soon as the dlm spinlock is dropped, this lockres
- * will not be found. kfree still happens on last put. */
-static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
-                                  struct dlm_lock_resource *lockres)
-{
-        assert_spin_locked(&dlm->spinlock);
-        assert_spin_locked(&lockres->spinlock);
-        BUG_ON(!__dlm_lockres_unused(lockres));
+        /* lockres is not in the hash now.  drop the flag and wake up
+         * any processes waiting in dlm_get_lock_resource. */
-        if (!list_empty(&lockres->purge)) {
+        if (!master) {
-                list_del_init(&lockres->purge);
+                spin_lock(&res->spinlock);
-                dlm->purge_count--;
+                res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+                spin_unlock(&res->spinlock);
+                wake_up(&res->wq);
        }
-        __dlm_unhash_lockres(lockres);
+        return 0;
 }
 static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -268,13 +256,17 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                        break;
                }
+                mlog(0, "removing lockres %.*s:%p from purgelist\n",
+                     lockres->lockname.len, lockres->lockname.name, lockres);
                list_del_init(&lockres->purge);
+                dlm_lockres_put(lockres);
                dlm->purge_count--;
                /* This may drop and reacquire the dlm spinlock if it
                 * has to do migration. */
                mlog(0, "calling dlm_purge_lockres!\n");
-                dlm_purge_lockres(dlm, lockres);
+                if (dlm_purge_lockres(dlm, lockres))
+                        BUG();
                mlog(0, "DONE calling dlm_purge_lockres!\n");
                /* Avoid adding any scheduling latencies */
@@ -467,12 +459,17 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        assert_spin_locked(&res->spinlock);
        /* don't shuffle secondary queues */
-        if ((res->owner == dlm->node_num) &&
+        if ((res->owner == dlm->node_num)) {
-            !(res->state & DLM_LOCK_RES_DIRTY)) {
+                if (res->state & (DLM_LOCK_RES_MIGRATING |
-                /* ref for dirty_list */
+                                  DLM_LOCK_RES_BLOCK_DIRTY))
-                dlm_lockres_get(res);
+                    return;
-                list_add_tail(&res->dirty, &dlm->dirty_list);
-                res->state |= DLM_LOCK_RES_DIRTY;
+                if (list_empty(&res->dirty)) {
+                        /* ref for dirty_list */
+                        dlm_lockres_get(res);
+                        list_add_tail(&res->dirty, &dlm->dirty_list);
+                        res->state |= DLM_LOCK_RES_DIRTY;
+                }
        }
 }
@@ -651,7 +648,7 @@ static int dlm_thread(void *data)
                        dlm_lockres_get(res);
                        spin_lock(&res->spinlock);
-                        res->state &= ~DLM_LOCK_RES_DIRTY;
+                        /* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */
                        list_del_init(&res->dirty);
                        spin_unlock(&res->spinlock);
                        spin_unlock(&dlm->spinlock);
@@ -675,10 +672,11 @@ static int dlm_thread(void *data)
                        /* it is now ok to move lockreses in these states
                         * to the dirty list, assuming that they will only be
                         * dirty for a short while. */
+                        BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
                        if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
-                                          DLM_LOCK_RES_MIGRATING |
                                          DLM_LOCK_RES_RECOVERING)) {
                                /* move it to the tail and keep going */
+                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
                                mlog(0, "delaying list shuffling for in-"
                                     "progress lockres %.*s, state=%d\n",
@@ -699,6 +697,7 @@ static int dlm_thread(void *data)
                        /* called while holding lockres lock */
                        dlm_shuffle_lists(dlm, res);
+                        res->state &= ~DLM_LOCK_RES_DIRTY;
                        spin_unlock(&res->spinlock);
                        dlm_lockres_calc_usage(dlm, res);
@@ -709,11 +708,8 @@ in_progress:
                        /* if the lock was in-progress, stick
                         * it on the back of the list */
                        if (delay) {
-                                /* ref for dirty_list */
-                                dlm_lockres_get(res);
                                spin_lock(&res->spinlock);
-                                list_add_tail(&res->dirty, &dlm->dirty_list);
+                                __dlm_dirty_lockres(dlm, res);
-                                res->state |= DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
                        }
                        dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 37be4b2e0d4a..86ca085ef324 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -147,6 +147,10 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
                goto leave;
        }
+        if (res->state & DLM_LOCK_RES_MIGRATING) {
+                status = DLM_MIGRATING;
+                goto leave;
+        }
        /* see above for what the spec says about
         * LKM_CANCEL and the lock queue state */
@@ -244,8 +248,8 @@ leave:
                /* this should always be coupled with list removal */
                BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
                mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
-                     dlm_get_lock_cookie_node(lock->ml.cookie),
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                     dlm_get_lock_cookie_seq(lock->ml.cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                     atomic_read(&lock->lock_refs.refcount)-1);
                dlm_lock_put(lock);
        }
@@ -379,7 +383,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
 * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
 *          return value from dlmunlock_master
 */
-int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+                            void **ret_data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
@@ -502,8 +507,8 @@ not_found:
        if (!found)
                mlog(ML_ERROR, "failed to find lock to unlock! "
                               "cookie=%u:%llu\n",
-                               dlm_get_lock_cookie_node(unlock->cookie),
+                     dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
-                               dlm_get_lock_cookie_seq(unlock->cookie));
+                     dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie)));
        else
                dlm_lock_put(lock);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index e1216364d191..d026b4f27757 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -306,8 +306,8 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 * for the dinode, one for the new block. */
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
-/* file update (nlink, etc) + dir entry block */
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
-#define OCFS2_LINK_CREDITS  (OCFS2_INODE_UPDATE_CREDITS + 1)
+#define OCFS2_LINK_CREDITS  (2*OCFS2_INODE_UPDATE_CREDITS + 1)
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
 * dir inode link */
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index 0afd8b9af70f..f30e63b9910c 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -887,7 +887,7 @@ static inline int ocfs2_translate_response(int response)
 static int ocfs2_handle_response_message(struct o2net_msg *msg,
                                         u32 len,
-                                         void *data)
+                                         void *data, void **ret_data)
 {
        unsigned int response_id, node_num;
        int response_status;
@@ -943,7 +943,7 @@ bail:
 static int ocfs2_handle_vote_message(struct o2net_msg *msg,
                                     u32 len,
-                                     void *data)
+                                     void *data, void **ret_data)
 {
        int status;
        struct ocfs2_super *osb = data;
@@ -1007,7 +1007,7 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb)
                                        osb->net_key,
                                        sizeof(struct ocfs2_response_msg),
                                        ocfs2_handle_response_message,
-                                        osb, &osb->osb_net_handlers);
+                                        osb, NULL, &osb->osb_net_handlers);
        if (status) {
                mlog_errno(status);
                goto bail;
@@ -1017,7 +1017,7 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb)
                                        osb->net_key,
                                        sizeof(struct ocfs2_vote_msg),
                                        ocfs2_handle_vote_message,
-                                        osb, &osb->osb_net_handlers);
+                                        osb, NULL, &osb->osb_net_handlers);
        if (status) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e8f540d38d48..d3b9f5f07db1 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
+#include <asm/semaphore.h>
 #include "sysfs.h"
@@ -146,7 +147,7 @@ static int open(struct inode * inode, struct file * file)
 Error:
        module_put(attr->attr.owner);
 Done:
-        if (error && kobj)
+        if (error)
                kobject_put(kobj);
        return error;
 }
@@ -157,8 +158,7 @@ static int release(struct inode * inode, struct file * file)
        struct bin_attribute * attr = to_bin_attr(file->f_path.dentry);
        u8 * buffer = file->private_data;
-        if (kobj) 
+        kobject_put(kobj);
-                kobject_put(kobj);
        module_put(attr->attr.owner);
        kfree(buffer);
        return 0;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 511edef8b321..9dcdf556c99c 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/kobject.h>
 #include <linux/namei.h>
+#include <asm/semaphore.h>
 #include "sysfs.h"
 DECLARE_RWSEM(sysfs_rename_sem);
@@ -32,8 +33,7 @@ static struct dentry_operations sysfs_dentry_ops = {
 /*
 * Allocates a new sysfs_dirent and links it to the parent sysfs_dirent
 */
-static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd,
+static struct sysfs_dirent * __sysfs_new_dirent(void * element)
-                                                void * element)
 {
        struct sysfs_dirent * sd;
@@ -45,12 +45,28 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd,
        atomic_set(&sd->s_count, 1);
        atomic_set(&sd->s_event, 1);
        INIT_LIST_HEAD(&sd->s_children);
-        list_add(&sd->s_sibling, &parent_sd->s_children);
+        INIT_LIST_HEAD(&sd->s_sibling);
        sd->s_element = element;
        return sd;
 }
+static void __sysfs_list_dirent(struct sysfs_dirent *parent_sd,
+                              struct sysfs_dirent *sd)
+{
+        if (sd)
+                list_add(&sd->s_sibling, &parent_sd->s_children);
+}
+static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent *parent_sd,
+                                                void * element)
+{
+        struct sysfs_dirent *sd;
+        sd = __sysfs_new_dirent(element);
+        __sysfs_list_dirent(parent_sd, sd);
+        return sd;
+}
 /*
 *
 * Return -EEXIST if there is already a sysfs element with the same name for
@@ -77,14 +93,14 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd,
 }
-int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry,
+static struct sysfs_dirent *
-                        void * element, umode_t mode, int type)
+__sysfs_make_dirent(struct dentry *dentry, void *element, mode_t mode, int type)
 {
        struct sysfs_dirent * sd;
-        sd = sysfs_new_dirent(parent_sd, element);
+        sd = __sysfs_new_dirent(element);
        if (!sd)
-                return -ENOMEM;
+                goto out;
        sd->s_mode = mode;
        sd->s_type = type;
@@ -94,7 +110,19 @@ int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry,
                dentry->d_op = &sysfs_dentry_ops;
        }
-        return 0;
+out:
+        return sd;
+}
+int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry,
+                        void * element, umode_t mode, int type)
+{
+        struct sysfs_dirent *sd;
+        sd = __sysfs_make_dirent(dentry, element, mode, type);
+        __sysfs_list_dirent(parent_sd, sd);
+        return sd ? 0 : -ENOMEM;
 }
 static int init_dir(struct inode * inode)
@@ -165,11 +193,11 @@ int sysfs_create_subdir(struct kobject * k, const char * n, struct dentry ** d)
 /**
 *      sysfs_create_dir - create a directory for an object.
- *      @parent:        parent parent object.
 *      @kobj:          object we're creating directory for. 
+ *      @shadow_parent: parent parent object.
 */
-int sysfs_create_dir(struct kobject * kobj)
+int sysfs_create_dir(struct kobject * kobj, struct dentry *shadow_parent)
 {
        struct dentry * dentry = NULL;
        struct dentry * parent;
@@ -177,7 +205,9 @@ int sysfs_create_dir(struct kobject * kobj)
        BUG_ON(!kobj);
-        if (kobj->parent)
+        if (shadow_parent)
+                parent = shadow_parent;
+        else if (kobj->parent)
                parent = kobj->parent->dentry;
        else if (sysfs_mount && sysfs_mount->mnt_sb)
                parent = sysfs_mount->mnt_sb->s_root;
@@ -298,21 +328,12 @@ void sysfs_remove_subdir(struct dentry * d)
 }
-/**
+static void __sysfs_remove_dir(struct dentry *dentry)
- *      sysfs_remove_dir - remove an object's directory.
- *      @kobj:  object. 
- *
- *      The only thing special about this is that we remove any files in 
- *      the directory before we remove the directory, and we've inlined
- *      what used to be sysfs_rmdir() below, instead of calling separately.
- */
-void sysfs_remove_dir(struct kobject * kobj)
 {
-        struct dentry * dentry = dget(kobj->dentry);
        struct sysfs_dirent * parent_sd;
        struct sysfs_dirent * sd, * tmp;
+        dget(dentry);
        if (!dentry)
                return;
@@ -333,32 +354,60 @@ void sysfs_remove_dir(struct kobject * kobj)
         * Drop reference from dget() on entrance.
         */
        dput(dentry);
+}
+/**
+ *      sysfs_remove_dir - remove an object's directory.
+ *      @kobj:  object.
+ *
+ *      The only thing special about this is that we remove any files in
+ *      the directory before we remove the directory, and we've inlined
+ *      what used to be sysfs_rmdir() below, instead of calling separately.
+ */
+void sysfs_remove_dir(struct kobject * kobj)
+{
+        __sysfs_remove_dir(kobj->dentry);
        kobj->dentry = NULL;
 }
-int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
+int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent,
+                     const char *new_name)
 {
        int error = 0;
-        struct dentry * new_dentry, * parent;
+        struct dentry * new_dentry;
-        if (!strcmp(kobject_name(kobj), new_name))
-                return -EINVAL;
-        if (!kobj->parent)
+        if (!new_parent)
-                return -EINVAL;
+                return -EFAULT;
        down_write(&sysfs_rename_sem);
-        parent = kobj->parent->dentry;
+        mutex_lock(&new_parent->d_inode->i_mutex);
-        mutex_lock(&parent->d_inode->i_mutex);
-        new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
+        new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name));
        if (!IS_ERR(new_dentry)) {
-                if (!new_dentry->d_inode) {
+                /* By allowing two different directories with the
+                 * same d_parent we allow this routine to move
+                 * between different shadows of the same directory
+                 */
+                if (kobj->dentry->d_parent->d_inode != new_parent->d_inode)
+                        return -EINVAL;
+                else if (new_dentry->d_parent->d_inode != new_parent->d_inode)
+                        error = -EINVAL;
+                else if (new_dentry == kobj->dentry)
+                        error = -EINVAL;
+                else if (!new_dentry->d_inode) {
                        error = kobject_set_name(kobj, "%s", new_name);
                        if (!error) {
+                                struct sysfs_dirent *sd, *parent_sd;
                                d_add(new_dentry, NULL);
                                d_move(kobj->dentry, new_dentry);
+                                sd = kobj->dentry->d_fsdata;
+                                parent_sd = new_parent->d_fsdata;
+                                list_del_init(&sd->s_sibling);
+                                list_add(&sd->s_sibling, &parent_sd->s_children);
                        }
                        else
                                d_drop(new_dentry);
@@ -366,7 +415,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
                        error = -EEXIST;
                dput(new_dentry);
        }
-        mutex_unlock(&parent->d_inode->i_mutex);
+        mutex_unlock(&new_parent->d_inode->i_mutex);
        up_write(&sysfs_rename_sem);
        return error;
@@ -378,12 +427,10 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent)
        struct sysfs_dirent *new_parent_sd, *sd;
        int error;
-        if (!new_parent)
-                return -EINVAL;
        old_parent_dentry = kobj->parent ?
                kobj->parent->dentry : sysfs_mount->mnt_sb->s_root;
-        new_parent_dentry = new_parent->dentry;
+        new_parent_dentry = new_parent ?
+                new_parent->dentry : sysfs_mount->mnt_sb->s_root;
 again:
        mutex_lock(&old_parent_dentry->d_inode->i_mutex);
@@ -547,6 +594,95 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
        return offset;
 }
+/**
+ *      sysfs_make_shadowed_dir - Setup so a directory can be shadowed
+ *      @kobj:  object we're creating shadow of.
+ */
+int sysfs_make_shadowed_dir(struct kobject *kobj,
+        void * (*follow_link)(struct dentry *, struct nameidata *))
+{
+        struct inode *inode;
+        struct inode_operations *i_op;
+        inode = kobj->dentry->d_inode;
+        if (inode->i_op != &sysfs_dir_inode_operations)
+                return -EINVAL;
+        i_op = kmalloc(sizeof(*i_op), GFP_KERNEL);
+        if (!i_op)
+                return -ENOMEM;
+        memcpy(i_op, &sysfs_dir_inode_operations, sizeof(*i_op));
+        i_op->follow_link = follow_link;
+        /* Locking of inode->i_op?
+         * Since setting i_op is a single word write and they
+         * are atomic we should be ok here.
+         */
+        inode->i_op = i_op;
+        return 0;
+}
+/**
+ *      sysfs_create_shadow_dir - create a shadow directory for an object.
+ *      @kobj:  object we're creating directory for.
+ *
+ *      sysfs_make_shadowed_dir must already have been called on this
+ *      directory.
+ */
+struct dentry *sysfs_create_shadow_dir(struct kobject *kobj)
+{
+        struct sysfs_dirent *sd;
+        struct dentry *parent, *dir, *shadow;
+        struct inode *inode;
+        dir = kobj->dentry;
+        inode = dir->d_inode;
+        parent = dir->d_parent;
+        shadow = ERR_PTR(-EINVAL);
+        if (!sysfs_is_shadowed_inode(inode))
+                goto out;
+        shadow = d_alloc(parent, &dir->d_name);
+        if (!shadow)
+                goto nomem;
+        sd = __sysfs_make_dirent(shadow, kobj, inode->i_mode, SYSFS_DIR);
+        if (!sd)
+                goto nomem;
+        d_instantiate(shadow, igrab(inode));
+        inc_nlink(inode);
+        inc_nlink(parent->d_inode);
+        shadow->d_op = &sysfs_dentry_ops;
+        dget(shadow);           /* Extra count - pin the dentry in core */
+out:
+        return shadow;
+nomem:
+        dput(shadow);
+        shadow = ERR_PTR(-ENOMEM);
+        goto out;
+}
+/**
+ *      sysfs_remove_shadow_dir - remove an object's directory.
+ *      @shadow: dentry of shadow directory
+ *
+ *      The only thing special about this is that we remove any files in
+ *      the directory before we remove the directory, and we've inlined
+ *      what used to be sysfs_rmdir() below, instead of calling separately.
+ */
+void sysfs_remove_shadow_dir(struct dentry *shadow)
+{
+        __sysfs_remove_dir(shadow);
+}
 const struct file_operations sysfs_dir_operations = {
        .open           = sysfs_dir_open,
        .release        = sysfs_dir_close,
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 9cfe53e1e00d..c0e117649a4d 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -7,6 +7,7 @@
 #include <linux/kobject.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
+#include <linux/list.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
@@ -50,17 +51,29 @@ static struct sysfs_ops subsys_sysfs_ops = {
        .store  = subsys_attr_store,
 };
+/**
+ *      add_to_collection - add buffer to a collection
+ *      @buffer:        buffer to be added
+ *      @node           inode of set to add to
+ */
-struct sysfs_buffer {
+static inline void
-        size_t                  count;
+add_to_collection(struct sysfs_buffer *buffer, struct inode *node)
-        loff_t                  pos;
+{
-        char                    * page;
+        struct sysfs_buffer_collection *set = node->i_private;
-        struct sysfs_ops        * ops;
-        struct semaphore        sem;
-        int                     needs_read_fill;
-        int                     event;
-};
+        mutex_lock(&node->i_mutex);
+        list_add(&buffer->associates, &set->associates);
+        mutex_unlock(&node->i_mutex);
+}
+static inline void
+remove_from_collection(struct sysfs_buffer *buffer, struct inode *node)
+{
+        mutex_lock(&node->i_mutex);
+        list_del(&buffer->associates);
+        mutex_unlock(&node->i_mutex);
+}
 /**
 *      fill_read_buffer - allocate and fill buffer from object.
@@ -70,7 +83,8 @@ struct sysfs_buffer {
 *      Allocate @buffer->page, if it hasn't been already, then call the
 *      kobject's show() method to fill the buffer with this attribute's 
 *      data. 
- *      This is called only once, on the file's first read. 
+ *      This is called only once, on the file's first read unless an error
+ *      is returned.
 */
 static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
 {
@@ -88,12 +102,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
        buffer->event = atomic_read(&sd->s_event);
        count = ops->show(kobj,attr,buffer->page);
-        buffer->needs_read_fill = 0;
        BUG_ON(count > (ssize_t)PAGE_SIZE);
-        if (count >= 0)
+        if (count >= 0) {
+                buffer->needs_read_fill = 0;
                buffer->count = count;
-        else
+        } else {
                ret = count;
+        }
        return ret;
 }
@@ -153,6 +168,10 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        ssize_t retval = 0;
        down(&buffer->sem);
+        if (buffer->orphaned) {
+                retval = -ENODEV;
+                goto out;
+        }
        if (buffer->needs_read_fill) {
                if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
                        goto out;
@@ -165,7 +184,6 @@ out:
        return retval;
 }
 /**
 *      fill_write_buffer - copy buffer from userspace.
 *      @buffer:        data buffer for file.
@@ -243,19 +261,25 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t
        ssize_t len;
        down(&buffer->sem);
+        if (buffer->orphaned) {
+                len = -ENODEV;
+                goto out;
+        }
        len = fill_write_buffer(buffer, buf, count);
        if (len > 0)
                len = flush_write_buffer(file->f_path.dentry, buffer, len);
        if (len > 0)
                *ppos += len;
+out:
        up(&buffer->sem);
        return len;
 }
-static int check_perm(struct inode * inode, struct file * file)
+static int sysfs_open_file(struct inode *inode, struct file *file)
 {
        struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent);
        struct attribute * attr = to_attr(file->f_path.dentry);
+        struct sysfs_buffer_collection *set;
        struct sysfs_buffer * buffer;
        struct sysfs_ops * ops = NULL;
        int error = 0;
@@ -285,6 +309,18 @@ static int check_perm(struct inode * inode, struct file * file)
        if (!ops)
                goto Eaccess;
+        /* make sure we have a collection to add our buffers to */
+        mutex_lock(&inode->i_mutex);
+        if (!(set = inode->i_private)) {
+                if (!(set = inode->i_private = kmalloc(sizeof(struct sysfs_buffer_collection), GFP_KERNEL))) {
+                        error = -ENOMEM;
+                        goto Done;
+                } else {
+                        INIT_LIST_HEAD(&set->associates);
+                }
+        }
+        mutex_unlock(&inode->i_mutex);
        /* File needs write support.
         * The inode's perms must say it's ok, 
         * and we must have a store method.
@@ -310,9 +346,11 @@ static int check_perm(struct inode * inode, struct file * file)
         */
        buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL);
        if (buffer) {
+                INIT_LIST_HEAD(&buffer->associates);
                init_MUTEX(&buffer->sem);
                buffer->needs_read_fill = 1;
                buffer->ops = ops;
+                add_to_collection(buffer, inode);
                file->private_data = buffer;
        } else
                error = -ENOMEM;
@@ -325,16 +363,11 @@ static int check_perm(struct inode * inode, struct file * file)
        error = -EACCES;
        module_put(attr->owner);
 Done:
-        if (error && kobj)
+        if (error)
                kobject_put(kobj);
        return error;
 }
-static int sysfs_open_file(struct inode * inode, struct file * filp)
-{
-        return check_perm(inode,filp);
-}
 static int sysfs_release(struct inode * inode, struct file * filp)
 {
        struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent);
@@ -342,8 +375,9 @@ static int sysfs_release(struct inode * inode, struct file * filp)
        struct module * owner = attr->owner;
        struct sysfs_buffer * buffer = filp->private_data;
-        if (kobj) 
+        if (buffer)
-                kobject_put(kobj);
+                remove_from_collection(buffer, inode);
+        kobject_put(kobj);
        /* After this point, attr should not be accessed. */
        module_put(owner);
@@ -548,7 +582,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-        sysfs_hash_and_remove(kobj->dentry,attr->name);
+        sysfs_hash_and_remove(kobj->dentry, attr->name);
 }
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 122145b0895c..b20951c93761 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -13,6 +13,8 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/err.h>
+#include <linux/fs.h>
+#include <asm/semaphore.h>
 #include "sysfs.h"
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e79e38d52c00..542d2bcc73df 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -13,6 +13,7 @@
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
+#include <asm/semaphore.h>
 #include "sysfs.h"
 extern struct super_block * sysfs_sb;
@@ -32,6 +33,16 @@ static struct inode_operations sysfs_inode_operations ={
        .setattr        = sysfs_setattr,
 };
+void sysfs_delete_inode(struct inode *inode)
+{
+        /* Free the shadowed directory inode operations */
+        if (sysfs_is_shadowed_inode(inode)) {
+                kfree(inode->i_op);
+                inode->i_op = NULL;
+        }
+        return generic_delete_inode(inode);
+}
 int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 {
        struct inode * inode = dentry->d_inode;
@@ -209,6 +220,22 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd)
        return NULL;
 }
+static inline void orphan_all_buffers(struct inode *node)
+{
+        struct sysfs_buffer_collection *set = node->i_private;
+        struct sysfs_buffer *buf;
+        mutex_lock_nested(&node->i_mutex, I_MUTEX_CHILD);
+        if (node->i_private) {
+                list_for_each_entry(buf, &set->associates, associates) {
+                        down(&buf->sem);
+                        buf->orphaned = 1;
+                        up(&buf->sem);
+                }
+        }
+        mutex_unlock(&node->i_mutex);
+}
 /*
 * Unhashes the dentry corresponding to given sysfs_dirent
@@ -217,16 +244,23 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd)
 void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent)
 {
        struct dentry * dentry = sd->s_dentry;
+        struct inode *inode;
        if (dentry) {
                spin_lock(&dcache_lock);
                spin_lock(&dentry->d_lock);
                if (!(d_unhashed(dentry) && dentry->d_inode)) {
+                        inode = dentry->d_inode;
+                        spin_lock(&inode->i_lock);
+                        __iget(inode);
+                        spin_unlock(&inode->i_lock);
                        dget_locked(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
                        spin_unlock(&dcache_lock);
                        simple_unlink(parent->d_inode, dentry);
+                        orphan_all_buffers(inode);
+                        iput(inode);
                } else {
                        spin_unlock(&dentry->d_lock);
                        spin_unlock(&dcache_lock);
@@ -248,7 +282,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name)
                return -ENOENT;
        parent_sd = dir->d_fsdata;
-        mutex_lock(&dir->d_inode->i_mutex);
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
        list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
                if (!sd->s_element)
                        continue;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index e503f858fba8..f6a87a824883 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -8,6 +8,7 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <asm/semaphore.h>
 #include "sysfs.h"
@@ -18,9 +19,12 @@ struct vfsmount *sysfs_mount;
 struct super_block * sysfs_sb = NULL;
 struct kmem_cache *sysfs_dir_cachep;
+static void sysfs_clear_inode(struct inode *inode);
 static struct super_operations sysfs_ops = {
        .statfs         = simple_statfs,
-        .drop_inode     = generic_delete_inode,
+        .drop_inode     = sysfs_delete_inode,
+        .clear_inode    = sysfs_clear_inode,
 };
 static struct sysfs_dirent sysfs_root = {
@@ -31,6 +35,11 @@ static struct sysfs_dirent sysfs_root = {
        .s_iattr        = NULL,
 };
+static void sysfs_clear_inode(struct inode *inode)
+{
+        kfree(inode->i_private);
+}
 static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct inode *inode;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index f50e3cc2ded8..4869f611192f 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/kobject.h>
 #include <linux/namei.h>
+#include <asm/semaphore.h>
 #include "sysfs.h"
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index bd7cec295dab..fe1cbfd208ed 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -2,6 +2,7 @@
 extern struct vfsmount * sysfs_mount;
 extern struct kmem_cache *sysfs_dir_cachep;
+extern void sysfs_delete_inode(struct inode *inode);
 extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *);
 extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *));
@@ -33,6 +34,22 @@ struct sysfs_symlink {
        struct kobject * target_kobj;
 };
+struct sysfs_buffer {
+        struct list_head                associates;
+        size_t                          count;
+        loff_t                          pos;
+        char                            * page;
+        struct sysfs_ops                * ops;
+        struct semaphore                sem;
+        int                             orphaned;
+        int                             needs_read_fill;
+        int                             event;
+};
+struct sysfs_buffer_collection {
+        struct list_head        associates;
+};
 static inline struct kobject * to_kobj(struct dentry * dentry)
 {
        struct sysfs_dirent * sd = dentry->d_fsdata;
@@ -96,3 +113,7 @@ static inline void sysfs_put(struct sysfs_dirent * sd)
                release_sysfs_dirent(sd);
 }
+static inline int sysfs_is_shadowed_inode(struct inode *inode)
+{
+        return S_ISDIR(inode->i_mode) && inode->i_op->follow_link;
+}