aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dlm
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /fs/dlm
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'fs/dlm')
-rw-r--r--fs/dlm/Kconfig2
-rw-r--r--fs/dlm/ast.c7
-rw-r--r--fs/dlm/config.c206
-rw-r--r--fs/dlm/config.h20
-rw-r--r--fs/dlm/debug_fs.c126
-rw-r--r--fs/dlm/dir.c295
-rw-r--r--fs/dlm/dir.h7
-rw-r--r--fs/dlm/dlm_internal.h183
-rw-r--r--fs/dlm/lock.c1874
-rw-r--r--fs/dlm/lock.h13
-rw-r--r--fs/dlm/lockspace.c151
-rw-r--r--fs/dlm/lowcomms.c274
-rw-r--r--fs/dlm/lowcomms.h2
-rw-r--r--fs/dlm/main.c2
-rw-r--r--fs/dlm/member.c503
-rw-r--r--fs/dlm/member.h10
-rw-r--r--fs/dlm/memory.c8
-rw-r--r--fs/dlm/netlink.c8
-rw-r--r--fs/dlm/rcom.c267
-rw-r--r--fs/dlm/rcom.h3
-rw-r--r--fs/dlm/recover.c452
-rw-r--r--fs/dlm/recover.h2
-rw-r--r--fs/dlm/recoverd.c103
-rw-r--r--fs/dlm/recoverd.h1
-rw-r--r--fs/dlm/requestqueue.c43
-rw-r--r--fs/dlm/user.c12
26 files changed, 1287 insertions, 3287 deletions
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index e4242c3f848..1897eb1b4b6 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,6 +1,6 @@
1menuconfig DLM 1menuconfig DLM
2 tristate "Distributed Lock Manager (DLM)" 2 tristate "Distributed Lock Manager (DLM)"
3 depends on INET 3 depends on EXPERIMENTAL && INET
4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) 4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
5 select IP_SCTP 5 select IP_SCTP
6 help 6 help
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 27a6ba9aaee..90e5997262e 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -15,8 +15,8 @@
15#include "lock.h" 15#include "lock.h"
16#include "user.h" 16#include "user.h"
17 17
18static uint64_t dlm_cb_seq; 18static uint64_t dlm_cb_seq;
19static DEFINE_SPINLOCK(dlm_cb_seq_spin); 19static spinlock_t dlm_cb_seq_spin;
20 20
21static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) 21static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
22{ 22{
@@ -310,7 +310,6 @@ void dlm_callback_resume(struct dlm_ls *ls)
310 } 310 }
311 mutex_unlock(&ls->ls_cb_mutex); 311 mutex_unlock(&ls->ls_cb_mutex);
312 312
313 if (count) 313 log_debug(ls, "dlm_callback_resume %d", count);
314 log_debug(ls, "dlm_callback_resume %d", count);
315} 314}
316 315
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index a0387dd8b1f..6cf72fcc0d0 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -17,7 +17,6 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/in.h> 18#include <linux/in.h>
19#include <linux/in6.h> 19#include <linux/in6.h>
20#include <linux/dlmconstants.h>
21#include <net/ipv6.h> 20#include <net/ipv6.h>
22#include <net/sock.h> 21#include <net/sock.h>
23 22
@@ -37,7 +36,6 @@
37static struct config_group *space_list; 36static struct config_group *space_list;
38static struct config_group *comm_list; 37static struct config_group *comm_list;
39static struct dlm_comm *local_comm; 38static struct dlm_comm *local_comm;
40static uint32_t dlm_comm_count;
41 39
42struct dlm_clusters; 40struct dlm_clusters;
43struct dlm_cluster; 41struct dlm_cluster;
@@ -96,6 +94,7 @@ struct dlm_cluster {
96 unsigned int cl_tcp_port; 94 unsigned int cl_tcp_port;
97 unsigned int cl_buffer_size; 95 unsigned int cl_buffer_size;
98 unsigned int cl_rsbtbl_size; 96 unsigned int cl_rsbtbl_size;
97 unsigned int cl_dirtbl_size;
99 unsigned int cl_recover_timer; 98 unsigned int cl_recover_timer;
100 unsigned int cl_toss_secs; 99 unsigned int cl_toss_secs;
101 unsigned int cl_scan_secs; 100 unsigned int cl_scan_secs;
@@ -104,14 +103,13 @@ struct dlm_cluster {
104 unsigned int cl_timewarn_cs; 103 unsigned int cl_timewarn_cs;
105 unsigned int cl_waitwarn_us; 104 unsigned int cl_waitwarn_us;
106 unsigned int cl_new_rsb_count; 105 unsigned int cl_new_rsb_count;
107 unsigned int cl_recover_callbacks;
108 char cl_cluster_name[DLM_LOCKSPACE_LEN];
109}; 106};
110 107
111enum { 108enum {
112 CLUSTER_ATTR_TCP_PORT = 0, 109 CLUSTER_ATTR_TCP_PORT = 0,
113 CLUSTER_ATTR_BUFFER_SIZE, 110 CLUSTER_ATTR_BUFFER_SIZE,
114 CLUSTER_ATTR_RSBTBL_SIZE, 111 CLUSTER_ATTR_RSBTBL_SIZE,
112 CLUSTER_ATTR_DIRTBL_SIZE,
115 CLUSTER_ATTR_RECOVER_TIMER, 113 CLUSTER_ATTR_RECOVER_TIMER,
116 CLUSTER_ATTR_TOSS_SECS, 114 CLUSTER_ATTR_TOSS_SECS,
117 CLUSTER_ATTR_SCAN_SECS, 115 CLUSTER_ATTR_SCAN_SECS,
@@ -120,8 +118,6 @@ enum {
120 CLUSTER_ATTR_TIMEWARN_CS, 118 CLUSTER_ATTR_TIMEWARN_CS,
121 CLUSTER_ATTR_WAITWARN_US, 119 CLUSTER_ATTR_WAITWARN_US,
122 CLUSTER_ATTR_NEW_RSB_COUNT, 120 CLUSTER_ATTR_NEW_RSB_COUNT,
123 CLUSTER_ATTR_RECOVER_CALLBACKS,
124 CLUSTER_ATTR_CLUSTER_NAME,
125}; 121};
126 122
127struct cluster_attribute { 123struct cluster_attribute {
@@ -130,27 +126,6 @@ struct cluster_attribute {
130 ssize_t (*store)(struct dlm_cluster *, const char *, size_t); 126 ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
131}; 127};
132 128
133static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
134{
135 return sprintf(buf, "%s\n", cl->cl_cluster_name);
136}
137
138static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
139 const char *buf, size_t len)
140{
141 strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
142 strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
143 return len;
144}
145
146static struct cluster_attribute cluster_attr_cluster_name = {
147 .attr = { .ca_owner = THIS_MODULE,
148 .ca_name = "cluster_name",
149 .ca_mode = S_IRUGO | S_IWUSR },
150 .show = cluster_cluster_name_read,
151 .store = cluster_cluster_name_write,
152};
153
154static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, 129static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
155 int *info_field, int check_zero, 130 int *info_field, int check_zero,
156 const char *buf, size_t len) 131 const char *buf, size_t len)
@@ -187,6 +162,7 @@ __CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
187CLUSTER_ATTR(tcp_port, 1); 162CLUSTER_ATTR(tcp_port, 1);
188CLUSTER_ATTR(buffer_size, 1); 163CLUSTER_ATTR(buffer_size, 1);
189CLUSTER_ATTR(rsbtbl_size, 1); 164CLUSTER_ATTR(rsbtbl_size, 1);
165CLUSTER_ATTR(dirtbl_size, 1);
190CLUSTER_ATTR(recover_timer, 1); 166CLUSTER_ATTR(recover_timer, 1);
191CLUSTER_ATTR(toss_secs, 1); 167CLUSTER_ATTR(toss_secs, 1);
192CLUSTER_ATTR(scan_secs, 1); 168CLUSTER_ATTR(scan_secs, 1);
@@ -195,12 +171,12 @@ CLUSTER_ATTR(protocol, 0);
195CLUSTER_ATTR(timewarn_cs, 1); 171CLUSTER_ATTR(timewarn_cs, 1);
196CLUSTER_ATTR(waitwarn_us, 0); 172CLUSTER_ATTR(waitwarn_us, 0);
197CLUSTER_ATTR(new_rsb_count, 0); 173CLUSTER_ATTR(new_rsb_count, 0);
198CLUSTER_ATTR(recover_callbacks, 0);
199 174
200static struct configfs_attribute *cluster_attrs[] = { 175static struct configfs_attribute *cluster_attrs[] = {
201 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, 176 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
202 [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr, 177 [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
203 [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr, 178 [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
179 [CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr,
204 [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr, 180 [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
205 [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, 181 [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
206 [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, 182 [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
@@ -209,8 +185,6 @@ static struct configfs_attribute *cluster_attrs[] = {
209 [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, 185 [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
210 [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, 186 [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
211 [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr, 187 [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
212 [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
213 [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
214 NULL, 188 NULL,
215}; 189};
216 190
@@ -319,7 +293,6 @@ struct dlm_comms {
319 293
320struct dlm_comm { 294struct dlm_comm {
321 struct config_item item; 295 struct config_item item;
322 int seq;
323 int nodeid; 296 int nodeid;
324 int local; 297 int local;
325 int addr_count; 298 int addr_count;
@@ -336,7 +309,6 @@ struct dlm_node {
336 int nodeid; 309 int nodeid;
337 int weight; 310 int weight;
338 int new; 311 int new;
339 int comm_seq; /* copy of cm->seq when nd->nodeid is set */
340}; 312};
341 313
342static struct configfs_group_operations clusters_ops = { 314static struct configfs_group_operations clusters_ops = {
@@ -474,6 +446,7 @@ static struct config_group *make_cluster(struct config_group *g,
474 cl->cl_tcp_port = dlm_config.ci_tcp_port; 446 cl->cl_tcp_port = dlm_config.ci_tcp_port;
475 cl->cl_buffer_size = dlm_config.ci_buffer_size; 447 cl->cl_buffer_size = dlm_config.ci_buffer_size;
476 cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; 448 cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size;
449 cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size;
477 cl->cl_recover_timer = dlm_config.ci_recover_timer; 450 cl->cl_recover_timer = dlm_config.ci_recover_timer;
478 cl->cl_toss_secs = dlm_config.ci_toss_secs; 451 cl->cl_toss_secs = dlm_config.ci_toss_secs;
479 cl->cl_scan_secs = dlm_config.ci_scan_secs; 452 cl->cl_scan_secs = dlm_config.ci_scan_secs;
@@ -482,9 +455,6 @@ static struct config_group *make_cluster(struct config_group *g,
482 cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; 455 cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
483 cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; 456 cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
484 cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; 457 cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
485 cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
486 memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
487 DLM_LOCKSPACE_LEN);
488 458
489 space_list = &sps->ss_group; 459 space_list = &sps->ss_group;
490 comm_list = &cms->cs_group; 460 comm_list = &cms->cs_group;
@@ -588,11 +558,6 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
588 return ERR_PTR(-ENOMEM); 558 return ERR_PTR(-ENOMEM);
589 559
590 config_item_init_type_name(&cm->item, name, &comm_type); 560 config_item_init_type_name(&cm->item, name, &comm_type);
591
592 cm->seq = dlm_comm_count++;
593 if (!cm->seq)
594 cm->seq = dlm_comm_count++;
595
596 cm->nodeid = -1; 561 cm->nodeid = -1;
597 cm->local = 0; 562 cm->local = 0;
598 cm->addr_count = 0; 563 cm->addr_count = 0;
@@ -750,7 +715,6 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
750static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) 715static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
751{ 716{
752 struct sockaddr_storage *addr; 717 struct sockaddr_storage *addr;
753 int rv;
754 718
755 if (len != sizeof(struct sockaddr_storage)) 719 if (len != sizeof(struct sockaddr_storage))
756 return -EINVAL; 720 return -EINVAL;
@@ -763,13 +727,6 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
763 return -ENOMEM; 727 return -ENOMEM;
764 728
765 memcpy(addr, buf, len); 729 memcpy(addr, buf, len);
766
767 rv = dlm_lowcomms_addr(cm->nodeid, addr, len);
768 if (rv) {
769 kfree(addr);
770 return rv;
771 }
772
773 cm->addr[cm->addr_count++] = addr; 730 cm->addr[cm->addr_count++] = addr;
774 return len; 731 return len;
775} 732}
@@ -844,10 +801,7 @@ static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
844static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, 801static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
845 size_t len) 802 size_t len)
846{ 803{
847 uint32_t seq = 0;
848 nd->nodeid = simple_strtol(buf, NULL, 0); 804 nd->nodeid = simple_strtol(buf, NULL, 0);
849 dlm_comm_seq(nd->nodeid, &seq);
850 nd->comm_seq = seq;
851 return len; 805 return len;
852} 806}
853 807
@@ -886,7 +840,34 @@ static void put_space(struct dlm_space *sp)
886 config_item_put(&sp->group.cg_item); 840 config_item_put(&sp->group.cg_item);
887} 841}
888 842
889static struct dlm_comm *get_comm(int nodeid) 843static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
844{
845 switch (x->ss_family) {
846 case AF_INET: {
847 struct sockaddr_in *sinx = (struct sockaddr_in *)x;
848 struct sockaddr_in *siny = (struct sockaddr_in *)y;
849 if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
850 return 0;
851 if (sinx->sin_port != siny->sin_port)
852 return 0;
853 break;
854 }
855 case AF_INET6: {
856 struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
857 struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
858 if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
859 return 0;
860 if (sinx->sin6_port != siny->sin6_port)
861 return 0;
862 break;
863 }
864 default:
865 return 0;
866 }
867 return 1;
868}
869
870static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
890{ 871{
891 struct config_item *i; 872 struct config_item *i;
892 struct dlm_comm *cm = NULL; 873 struct dlm_comm *cm = NULL;
@@ -900,11 +881,19 @@ static struct dlm_comm *get_comm(int nodeid)
900 list_for_each_entry(i, &comm_list->cg_children, ci_entry) { 881 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
901 cm = config_item_to_comm(i); 882 cm = config_item_to_comm(i);
902 883
903 if (cm->nodeid != nodeid) 884 if (nodeid) {
904 continue; 885 if (cm->nodeid != nodeid)
905 found = 1; 886 continue;
906 config_item_get(i); 887 found = 1;
907 break; 888 config_item_get(i);
889 break;
890 } else {
891 if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
892 continue;
893 found = 1;
894 config_item_get(i);
895 break;
896 }
908 } 897 }
909 mutex_unlock(&clusters_root.subsys.su_mutex); 898 mutex_unlock(&clusters_root.subsys.su_mutex);
910 899
@@ -919,13 +908,13 @@ static void put_comm(struct dlm_comm *cm)
919} 908}
920 909
921/* caller must free mem */ 910/* caller must free mem */
922int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, 911int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
923 int *count_out) 912 int **new_out, int *new_count_out)
924{ 913{
925 struct dlm_space *sp; 914 struct dlm_space *sp;
926 struct dlm_node *nd; 915 struct dlm_node *nd;
927 struct dlm_config_node *nodes, *node; 916 int i = 0, rv = 0, ids_count = 0, new_count = 0;
928 int rv, count; 917 int *ids, *new;
929 918
930 sp = get_space(lsname); 919 sp = get_space(lsname);
931 if (!sp) 920 if (!sp)
@@ -938,40 +927,93 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
938 goto out; 927 goto out;
939 } 928 }
940 929
941 count = sp->members_count; 930 ids_count = sp->members_count;
942 931
943 nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); 932 ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
944 if (!nodes) { 933 if (!ids) {
945 rv = -ENOMEM; 934 rv = -ENOMEM;
946 goto out; 935 goto out;
947 } 936 }
948 937
949 node = nodes;
950 list_for_each_entry(nd, &sp->members, list) { 938 list_for_each_entry(nd, &sp->members, list) {
951 node->nodeid = nd->nodeid; 939 ids[i++] = nd->nodeid;
952 node->weight = nd->weight; 940 if (nd->new)
953 node->new = nd->new; 941 new_count++;
954 node->comm_seq = nd->comm_seq; 942 }
955 node++; 943
944 if (ids_count != i)
945 printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
946
947 if (!new_count)
948 goto out_ids;
956 949
957 nd->new = 0; 950 new = kcalloc(new_count, sizeof(int), GFP_NOFS);
951 if (!new) {
952 kfree(ids);
953 rv = -ENOMEM;
954 goto out;
955 }
956
957 i = 0;
958 list_for_each_entry(nd, &sp->members, list) {
959 if (nd->new) {
960 new[i++] = nd->nodeid;
961 nd->new = 0;
962 }
958 } 963 }
964 *new_count_out = new_count;
965 *new_out = new;
959 966
960 *count_out = count; 967 out_ids:
961 *nodes_out = nodes; 968 *ids_count_out = ids_count;
962 rv = 0; 969 *ids_out = ids;
963 out: 970 out:
964 mutex_unlock(&sp->members_lock); 971 mutex_unlock(&sp->members_lock);
965 put_space(sp); 972 put_space(sp);
966 return rv; 973 return rv;
967} 974}
968 975
969int dlm_comm_seq(int nodeid, uint32_t *seq) 976int dlm_node_weight(char *lsname, int nodeid)
977{
978 struct dlm_space *sp;
979 struct dlm_node *nd;
980 int w = -EEXIST;
981
982 sp = get_space(lsname);
983 if (!sp)
984 goto out;
985
986 mutex_lock(&sp->members_lock);
987 list_for_each_entry(nd, &sp->members, list) {
988 if (nd->nodeid != nodeid)
989 continue;
990 w = nd->weight;
991 break;
992 }
993 mutex_unlock(&sp->members_lock);
994 put_space(sp);
995 out:
996 return w;
997}
998
999int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
1000{
1001 struct dlm_comm *cm = get_comm(nodeid, NULL);
1002 if (!cm)
1003 return -EEXIST;
1004 if (!cm->addr_count)
1005 return -ENOENT;
1006 memcpy(addr, cm->addr[0], sizeof(*addr));
1007 put_comm(cm);
1008 return 0;
1009}
1010
1011int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
970{ 1012{
971 struct dlm_comm *cm = get_comm(nodeid); 1013 struct dlm_comm *cm = get_comm(0, addr);
972 if (!cm) 1014 if (!cm)
973 return -EEXIST; 1015 return -EEXIST;
974 *seq = cm->seq; 1016 *nodeid = cm->nodeid;
975 put_comm(cm); 1017 put_comm(cm);
976 return 0; 1018 return 0;
977} 1019}
@@ -996,6 +1038,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
996#define DEFAULT_TCP_PORT 21064 1038#define DEFAULT_TCP_PORT 21064
997#define DEFAULT_BUFFER_SIZE 4096 1039#define DEFAULT_BUFFER_SIZE 4096
998#define DEFAULT_RSBTBL_SIZE 1024 1040#define DEFAULT_RSBTBL_SIZE 1024
1041#define DEFAULT_DIRTBL_SIZE 1024
999#define DEFAULT_RECOVER_TIMER 5 1042#define DEFAULT_RECOVER_TIMER 5
1000#define DEFAULT_TOSS_SECS 10 1043#define DEFAULT_TOSS_SECS 10
1001#define DEFAULT_SCAN_SECS 5 1044#define DEFAULT_SCAN_SECS 5
@@ -1004,13 +1047,12 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
1004#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ 1047#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
1005#define DEFAULT_WAITWARN_US 0 1048#define DEFAULT_WAITWARN_US 0
1006#define DEFAULT_NEW_RSB_COUNT 128 1049#define DEFAULT_NEW_RSB_COUNT 128
1007#define DEFAULT_RECOVER_CALLBACKS 0
1008#define DEFAULT_CLUSTER_NAME ""
1009 1050
1010struct dlm_config_info dlm_config = { 1051struct dlm_config_info dlm_config = {
1011 .ci_tcp_port = DEFAULT_TCP_PORT, 1052 .ci_tcp_port = DEFAULT_TCP_PORT,
1012 .ci_buffer_size = DEFAULT_BUFFER_SIZE, 1053 .ci_buffer_size = DEFAULT_BUFFER_SIZE,
1013 .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, 1054 .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
1055 .ci_dirtbl_size = DEFAULT_DIRTBL_SIZE,
1014 .ci_recover_timer = DEFAULT_RECOVER_TIMER, 1056 .ci_recover_timer = DEFAULT_RECOVER_TIMER,
1015 .ci_toss_secs = DEFAULT_TOSS_SECS, 1057 .ci_toss_secs = DEFAULT_TOSS_SECS,
1016 .ci_scan_secs = DEFAULT_SCAN_SECS, 1058 .ci_scan_secs = DEFAULT_SCAN_SECS,
@@ -1018,8 +1060,6 @@ struct dlm_config_info dlm_config = {
1018 .ci_protocol = DEFAULT_PROTOCOL, 1060 .ci_protocol = DEFAULT_PROTOCOL,
1019 .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, 1061 .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
1020 .ci_waitwarn_us = DEFAULT_WAITWARN_US, 1062 .ci_waitwarn_us = DEFAULT_WAITWARN_US,
1021 .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, 1063 .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT
1022 .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
1023 .ci_cluster_name = DEFAULT_CLUSTER_NAME
1024}; 1064};
1025 1065
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index f30697bc278..3099d0dd26c 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,19 +14,13 @@
14#ifndef __CONFIG_DOT_H__ 14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__ 15#define __CONFIG_DOT_H__
16 16
17struct dlm_config_node {
18 int nodeid;
19 int weight;
20 int new;
21 uint32_t comm_seq;
22};
23
24#define DLM_MAX_ADDR_COUNT 3 17#define DLM_MAX_ADDR_COUNT 3
25 18
26struct dlm_config_info { 19struct dlm_config_info {
27 int ci_tcp_port; 20 int ci_tcp_port;
28 int ci_buffer_size; 21 int ci_buffer_size;
29 int ci_rsbtbl_size; 22 int ci_rsbtbl_size;
23 int ci_dirtbl_size;
30 int ci_recover_timer; 24 int ci_recover_timer;
31 int ci_toss_secs; 25 int ci_toss_secs;
32 int ci_scan_secs; 26 int ci_scan_secs;
@@ -35,17 +29,17 @@ struct dlm_config_info {
35 int ci_timewarn_cs; 29 int ci_timewarn_cs;
36 int ci_waitwarn_us; 30 int ci_waitwarn_us;
37 int ci_new_rsb_count; 31 int ci_new_rsb_count;
38 int ci_recover_callbacks;
39 char ci_cluster_name[DLM_LOCKSPACE_LEN];
40}; 32};
41 33
42extern struct dlm_config_info dlm_config; 34extern struct dlm_config_info dlm_config;
43 35
44int dlm_config_init(void); 36int dlm_config_init(void);
45void dlm_config_exit(void); 37void dlm_config_exit(void);
46int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, 38int dlm_node_weight(char *lsname, int nodeid);
47 int *count_out); 39int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
48int dlm_comm_seq(int nodeid, uint32_t *seq); 40 int **new_out, int *new_count_out);
41int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
42int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
49int dlm_our_nodeid(void); 43int dlm_our_nodeid(void);
50int dlm_our_addr(struct sockaddr_storage *addr, int num); 44int dlm_our_addr(struct sockaddr_storage *addr, int num);
51 45
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index b969deef9eb..59779237e2b 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -344,45 +344,6 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
344 return rv; 344 return rv;
345} 345}
346 346
347static int print_format4(struct dlm_rsb *r, struct seq_file *s)
348{
349 int our_nodeid = dlm_our_nodeid();
350 int print_name = 1;
351 int i, rv;
352
353 lock_rsb(r);
354
355 rv = seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ",
356 r,
357 r->res_nodeid,
358 r->res_master_nodeid,
359 r->res_dir_nodeid,
360 our_nodeid,
361 r->res_toss_time,
362 r->res_flags,
363 r->res_length);
364 if (rv)
365 goto out;
366
367 for (i = 0; i < r->res_length; i++) {
368 if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
369 print_name = 0;
370 }
371
372 seq_printf(s, "%s", print_name ? "str " : "hex");
373
374 for (i = 0; i < r->res_length; i++) {
375 if (print_name)
376 seq_printf(s, "%c", r->res_name[i]);
377 else
378 seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
379 }
380 rv = seq_printf(s, "\n");
381 out:
382 unlock_rsb(r);
383 return rv;
384}
385
386struct rsbtbl_iter { 347struct rsbtbl_iter {
387 struct dlm_rsb *rsb; 348 struct dlm_rsb *rsb;
388 unsigned bucket; 349 unsigned bucket;
@@ -421,13 +382,6 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
421 } 382 }
422 rv = print_format3(ri->rsb, seq); 383 rv = print_format3(ri->rsb, seq);
423 break; 384 break;
424 case 4:
425 if (ri->header) {
426 seq_printf(seq, "version 4 rsb 2\n");
427 ri->header = 0;
428 }
429 rv = print_format4(ri->rsb, seq);
430 break;
431 } 385 }
432 386
433 return rv; 387 return rv;
@@ -436,18 +390,14 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
436static const struct seq_operations format1_seq_ops; 390static const struct seq_operations format1_seq_ops;
437static const struct seq_operations format2_seq_ops; 391static const struct seq_operations format2_seq_ops;
438static const struct seq_operations format3_seq_ops; 392static const struct seq_operations format3_seq_ops;
439static const struct seq_operations format4_seq_ops;
440 393
441static void *table_seq_start(struct seq_file *seq, loff_t *pos) 394static void *table_seq_start(struct seq_file *seq, loff_t *pos)
442{ 395{
443 struct rb_root *tree;
444 struct rb_node *node;
445 struct dlm_ls *ls = seq->private; 396 struct dlm_ls *ls = seq->private;
446 struct rsbtbl_iter *ri; 397 struct rsbtbl_iter *ri;
447 struct dlm_rsb *r; 398 struct dlm_rsb *r;
448 loff_t n = *pos; 399 loff_t n = *pos;
449 unsigned bucket, entry; 400 unsigned bucket, entry;
450 int toss = (seq->op == &format4_seq_ops);
451 401
452 bucket = n >> 32; 402 bucket = n >> 32;
453 entry = n & ((1LL << 32) - 1); 403 entry = n & ((1LL << 32) - 1);
@@ -466,15 +416,11 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
466 ri->format = 2; 416 ri->format = 2;
467 if (seq->op == &format3_seq_ops) 417 if (seq->op == &format3_seq_ops)
468 ri->format = 3; 418 ri->format = 3;
469 if (seq->op == &format4_seq_ops)
470 ri->format = 4;
471
472 tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
473 419
474 spin_lock(&ls->ls_rsbtbl[bucket].lock); 420 spin_lock(&ls->ls_rsbtbl[bucket].lock);
475 if (!RB_EMPTY_ROOT(tree)) { 421 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
476 for (node = rb_first(tree); node; node = rb_next(node)) { 422 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
477 r = rb_entry(node, struct dlm_rsb, res_hashnode); 423 res_hashchain) {
478 if (!entry--) { 424 if (!entry--) {
479 dlm_hold_rsb(r); 425 dlm_hold_rsb(r);
480 ri->rsb = r; 426 ri->rsb = r;
@@ -501,12 +447,11 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
501 kfree(ri); 447 kfree(ri);
502 return NULL; 448 return NULL;
503 } 449 }
504 tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
505 450
506 spin_lock(&ls->ls_rsbtbl[bucket].lock); 451 spin_lock(&ls->ls_rsbtbl[bucket].lock);
507 if (!RB_EMPTY_ROOT(tree)) { 452 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
508 node = rb_first(tree); 453 r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
509 r = rb_entry(node, struct dlm_rsb, res_hashnode); 454 struct dlm_rsb, res_hashchain);
510 dlm_hold_rsb(r); 455 dlm_hold_rsb(r);
511 ri->rsb = r; 456 ri->rsb = r;
512 ri->bucket = bucket; 457 ri->bucket = bucket;
@@ -522,12 +467,10 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
522{ 467{
523 struct dlm_ls *ls = seq->private; 468 struct dlm_ls *ls = seq->private;
524 struct rsbtbl_iter *ri = iter_ptr; 469 struct rsbtbl_iter *ri = iter_ptr;
525 struct rb_root *tree; 470 struct list_head *next;
526 struct rb_node *next;
527 struct dlm_rsb *r, *rp; 471 struct dlm_rsb *r, *rp;
528 loff_t n = *pos; 472 loff_t n = *pos;
529 unsigned bucket; 473 unsigned bucket;
530 int toss = (seq->op == &format4_seq_ops);
531 474
532 bucket = n >> 32; 475 bucket = n >> 32;
533 476
@@ -537,10 +480,10 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
537 480
538 spin_lock(&ls->ls_rsbtbl[bucket].lock); 481 spin_lock(&ls->ls_rsbtbl[bucket].lock);
539 rp = ri->rsb; 482 rp = ri->rsb;
540 next = rb_next(&rp->res_hashnode); 483 next = rp->res_hashchain.next;
541 484
542 if (next) { 485 if (next != &ls->ls_rsbtbl[bucket].list) {
543 r = rb_entry(next, struct dlm_rsb, res_hashnode); 486 r = list_entry(next, struct dlm_rsb, res_hashchain);
544 dlm_hold_rsb(r); 487 dlm_hold_rsb(r);
545 ri->rsb = r; 488 ri->rsb = r;
546 spin_unlock(&ls->ls_rsbtbl[bucket].lock); 489 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
@@ -566,12 +509,11 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
566 kfree(ri); 509 kfree(ri);
567 return NULL; 510 return NULL;
568 } 511 }
569 tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
570 512
571 spin_lock(&ls->ls_rsbtbl[bucket].lock); 513 spin_lock(&ls->ls_rsbtbl[bucket].lock);
572 if (!RB_EMPTY_ROOT(tree)) { 514 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
573 next = rb_first(tree); 515 r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
574 r = rb_entry(next, struct dlm_rsb, res_hashnode); 516 struct dlm_rsb, res_hashchain);
575 dlm_hold_rsb(r); 517 dlm_hold_rsb(r);
576 ri->rsb = r; 518 ri->rsb = r;
577 ri->bucket = bucket; 519 ri->bucket = bucket;
@@ -614,17 +556,9 @@ static const struct seq_operations format3_seq_ops = {
614 .show = table_seq_show, 556 .show = table_seq_show,
615}; 557};
616 558
617static const struct seq_operations format4_seq_ops = {
618 .start = table_seq_start,
619 .next = table_seq_next,
620 .stop = table_seq_stop,
621 .show = table_seq_show,
622};
623
624static const struct file_operations format1_fops; 559static const struct file_operations format1_fops;
625static const struct file_operations format2_fops; 560static const struct file_operations format2_fops;
626static const struct file_operations format3_fops; 561static const struct file_operations format3_fops;
627static const struct file_operations format4_fops;
628 562
629static int table_open(struct inode *inode, struct file *file) 563static int table_open(struct inode *inode, struct file *file)
630{ 564{
@@ -637,8 +571,6 @@ static int table_open(struct inode *inode, struct file *file)
637 ret = seq_open(file, &format2_seq_ops); 571 ret = seq_open(file, &format2_seq_ops);
638 else if (file->f_op == &format3_fops) 572 else if (file->f_op == &format3_fops)
639 ret = seq_open(file, &format3_seq_ops); 573 ret = seq_open(file, &format3_seq_ops);
640 else if (file->f_op == &format4_fops)
641 ret = seq_open(file, &format4_seq_ops);
642 574
643 if (ret) 575 if (ret)
644 return ret; 576 return ret;
@@ -672,17 +604,16 @@ static const struct file_operations format3_fops = {
672 .release = seq_release 604 .release = seq_release
673}; 605};
674 606
675static const struct file_operations format4_fops = {
676 .owner = THIS_MODULE,
677 .open = table_open,
678 .read = seq_read,
679 .llseek = seq_lseek,
680 .release = seq_release
681};
682
683/* 607/*
684 * dump lkb's on the ls_waiters list 608 * dump lkb's on the ls_waiters list
685 */ 609 */
610
611static int waiters_open(struct inode *inode, struct file *file)
612{
613 file->private_data = inode->i_private;
614 return 0;
615}
616
686static ssize_t waiters_read(struct file *file, char __user *userbuf, 617static ssize_t waiters_read(struct file *file, char __user *userbuf,
687 size_t count, loff_t *ppos) 618 size_t count, loff_t *ppos)
688{ 619{
@@ -711,7 +642,7 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
711 642
712static const struct file_operations waiters_fops = { 643static const struct file_operations waiters_fops = {
713 .owner = THIS_MODULE, 644 .owner = THIS_MODULE,
714 .open = simple_open, 645 .open = waiters_open,
715 .read = waiters_read, 646 .read = waiters_read,
716 .llseek = default_llseek, 647 .llseek = default_llseek,
717}; 648};
@@ -726,8 +657,6 @@ void dlm_delete_debug_file(struct dlm_ls *ls)
726 debugfs_remove(ls->ls_debug_locks_dentry); 657 debugfs_remove(ls->ls_debug_locks_dentry);
727 if (ls->ls_debug_all_dentry) 658 if (ls->ls_debug_all_dentry)
728 debugfs_remove(ls->ls_debug_all_dentry); 659 debugfs_remove(ls->ls_debug_all_dentry);
729 if (ls->ls_debug_toss_dentry)
730 debugfs_remove(ls->ls_debug_toss_dentry);
731} 660}
732 661
733int dlm_create_debug_file(struct dlm_ls *ls) 662int dlm_create_debug_file(struct dlm_ls *ls)
@@ -770,19 +699,6 @@ int dlm_create_debug_file(struct dlm_ls *ls)
770 if (!ls->ls_debug_all_dentry) 699 if (!ls->ls_debug_all_dentry)
771 goto fail; 700 goto fail;
772 701
773 /* format 4 */
774
775 memset(name, 0, sizeof(name));
776 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_toss", ls->ls_name);
777
778 ls->ls_debug_toss_dentry = debugfs_create_file(name,
779 S_IFREG | S_IRUGO,
780 dlm_root,
781 ls,
782 &format4_fops);
783 if (!ls->ls_debug_toss_dentry)
784 goto fail;
785
786 memset(name, 0, sizeof(name)); 702 memset(name, 0, sizeof(name));
787 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); 703 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
788 704
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 278a75cda44..7b84c1dbc82 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -23,6 +23,50 @@
23#include "lock.h" 23#include "lock.h"
24#include "dir.h" 24#include "dir.h"
25 25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 kfree(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
26/* 70/*
27 * We use the upper 16 bits of the hash value to select the directory node. 71 * We use the upper 16 bits of the hash value to select the directory node.
28 * Low bits are used for distribution of rsb's among hash buckets on each node. 72 * Low bits are used for distribution of rsb's among hash buckets on each node.
@@ -34,53 +78,144 @@
34 78
35int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash) 79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
36{ 80{
37 uint32_t node; 81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
38 90
39 if (ls->ls_num_nodes == 1) 91 if (ls->ls_node_array) {
40 return dlm_our_nodeid();
41 else {
42 node = (hash >> 16) % ls->ls_total_weight; 92 node = (hash >> 16) % ls->ls_total_weight;
43 return ls->ls_node_array[node]; 93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
44 } 106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
45} 113}
46 114
47int dlm_dir_nodeid(struct dlm_rsb *r) 115int dlm_dir_nodeid(struct dlm_rsb *r)
48{ 116{
49 return r->res_dir_nodeid; 117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
50} 118}
51 119
52void dlm_recover_dir_nodeid(struct dlm_ls *ls) 120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
53{ 121{
54 struct dlm_rsb *r; 122 uint32_t val;
55 123
56 down_read(&ls->ls_root_sem); 124 val = jhash(name, len, 0);
57 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 125 val &= (ls->ls_dirtbl_size - 1);
58 r->res_dir_nodeid = dlm_hash2nodeid(ls, r->res_hash); 126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 spin_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 kfree(de);
175 out:
176 spin_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 spin_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 spin_unlock(&ls->ls_dirtbl[i].lock);
59 } 196 }
60 up_read(&ls->ls_root_sem);
61} 197}
62 198
63int dlm_recover_directory(struct dlm_ls *ls) 199int dlm_recover_directory(struct dlm_ls *ls)
64{ 200{
65 struct dlm_member *memb; 201 struct dlm_member *memb;
202 struct dlm_direntry *de;
66 char *b, *last_name = NULL; 203 char *b, *last_name = NULL;
67 int error = -ENOMEM, last_len, nodeid, result; 204 int error = -ENOMEM, last_len, count = 0;
68 uint16_t namelen; 205 uint16_t namelen;
69 unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0;
70 206
71 log_debug(ls, "dlm_recover_directory"); 207 log_debug(ls, "dlm_recover_directory");
72 208
73 if (dlm_no_directory(ls)) 209 if (dlm_no_directory(ls))
74 goto out_status; 210 goto out_status;
75 211
212 dlm_dir_clear(ls);
213
76 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS); 214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
77 if (!last_name) 215 if (!last_name)
78 goto out; 216 goto out;
79 217
80 list_for_each_entry(memb, &ls->ls_nodes, list) { 218 list_for_each_entry(memb, &ls->ls_nodes, list) {
81 if (memb->nodeid == dlm_our_nodeid())
82 continue;
83
84 memset(last_name, 0, DLM_RESNAME_MAXLEN); 219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
85 last_len = 0; 220 last_len = 0;
86 221
@@ -95,7 +230,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
95 if (error) 230 if (error)
96 goto out_free; 231 goto out_free;
97 232
98 cond_resched(); 233 schedule();
99 234
100 /* 235 /*
101 * pick namelen/name pairs out of received buffer 236 * pick namelen/name pairs out of received buffer
@@ -132,96 +267,96 @@ int dlm_recover_directory(struct dlm_ls *ls)
132 if (namelen > DLM_RESNAME_MAXLEN) 267 if (namelen > DLM_RESNAME_MAXLEN)
133 goto out_free; 268 goto out_free;
134 269
135 error = dlm_master_lookup(ls, memb->nodeid, 270 error = -ENOMEM;
136 b, namelen, 271 de = get_free_de(ls, namelen);
137 DLM_LU_RECOVER_DIR, 272 if (!de)
138 &nodeid, &result);
139 if (error) {
140 log_error(ls, "recover_dir lookup %d",
141 error);
142 goto out_free; 273 goto out_free;
143 }
144
145 /* The name was found in rsbtbl, but the
146 * master nodeid is different from
147 * memb->nodeid which says it is the master.
148 * This should not happen. */
149
150 if (result == DLM_LU_MATCH &&
151 nodeid != memb->nodeid) {
152 count_bad++;
153 log_error(ls, "recover_dir lookup %d "
154 "nodeid %d memb %d bad %u",
155 result, nodeid, memb->nodeid,
156 count_bad);
157 print_hex_dump_bytes("dlm_recover_dir ",
158 DUMP_PREFIX_NONE,
159 b, namelen);
160 }
161
162 /* The name was found in rsbtbl, and the
163 * master nodeid matches memb->nodeid. */
164
165 if (result == DLM_LU_MATCH &&
166 nodeid == memb->nodeid) {
167 count_match++;
168 }
169
170 /* The name was not found in rsbtbl and was
171 * added with memb->nodeid as the master. */
172
173 if (result == DLM_LU_ADD) {
174 count_add++;
175 }
176 274
275 de->master_nodeid = memb->nodeid;
276 de->length = namelen;
177 last_len = namelen; 277 last_len = namelen;
278 memcpy(de->name, b, namelen);
178 memcpy(last_name, b, namelen); 279 memcpy(last_name, b, namelen);
179 b += namelen; 280 b += namelen;
180 left -= namelen; 281 left -= namelen;
282
283 add_entry_to_hash(ls, de);
181 count++; 284 count++;
182 } 285 }
183 } 286 }
184 done: 287 done:
185 ; 288 ;
186 } 289 }
187 290
188 out_status: 291 out_status:
189 error = 0; 292 error = 0;
190 dlm_set_recover_status(ls, DLM_RS_DIR); 293 dlm_set_recover_status(ls, DLM_RS_DIR);
191 294 log_debug(ls, "dlm_recover_directory %d entries", count);
192 log_debug(ls, "dlm_recover_directory %u in %u new",
193 count, count_add);
194 out_free: 295 out_free:
195 kfree(last_name); 296 kfree(last_name);
196 out: 297 out:
298 dlm_clear_free_entries(ls);
197 return error; 299 return error;
198} 300}
199 301
200static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) 302static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
303 int namelen, int *r_nodeid)
201{ 304{
202 struct dlm_rsb *r; 305 struct dlm_direntry *de, *tmp;
203 uint32_t hash, bucket; 306 uint32_t bucket;
204 int rv; 307
308 bucket = dir_hash(ls, name, namelen);
309
310 spin_lock(&ls->ls_dirtbl[bucket].lock);
311 de = search_bucket(ls, name, namelen, bucket);
312 if (de) {
313 *r_nodeid = de->master_nodeid;
314 spin_unlock(&ls->ls_dirtbl[bucket].lock);
315 if (*r_nodeid == nodeid)
316 return -EEXIST;
317 return 0;
318 }
319
320 spin_unlock(&ls->ls_dirtbl[bucket].lock);
321
322 if (namelen > DLM_RESNAME_MAXLEN)
323 return -EINVAL;
324
325 de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
326 if (!de)
327 return -ENOMEM;
205 328
206 hash = jhash(name, len, 0); 329 de->master_nodeid = nodeid;
207 bucket = hash & (ls->ls_rsbtbl_size - 1); 330 de->length = namelen;
331 memcpy(de->name, name, namelen);
208 332
209 spin_lock(&ls->ls_rsbtbl[bucket].lock); 333 spin_lock(&ls->ls_dirtbl[bucket].lock);
210 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].keep, name, len, &r); 334 tmp = search_bucket(ls, name, namelen, bucket);
211 if (rv) 335 if (tmp) {
212 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].toss, 336 kfree(de);
213 name, len, &r); 337 de = tmp;
214 spin_unlock(&ls->ls_rsbtbl[bucket].lock); 338 } else {
339 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
340 }
341 *r_nodeid = de->master_nodeid;
342 spin_unlock(&ls->ls_dirtbl[bucket].lock);
343 return 0;
344}
345
346int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
347 int *r_nodeid)
348{
349 return get_entry(ls, nodeid, name, namelen, r_nodeid);
350}
215 351
216 if (!rv) 352static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
217 return r; 353{
354 struct dlm_rsb *r;
218 355
219 down_read(&ls->ls_root_sem); 356 down_read(&ls->ls_root_sem);
220 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 357 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
221 if (len == r->res_length && !memcmp(name, r->res_name, len)) { 358 if (len == r->res_length && !memcmp(name, r->res_name, len)) {
222 up_read(&ls->ls_root_sem); 359 up_read(&ls->ls_root_sem);
223 log_debug(ls, "find_rsb_root revert to root_list %s",
224 r->res_name);
225 return r; 360 return r;
226 } 361 }
227 } 362 }
@@ -278,7 +413,6 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
278 be_namelen = cpu_to_be16(0); 413 be_namelen = cpu_to_be16(0);
279 memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); 414 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
280 offset += sizeof(__be16); 415 offset += sizeof(__be16);
281 ls->ls_recover_dir_sent_msg++;
282 goto out; 416 goto out;
283 } 417 }
284 418
@@ -287,7 +421,6 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
287 offset += sizeof(__be16); 421 offset += sizeof(__be16);
288 memcpy(outbuf + offset, r->res_name, r->res_length); 422 memcpy(outbuf + offset, r->res_name, r->res_length);
289 offset += r->res_length; 423 offset += r->res_length;
290 ls->ls_recover_dir_sent_res++;
291 } 424 }
292 425
293 /* 426 /*
@@ -300,8 +433,8 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
300 be_namelen = cpu_to_be16(0xFFFF); 433 be_namelen = cpu_to_be16(0xFFFF);
301 memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); 434 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
302 offset += sizeof(__be16); 435 offset += sizeof(__be16);
303 ls->ls_recover_dir_sent_msg++;
304 } 436 }
437
305 out: 438 out:
306 up_read(&ls->ls_root_sem); 439 up_read(&ls->ls_root_sem);
307} 440}
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
index 41750634445..0b0eb1267b6 100644
--- a/fs/dlm/dir.h
+++ b/fs/dlm/dir.h
@@ -14,10 +14,15 @@
14#ifndef __DIR_DOT_H__ 14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__ 15#define __DIR_DOT_H__
16 16
17
17int dlm_dir_nodeid(struct dlm_rsb *rsb); 18int dlm_dir_nodeid(struct dlm_rsb *rsb);
18int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); 19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
19void dlm_recover_dir_nodeid(struct dlm_ls *ls); 20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
20int dlm_recover_directory(struct dlm_ls *ls); 23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
21void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, 26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
22 char *outbuf, int outlen, int nodeid); 27 char *outbuf, int outlen, int nodeid);
23 28
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 77c0f70f8fe..fe2860c0244 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -38,7 +38,6 @@
38#include <linux/miscdevice.h> 38#include <linux/miscdevice.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <linux/idr.h> 40#include <linux/idr.h>
41#include <linux/ratelimit.h>
42#include <asm/uaccess.h> 41#include <asm/uaccess.h>
43 42
44#include <linux/dlm.h> 43#include <linux/dlm.h>
@@ -55,6 +54,8 @@ struct dlm_lkb;
55struct dlm_rsb; 54struct dlm_rsb;
56struct dlm_member; 55struct dlm_member;
57struct dlm_rsbtable; 56struct dlm_rsbtable;
57struct dlm_dirtable;
58struct dlm_direntry;
58struct dlm_recover; 59struct dlm_recover;
59struct dlm_header; 60struct dlm_header;
60struct dlm_message; 61struct dlm_message;
@@ -73,13 +74,6 @@ do { \
73 (ls)->ls_name , ##args); \ 74 (ls)->ls_name , ##args); \
74} while (0) 75} while (0)
75 76
76#define log_limit(ls, fmt, args...) \
77do { \
78 if (dlm_config.ci_log_debug) \
79 printk_ratelimited(KERN_DEBUG "dlm: %s: " fmt "\n", \
80 (ls)->ls_name , ##args); \
81} while (0)
82
83#define DLM_ASSERT(x, do) \ 77#define DLM_ASSERT(x, do) \
84{ \ 78{ \
85 if (!(x)) \ 79 if (!(x)) \
@@ -96,9 +90,21 @@ do { \
96} 90}
97 91
98 92
93struct dlm_direntry {
94 struct list_head list;
95 uint32_t master_nodeid;
96 uint16_t length;
97 char name[1];
98};
99
100struct dlm_dirtable {
101 struct list_head list;
102 spinlock_t lock;
103};
104
99struct dlm_rsbtable { 105struct dlm_rsbtable {
100 struct rb_root keep; 106 struct list_head list;
101 struct rb_root toss; 107 struct list_head toss;
102 spinlock_t lock; 108 spinlock_t lock;
103}; 109};
104 110
@@ -111,10 +117,6 @@ struct dlm_member {
111 struct list_head list; 117 struct list_head list;
112 int nodeid; 118 int nodeid;
113 int weight; 119 int weight;
114 int slot;
115 int slot_prev;
116 int comm_seq;
117 uint32_t generation;
118}; 120};
119 121
120/* 122/*
@@ -123,8 +125,10 @@ struct dlm_member {
123 125
124struct dlm_recover { 126struct dlm_recover {
125 struct list_head list; 127 struct list_head list;
126 struct dlm_config_node *nodes; 128 int *nodeids; /* nodeids of all members */
127 int nodes_count; 129 int node_count;
130 int *new; /* nodeids of new members */
131 int new_count;
128 uint64_t seq; 132 uint64_t seq;
129}; 133};
130 134
@@ -257,8 +261,6 @@ struct dlm_lkb {
257 ktime_t lkb_last_cast_time; /* for debugging */ 261 ktime_t lkb_last_cast_time; /* for debugging */
258 ktime_t lkb_last_bast_time; /* for debugging */ 262 ktime_t lkb_last_bast_time; /* for debugging */
259 263
260 uint64_t lkb_recover_seq; /* from ls_recover_seq */
261
262 char *lkb_lvbptr; 264 char *lkb_lvbptr;
263 struct dlm_lksb *lkb_lksb; /* caller's status block */ 265 struct dlm_lksb *lkb_lksb; /* caller's status block */
264 void (*lkb_astfn) (void *astparam); 266 void (*lkb_astfn) (void *astparam);
@@ -269,15 +271,6 @@ struct dlm_lkb {
269 }; 271 };
270}; 272};
271 273
272/*
273 * res_master_nodeid is "normal": 0 is unset/invalid, non-zero is the real
274 * nodeid, even when nodeid is our_nodeid.
275 *
276 * res_nodeid is "odd": -1 is unset/invalid, zero means our_nodeid,
277 * greater than zero when another nodeid.
278 *
279 * (TODO: remove res_nodeid and only use res_master_nodeid)
280 */
281 274
282struct dlm_rsb { 275struct dlm_rsb {
283 struct dlm_ls *res_ls; /* the lockspace */ 276 struct dlm_ls *res_ls; /* the lockspace */
@@ -286,19 +279,13 @@ struct dlm_rsb {
286 unsigned long res_flags; 279 unsigned long res_flags;
287 int res_length; /* length of rsb name */ 280 int res_length; /* length of rsb name */
288 int res_nodeid; 281 int res_nodeid;
289 int res_master_nodeid;
290 int res_dir_nodeid;
291 int res_id; /* for ls_recover_idr */
292 uint32_t res_lvbseq; 282 uint32_t res_lvbseq;
293 uint32_t res_hash; 283 uint32_t res_hash;
294 uint32_t res_bucket; /* rsbtbl */ 284 uint32_t res_bucket; /* rsbtbl */
295 unsigned long res_toss_time; 285 unsigned long res_toss_time;
296 uint32_t res_first_lkid; 286 uint32_t res_first_lkid;
297 struct list_head res_lookup; /* lkbs waiting on first */ 287 struct list_head res_lookup; /* lkbs waiting on first */
298 union { 288 struct list_head res_hashchain; /* rsbtbl */
299 struct list_head res_hashchain;
300 struct rb_node res_hashnode; /* rsbtbl */
301 };
302 struct list_head res_grantqueue; 289 struct list_head res_grantqueue;
303 struct list_head res_convertqueue; 290 struct list_head res_convertqueue;
304 struct list_head res_waitqueue; 291 struct list_head res_waitqueue;
@@ -311,21 +298,10 @@ struct dlm_rsb {
311 char res_name[DLM_RESNAME_MAXLEN+1]; 298 char res_name[DLM_RESNAME_MAXLEN+1];
312}; 299};
313 300
314/* dlm_master_lookup() flags */
315
316#define DLM_LU_RECOVER_DIR 1
317#define DLM_LU_RECOVER_MASTER 2
318
319/* dlm_master_lookup() results */
320
321#define DLM_LU_MATCH 1
322#define DLM_LU_ADD 2
323
324/* find_rsb() flags */ 301/* find_rsb() flags */
325 302
326#define R_REQUEST 0x00000001 303#define R_MASTER 1 /* only return rsb if it's a master */
327#define R_RECEIVE_REQUEST 0x00000002 304#define R_CREATE 2 /* create/add rsb if not found */
328#define R_RECEIVE_RECOVER 0x00000004
329 305
330/* rsb_flags */ 306/* rsb_flags */
331 307
@@ -336,8 +312,7 @@ enum rsb_flags {
336 RSB_NEW_MASTER, 312 RSB_NEW_MASTER,
337 RSB_NEW_MASTER2, 313 RSB_NEW_MASTER2,
338 RSB_RECOVER_CONVERT, 314 RSB_RECOVER_CONVERT,
339 RSB_RECOVER_GRANT, 315 RSB_LOCKS_PURGED,
340 RSB_RECOVER_LVB_INVAL,
341}; 316};
342 317
343static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) 318static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
@@ -359,9 +334,7 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
359/* dlm_header is first element of all structs sent between nodes */ 334/* dlm_header is first element of all structs sent between nodes */
360 335
361#define DLM_HEADER_MAJOR 0x00030000 336#define DLM_HEADER_MAJOR 0x00030000
362#define DLM_HEADER_MINOR 0x00000001 337#define DLM_HEADER_MINOR 0x00000000
363
364#define DLM_HEADER_SLOTS 0x00000001
365 338
366#define DLM_MSG 1 339#define DLM_MSG 1
367#define DLM_RCOM 2 340#define DLM_RCOM 2
@@ -449,34 +422,10 @@ union dlm_packet {
449 struct dlm_rcom rcom; 422 struct dlm_rcom rcom;
450}; 423};
451 424
452#define DLM_RSF_NEED_SLOTS 0x00000001
453
454/* RCOM_STATUS data */
455struct rcom_status {
456 __le32 rs_flags;
457 __le32 rs_unused1;
458 __le64 rs_unused2;
459};
460
461/* RCOM_STATUS_REPLY data */
462struct rcom_config { 425struct rcom_config {
463 __le32 rf_lvblen; 426 __le32 rf_lvblen;
464 __le32 rf_lsflags; 427 __le32 rf_lsflags;
465 428 __le64 rf_unused;
466 /* DLM_HEADER_SLOTS adds: */
467 __le32 rf_flags;
468 __le16 rf_our_slot;
469 __le16 rf_num_slots;
470 __le32 rf_generation;
471 __le32 rf_unused1;
472 __le64 rf_unused2;
473};
474
475struct rcom_slot {
476 __le32 ro_nodeid;
477 __le16 ro_slot;
478 __le16 ro_unused1;
479 __le64 ro_unused2;
480}; 429};
481 430
482struct rcom_lock { 431struct rcom_lock {
@@ -499,18 +448,10 @@ struct rcom_lock {
499 char rl_lvb[0]; 448 char rl_lvb[0];
500}; 449};
501 450
502/*
503 * The max number of resources per rsbtbl bucket that shrink will attempt
504 * to remove in each iteration.
505 */
506
507#define DLM_REMOVE_NAMES_MAX 8
508
509struct dlm_ls { 451struct dlm_ls {
510 struct list_head ls_list; /* list of lockspaces */ 452 struct list_head ls_list; /* list of lockspaces */
511 dlm_lockspace_t *ls_local_handle; 453 dlm_lockspace_t *ls_local_handle;
512 uint32_t ls_global_id; /* global unique lockspace ID */ 454 uint32_t ls_global_id; /* global unique lockspace ID */
513 uint32_t ls_generation;
514 uint32_t ls_exflags; 455 uint32_t ls_exflags;
515 int ls_lvblen; 456 int ls_lvblen;
516 int ls_count; /* refcount of processes in 457 int ls_count; /* refcount of processes in
@@ -526,6 +467,9 @@ struct dlm_ls {
526 struct dlm_rsbtable *ls_rsbtbl; 467 struct dlm_rsbtable *ls_rsbtbl;
527 uint32_t ls_rsbtbl_size; 468 uint32_t ls_rsbtbl_size;
528 469
470 struct dlm_dirtable *ls_dirtbl;
471 uint32_t ls_dirtbl_size;
472
529 struct mutex ls_waiters_mutex; 473 struct mutex ls_waiters_mutex;
530 struct list_head ls_waiters; /* lkbs needing a reply */ 474 struct list_head ls_waiters; /* lkbs needing a reply */
531 475
@@ -539,12 +483,6 @@ struct dlm_ls {
539 int ls_new_rsb_count; 483 int ls_new_rsb_count;
540 struct list_head ls_new_rsb; /* new rsb structs */ 484 struct list_head ls_new_rsb; /* new rsb structs */
541 485
542 spinlock_t ls_remove_spin;
543 char ls_remove_name[DLM_RESNAME_MAXLEN+1];
544 char *ls_remove_names[DLM_REMOVE_NAMES_MAX];
545 int ls_remove_len;
546 int ls_remove_lens[DLM_REMOVE_NAMES_MAX];
547
548 struct list_head ls_nodes; /* current nodes in ls */ 486 struct list_head ls_nodes; /* current nodes in ls */
549 struct list_head ls_nodes_gone; /* dead node list, recovery */ 487 struct list_head ls_nodes_gone; /* dead node list, recovery */
550 int ls_num_nodes; /* number of nodes in ls */ 488 int ls_num_nodes; /* number of nodes in ls */
@@ -552,11 +490,6 @@ struct dlm_ls {
552 int ls_total_weight; 490 int ls_total_weight;
553 int *ls_node_array; 491 int *ls_node_array;
554 492
555 int ls_slot;
556 int ls_num_slots;
557 int ls_slots_size;
558 struct dlm_slot *ls_slots;
559
560 struct dlm_rsb ls_stub_rsb; /* for returning errors */ 493 struct dlm_rsb ls_stub_rsb; /* for returning errors */
561 struct dlm_lkb ls_stub_lkb; /* for returning errors */ 494 struct dlm_lkb ls_stub_lkb; /* for returning errors */
562 struct dlm_message ls_stub_ms; /* for faking a reply */ 495 struct dlm_message ls_stub_ms; /* for faking a reply */
@@ -565,7 +498,6 @@ struct dlm_ls {
565 struct dentry *ls_debug_waiters_dentry; /* debugfs */ 498 struct dentry *ls_debug_waiters_dentry; /* debugfs */
566 struct dentry *ls_debug_locks_dentry; /* debugfs */ 499 struct dentry *ls_debug_locks_dentry; /* debugfs */
567 struct dentry *ls_debug_all_dentry; /* debugfs */ 500 struct dentry *ls_debug_all_dentry; /* debugfs */
568 struct dentry *ls_debug_toss_dentry; /* debugfs */
569 501
570 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ 502 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
571 int ls_uevent_result; 503 int ls_uevent_result;
@@ -594,64 +526,29 @@ struct dlm_ls {
594 struct mutex ls_requestqueue_mutex; 526 struct mutex ls_requestqueue_mutex;
595 struct dlm_rcom *ls_recover_buf; 527 struct dlm_rcom *ls_recover_buf;
596 int ls_recover_nodeid; /* for debugging */ 528 int ls_recover_nodeid; /* for debugging */
597 unsigned int ls_recover_dir_sent_res; /* for log info */
598 unsigned int ls_recover_dir_sent_msg; /* for log info */
599 unsigned int ls_recover_locks_in; /* for log info */
600 uint64_t ls_rcom_seq; 529 uint64_t ls_rcom_seq;
601 spinlock_t ls_rcom_spin; 530 spinlock_t ls_rcom_spin;
602 struct list_head ls_recover_list; 531 struct list_head ls_recover_list;
603 spinlock_t ls_recover_list_lock; 532 spinlock_t ls_recover_list_lock;
604 int ls_recover_list_count; 533 int ls_recover_list_count;
605 struct idr ls_recover_idr;
606 spinlock_t ls_recover_idr_lock;
607 wait_queue_head_t ls_wait_general; 534 wait_queue_head_t ls_wait_general;
608 wait_queue_head_t ls_recover_lock_wait;
609 struct mutex ls_clear_proc_locks; 535 struct mutex ls_clear_proc_locks;
610 536
611 struct list_head ls_root_list; /* root resources */ 537 struct list_head ls_root_list; /* root resources */
612 struct rw_semaphore ls_root_sem; /* protect root_list */ 538 struct rw_semaphore ls_root_sem; /* protect root_list */
613 539
614 const struct dlm_lockspace_ops *ls_ops;
615 void *ls_ops_arg;
616
617 int ls_namelen; 540 int ls_namelen;
618 char ls_name[1]; 541 char ls_name[1];
619}; 542};
620 543
621/* 544#define LSFL_WORK 0
622 * LSFL_RECOVER_STOP - dlm_ls_stop() sets this to tell dlm recovery routines 545#define LSFL_RUNNING 1
623 * that they should abort what they're doing so new recovery can be started. 546#define LSFL_RECOVERY_STOP 2
624 * 547#define LSFL_RCOM_READY 3
625 * LSFL_RECOVER_DOWN - dlm_ls_stop() sets this to tell dlm_recoverd that it 548#define LSFL_RCOM_WAIT 4
626 * should do down_write() on the in_recovery rw_semaphore. (doing down_write 549#define LSFL_UEVENT_WAIT 5
627 * within dlm_ls_stop causes complaints about the lock acquired/released 550#define LSFL_TIMEWARN 6
628 * in different contexts.) 551#define LSFL_CB_DELAY 7
629 *
630 * LSFL_RECOVER_LOCK - dlm_recoverd holds the in_recovery rw_semaphore.
631 * It sets this after it is done with down_write() on the in_recovery
632 * rw_semaphore and clears it after it has released the rw_semaphore.
633 *
634 * LSFL_RECOVER_WORK - dlm_ls_start() sets this to tell dlm_recoverd that it
635 * should begin recovery of the lockspace.
636 *
637 * LSFL_RUNNING - set when normal locking activity is enabled.
638 * dlm_ls_stop() clears this to tell dlm locking routines that they should
639 * quit what they are doing so recovery can run. dlm_recoverd sets
640 * this after recovery is finished.
641 */
642
643#define LSFL_RECOVER_STOP 0
644#define LSFL_RECOVER_DOWN 1
645#define LSFL_RECOVER_LOCK 2
646#define LSFL_RECOVER_WORK 3
647#define LSFL_RUNNING 4
648
649#define LSFL_RCOM_READY 5
650#define LSFL_RCOM_WAIT 6
651#define LSFL_UEVENT_WAIT 7
652#define LSFL_TIMEWARN 8
653#define LSFL_CB_DELAY 9
654#define LSFL_NODIR 10
655 552
656/* much of this is just saving user space pointers associated with the 553/* much of this is just saving user space pointers associated with the
657 lock that we pass back to the user lib with an ast */ 554 lock that we pass back to the user lib with an ast */
@@ -694,12 +591,12 @@ static inline int dlm_locking_stopped(struct dlm_ls *ls)
694 591
695static inline int dlm_recovery_stopped(struct dlm_ls *ls) 592static inline int dlm_recovery_stopped(struct dlm_ls *ls)
696{ 593{
697 return test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); 594 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
698} 595}
699 596
700static inline int dlm_no_directory(struct dlm_ls *ls) 597static inline int dlm_no_directory(struct dlm_ls *ls)
701{ 598{
702 return test_bit(LSFL_NODIR, &ls->ls_flags); 599 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
703} 600}
704 601
705int dlm_netlink_init(void); 602int dlm_netlink_init(void);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index a579f30f237..83b5e32514e 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,7 +56,6 @@
56 L: receive_xxxx_reply() <- R: send_xxxx_reply() 56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/ 57*/
58#include <linux/types.h> 58#include <linux/types.h>
59#include <linux/rbtree.h>
60#include <linux/slab.h> 59#include <linux/slab.h>
61#include "dlm_internal.h" 60#include "dlm_internal.h"
62#include <linux/dlm_device.h> 61#include <linux/dlm_device.h>
@@ -90,7 +89,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
90static int receive_extralen(struct dlm_message *ms); 89static int receive_extralen(struct dlm_message *ms);
91static void do_purge(struct dlm_ls *ls, int nodeid, int pid); 90static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
92static void del_timeout(struct dlm_lkb *lkb); 91static void del_timeout(struct dlm_lkb *lkb);
93static void toss_rsb(struct kref *kref);
94 92
95/* 93/*
96 * Lock compatibilty matrix - thanks Steve 94 * Lock compatibilty matrix - thanks Steve
@@ -161,21 +159,18 @@ static const int __quecvt_compat_matrix[8][8] = {
161 159
162void dlm_print_lkb(struct dlm_lkb *lkb) 160void dlm_print_lkb(struct dlm_lkb *lkb)
163{ 161{
164 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x " 162 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
165 "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n", 163 " status %d rqmode %d grmode %d wait_type %d\n",
166 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, 164 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
167 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, 165 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
168 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid, 166 lkb->lkb_grmode, lkb->lkb_wait_type);
169 (unsigned long long)lkb->lkb_recover_seq);
170} 167}
171 168
172static void dlm_print_rsb(struct dlm_rsb *r) 169static void dlm_print_rsb(struct dlm_rsb *r)
173{ 170{
174 printk(KERN_ERR "rsb: nodeid %d master %d dir %d flags %lx first %x " 171 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
175 "rlc %d name %s\n", 172 r->res_nodeid, r->res_flags, r->res_first_lkid,
176 r->res_nodeid, r->res_master_nodeid, r->res_dir_nodeid, 173 r->res_recover_locks_count, r->res_name);
177 r->res_flags, r->res_first_lkid, r->res_recover_locks_count,
178 r->res_name);
179} 174}
180 175
181void dlm_dump_rsb(struct dlm_rsb *r) 176void dlm_dump_rsb(struct dlm_rsb *r)
@@ -255,6 +250,8 @@ static inline int is_process_copy(struct dlm_lkb *lkb)
255 250
256static inline int is_master_copy(struct dlm_lkb *lkb) 251static inline int is_master_copy(struct dlm_lkb *lkb)
257{ 252{
253 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
254 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
258 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0; 255 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
259} 256}
260 257
@@ -330,37 +327,6 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
330 * Basic operations on rsb's and lkb's 327 * Basic operations on rsb's and lkb's
331 */ 328 */
332 329
333/* This is only called to add a reference when the code already holds
334 a valid reference to the rsb, so there's no need for locking. */
335
336static inline void hold_rsb(struct dlm_rsb *r)
337{
338 kref_get(&r->res_ref);
339}
340
341void dlm_hold_rsb(struct dlm_rsb *r)
342{
343 hold_rsb(r);
344}
345
346/* When all references to the rsb are gone it's transferred to
347 the tossed list for later disposal. */
348
349static void put_rsb(struct dlm_rsb *r)
350{
351 struct dlm_ls *ls = r->res_ls;
352 uint32_t bucket = r->res_bucket;
353
354 spin_lock(&ls->ls_rsbtbl[bucket].lock);
355 kref_put(&r->res_ref, toss_rsb);
356 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
357}
358
359void dlm_put_rsb(struct dlm_rsb *r)
360{
361 put_rsb(r);
362}
363
364static int pre_rsb_struct(struct dlm_ls *ls) 330static int pre_rsb_struct(struct dlm_ls *ls)
365{ 331{
366 struct dlm_rsb *r1, *r2; 332 struct dlm_rsb *r1, *r2;
@@ -414,8 +380,6 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
414 380
415 r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain); 381 r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
416 list_del(&r->res_hashchain); 382 list_del(&r->res_hashchain);
417 /* Convert the empty list_head to a NULL rb_node for tree usage: */
418 memset(&r->res_hashnode, 0, sizeof(struct rb_node));
419 ls->ls_new_rsb_count--; 383 ls->ls_new_rsb_count--;
420 spin_unlock(&ls->ls_new_rsb_spin); 384 spin_unlock(&ls->ls_new_rsb_spin);
421 385
@@ -424,6 +388,7 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
424 memcpy(r->res_name, name, len); 388 memcpy(r->res_name, name, len);
425 mutex_init(&r->res_mutex); 389 mutex_init(&r->res_mutex);
426 390
391 INIT_LIST_HEAD(&r->res_hashchain);
427 INIT_LIST_HEAD(&r->res_lookup); 392 INIT_LIST_HEAD(&r->res_lookup);
428 INIT_LIST_HEAD(&r->res_grantqueue); 393 INIT_LIST_HEAD(&r->res_grantqueue);
429 INIT_LIST_HEAD(&r->res_convertqueue); 394 INIT_LIST_HEAD(&r->res_convertqueue);
@@ -435,67 +400,59 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
435 return 0; 400 return 0;
436} 401}
437 402
438static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen) 403static int search_rsb_list(struct list_head *head, char *name, int len,
404 unsigned int flags, struct dlm_rsb **r_ret)
439{ 405{
440 char maxname[DLM_RESNAME_MAXLEN];
441
442 memset(maxname, 0, DLM_RESNAME_MAXLEN);
443 memcpy(maxname, name, nlen);
444 return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
445}
446
447int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
448 struct dlm_rsb **r_ret)
449{
450 struct rb_node *node = tree->rb_node;
451 struct dlm_rsb *r; 406 struct dlm_rsb *r;
452 int rc; 407 int error = 0;
453 408
454 while (node) { 409 list_for_each_entry(r, head, res_hashchain) {
455 r = rb_entry(node, struct dlm_rsb, res_hashnode); 410 if (len == r->res_length && !memcmp(name, r->res_name, len))
456 rc = rsb_cmp(r, name, len);
457 if (rc < 0)
458 node = node->rb_left;
459 else if (rc > 0)
460 node = node->rb_right;
461 else
462 goto found; 411 goto found;
463 } 412 }
464 *r_ret = NULL; 413 *r_ret = NULL;
465 return -EBADR; 414 return -EBADR;
466 415
467 found: 416 found:
417 if (r->res_nodeid && (flags & R_MASTER))
418 error = -ENOTBLK;
468 *r_ret = r; 419 *r_ret = r;
469 return 0; 420 return error;
470} 421}
471 422
472static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) 423static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
424 unsigned int flags, struct dlm_rsb **r_ret)
473{ 425{
474 struct rb_node **newn = &tree->rb_node; 426 struct dlm_rsb *r;
475 struct rb_node *parent = NULL; 427 int error;
476 int rc;
477
478 while (*newn) {
479 struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
480 res_hashnode);
481 428
482 parent = *newn; 429 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
483 rc = rsb_cmp(cur, rsb->res_name, rsb->res_length); 430 if (!error) {
484 if (rc < 0) 431 kref_get(&r->res_ref);
485 newn = &parent->rb_left; 432 goto out;
486 else if (rc > 0)
487 newn = &parent->rb_right;
488 else {
489 log_print("rsb_insert match");
490 dlm_dump_rsb(rsb);
491 dlm_dump_rsb(cur);
492 return -EEXIST;
493 }
494 } 433 }
434 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
435 if (error)
436 goto out;
495 437
496 rb_link_node(&rsb->res_hashnode, parent, newn); 438 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
497 rb_insert_color(&rsb->res_hashnode, tree); 439
498 return 0; 440 if (dlm_no_directory(ls))
441 goto out;
442
443 if (r->res_nodeid == -1) {
444 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
445 r->res_first_lkid = 0;
446 } else if (r->res_nodeid > 0) {
447 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
448 r->res_first_lkid = 0;
449 } else {
450 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
451 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
452 }
453 out:
454 *r_ret = r;
455 return error;
499} 456}
500 457
501/* 458/*
@@ -510,632 +467,119 @@ static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
510 * Searching for an rsb means looking through both the normal list and toss 467 * Searching for an rsb means looking through both the normal list and toss
511 * list. When found on the toss list the rsb is moved to the normal list with 468 * list. When found on the toss list the rsb is moved to the normal list with
512 * ref count of 1; when found on normal list the ref count is incremented. 469 * ref count of 1; when found on normal list the ref count is incremented.
513 *
514 * rsb's on the keep list are being used locally and refcounted.
515 * rsb's on the toss list are not being used locally, and are not refcounted.
516 *
517 * The toss list rsb's were either
518 * - previously used locally but not any more (were on keep list, then
519 * moved to toss list when last refcount dropped)
520 * - created and put on toss list as a directory record for a lookup
521 * (we are the dir node for the res, but are not using the res right now,
522 * but some other node is)
523 *
524 * The purpose of find_rsb() is to return a refcounted rsb for local use.
525 * So, if the given rsb is on the toss list, it is moved to the keep list
526 * before being returned.
527 *
528 * toss_rsb() happens when all local usage of the rsb is done, i.e. no
529 * more refcounts exist, so the rsb is moved from the keep list to the
530 * toss list.
531 *
532 * rsb's on both keep and toss lists are used for doing a name to master
533 * lookups. rsb's that are in use locally (and being refcounted) are on
534 * the keep list, rsb's that are not in use locally (not refcounted) and
535 * only exist for name/master lookups are on the toss list.
536 *
537 * rsb's on the toss list who's dir_nodeid is not local can have stale
538 * name/master mappings. So, remote requests on such rsb's can potentially
539 * return with an error, which means the mapping is stale and needs to
540 * be updated with a new lookup. (The idea behind MASTER UNCERTAIN and
541 * first_lkid is to keep only a single outstanding request on an rsb
542 * while that rsb has a potentially stale master.)
543 */ 470 */
544 471
545static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, 472static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
546 uint32_t hash, uint32_t b, 473 unsigned int flags, struct dlm_rsb **r_ret)
547 int dir_nodeid, int from_nodeid,
548 unsigned int flags, struct dlm_rsb **r_ret)
549{ 474{
550 struct dlm_rsb *r = NULL; 475 struct dlm_rsb *r = NULL;
551 int our_nodeid = dlm_our_nodeid(); 476 uint32_t hash, bucket;
552 int from_local = 0;
553 int from_other = 0;
554 int from_dir = 0;
555 int create = 0;
556 int error; 477 int error;
557 478
558 if (flags & R_RECEIVE_REQUEST) { 479 if (namelen > DLM_RESNAME_MAXLEN) {
559 if (from_nodeid == dir_nodeid) 480 error = -EINVAL;
560 from_dir = 1; 481 goto out;
561 else
562 from_other = 1;
563 } else if (flags & R_REQUEST) {
564 from_local = 1;
565 } 482 }
566 483
567 /* 484 if (dlm_no_directory(ls))
568 * flags & R_RECEIVE_RECOVER is from dlm_recover_master_copy, so 485 flags |= R_CREATE;
569 * from_nodeid has sent us a lock in dlm_recover_locks, believing
570 * we're the new master. Our local recovery may not have set
571 * res_master_nodeid to our_nodeid yet, so allow either. Don't
572 * create the rsb; dlm_recover_process_copy() will handle EBADR
573 * by resending.
574 *
575 * If someone sends us a request, we are the dir node, and we do
576 * not find the rsb anywhere, then recreate it. This happens if
577 * someone sends us a request after we have removed/freed an rsb
578 * from our toss list. (They sent a request instead of lookup
579 * because they are using an rsb from their toss list.)
580 */
581 486
582 if (from_local || from_dir || 487 hash = jhash(name, namelen, 0);
583 (from_other && (dir_nodeid == our_nodeid))) { 488 bucket = hash & (ls->ls_rsbtbl_size - 1);
584 create = 1;
585 }
586 489
587 retry: 490 retry:
588 if (create) { 491 if (flags & R_CREATE) {
589 error = pre_rsb_struct(ls); 492 error = pre_rsb_struct(ls);
590 if (error < 0) 493 if (error < 0)
591 goto out; 494 goto out;
592 } 495 }
593 496
594 spin_lock(&ls->ls_rsbtbl[b].lock); 497 spin_lock(&ls->ls_rsbtbl[bucket].lock);
595
596 error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
597 if (error)
598 goto do_toss;
599
600 /*
601 * rsb is active, so we can't check master_nodeid without lock_rsb.
602 */
603
604 kref_get(&r->res_ref);
605 error = 0;
606 goto out_unlock;
607
608
609 do_toss:
610 error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
611 if (error)
612 goto do_new;
613
614 /*
615 * rsb found inactive (master_nodeid may be out of date unless
616 * we are the dir_nodeid or were the master) No other thread
617 * is using this rsb because it's on the toss list, so we can
618 * look at or update res_master_nodeid without lock_rsb.
619 */
620 498
621 if ((r->res_master_nodeid != our_nodeid) && from_other) { 499 error = _search_rsb(ls, name, namelen, bucket, flags, &r);
622 /* our rsb was not master, and another node (not the dir node) 500 if (!error)
623 has sent us a request */
624 log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s",
625 from_nodeid, r->res_master_nodeid, dir_nodeid,
626 r->res_name);
627 error = -ENOTBLK;
628 goto out_unlock; 501 goto out_unlock;
629 }
630
631 if ((r->res_master_nodeid != our_nodeid) && from_dir) {
632 /* don't think this should ever happen */
633 log_error(ls, "find_rsb toss from_dir %d master %d",
634 from_nodeid, r->res_master_nodeid);
635 dlm_print_rsb(r);
636 /* fix it and go on */
637 r->res_master_nodeid = our_nodeid;
638 r->res_nodeid = 0;
639 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
640 r->res_first_lkid = 0;
641 }
642
643 if (from_local && (r->res_master_nodeid != our_nodeid)) {
644 /* Because we have held no locks on this rsb,
645 res_master_nodeid could have become stale. */
646 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
647 r->res_first_lkid = 0;
648 }
649
650 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
651 error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
652 goto out_unlock;
653 502
503 if (error == -EBADR && !(flags & R_CREATE))
504 goto out_unlock;
654 505
655 do_new: 506 /* the rsb was found but wasn't a master copy */
656 /* 507 if (error == -ENOTBLK)
657 * rsb not found
658 */
659
660 if (error == -EBADR && !create)
661 goto out_unlock; 508 goto out_unlock;
662 509
663 error = get_rsb_struct(ls, name, len, &r); 510 error = get_rsb_struct(ls, name, namelen, &r);
664 if (error == -EAGAIN) { 511 if (error == -EAGAIN) {
665 spin_unlock(&ls->ls_rsbtbl[b].lock); 512 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
666 goto retry; 513 goto retry;
667 } 514 }
668 if (error) 515 if (error)
669 goto out_unlock; 516 goto out_unlock;
670 517
671 r->res_hash = hash; 518 r->res_hash = hash;
672 r->res_bucket = b; 519 r->res_bucket = bucket;
673 r->res_dir_nodeid = dir_nodeid; 520 r->res_nodeid = -1;
674 kref_init(&r->res_ref); 521 kref_init(&r->res_ref);
675 522
676 if (from_dir) { 523 /* With no directory, the master can be set immediately */
677 /* want to see how often this happens */ 524 if (dlm_no_directory(ls)) {
678 log_debug(ls, "find_rsb new from_dir %d recreate %s", 525 int nodeid = dlm_dir_nodeid(r);
679 from_nodeid, r->res_name); 526 if (nodeid == dlm_our_nodeid())
680 r->res_master_nodeid = our_nodeid; 527 nodeid = 0;
681 r->res_nodeid = 0; 528 r->res_nodeid = nodeid;
682 goto out_add;
683 }
684
685 if (from_other && (dir_nodeid != our_nodeid)) {
686 /* should never happen */
687 log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
688 from_nodeid, dir_nodeid, our_nodeid, r->res_name);
689 dlm_free_rsb(r);
690 error = -ENOTBLK;
691 goto out_unlock;
692 }
693
694 if (from_other) {
695 log_debug(ls, "find_rsb new from_other %d dir %d %s",
696 from_nodeid, dir_nodeid, r->res_name);
697 }
698
699 if (dir_nodeid == our_nodeid) {
700 /* When we are the dir nodeid, we can set the master
701 node immediately */
702 r->res_master_nodeid = our_nodeid;
703 r->res_nodeid = 0;
704 } else {
705 /* set_master will send_lookup to dir_nodeid */
706 r->res_master_nodeid = 0;
707 r->res_nodeid = -1;
708 } 529 }
709 530 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
710 out_add: 531 error = 0;
711 error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
712 out_unlock: 532 out_unlock:
713 spin_unlock(&ls->ls_rsbtbl[b].lock); 533 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
714 out: 534 out:
715 *r_ret = r; 535 *r_ret = r;
716 return error; 536 return error;
717} 537}
718 538
719/* During recovery, other nodes can send us new MSTCPY locks (from 539/* This is only called to add a reference when the code already holds
720 dlm_recover_locks) before we've made ourself master (in 540 a valid reference to the rsb, so there's no need for locking. */
721 dlm_recover_masters). */
722 541
723static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len, 542static inline void hold_rsb(struct dlm_rsb *r)
724 uint32_t hash, uint32_t b,
725 int dir_nodeid, int from_nodeid,
726 unsigned int flags, struct dlm_rsb **r_ret)
727{ 543{
728 struct dlm_rsb *r = NULL;
729 int our_nodeid = dlm_our_nodeid();
730 int recover = (flags & R_RECEIVE_RECOVER);
731 int error;
732
733 retry:
734 error = pre_rsb_struct(ls);
735 if (error < 0)
736 goto out;
737
738 spin_lock(&ls->ls_rsbtbl[b].lock);
739
740 error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
741 if (error)
742 goto do_toss;
743
744 /*
745 * rsb is active, so we can't check master_nodeid without lock_rsb.
746 */
747
748 kref_get(&r->res_ref); 544 kref_get(&r->res_ref);
749 goto out_unlock;
750
751
752 do_toss:
753 error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
754 if (error)
755 goto do_new;
756
757 /*
758 * rsb found inactive. No other thread is using this rsb because
759 * it's on the toss list, so we can look at or update
760 * res_master_nodeid without lock_rsb.
761 */
762
763 if (!recover && (r->res_master_nodeid != our_nodeid) && from_nodeid) {
764 /* our rsb is not master, and another node has sent us a
765 request; this should never happen */
766 log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d",
767 from_nodeid, r->res_master_nodeid, dir_nodeid);
768 dlm_print_rsb(r);
769 error = -ENOTBLK;
770 goto out_unlock;
771 }
772
773 if (!recover && (r->res_master_nodeid != our_nodeid) &&
774 (dir_nodeid == our_nodeid)) {
775 /* our rsb is not master, and we are dir; may as well fix it;
776 this should never happen */
777 log_error(ls, "find_rsb toss our %d master %d dir %d",
778 our_nodeid, r->res_master_nodeid, dir_nodeid);
779 dlm_print_rsb(r);
780 r->res_master_nodeid = our_nodeid;
781 r->res_nodeid = 0;
782 }
783
784 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
785 error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
786 goto out_unlock;
787
788
789 do_new:
790 /*
791 * rsb not found
792 */
793
794 error = get_rsb_struct(ls, name, len, &r);
795 if (error == -EAGAIN) {
796 spin_unlock(&ls->ls_rsbtbl[b].lock);
797 goto retry;
798 }
799 if (error)
800 goto out_unlock;
801
802 r->res_hash = hash;
803 r->res_bucket = b;
804 r->res_dir_nodeid = dir_nodeid;
805 r->res_master_nodeid = dir_nodeid;
806 r->res_nodeid = (dir_nodeid == our_nodeid) ? 0 : dir_nodeid;
807 kref_init(&r->res_ref);
808
809 error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
810 out_unlock:
811 spin_unlock(&ls->ls_rsbtbl[b].lock);
812 out:
813 *r_ret = r;
814 return error;
815} 545}
816 546
817static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid, 547void dlm_hold_rsb(struct dlm_rsb *r)
818 unsigned int flags, struct dlm_rsb **r_ret)
819{
820 uint32_t hash, b;
821 int dir_nodeid;
822
823 if (len > DLM_RESNAME_MAXLEN)
824 return -EINVAL;
825
826 hash = jhash(name, len, 0);
827 b = hash & (ls->ls_rsbtbl_size - 1);
828
829 dir_nodeid = dlm_hash2nodeid(ls, hash);
830
831 if (dlm_no_directory(ls))
832 return find_rsb_nodir(ls, name, len, hash, b, dir_nodeid,
833 from_nodeid, flags, r_ret);
834 else
835 return find_rsb_dir(ls, name, len, hash, b, dir_nodeid,
836 from_nodeid, flags, r_ret);
837}
838
839/* we have received a request and found that res_master_nodeid != our_nodeid,
840 so we need to return an error or make ourself the master */
841
842static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r,
843 int from_nodeid)
844{ 548{
845 if (dlm_no_directory(ls)) { 549 hold_rsb(r);
846 log_error(ls, "find_rsb keep from_nodeid %d master %d dir %d",
847 from_nodeid, r->res_master_nodeid,
848 r->res_dir_nodeid);
849 dlm_print_rsb(r);
850 return -ENOTBLK;
851 }
852
853 if (from_nodeid != r->res_dir_nodeid) {
854 /* our rsb is not master, and another node (not the dir node)
855 has sent us a request. this is much more common when our
856 master_nodeid is zero, so limit debug to non-zero. */
857
858 if (r->res_master_nodeid) {
859 log_debug(ls, "validate master from_other %d master %d "
860 "dir %d first %x %s", from_nodeid,
861 r->res_master_nodeid, r->res_dir_nodeid,
862 r->res_first_lkid, r->res_name);
863 }
864 return -ENOTBLK;
865 } else {
866 /* our rsb is not master, but the dir nodeid has sent us a
867 request; this could happen with master 0 / res_nodeid -1 */
868
869 if (r->res_master_nodeid) {
870 log_error(ls, "validate master from_dir %d master %d "
871 "first %x %s",
872 from_nodeid, r->res_master_nodeid,
873 r->res_first_lkid, r->res_name);
874 }
875
876 r->res_master_nodeid = dlm_our_nodeid();
877 r->res_nodeid = 0;
878 return 0;
879 }
880} 550}
881 551
882/* 552static void toss_rsb(struct kref *kref)
883 * We're the dir node for this res and another node wants to know the
884 * master nodeid. During normal operation (non recovery) this is only
885 * called from receive_lookup(); master lookups when the local node is
886 * the dir node are done by find_rsb().
887 *
888 * normal operation, we are the dir node for a resource
889 * . _request_lock
890 * . set_master
891 * . send_lookup
892 * . receive_lookup
893 * . dlm_master_lookup flags 0
894 *
895 * recover directory, we are rebuilding dir for all resources
896 * . dlm_recover_directory
897 * . dlm_rcom_names
898 * remote node sends back the rsb names it is master of and we are dir of
899 * . dlm_master_lookup RECOVER_DIR (fix_master 0, from_master 1)
900 * we either create new rsb setting remote node as master, or find existing
901 * rsb and set master to be the remote node.
902 *
903 * recover masters, we are finding the new master for resources
904 * . dlm_recover_masters
905 * . recover_master
906 * . dlm_send_rcom_lookup
907 * . receive_rcom_lookup
908 * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0)
909 */
910
911int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
912 unsigned int flags, int *r_nodeid, int *result)
913{ 553{
914 struct dlm_rsb *r = NULL; 554 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
915 uint32_t hash, b; 555 struct dlm_ls *ls = r->res_ls;
916 int from_master = (flags & DLM_LU_RECOVER_DIR);
917 int fix_master = (flags & DLM_LU_RECOVER_MASTER);
918 int our_nodeid = dlm_our_nodeid();
919 int dir_nodeid, error, toss_list = 0;
920
921 if (len > DLM_RESNAME_MAXLEN)
922 return -EINVAL;
923
924 if (from_nodeid == our_nodeid) {
925 log_error(ls, "dlm_master_lookup from our_nodeid %d flags %x",
926 our_nodeid, flags);
927 return -EINVAL;
928 }
929
930 hash = jhash(name, len, 0);
931 b = hash & (ls->ls_rsbtbl_size - 1);
932
933 dir_nodeid = dlm_hash2nodeid(ls, hash);
934 if (dir_nodeid != our_nodeid) {
935 log_error(ls, "dlm_master_lookup from %d dir %d our %d h %x %d",
936 from_nodeid, dir_nodeid, our_nodeid, hash,
937 ls->ls_num_nodes);
938 *r_nodeid = -1;
939 return -EINVAL;
940 }
941
942 retry:
943 error = pre_rsb_struct(ls);
944 if (error < 0)
945 return error;
946
947 spin_lock(&ls->ls_rsbtbl[b].lock);
948 error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
949 if (!error) {
950 /* because the rsb is active, we need to lock_rsb before
951 checking/changing re_master_nodeid */
952
953 hold_rsb(r);
954 spin_unlock(&ls->ls_rsbtbl[b].lock);
955 lock_rsb(r);
956 goto found;
957 }
958
959 error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
960 if (error)
961 goto not_found;
962
963 /* because the rsb is inactive (on toss list), it's not refcounted
964 and lock_rsb is not used, but is protected by the rsbtbl lock */
965
966 toss_list = 1;
967 found:
968 if (r->res_dir_nodeid != our_nodeid) {
969 /* should not happen, but may as well fix it and carry on */
970 log_error(ls, "dlm_master_lookup res_dir %d our %d %s",
971 r->res_dir_nodeid, our_nodeid, r->res_name);
972 r->res_dir_nodeid = our_nodeid;
973 }
974
975 if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) {
976 /* Recovery uses this function to set a new master when
977 the previous master failed. Setting NEW_MASTER will
978 force dlm_recover_masters to call recover_master on this
979 rsb even though the res_nodeid is no longer removed. */
980
981 r->res_master_nodeid = from_nodeid;
982 r->res_nodeid = from_nodeid;
983 rsb_set_flag(r, RSB_NEW_MASTER);
984
985 if (toss_list) {
986 /* I don't think we should ever find it on toss list. */
987 log_error(ls, "dlm_master_lookup fix_master on toss");
988 dlm_dump_rsb(r);
989 }
990 }
991
992 if (from_master && (r->res_master_nodeid != from_nodeid)) {
993 /* this will happen if from_nodeid became master during
994 a previous recovery cycle, and we aborted the previous
995 cycle before recovering this master value */
996
997 log_limit(ls, "dlm_master_lookup from_master %d "
998 "master_nodeid %d res_nodeid %d first %x %s",
999 from_nodeid, r->res_master_nodeid, r->res_nodeid,
1000 r->res_first_lkid, r->res_name);
1001
1002 if (r->res_master_nodeid == our_nodeid) {
1003 log_error(ls, "from_master %d our_master", from_nodeid);
1004 dlm_dump_rsb(r);
1005 dlm_send_rcom_lookup_dump(r, from_nodeid);
1006 goto out_found;
1007 }
1008
1009 r->res_master_nodeid = from_nodeid;
1010 r->res_nodeid = from_nodeid;
1011 rsb_set_flag(r, RSB_NEW_MASTER);
1012 }
1013
1014 if (!r->res_master_nodeid) {
1015 /* this will happen if recovery happens while we're looking
1016 up the master for this rsb */
1017
1018 log_debug(ls, "dlm_master_lookup master 0 to %d first %x %s",
1019 from_nodeid, r->res_first_lkid, r->res_name);
1020 r->res_master_nodeid = from_nodeid;
1021 r->res_nodeid = from_nodeid;
1022 }
1023
1024 if (!from_master && !fix_master &&
1025 (r->res_master_nodeid == from_nodeid)) {
1026 /* this can happen when the master sends remove, the dir node
1027 finds the rsb on the keep list and ignores the remove,
1028 and the former master sends a lookup */
1029
1030 log_limit(ls, "dlm_master_lookup from master %d flags %x "
1031 "first %x %s", from_nodeid, flags,
1032 r->res_first_lkid, r->res_name);
1033 }
1034
1035 out_found:
1036 *r_nodeid = r->res_master_nodeid;
1037 if (result)
1038 *result = DLM_LU_MATCH;
1039
1040 if (toss_list) {
1041 r->res_toss_time = jiffies;
1042 /* the rsb was inactive (on toss list) */
1043 spin_unlock(&ls->ls_rsbtbl[b].lock);
1044 } else {
1045 /* the rsb was active */
1046 unlock_rsb(r);
1047 put_rsb(r);
1048 }
1049 return 0;
1050
1051 not_found:
1052 error = get_rsb_struct(ls, name, len, &r);
1053 if (error == -EAGAIN) {
1054 spin_unlock(&ls->ls_rsbtbl[b].lock);
1055 goto retry;
1056 }
1057 if (error)
1058 goto out_unlock;
1059 556
1060 r->res_hash = hash; 557 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
1061 r->res_bucket = b;
1062 r->res_dir_nodeid = our_nodeid;
1063 r->res_master_nodeid = from_nodeid;
1064 r->res_nodeid = from_nodeid;
1065 kref_init(&r->res_ref); 558 kref_init(&r->res_ref);
559 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
1066 r->res_toss_time = jiffies; 560 r->res_toss_time = jiffies;
1067 561 if (r->res_lvbptr) {
1068 error = rsb_insert(r, &ls->ls_rsbtbl[b].toss); 562 dlm_free_lvb(r->res_lvbptr);
1069 if (error) { 563 r->res_lvbptr = NULL;
1070 /* should never happen */
1071 dlm_free_rsb(r);
1072 spin_unlock(&ls->ls_rsbtbl[b].lock);
1073 goto retry;
1074 } 564 }
1075
1076 if (result)
1077 *result = DLM_LU_ADD;
1078 *r_nodeid = from_nodeid;
1079 error = 0;
1080 out_unlock:
1081 spin_unlock(&ls->ls_rsbtbl[b].lock);
1082 return error;
1083} 565}
1084 566
1085static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash) 567/* When all references to the rsb are gone it's transferred to
1086{ 568 the tossed list for later disposal. */
1087 struct rb_node *n;
1088 struct dlm_rsb *r;
1089 int i;
1090
1091 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
1092 spin_lock(&ls->ls_rsbtbl[i].lock);
1093 for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
1094 r = rb_entry(n, struct dlm_rsb, res_hashnode);
1095 if (r->res_hash == hash)
1096 dlm_dump_rsb(r);
1097 }
1098 spin_unlock(&ls->ls_rsbtbl[i].lock);
1099 }
1100}
1101 569
1102void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len) 570static void put_rsb(struct dlm_rsb *r)
1103{ 571{
1104 struct dlm_rsb *r = NULL; 572 struct dlm_ls *ls = r->res_ls;
1105 uint32_t hash, b; 573 uint32_t bucket = r->res_bucket;
1106 int error;
1107
1108 hash = jhash(name, len, 0);
1109 b = hash & (ls->ls_rsbtbl_size - 1);
1110
1111 spin_lock(&ls->ls_rsbtbl[b].lock);
1112 error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
1113 if (!error)
1114 goto out_dump;
1115 574
1116 error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); 575 spin_lock(&ls->ls_rsbtbl[bucket].lock);
1117 if (error) 576 kref_put(&r->res_ref, toss_rsb);
1118 goto out; 577 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
1119 out_dump:
1120 dlm_dump_rsb(r);
1121 out:
1122 spin_unlock(&ls->ls_rsbtbl[b].lock);
1123} 578}
1124 579
1125static void toss_rsb(struct kref *kref) 580void dlm_put_rsb(struct dlm_rsb *r)
1126{ 581{
1127 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); 582 put_rsb(r);
1128 struct dlm_ls *ls = r->res_ls;
1129
1130 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
1131 kref_init(&r->res_ref);
1132 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
1133 rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
1134 r->res_toss_time = jiffies;
1135 if (r->res_lvbptr) {
1136 dlm_free_lvb(r->res_lvbptr);
1137 r->res_lvbptr = NULL;
1138 }
1139} 583}
1140 584
1141/* See comment for unhold_lkb */ 585/* See comment for unhold_lkb */
@@ -1569,9 +1013,8 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
1569 goto out_del; 1013 goto out_del;
1570 } 1014 }
1571 1015
1572 log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait", 1016 log_error(ls, "remwait error %x reply %d flags %x no wait_type",
1573 lkb->lkb_id, ms ? ms->m_header.h_nodeid : 0, lkb->lkb_remid, 1017 lkb->lkb_id, mstype, lkb->lkb_flags);
1574 mstype, lkb->lkb_flags);
1575 return -1; 1018 return -1;
1576 1019
1577 out_del: 1020 out_del:
@@ -1624,170 +1067,61 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
1624 return error; 1067 return error;
1625} 1068}
1626 1069
1627/* If there's an rsb for the same resource being removed, ensure 1070static void dir_remove(struct dlm_rsb *r)
1628 that the remove message is sent before the new lookup message.
1629 It should be rare to need a delay here, but if not, then it may
1630 be worthwhile to add a proper wait mechanism rather than a delay. */
1631
1632static void wait_pending_remove(struct dlm_rsb *r)
1633{ 1071{
1634 struct dlm_ls *ls = r->res_ls; 1072 int to_nodeid;
1635 restart: 1073
1636 spin_lock(&ls->ls_remove_spin); 1074 if (dlm_no_directory(r->res_ls))
1637 if (ls->ls_remove_len && 1075 return;
1638 !rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) { 1076
1639 log_debug(ls, "delay lookup for remove dir %d %s", 1077 to_nodeid = dlm_dir_nodeid(r);
1640 r->res_dir_nodeid, r->res_name); 1078 if (to_nodeid != dlm_our_nodeid())
1641 spin_unlock(&ls->ls_remove_spin); 1079 send_remove(r);
1642 msleep(1); 1080 else
1643 goto restart; 1081 dlm_dir_remove_entry(r->res_ls, to_nodeid,
1644 } 1082 r->res_name, r->res_length);
1645 spin_unlock(&ls->ls_remove_spin);
1646} 1083}
1647 1084
1648/* 1085/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
1649 * ls_remove_spin protects ls_remove_name and ls_remove_len which are 1086 found since they are in order of newest to oldest? */
1650 * read by other threads in wait_pending_remove. ls_remove_names
1651 * and ls_remove_lens are only used by the scan thread, so they do
1652 * not need protection.
1653 */
1654 1087
1655static void shrink_bucket(struct dlm_ls *ls, int b) 1088static int shrink_bucket(struct dlm_ls *ls, int b)
1656{ 1089{
1657 struct rb_node *n, *next;
1658 struct dlm_rsb *r; 1090 struct dlm_rsb *r;
1659 char *name; 1091 int count = 0, found;
1660 int our_nodeid = dlm_our_nodeid();
1661 int remote_count = 0;
1662 int i, len, rv;
1663
1664 memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
1665
1666 spin_lock(&ls->ls_rsbtbl[b].lock);
1667 for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
1668 next = rb_next(n);
1669 r = rb_entry(n, struct dlm_rsb, res_hashnode);
1670
1671 /* If we're the directory record for this rsb, and
1672 we're not the master of it, then we need to wait
1673 for the master node to send us a dir remove for
1674 before removing the dir record. */
1675
1676 if (!dlm_no_directory(ls) &&
1677 (r->res_master_nodeid != our_nodeid) &&
1678 (dlm_dir_nodeid(r) == our_nodeid)) {
1679 continue;
1680 }
1681
1682 if (!time_after_eq(jiffies, r->res_toss_time +
1683 dlm_config.ci_toss_secs * HZ)) {
1684 continue;
1685 }
1686
1687 if (!dlm_no_directory(ls) &&
1688 (r->res_master_nodeid == our_nodeid) &&
1689 (dlm_dir_nodeid(r) != our_nodeid)) {
1690
1691 /* We're the master of this rsb but we're not
1692 the directory record, so we need to tell the
1693 dir node to remove the dir record. */
1694
1695 ls->ls_remove_lens[remote_count] = r->res_length;
1696 memcpy(ls->ls_remove_names[remote_count], r->res_name,
1697 DLM_RESNAME_MAXLEN);
1698 remote_count++;
1699
1700 if (remote_count >= DLM_REMOVE_NAMES_MAX)
1701 break;
1702 continue;
1703 }
1704
1705 if (!kref_put(&r->res_ref, kill_rsb)) {
1706 log_error(ls, "tossed rsb in use %s", r->res_name);
1707 continue;
1708 }
1709
1710 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
1711 dlm_free_rsb(r);
1712 }
1713 spin_unlock(&ls->ls_rsbtbl[b].lock);
1714
1715 /*
1716 * While searching for rsb's to free, we found some that require
1717 * remote removal. We leave them in place and find them again here
1718 * so there is a very small gap between removing them from the toss
1719 * list and sending the removal. Keeping this gap small is
1720 * important to keep us (the master node) from being out of sync
1721 * with the remote dir node for very long.
1722 *
1723 * From the time the rsb is removed from toss until just after
1724 * send_remove, the rsb name is saved in ls_remove_name. A new
1725 * lookup checks this to ensure that a new lookup message for the
1726 * same resource name is not sent just before the remove message.
1727 */
1728
1729 for (i = 0; i < remote_count; i++) {
1730 name = ls->ls_remove_names[i];
1731 len = ls->ls_remove_lens[i];
1732 1092
1093 for (;;) {
1094 found = 0;
1733 spin_lock(&ls->ls_rsbtbl[b].lock); 1095 spin_lock(&ls->ls_rsbtbl[b].lock);
1734 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); 1096 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
1735 if (rv) { 1097 res_hashchain) {
1736 spin_unlock(&ls->ls_rsbtbl[b].lock); 1098 if (!time_after_eq(jiffies, r->res_toss_time +
1737 log_debug(ls, "remove_name not toss %s", name); 1099 dlm_config.ci_toss_secs * HZ))
1738 continue; 1100 continue;
1739 } 1101 found = 1;
1740 1102 break;
1741 if (r->res_master_nodeid != our_nodeid) {
1742 spin_unlock(&ls->ls_rsbtbl[b].lock);
1743 log_debug(ls, "remove_name master %d dir %d our %d %s",
1744 r->res_master_nodeid, r->res_dir_nodeid,
1745 our_nodeid, name);
1746 continue;
1747 } 1103 }
1748 1104
1749 if (r->res_dir_nodeid == our_nodeid) { 1105 if (!found) {
1750 /* should never happen */
1751 spin_unlock(&ls->ls_rsbtbl[b].lock); 1106 spin_unlock(&ls->ls_rsbtbl[b].lock);
1752 log_error(ls, "remove_name dir %d master %d our %d %s", 1107 break;
1753 r->res_dir_nodeid, r->res_master_nodeid,
1754 our_nodeid, name);
1755 continue;
1756 } 1108 }
1757 1109
1758 if (!time_after_eq(jiffies, r->res_toss_time + 1110 if (kref_put(&r->res_ref, kill_rsb)) {
1759 dlm_config.ci_toss_secs * HZ)) { 1111 list_del(&r->res_hashchain);
1760 spin_unlock(&ls->ls_rsbtbl[b].lock); 1112 spin_unlock(&ls->ls_rsbtbl[b].lock);
1761 log_debug(ls, "remove_name toss_time %lu now %lu %s",
1762 r->res_toss_time, jiffies, name);
1763 continue;
1764 }
1765 1113
1766 if (!kref_put(&r->res_ref, kill_rsb)) { 1114 if (is_master(r))
1115 dir_remove(r);
1116 dlm_free_rsb(r);
1117 count++;
1118 } else {
1767 spin_unlock(&ls->ls_rsbtbl[b].lock); 1119 spin_unlock(&ls->ls_rsbtbl[b].lock);
1768 log_error(ls, "remove_name in use %s", name); 1120 log_error(ls, "tossed rsb in use %s", r->res_name);
1769 continue;
1770 } 1121 }
1771
1772 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
1773
1774 /* block lookup of same name until we've sent remove */
1775 spin_lock(&ls->ls_remove_spin);
1776 ls->ls_remove_len = len;
1777 memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
1778 spin_unlock(&ls->ls_remove_spin);
1779 spin_unlock(&ls->ls_rsbtbl[b].lock);
1780
1781 send_remove(r);
1782
1783 /* allow lookup of name again */
1784 spin_lock(&ls->ls_remove_spin);
1785 ls->ls_remove_len = 0;
1786 memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
1787 spin_unlock(&ls->ls_remove_spin);
1788
1789 dlm_free_rsb(r);
1790 } 1122 }
1123
1124 return count;
1791} 1125}
1792 1126
1793void dlm_scan_rsbs(struct dlm_ls *ls) 1127void dlm_scan_rsbs(struct dlm_ls *ls)
@@ -2113,13 +1447,13 @@ static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2113 } 1447 }
2114 1448
2115 lkb->lkb_rqmode = DLM_LOCK_IV; 1449 lkb->lkb_rqmode = DLM_LOCK_IV;
2116 lkb->lkb_highbast = 0;
2117} 1450}
2118 1451
2119static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1452static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2120{ 1453{
2121 set_lvb_lock(r, lkb); 1454 set_lvb_lock(r, lkb);
2122 _grant_lock(r, lkb); 1455 _grant_lock(r, lkb);
1456 lkb->lkb_highbast = 0;
2123} 1457}
2124 1458
2125static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, 1459static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
@@ -2279,14 +1613,10 @@ static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2)
2279 * immediate request, it is 0 if called later, after the lock has been 1613 * immediate request, it is 0 if called later, after the lock has been
2280 * queued. 1614 * queued.
2281 * 1615 *
2282 * recover is 1 if dlm_recover_grant() is trying to grant conversions
2283 * after recovery.
2284 *
2285 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis 1616 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
2286 */ 1617 */
2287 1618
2288static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, 1619static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
2289 int recover)
2290{ 1620{
2291 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV); 1621 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
2292 1622
@@ -2318,7 +1648,7 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2318 */ 1648 */
2319 1649
2320 if (queue_conflict(&r->res_grantqueue, lkb)) 1650 if (queue_conflict(&r->res_grantqueue, lkb))
2321 return 0; 1651 goto out;
2322 1652
2323 /* 1653 /*
2324 * 6-3: By default, a conversion request is immediately granted if the 1654 * 6-3: By default, a conversion request is immediately granted if the
@@ -2327,24 +1657,7 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2327 */ 1657 */
2328 1658
2329 if (queue_conflict(&r->res_convertqueue, lkb)) 1659 if (queue_conflict(&r->res_convertqueue, lkb))
2330 return 0; 1660 goto out;
2331
2332 /*
2333 * The RECOVER_GRANT flag means dlm_recover_grant() is granting
2334 * locks for a recovered rsb, on which lkb's have been rebuilt.
2335 * The lkb's may have been rebuilt on the queues in a different
2336 * order than they were in on the previous master. So, granting
2337 * queued conversions in order after recovery doesn't make sense
2338 * since the order hasn't been preserved anyway. The new order
2339 * could also have created a new "in place" conversion deadlock.
2340 * (e.g. old, failed master held granted EX, with PR->EX, NL->EX.
2341 * After recovery, there would be no granted locks, and possibly
2342 * NL->EX, PR->EX, an in-place conversion deadlock.) So, after
2343 * recovery, grant conversions without considering order.
2344 */
2345
2346 if (conv && recover)
2347 return 1;
2348 1661
2349 /* 1662 /*
2350 * 6-5: But the default algorithm for deciding whether to grant or 1663 * 6-5: But the default algorithm for deciding whether to grant or
@@ -2373,18 +1686,6 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2373 return 1; 1686 return 1;
2374 1687
2375 /* 1688 /*
2376 * Even if the convert is compat with all granted locks,
2377 * QUECVT forces it behind other locks on the convert queue.
2378 */
2379
2380 if (now && conv && (lkb->lkb_exflags & DLM_LKF_QUECVT)) {
2381 if (list_empty(&r->res_convertqueue))
2382 return 1;
2383 else
2384 return 0;
2385 }
2386
2387 /*
2388 * The NOORDER flag is set to avoid the standard vms rules on grant 1689 * The NOORDER flag is set to avoid the standard vms rules on grant
2389 * order. 1690 * order.
2390 */ 1691 */
@@ -2427,12 +1728,12 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2427 if (!now && !conv && list_empty(&r->res_convertqueue) && 1728 if (!now && !conv && list_empty(&r->res_convertqueue) &&
2428 first_in_list(lkb, &r->res_waitqueue)) 1729 first_in_list(lkb, &r->res_waitqueue))
2429 return 1; 1730 return 1;
2430 1731 out:
2431 return 0; 1732 return 0;
2432} 1733}
2433 1734
2434static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, 1735static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2435 int recover, int *err) 1736 int *err)
2436{ 1737{
2437 int rv; 1738 int rv;
2438 int8_t alt = 0, rqmode = lkb->lkb_rqmode; 1739 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
@@ -2441,7 +1742,7 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2441 if (err) 1742 if (err)
2442 *err = 0; 1743 *err = 0;
2443 1744
2444 rv = _can_be_granted(r, lkb, now, recover); 1745 rv = _can_be_granted(r, lkb, now);
2445 if (rv) 1746 if (rv)
2446 goto out; 1747 goto out;
2447 1748
@@ -2482,7 +1783,7 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2482 1783
2483 if (alt) { 1784 if (alt) {
2484 lkb->lkb_rqmode = alt; 1785 lkb->lkb_rqmode = alt;
2485 rv = _can_be_granted(r, lkb, now, 0); 1786 rv = _can_be_granted(r, lkb, now);
2486 if (rv) 1787 if (rv)
2487 lkb->lkb_sbflags |= DLM_SBF_ALTMODE; 1788 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
2488 else 1789 else
@@ -2502,11 +1803,9 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2502/* Returns the highest requested mode of all blocked conversions; sets 1803/* Returns the highest requested mode of all blocked conversions; sets
2503 cw if there's a blocked conversion to DLM_LOCK_CW. */ 1804 cw if there's a blocked conversion to DLM_LOCK_CW. */
2504 1805
2505static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, 1806static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw)
2506 unsigned int *count)
2507{ 1807{
2508 struct dlm_lkb *lkb, *s; 1808 struct dlm_lkb *lkb, *s;
2509 int recover = rsb_flag(r, RSB_RECOVER_GRANT);
2510 int hi, demoted, quit, grant_restart, demote_restart; 1809 int hi, demoted, quit, grant_restart, demote_restart;
2511 int deadlk; 1810 int deadlk;
2512 1811
@@ -2520,11 +1819,9 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw,
2520 demoted = is_demoted(lkb); 1819 demoted = is_demoted(lkb);
2521 deadlk = 0; 1820 deadlk = 0;
2522 1821
2523 if (can_be_granted(r, lkb, 0, recover, &deadlk)) { 1822 if (can_be_granted(r, lkb, 0, &deadlk)) {
2524 grant_lock_pending(r, lkb); 1823 grant_lock_pending(r, lkb);
2525 grant_restart = 1; 1824 grant_restart = 1;
2526 if (count)
2527 (*count)++;
2528 continue; 1825 continue;
2529 } 1826 }
2530 1827
@@ -2558,17 +1855,14 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw,
2558 return max_t(int, high, hi); 1855 return max_t(int, high, hi);
2559} 1856}
2560 1857
2561static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw, 1858static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw)
2562 unsigned int *count)
2563{ 1859{
2564 struct dlm_lkb *lkb, *s; 1860 struct dlm_lkb *lkb, *s;
2565 1861
2566 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { 1862 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
2567 if (can_be_granted(r, lkb, 0, 0, NULL)) { 1863 if (can_be_granted(r, lkb, 0, NULL))
2568 grant_lock_pending(r, lkb); 1864 grant_lock_pending(r, lkb);
2569 if (count) 1865 else {
2570 (*count)++;
2571 } else {
2572 high = max_t(int, lkb->lkb_rqmode, high); 1866 high = max_t(int, lkb->lkb_rqmode, high);
2573 if (lkb->lkb_rqmode == DLM_LOCK_CW) 1867 if (lkb->lkb_rqmode == DLM_LOCK_CW)
2574 *cw = 1; 1868 *cw = 1;
@@ -2597,20 +1891,16 @@ static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw)
2597 return 0; 1891 return 0;
2598} 1892}
2599 1893
2600static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count) 1894static void grant_pending_locks(struct dlm_rsb *r)
2601{ 1895{
2602 struct dlm_lkb *lkb, *s; 1896 struct dlm_lkb *lkb, *s;
2603 int high = DLM_LOCK_IV; 1897 int high = DLM_LOCK_IV;
2604 int cw = 0; 1898 int cw = 0;
2605 1899
2606 if (!is_master(r)) { 1900 DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
2607 log_print("grant_pending_locks r nodeid %d", r->res_nodeid);
2608 dlm_dump_rsb(r);
2609 return;
2610 }
2611 1901
2612 high = grant_pending_convert(r, high, &cw, count); 1902 high = grant_pending_convert(r, high, &cw);
2613 high = grant_pending_wait(r, high, &cw, count); 1903 high = grant_pending_wait(r, high, &cw);
2614 1904
2615 if (high == DLM_LOCK_IV) 1905 if (high == DLM_LOCK_IV)
2616 return; 1906 return;
@@ -2695,7 +1985,8 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
2695 1985
2696static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) 1986static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
2697{ 1987{
2698 int our_nodeid = dlm_our_nodeid(); 1988 struct dlm_ls *ls = r->res_ls;
1989 int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
2699 1990
2700 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { 1991 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
2701 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); 1992 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
@@ -2709,37 +2000,53 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
2709 return 1; 2000 return 1;
2710 } 2001 }
2711 2002
2712 if (r->res_master_nodeid == our_nodeid) { 2003 if (r->res_nodeid == 0) {
2713 lkb->lkb_nodeid = 0; 2004 lkb->lkb_nodeid = 0;
2714 return 0; 2005 return 0;
2715 } 2006 }
2716 2007
2717 if (r->res_master_nodeid) { 2008 if (r->res_nodeid > 0) {
2718 lkb->lkb_nodeid = r->res_master_nodeid; 2009 lkb->lkb_nodeid = r->res_nodeid;
2719 return 0; 2010 return 0;
2720 } 2011 }
2721 2012
2722 if (dlm_dir_nodeid(r) == our_nodeid) { 2013 DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
2723 /* This is a somewhat unusual case; find_rsb will usually 2014
2724 have set res_master_nodeid when dir nodeid is local, but 2015 dir_nodeid = dlm_dir_nodeid(r);
2725 there are cases where we become the dir node after we've 2016
2726 past find_rsb and go through _request_lock again. 2017 if (dir_nodeid != our_nodeid) {
2727 confirm_master() or process_lookup_list() needs to be 2018 r->res_first_lkid = lkb->lkb_id;
2728 called after this. */ 2019 send_lookup(r, lkb);
2729 log_debug(r->res_ls, "set_master %x self master %d dir %d %s", 2020 return 1;
2730 lkb->lkb_id, r->res_master_nodeid, r->res_dir_nodeid,
2731 r->res_name);
2732 r->res_master_nodeid = our_nodeid;
2733 r->res_nodeid = 0;
2734 lkb->lkb_nodeid = 0;
2735 return 0;
2736 } 2021 }
2737 2022
2738 wait_pending_remove(r); 2023 for (i = 0; i < 2; i++) {
2024 /* It's possible for dlm_scand to remove an old rsb for
2025 this same resource from the toss list, us to create
2026 a new one, look up the master locally, and find it
2027 already exists just before dlm_scand does the
2028 dir_remove() on the previous rsb. */
2739 2029
2740 r->res_first_lkid = lkb->lkb_id; 2030 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
2741 send_lookup(r, lkb); 2031 r->res_length, &ret_nodeid);
2742 return 1; 2032 if (!error)
2033 break;
2034 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
2035 schedule();
2036 }
2037 if (error && error != -EEXIST)
2038 return error;
2039
2040 if (ret_nodeid == our_nodeid) {
2041 r->res_first_lkid = 0;
2042 r->res_nodeid = 0;
2043 lkb->lkb_nodeid = 0;
2044 } else {
2045 r->res_first_lkid = lkb->lkb_id;
2046 r->res_nodeid = ret_nodeid;
2047 lkb->lkb_nodeid = ret_nodeid;
2048 }
2049 return 0;
2743} 2050}
2744 2051
2745static void process_lookup_list(struct dlm_rsb *r) 2052static void process_lookup_list(struct dlm_rsb *r)
@@ -3064,7 +2371,7 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
3064{ 2371{
3065 int error = 0; 2372 int error = 0;
3066 2373
3067 if (can_be_granted(r, lkb, 1, 0, NULL)) { 2374 if (can_be_granted(r, lkb, 1, NULL)) {
3068 grant_lock(r, lkb); 2375 grant_lock(r, lkb);
3069 queue_cast(r, lkb, 0); 2376 queue_cast(r, lkb, 0);
3070 goto out; 2377 goto out;
@@ -3104,7 +2411,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
3104 2411
3105 /* changing an existing lock may allow others to be granted */ 2412 /* changing an existing lock may allow others to be granted */
3106 2413
3107 if (can_be_granted(r, lkb, 1, 0, &deadlk)) { 2414 if (can_be_granted(r, lkb, 1, &deadlk)) {
3108 grant_lock(r, lkb); 2415 grant_lock(r, lkb);
3109 queue_cast(r, lkb, 0); 2416 queue_cast(r, lkb, 0);
3110 goto out; 2417 goto out;
@@ -3129,8 +2436,8 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
3129 before we try again to grant this one. */ 2436 before we try again to grant this one. */
3130 2437
3131 if (is_demoted(lkb)) { 2438 if (is_demoted(lkb)) {
3132 grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL); 2439 grant_pending_convert(r, DLM_LOCK_IV, NULL);
3133 if (_can_be_granted(r, lkb, 1, 0)) { 2440 if (_can_be_granted(r, lkb, 1)) {
3134 grant_lock(r, lkb); 2441 grant_lock(r, lkb);
3135 queue_cast(r, lkb, 0); 2442 queue_cast(r, lkb, 0);
3136 goto out; 2443 goto out;
@@ -3157,7 +2464,7 @@ static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3157{ 2464{
3158 switch (error) { 2465 switch (error) {
3159 case 0: 2466 case 0:
3160 grant_pending_locks(r, NULL); 2467 grant_pending_locks(r);
3161 /* grant_pending_locks also sends basts */ 2468 /* grant_pending_locks also sends basts */
3162 break; 2469 break;
3163 case -EAGAIN: 2470 case -EAGAIN:
@@ -3180,11 +2487,11 @@ static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3180static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, 2487static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3181 int error) 2488 int error)
3182{ 2489{
3183 grant_pending_locks(r, NULL); 2490 grant_pending_locks(r);
3184} 2491}
3185 2492
3186/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ 2493/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
3187 2494
3188static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) 2495static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
3189{ 2496{
3190 int error; 2497 int error;
@@ -3201,7 +2508,7 @@ static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3201 int error) 2508 int error)
3202{ 2509{
3203 if (error) 2510 if (error)
3204 grant_pending_locks(r, NULL); 2511 grant_pending_locks(r);
3205} 2512}
3206 2513
3207/* 2514/*
@@ -3308,11 +2615,11 @@ static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
3308 2615
3309 error = validate_lock_args(ls, lkb, args); 2616 error = validate_lock_args(ls, lkb, args);
3310 if (error) 2617 if (error)
3311 return error; 2618 goto out;
3312 2619
3313 error = find_rsb(ls, name, len, 0, R_REQUEST, &r); 2620 error = find_rsb(ls, name, len, R_CREATE, &r);
3314 if (error) 2621 if (error)
3315 return error; 2622 goto out;
3316 2623
3317 lock_rsb(r); 2624 lock_rsb(r);
3318 2625
@@ -3323,6 +2630,8 @@ static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
3323 2630
3324 unlock_rsb(r); 2631 unlock_rsb(r);
3325 put_rsb(r); 2632 put_rsb(r);
2633
2634 out:
3326 return error; 2635 return error;
3327} 2636}
3328 2637
@@ -4000,72 +3309,11 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
4000 return error; 3309 return error;
4001} 3310}
4002 3311
4003static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) 3312static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
4004{
4005 char name[DLM_RESNAME_MAXLEN + 1];
4006 struct dlm_message *ms;
4007 struct dlm_mhandle *mh;
4008 struct dlm_rsb *r;
4009 uint32_t hash, b;
4010 int rv, dir_nodeid;
4011
4012 memset(name, 0, sizeof(name));
4013 memcpy(name, ms_name, len);
4014
4015 hash = jhash(name, len, 0);
4016 b = hash & (ls->ls_rsbtbl_size - 1);
4017
4018 dir_nodeid = dlm_hash2nodeid(ls, hash);
4019
4020 log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name);
4021
4022 spin_lock(&ls->ls_rsbtbl[b].lock);
4023 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
4024 if (!rv) {
4025 spin_unlock(&ls->ls_rsbtbl[b].lock);
4026 log_error(ls, "repeat_remove on keep %s", name);
4027 return;
4028 }
4029
4030 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
4031 if (!rv) {
4032 spin_unlock(&ls->ls_rsbtbl[b].lock);
4033 log_error(ls, "repeat_remove on toss %s", name);
4034 return;
4035 }
4036
4037 /* use ls->remove_name2 to avoid conflict with shrink? */
4038
4039 spin_lock(&ls->ls_remove_spin);
4040 ls->ls_remove_len = len;
4041 memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
4042 spin_unlock(&ls->ls_remove_spin);
4043 spin_unlock(&ls->ls_rsbtbl[b].lock);
4044
4045 rv = _create_message(ls, sizeof(struct dlm_message) + len,
4046 dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
4047 if (rv)
4048 return;
4049
4050 memcpy(ms->m_extra, name, len);
4051 ms->m_hash = hash;
4052
4053 send_message(mh, ms);
4054
4055 spin_lock(&ls->ls_remove_spin);
4056 ls->ls_remove_len = 0;
4057 memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
4058 spin_unlock(&ls->ls_remove_spin);
4059}
4060
4061static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
4062{ 3313{
4063 struct dlm_lkb *lkb; 3314 struct dlm_lkb *lkb;
4064 struct dlm_rsb *r; 3315 struct dlm_rsb *r;
4065 int from_nodeid; 3316 int error, namelen;
4066 int error, namelen = 0;
4067
4068 from_nodeid = ms->m_header.h_nodeid;
4069 3317
4070 error = create_lkb(ls, &lkb); 3318 error = create_lkb(ls, &lkb);
4071 if (error) 3319 if (error)
@@ -4079,16 +3327,9 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
4079 goto fail; 3327 goto fail;
4080 } 3328 }
4081 3329
4082 /* The dir node is the authority on whether we are the master
4083 for this rsb or not, so if the master sends us a request, we should
4084 recreate the rsb if we've destroyed it. This race happens when we
4085 send a remove message to the dir node at the same time that the dir
4086 node sends us a request for the rsb. */
4087
4088 namelen = receive_extralen(ms); 3330 namelen = receive_extralen(ms);
4089 3331
4090 error = find_rsb(ls, ms->m_extra, namelen, from_nodeid, 3332 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
4091 R_RECEIVE_REQUEST, &r);
4092 if (error) { 3333 if (error) {
4093 __put_lkb(ls, lkb); 3334 __put_lkb(ls, lkb);
4094 goto fail; 3335 goto fail;
@@ -4096,16 +3337,6 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
4096 3337
4097 lock_rsb(r); 3338 lock_rsb(r);
4098 3339
4099 if (r->res_master_nodeid != dlm_our_nodeid()) {
4100 error = validate_master_nodeid(ls, r, from_nodeid);
4101 if (error) {
4102 unlock_rsb(r);
4103 put_rsb(r);
4104 __put_lkb(ls, lkb);
4105 goto fail;
4106 }
4107 }
4108
4109 attach_lkb(r, lkb); 3340 attach_lkb(r, lkb);
4110 error = do_request(r, lkb); 3341 error = do_request(r, lkb);
4111 send_request_reply(r, lkb, error); 3342 send_request_reply(r, lkb, error);
@@ -4118,40 +3349,14 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
4118 error = 0; 3349 error = 0;
4119 if (error) 3350 if (error)
4120 dlm_put_lkb(lkb); 3351 dlm_put_lkb(lkb);
4121 return 0; 3352 return;
4122 3353
4123 fail: 3354 fail:
4124 /* TODO: instead of returning ENOTBLK, add the lkb to res_lookup
4125 and do this receive_request again from process_lookup_list once
4126 we get the lookup reply. This would avoid a many repeated
4127 ENOTBLK request failures when the lookup reply designating us
4128 as master is delayed. */
4129
4130 /* We could repeatedly return -EBADR here if our send_remove() is
4131 delayed in being sent/arriving/being processed on the dir node.
4132 Another node would repeatedly lookup up the master, and the dir
4133 node would continue returning our nodeid until our send_remove
4134 took effect.
4135
4136 We send another remove message in case our previous send_remove
4137 was lost/ignored/missed somehow. */
4138
4139 if (error != -ENOTBLK) {
4140 log_limit(ls, "receive_request %x from %d %d",
4141 ms->m_lkid, from_nodeid, error);
4142 }
4143
4144 if (namelen && error == -EBADR) {
4145 send_repeat_remove(ls, ms->m_extra, namelen);
4146 msleep(1000);
4147 }
4148
4149 setup_stub_lkb(ls, ms); 3355 setup_stub_lkb(ls, ms);
4150 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 3356 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4151 return error;
4152} 3357}
4153 3358
4154static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) 3359static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
4155{ 3360{
4156 struct dlm_lkb *lkb; 3361 struct dlm_lkb *lkb;
4157 struct dlm_rsb *r; 3362 struct dlm_rsb *r;
@@ -4161,15 +3366,6 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
4161 if (error) 3366 if (error)
4162 goto fail; 3367 goto fail;
4163 3368
4164 if (lkb->lkb_remid != ms->m_lkid) {
4165 log_error(ls, "receive_convert %x remid %x recover_seq %llu "
4166 "remote %d %x", lkb->lkb_id, lkb->lkb_remid,
4167 (unsigned long long)lkb->lkb_recover_seq,
4168 ms->m_header.h_nodeid, ms->m_lkid);
4169 error = -ENOENT;
4170 goto fail;
4171 }
4172
4173 r = lkb->lkb_resource; 3369 r = lkb->lkb_resource;
4174 3370
4175 hold_rsb(r); 3371 hold_rsb(r);
@@ -4197,15 +3393,14 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
4197 unlock_rsb(r); 3393 unlock_rsb(r);
4198 put_rsb(r); 3394 put_rsb(r);
4199 dlm_put_lkb(lkb); 3395 dlm_put_lkb(lkb);
4200 return 0; 3396 return;
4201 3397
4202 fail: 3398 fail:
4203 setup_stub_lkb(ls, ms); 3399 setup_stub_lkb(ls, ms);
4204 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 3400 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4205 return error;
4206} 3401}
4207 3402
4208static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) 3403static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
4209{ 3404{
4210 struct dlm_lkb *lkb; 3405 struct dlm_lkb *lkb;
4211 struct dlm_rsb *r; 3406 struct dlm_rsb *r;
@@ -4215,14 +3410,6 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
4215 if (error) 3410 if (error)
4216 goto fail; 3411 goto fail;
4217 3412
4218 if (lkb->lkb_remid != ms->m_lkid) {
4219 log_error(ls, "receive_unlock %x remid %x remote %d %x",
4220 lkb->lkb_id, lkb->lkb_remid,
4221 ms->m_header.h_nodeid, ms->m_lkid);
4222 error = -ENOENT;
4223 goto fail;
4224 }
4225
4226 r = lkb->lkb_resource; 3413 r = lkb->lkb_resource;
4227 3414
4228 hold_rsb(r); 3415 hold_rsb(r);
@@ -4247,15 +3434,14 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
4247 unlock_rsb(r); 3434 unlock_rsb(r);
4248 put_rsb(r); 3435 put_rsb(r);
4249 dlm_put_lkb(lkb); 3436 dlm_put_lkb(lkb);
4250 return 0; 3437 return;
4251 3438
4252 fail: 3439 fail:
4253 setup_stub_lkb(ls, ms); 3440 setup_stub_lkb(ls, ms);
4254 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 3441 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4255 return error;
4256} 3442}
4257 3443
4258static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) 3444static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
4259{ 3445{
4260 struct dlm_lkb *lkb; 3446 struct dlm_lkb *lkb;
4261 struct dlm_rsb *r; 3447 struct dlm_rsb *r;
@@ -4283,23 +3469,25 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
4283 unlock_rsb(r); 3469 unlock_rsb(r);
4284 put_rsb(r); 3470 put_rsb(r);
4285 dlm_put_lkb(lkb); 3471 dlm_put_lkb(lkb);
4286 return 0; 3472 return;
4287 3473
4288 fail: 3474 fail:
4289 setup_stub_lkb(ls, ms); 3475 setup_stub_lkb(ls, ms);
4290 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 3476 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4291 return error;
4292} 3477}
4293 3478
4294static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) 3479static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
4295{ 3480{
4296 struct dlm_lkb *lkb; 3481 struct dlm_lkb *lkb;
4297 struct dlm_rsb *r; 3482 struct dlm_rsb *r;
4298 int error; 3483 int error;
4299 3484
4300 error = find_lkb(ls, ms->m_remid, &lkb); 3485 error = find_lkb(ls, ms->m_remid, &lkb);
4301 if (error) 3486 if (error) {
4302 return error; 3487 log_debug(ls, "receive_grant from %d no lkb %x",
3488 ms->m_header.h_nodeid, ms->m_remid);
3489 return;
3490 }
4303 3491
4304 r = lkb->lkb_resource; 3492 r = lkb->lkb_resource;
4305 3493
@@ -4319,18 +3507,20 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
4319 unlock_rsb(r); 3507 unlock_rsb(r);
4320 put_rsb(r); 3508 put_rsb(r);
4321 dlm_put_lkb(lkb); 3509 dlm_put_lkb(lkb);
4322 return 0;
4323} 3510}
4324 3511
4325static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) 3512static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
4326{ 3513{
4327 struct dlm_lkb *lkb; 3514 struct dlm_lkb *lkb;
4328 struct dlm_rsb *r; 3515 struct dlm_rsb *r;
4329 int error; 3516 int error;
4330 3517
4331 error = find_lkb(ls, ms->m_remid, &lkb); 3518 error = find_lkb(ls, ms->m_remid, &lkb);
4332 if (error) 3519 if (error) {
4333 return error; 3520 log_debug(ls, "receive_bast from %d no lkb %x",
3521 ms->m_header.h_nodeid, ms->m_remid);
3522 return;
3523 }
4334 3524
4335 r = lkb->lkb_resource; 3525 r = lkb->lkb_resource;
4336 3526
@@ -4342,120 +3532,57 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
4342 goto out; 3532 goto out;
4343 3533
4344 queue_bast(r, lkb, ms->m_bastmode); 3534 queue_bast(r, lkb, ms->m_bastmode);
4345 lkb->lkb_highbast = ms->m_bastmode;
4346 out: 3535 out:
4347 unlock_rsb(r); 3536 unlock_rsb(r);
4348 put_rsb(r); 3537 put_rsb(r);
4349 dlm_put_lkb(lkb); 3538 dlm_put_lkb(lkb);
4350 return 0;
4351} 3539}
4352 3540
4353static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) 3541static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
4354{ 3542{
4355 int len, error, ret_nodeid, from_nodeid, our_nodeid; 3543 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
4356 3544
4357 from_nodeid = ms->m_header.h_nodeid; 3545 from_nodeid = ms->m_header.h_nodeid;
4358 our_nodeid = dlm_our_nodeid(); 3546 our_nodeid = dlm_our_nodeid();
4359 3547
4360 len = receive_extralen(ms); 3548 len = receive_extralen(ms);
4361 3549
4362 error = dlm_master_lookup(ls, from_nodeid, ms->m_extra, len, 0, 3550 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
4363 &ret_nodeid, NULL); 3551 if (dir_nodeid != our_nodeid) {
3552 log_error(ls, "lookup dir_nodeid %d from %d",
3553 dir_nodeid, from_nodeid);
3554 error = -EINVAL;
3555 ret_nodeid = -1;
3556 goto out;
3557 }
3558
3559 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
4364 3560
4365 /* Optimization: we're master so treat lookup as a request */ 3561 /* Optimization: we're master so treat lookup as a request */
4366 if (!error && ret_nodeid == our_nodeid) { 3562 if (!error && ret_nodeid == our_nodeid) {
4367 receive_request(ls, ms); 3563 receive_request(ls, ms);
4368 return; 3564 return;
4369 } 3565 }
3566 out:
4370 send_lookup_reply(ls, ms, ret_nodeid, error); 3567 send_lookup_reply(ls, ms, ret_nodeid, error);
4371} 3568}
4372 3569
4373static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) 3570static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
4374{ 3571{
4375 char name[DLM_RESNAME_MAXLEN+1]; 3572 int len, dir_nodeid, from_nodeid;
4376 struct dlm_rsb *r;
4377 uint32_t hash, b;
4378 int rv, len, dir_nodeid, from_nodeid;
4379 3573
4380 from_nodeid = ms->m_header.h_nodeid; 3574 from_nodeid = ms->m_header.h_nodeid;
4381 3575
4382 len = receive_extralen(ms); 3576 len = receive_extralen(ms);
4383 3577
4384 if (len > DLM_RESNAME_MAXLEN) {
4385 log_error(ls, "receive_remove from %d bad len %d",
4386 from_nodeid, len);
4387 return;
4388 }
4389
4390 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); 3578 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
4391 if (dir_nodeid != dlm_our_nodeid()) { 3579 if (dir_nodeid != dlm_our_nodeid()) {
4392 log_error(ls, "receive_remove from %d bad nodeid %d", 3580 log_error(ls, "remove dir entry dir_nodeid %d from %d",
4393 from_nodeid, dir_nodeid); 3581 dir_nodeid, from_nodeid);
4394 return; 3582 return;
4395 } 3583 }
4396 3584
4397 /* Look for name on rsbtbl.toss, if it's there, kill it. 3585 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
4398 If it's on rsbtbl.keep, it's being used, and we should ignore this
4399 message. This is an expected race between the dir node sending a
4400 request to the master node at the same time as the master node sends
4401 a remove to the dir node. The resolution to that race is for the
4402 dir node to ignore the remove message, and the master node to
4403 recreate the master rsb when it gets a request from the dir node for
4404 an rsb it doesn't have. */
4405
4406 memset(name, 0, sizeof(name));
4407 memcpy(name, ms->m_extra, len);
4408
4409 hash = jhash(name, len, 0);
4410 b = hash & (ls->ls_rsbtbl_size - 1);
4411
4412 spin_lock(&ls->ls_rsbtbl[b].lock);
4413
4414 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
4415 if (rv) {
4416 /* verify the rsb is on keep list per comment above */
4417 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
4418 if (rv) {
4419 /* should not happen */
4420 log_error(ls, "receive_remove from %d not found %s",
4421 from_nodeid, name);
4422 spin_unlock(&ls->ls_rsbtbl[b].lock);
4423 return;
4424 }
4425 if (r->res_master_nodeid != from_nodeid) {
4426 /* should not happen */
4427 log_error(ls, "receive_remove keep from %d master %d",
4428 from_nodeid, r->res_master_nodeid);
4429 dlm_print_rsb(r);
4430 spin_unlock(&ls->ls_rsbtbl[b].lock);
4431 return;
4432 }
4433
4434 log_debug(ls, "receive_remove from %d master %d first %x %s",
4435 from_nodeid, r->res_master_nodeid, r->res_first_lkid,
4436 name);
4437 spin_unlock(&ls->ls_rsbtbl[b].lock);
4438 return;
4439 }
4440
4441 if (r->res_master_nodeid != from_nodeid) {
4442 log_error(ls, "receive_remove toss from %d master %d",
4443 from_nodeid, r->res_master_nodeid);
4444 dlm_print_rsb(r);
4445 spin_unlock(&ls->ls_rsbtbl[b].lock);
4446 return;
4447 }
4448
4449 if (kref_put(&r->res_ref, kill_rsb)) {
4450 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
4451 spin_unlock(&ls->ls_rsbtbl[b].lock);
4452 dlm_free_rsb(r);
4453 } else {
4454 log_error(ls, "receive_remove from %d rsb ref error",
4455 from_nodeid);
4456 dlm_print_rsb(r);
4457 spin_unlock(&ls->ls_rsbtbl[b].lock);
4458 }
4459} 3586}
4460 3587
4461static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) 3588static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
@@ -4463,16 +3590,18 @@ static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
4463 do_purge(ls, ms->m_nodeid, ms->m_pid); 3590 do_purge(ls, ms->m_nodeid, ms->m_pid);
4464} 3591}
4465 3592
4466static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) 3593static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
4467{ 3594{
4468 struct dlm_lkb *lkb; 3595 struct dlm_lkb *lkb;
4469 struct dlm_rsb *r; 3596 struct dlm_rsb *r;
4470 int error, mstype, result; 3597 int error, mstype, result;
4471 int from_nodeid = ms->m_header.h_nodeid;
4472 3598
4473 error = find_lkb(ls, ms->m_remid, &lkb); 3599 error = find_lkb(ls, ms->m_remid, &lkb);
4474 if (error) 3600 if (error) {
4475 return error; 3601 log_debug(ls, "receive_request_reply from %d no lkb %x",
3602 ms->m_header.h_nodeid, ms->m_remid);
3603 return;
3604 }
4476 3605
4477 r = lkb->lkb_resource; 3606 r = lkb->lkb_resource;
4478 hold_rsb(r); 3607 hold_rsb(r);
@@ -4484,19 +3613,14 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
4484 3613
4485 mstype = lkb->lkb_wait_type; 3614 mstype = lkb->lkb_wait_type;
4486 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); 3615 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
4487 if (error) { 3616 if (error)
4488 log_error(ls, "receive_request_reply %x remote %d %x result %d",
4489 lkb->lkb_id, from_nodeid, ms->m_lkid, ms->m_result);
4490 dlm_dump_rsb(r);
4491 goto out; 3617 goto out;
4492 }
4493 3618
4494 /* Optimization: the dir node was also the master, so it took our 3619 /* Optimization: the dir node was also the master, so it took our
4495 lookup as a request and sent request reply instead of lookup reply */ 3620 lookup as a request and sent request reply instead of lookup reply */
4496 if (mstype == DLM_MSG_LOOKUP) { 3621 if (mstype == DLM_MSG_LOOKUP) {
4497 r->res_master_nodeid = from_nodeid; 3622 r->res_nodeid = ms->m_header.h_nodeid;
4498 r->res_nodeid = from_nodeid; 3623 lkb->lkb_nodeid = r->res_nodeid;
4499 lkb->lkb_nodeid = from_nodeid;
4500 } 3624 }
4501 3625
4502 /* this is the value returned from do_request() on the master */ 3626 /* this is the value returned from do_request() on the master */
@@ -4530,30 +3654,18 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
4530 case -EBADR: 3654 case -EBADR:
4531 case -ENOTBLK: 3655 case -ENOTBLK:
4532 /* find_rsb failed to find rsb or rsb wasn't master */ 3656 /* find_rsb failed to find rsb or rsb wasn't master */
4533 log_limit(ls, "receive_request_reply %x from %d %d " 3657 log_debug(ls, "receive_request_reply %x %x master diff %d %d",
4534 "master %d dir %d first %x %s", lkb->lkb_id, 3658 lkb->lkb_id, lkb->lkb_flags, r->res_nodeid, result);
4535 from_nodeid, result, r->res_master_nodeid, 3659 r->res_nodeid = -1;
4536 r->res_dir_nodeid, r->res_first_lkid, r->res_name); 3660 lkb->lkb_nodeid = -1;
4537
4538 if (r->res_dir_nodeid != dlm_our_nodeid() &&
4539 r->res_master_nodeid != dlm_our_nodeid()) {
4540 /* cause _request_lock->set_master->send_lookup */
4541 r->res_master_nodeid = 0;
4542 r->res_nodeid = -1;
4543 lkb->lkb_nodeid = -1;
4544 }
4545 3661
4546 if (is_overlap(lkb)) { 3662 if (is_overlap(lkb)) {
4547 /* we'll ignore error in cancel/unlock reply */ 3663 /* we'll ignore error in cancel/unlock reply */
4548 queue_cast_overlap(r, lkb); 3664 queue_cast_overlap(r, lkb);
4549 confirm_master(r, result); 3665 confirm_master(r, result);
4550 unhold_lkb(lkb); /* undoes create_lkb() */ 3666 unhold_lkb(lkb); /* undoes create_lkb() */
4551 } else { 3667 } else
4552 _request_lock(r, lkb); 3668 _request_lock(r, lkb);
4553
4554 if (r->res_master_nodeid == dlm_our_nodeid())
4555 confirm_master(r, 0);
4556 }
4557 break; 3669 break;
4558 3670
4559 default: 3671 default:
@@ -4580,7 +3692,6 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
4580 unlock_rsb(r); 3692 unlock_rsb(r);
4581 put_rsb(r); 3693 put_rsb(r);
4582 dlm_put_lkb(lkb); 3694 dlm_put_lkb(lkb);
4583 return 0;
4584} 3695}
4585 3696
4586static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, 3697static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
@@ -4619,11 +3730,8 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
4619 break; 3730 break;
4620 3731
4621 default: 3732 default:
4622 log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d", 3733 log_error(r->res_ls, "receive_convert_reply %x error %d",
4623 lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid, 3734 lkb->lkb_id, ms->m_result);
4624 ms->m_result);
4625 dlm_print_rsb(r);
4626 dlm_print_lkb(lkb);
4627 } 3735 }
4628} 3736}
4629 3737
@@ -4650,18 +3758,20 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
4650 put_rsb(r); 3758 put_rsb(r);
4651} 3759}
4652 3760
4653static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) 3761static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
4654{ 3762{
4655 struct dlm_lkb *lkb; 3763 struct dlm_lkb *lkb;
4656 int error; 3764 int error;
4657 3765
4658 error = find_lkb(ls, ms->m_remid, &lkb); 3766 error = find_lkb(ls, ms->m_remid, &lkb);
4659 if (error) 3767 if (error) {
4660 return error; 3768 log_debug(ls, "receive_convert_reply from %d no lkb %x",
3769 ms->m_header.h_nodeid, ms->m_remid);
3770 return;
3771 }
4661 3772
4662 _receive_convert_reply(lkb, ms); 3773 _receive_convert_reply(lkb, ms);
4663 dlm_put_lkb(lkb); 3774 dlm_put_lkb(lkb);
4664 return 0;
4665} 3775}
4666 3776
4667static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 3777static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
@@ -4700,18 +3810,20 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
4700 put_rsb(r); 3810 put_rsb(r);
4701} 3811}
4702 3812
4703static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) 3813static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
4704{ 3814{
4705 struct dlm_lkb *lkb; 3815 struct dlm_lkb *lkb;
4706 int error; 3816 int error;
4707 3817
4708 error = find_lkb(ls, ms->m_remid, &lkb); 3818 error = find_lkb(ls, ms->m_remid, &lkb);
4709 if (error) 3819 if (error) {
4710 return error; 3820 log_debug(ls, "receive_unlock_reply from %d no lkb %x",
3821 ms->m_header.h_nodeid, ms->m_remid);
3822 return;
3823 }
4711 3824
4712 _receive_unlock_reply(lkb, ms); 3825 _receive_unlock_reply(lkb, ms);
4713 dlm_put_lkb(lkb); 3826 dlm_put_lkb(lkb);
4714 return 0;
4715} 3827}
4716 3828
4717static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 3829static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
@@ -4750,18 +3862,20 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
4750 put_rsb(r); 3862 put_rsb(r);
4751} 3863}
4752 3864
4753static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) 3865static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
4754{ 3866{
4755 struct dlm_lkb *lkb; 3867 struct dlm_lkb *lkb;
4756 int error; 3868 int error;
4757 3869
4758 error = find_lkb(ls, ms->m_remid, &lkb); 3870 error = find_lkb(ls, ms->m_remid, &lkb);
4759 if (error) 3871 if (error) {
4760 return error; 3872 log_debug(ls, "receive_cancel_reply from %d no lkb %x",
3873 ms->m_header.h_nodeid, ms->m_remid);
3874 return;
3875 }
4761 3876
4762 _receive_cancel_reply(lkb, ms); 3877 _receive_cancel_reply(lkb, ms);
4763 dlm_put_lkb(lkb); 3878 dlm_put_lkb(lkb);
4764 return 0;
4765} 3879}
4766 3880
4767static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) 3881static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
@@ -4769,15 +3883,14 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
4769 struct dlm_lkb *lkb; 3883 struct dlm_lkb *lkb;
4770 struct dlm_rsb *r; 3884 struct dlm_rsb *r;
4771 int error, ret_nodeid; 3885 int error, ret_nodeid;
4772 int do_lookup_list = 0;
4773 3886
4774 error = find_lkb(ls, ms->m_lkid, &lkb); 3887 error = find_lkb(ls, ms->m_lkid, &lkb);
4775 if (error) { 3888 if (error) {
4776 log_error(ls, "receive_lookup_reply no lkid %x", ms->m_lkid); 3889 log_error(ls, "receive_lookup_reply no lkb");
4777 return; 3890 return;
4778 } 3891 }
4779 3892
4780 /* ms->m_result is the value returned by dlm_master_lookup on dir node 3893 /* ms->m_result is the value returned by dlm_dir_lookup on dir node
4781 FIXME: will a non-zero error ever be returned? */ 3894 FIXME: will a non-zero error ever be returned? */
4782 3895
4783 r = lkb->lkb_resource; 3896 r = lkb->lkb_resource;
@@ -4789,37 +3902,12 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
4789 goto out; 3902 goto out;
4790 3903
4791 ret_nodeid = ms->m_nodeid; 3904 ret_nodeid = ms->m_nodeid;
4792
4793 /* We sometimes receive a request from the dir node for this
4794 rsb before we've received the dir node's loookup_reply for it.
4795 The request from the dir node implies we're the master, so we set
4796 ourself as master in receive_request_reply, and verify here that
4797 we are indeed the master. */
4798
4799 if (r->res_master_nodeid && (r->res_master_nodeid != ret_nodeid)) {
4800 /* This should never happen */
4801 log_error(ls, "receive_lookup_reply %x from %d ret %d "
4802 "master %d dir %d our %d first %x %s",
4803 lkb->lkb_id, ms->m_header.h_nodeid, ret_nodeid,
4804 r->res_master_nodeid, r->res_dir_nodeid,
4805 dlm_our_nodeid(), r->res_first_lkid, r->res_name);
4806 }
4807
4808 if (ret_nodeid == dlm_our_nodeid()) { 3905 if (ret_nodeid == dlm_our_nodeid()) {
4809 r->res_master_nodeid = ret_nodeid;
4810 r->res_nodeid = 0; 3906 r->res_nodeid = 0;
4811 do_lookup_list = 1; 3907 ret_nodeid = 0;
4812 r->res_first_lkid = 0; 3908 r->res_first_lkid = 0;
4813 } else if (ret_nodeid == -1) {
4814 /* the remote node doesn't believe it's the dir node */
4815 log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid",
4816 lkb->lkb_id, ms->m_header.h_nodeid);
4817 r->res_master_nodeid = 0;
4818 r->res_nodeid = -1;
4819 lkb->lkb_nodeid = -1;
4820 } else { 3909 } else {
4821 /* set_master() will set lkb_nodeid from r */ 3910 /* set_master() will copy res_nodeid to lkb_nodeid */
4822 r->res_master_nodeid = ret_nodeid;
4823 r->res_nodeid = ret_nodeid; 3911 r->res_nodeid = ret_nodeid;
4824 } 3912 }
4825 3913
@@ -4834,7 +3922,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
4834 _request_lock(r, lkb); 3922 _request_lock(r, lkb);
4835 3923
4836 out_list: 3924 out_list:
4837 if (do_lookup_list) 3925 if (!ret_nodeid)
4838 process_lookup_list(r); 3926 process_lookup_list(r);
4839 out: 3927 out:
4840 unlock_rsb(r); 3928 unlock_rsb(r);
@@ -4842,13 +3930,10 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
4842 dlm_put_lkb(lkb); 3930 dlm_put_lkb(lkb);
4843} 3931}
4844 3932
4845static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, 3933static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
4846 uint32_t saved_seq)
4847{ 3934{
4848 int error = 0, noent = 0;
4849
4850 if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { 3935 if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
4851 log_limit(ls, "receive %d from non-member %d %x %x %d", 3936 log_debug(ls, "ignore non-member message %d from %d %x %x %d",
4852 ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, 3937 ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
4853 ms->m_remid, ms->m_result); 3938 ms->m_remid, ms->m_result);
4854 return; 3939 return;
@@ -4859,50 +3944,47 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
4859 /* messages sent to a master node */ 3944 /* messages sent to a master node */
4860 3945
4861 case DLM_MSG_REQUEST: 3946 case DLM_MSG_REQUEST:
4862 error = receive_request(ls, ms); 3947 receive_request(ls, ms);
4863 break; 3948 break;
4864 3949
4865 case DLM_MSG_CONVERT: 3950 case DLM_MSG_CONVERT:
4866 error = receive_convert(ls, ms); 3951 receive_convert(ls, ms);
4867 break; 3952 break;
4868 3953
4869 case DLM_MSG_UNLOCK: 3954 case DLM_MSG_UNLOCK:
4870 error = receive_unlock(ls, ms); 3955 receive_unlock(ls, ms);
4871 break; 3956 break;
4872 3957
4873 case DLM_MSG_CANCEL: 3958 case DLM_MSG_CANCEL:
4874 noent = 1; 3959 receive_cancel(ls, ms);
4875 error = receive_cancel(ls, ms);
4876 break; 3960 break;
4877 3961
4878 /* messages sent from a master node (replies to above) */ 3962 /* messages sent from a master node (replies to above) */
4879 3963
4880 case DLM_MSG_REQUEST_REPLY: 3964 case DLM_MSG_REQUEST_REPLY:
4881 error = receive_request_reply(ls, ms); 3965 receive_request_reply(ls, ms);
4882 break; 3966 break;
4883 3967
4884 case DLM_MSG_CONVERT_REPLY: 3968 case DLM_MSG_CONVERT_REPLY:
4885 error = receive_convert_reply(ls, ms); 3969 receive_convert_reply(ls, ms);
4886 break; 3970 break;
4887 3971
4888 case DLM_MSG_UNLOCK_REPLY: 3972 case DLM_MSG_UNLOCK_REPLY:
4889 error = receive_unlock_reply(ls, ms); 3973 receive_unlock_reply(ls, ms);
4890 break; 3974 break;
4891 3975
4892 case DLM_MSG_CANCEL_REPLY: 3976 case DLM_MSG_CANCEL_REPLY:
4893 error = receive_cancel_reply(ls, ms); 3977 receive_cancel_reply(ls, ms);
4894 break; 3978 break;
4895 3979
4896 /* messages sent from a master node (only two types of async msg) */ 3980 /* messages sent from a master node (only two types of async msg) */
4897 3981
4898 case DLM_MSG_GRANT: 3982 case DLM_MSG_GRANT:
4899 noent = 1; 3983 receive_grant(ls, ms);
4900 error = receive_grant(ls, ms);
4901 break; 3984 break;
4902 3985
4903 case DLM_MSG_BAST: 3986 case DLM_MSG_BAST:
4904 noent = 1; 3987 receive_bast(ls, ms);
4905 error = receive_bast(ls, ms);
4906 break; 3988 break;
4907 3989
4908 /* messages sent to a dir node */ 3990 /* messages sent to a dir node */
@@ -4930,37 +4012,6 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
4930 default: 4012 default:
4931 log_error(ls, "unknown message type %d", ms->m_type); 4013 log_error(ls, "unknown message type %d", ms->m_type);
4932 } 4014 }
4933
4934 /*
4935 * When checking for ENOENT, we're checking the result of
4936 * find_lkb(m_remid):
4937 *
4938 * The lock id referenced in the message wasn't found. This may
4939 * happen in normal usage for the async messages and cancel, so
4940 * only use log_debug for them.
4941 *
4942 * Some errors are expected and normal.
4943 */
4944
4945 if (error == -ENOENT && noent) {
4946 log_debug(ls, "receive %d no %x remote %d %x saved_seq %u",
4947 ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
4948 ms->m_lkid, saved_seq);
4949 } else if (error == -ENOENT) {
4950 log_error(ls, "receive %d no %x remote %d %x saved_seq %u",
4951 ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
4952 ms->m_lkid, saved_seq);
4953
4954 if (ms->m_type == DLM_MSG_CONVERT)
4955 dlm_dump_rsb_hash(ls, ms->m_hash);
4956 }
4957
4958 if (error == -EINVAL) {
4959 log_error(ls, "receive %d inval from %d lkid %x remid %x "
4960 "saved_seq %u",
4961 ms->m_type, ms->m_header.h_nodeid,
4962 ms->m_lkid, ms->m_remid, saved_seq);
4963 }
4964} 4015}
4965 4016
4966/* If the lockspace is in recovery mode (locking stopped), then normal 4017/* If the lockspace is in recovery mode (locking stopped), then normal
@@ -4975,29 +4026,19 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
4975 int nodeid) 4026 int nodeid)
4976{ 4027{
4977 if (dlm_locking_stopped(ls)) { 4028 if (dlm_locking_stopped(ls)) {
4978 /* If we were a member of this lockspace, left, and rejoined,
4979 other nodes may still be sending us messages from the
4980 lockspace generation before we left. */
4981 if (!ls->ls_generation) {
4982 log_limit(ls, "receive %d from %d ignore old gen",
4983 ms->m_type, nodeid);
4984 return;
4985 }
4986
4987 dlm_add_requestqueue(ls, nodeid, ms); 4029 dlm_add_requestqueue(ls, nodeid, ms);
4988 } else { 4030 } else {
4989 dlm_wait_requestqueue(ls); 4031 dlm_wait_requestqueue(ls);
4990 _receive_message(ls, ms, 0); 4032 _receive_message(ls, ms);
4991 } 4033 }
4992} 4034}
4993 4035
4994/* This is called by dlm_recoverd to process messages that were saved on 4036/* This is called by dlm_recoverd to process messages that were saved on
4995 the requestqueue. */ 4037 the requestqueue. */
4996 4038
4997void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, 4039void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms)
4998 uint32_t saved_seq)
4999{ 4040{
5000 _receive_message(ls, ms, saved_seq); 4041 _receive_message(ls, ms);
5001} 4042}
5002 4043
5003/* This is called by the midcomms layer when something is received for 4044/* This is called by the midcomms layer when something is received for
@@ -5033,11 +4074,9 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
5033 4074
5034 ls = dlm_find_lockspace_global(hd->h_lockspace); 4075 ls = dlm_find_lockspace_global(hd->h_lockspace);
5035 if (!ls) { 4076 if (!ls) {
5036 if (dlm_config.ci_log_debug) { 4077 if (dlm_config.ci_log_debug)
5037 printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " 4078 log_print("invalid lockspace %x from %d cmd %d type %d",
5038 "%u from %d cmd %d type %d\n", 4079 hd->h_lockspace, nodeid, hd->h_cmd, type);
5039 hd->h_lockspace, nodeid, hd->h_cmd, type);
5040 }
5041 4080
5042 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) 4081 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
5043 dlm_send_ls_not_ready(nodeid, &p->rcom); 4082 dlm_send_ls_not_ready(nodeid, &p->rcom);
@@ -5085,13 +4124,15 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
5085/* A waiting lkb needs recovery if the master node has failed, or 4124/* A waiting lkb needs recovery if the master node has failed, or
5086 the master node is changing (only when no directory is used) */ 4125 the master node is changing (only when no directory is used) */
5087 4126
5088static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb, 4127static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
5089 int dir_nodeid)
5090{ 4128{
5091 if (dlm_no_directory(ls)) 4129 if (dlm_is_removed(ls, lkb->lkb_nodeid))
5092 return 1; 4130 return 1;
5093 4131
5094 if (dlm_is_removed(ls, lkb->lkb_wait_nodeid)) 4132 if (!dlm_no_directory(ls))
4133 return 0;
4134
4135 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
5095 return 1; 4136 return 1;
5096 4137
5097 return 0; 4138 return 0;
@@ -5108,7 +4149,6 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
5108 struct dlm_lkb *lkb, *safe; 4149 struct dlm_lkb *lkb, *safe;
5109 struct dlm_message *ms_stub; 4150 struct dlm_message *ms_stub;
5110 int wait_type, stub_unlock_result, stub_cancel_result; 4151 int wait_type, stub_unlock_result, stub_cancel_result;
5111 int dir_nodeid;
5112 4152
5113 ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); 4153 ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL);
5114 if (!ms_stub) { 4154 if (!ms_stub) {
@@ -5120,21 +4160,13 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
5120 4160
5121 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { 4161 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
5122 4162
5123 dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource);
5124
5125 /* exclude debug messages about unlocks because there can be so 4163 /* exclude debug messages about unlocks because there can be so
5126 many and they aren't very interesting */ 4164 many and they aren't very interesting */
5127 4165
5128 if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { 4166 if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
5129 log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " 4167 log_debug(ls, "recover_waiter %x nodeid %d "
5130 "lkb_nodeid %d wait_nodeid %d dir_nodeid %d", 4168 "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
5131 lkb->lkb_id, 4169 lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
5132 lkb->lkb_remid,
5133 lkb->lkb_wait_type,
5134 lkb->lkb_resource->res_nodeid,
5135 lkb->lkb_nodeid,
5136 lkb->lkb_wait_nodeid,
5137 dir_nodeid);
5138 } 4170 }
5139 4171
5140 /* all outstanding lookups, regardless of destination will be 4172 /* all outstanding lookups, regardless of destination will be
@@ -5145,7 +4177,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
5145 continue; 4177 continue;
5146 } 4178 }
5147 4179
5148 if (!waiter_needs_recovery(ls, lkb, dir_nodeid)) 4180 if (!waiter_needs_recovery(ls, lkb))
5149 continue; 4181 continue;
5150 4182
5151 wait_type = lkb->lkb_wait_type; 4183 wait_type = lkb->lkb_wait_type;
@@ -5278,11 +4310,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
5278 ou = is_overlap_unlock(lkb); 4310 ou = is_overlap_unlock(lkb);
5279 err = 0; 4311 err = 0;
5280 4312
5281 log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " 4313 log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
5282 "lkb_nodeid %d wait_nodeid %d dir_nodeid %d " 4314 lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
5283 "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype,
5284 r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
5285 dlm_dir_nodeid(r), oc, ou);
5286 4315
5287 /* At this point we assume that we won't get a reply to any 4316 /* At this point we assume that we won't get a reply to any
5288 previous op or overlap op on this lock. First, do a big 4317 previous op or overlap op on this lock. First, do a big
@@ -5334,12 +4363,9 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
5334 } 4363 }
5335 } 4364 }
5336 4365
5337 if (err) { 4366 if (err)
5338 log_error(ls, "waiter %x msg %d r_nodeid %d " 4367 log_error(ls, "recover_waiters_post %x %d %x %d %d",
5339 "dir_nodeid %d overlap %d %d", 4368 lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou);
5340 lkb->lkb_id, mstype, r->res_nodeid,
5341 dlm_dir_nodeid(r), oc, ou);
5342 }
5343 unlock_rsb(r); 4369 unlock_rsb(r);
5344 put_rsb(r); 4370 put_rsb(r);
5345 dlm_put_lkb(lkb); 4371 dlm_put_lkb(lkb);
@@ -5348,187 +4374,110 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
5348 return error; 4374 return error;
5349} 4375}
5350 4376
5351static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r, 4377static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
5352 struct list_head *list) 4378 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
5353{ 4379{
4380 struct dlm_ls *ls = r->res_ls;
5354 struct dlm_lkb *lkb, *safe; 4381 struct dlm_lkb *lkb, *safe;
5355 4382
5356 list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { 4383 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
5357 if (!is_master_copy(lkb)) 4384 if (test(ls, lkb)) {
5358 continue; 4385 rsb_set_flag(r, RSB_LOCKS_PURGED);
5359 4386 del_lkb(r, lkb);
5360 /* don't purge lkbs we've added in recover_master_copy for 4387 /* this put should free the lkb */
5361 the current recovery seq */ 4388 if (!dlm_put_lkb(lkb))
5362 4389 log_error(ls, "purged lkb not released");
5363 if (lkb->lkb_recover_seq == ls->ls_recover_seq) 4390 }
5364 continue;
5365
5366 del_lkb(r, lkb);
5367
5368 /* this put should free the lkb */
5369 if (!dlm_put_lkb(lkb))
5370 log_error(ls, "purged mstcpy lkb not released");
5371 } 4391 }
5372} 4392}
5373 4393
5374void dlm_purge_mstcpy_locks(struct dlm_rsb *r) 4394static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
5375{ 4395{
5376 struct dlm_ls *ls = r->res_ls; 4396 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
5377
5378 purge_mstcpy_list(ls, r, &r->res_grantqueue);
5379 purge_mstcpy_list(ls, r, &r->res_convertqueue);
5380 purge_mstcpy_list(ls, r, &r->res_waitqueue);
5381} 4397}
5382 4398
5383static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, 4399static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
5384 struct list_head *list,
5385 int nodeid_gone, unsigned int *count)
5386{ 4400{
5387 struct dlm_lkb *lkb, *safe; 4401 return is_master_copy(lkb);
5388 4402}
5389 list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
5390 if (!is_master_copy(lkb))
5391 continue;
5392
5393 if ((lkb->lkb_nodeid == nodeid_gone) ||
5394 dlm_is_removed(ls, lkb->lkb_nodeid)) {
5395
5396 /* tell recover_lvb to invalidate the lvb
5397 because a node holding EX/PW failed */
5398 if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
5399 (lkb->lkb_grmode >= DLM_LOCK_PW)) {
5400 rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
5401 }
5402
5403 del_lkb(r, lkb);
5404
5405 /* this put should free the lkb */
5406 if (!dlm_put_lkb(lkb))
5407 log_error(ls, "purged dead lkb not released");
5408 4403
5409 rsb_set_flag(r, RSB_RECOVER_GRANT); 4404static void purge_dead_locks(struct dlm_rsb *r)
4405{
4406 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
4407 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
4408 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
4409}
5410 4410
5411 (*count)++; 4411void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
5412 } 4412{
5413 } 4413 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
4414 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
4415 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
5414} 4416}
5415 4417
5416/* Get rid of locks held by nodes that are gone. */ 4418/* Get rid of locks held by nodes that are gone. */
5417 4419
5418void dlm_recover_purge(struct dlm_ls *ls) 4420int dlm_purge_locks(struct dlm_ls *ls)
5419{ 4421{
5420 struct dlm_rsb *r; 4422 struct dlm_rsb *r;
5421 struct dlm_member *memb;
5422 int nodes_count = 0;
5423 int nodeid_gone = 0;
5424 unsigned int lkb_count = 0;
5425
5426 /* cache one removed nodeid to optimize the common
5427 case of a single node removed */
5428 4423
5429 list_for_each_entry(memb, &ls->ls_nodes_gone, list) { 4424 log_debug(ls, "dlm_purge_locks");
5430 nodes_count++;
5431 nodeid_gone = memb->nodeid;
5432 }
5433
5434 if (!nodes_count)
5435 return;
5436 4425
5437 down_write(&ls->ls_root_sem); 4426 down_write(&ls->ls_root_sem);
5438 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 4427 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
5439 hold_rsb(r); 4428 hold_rsb(r);
5440 lock_rsb(r); 4429 lock_rsb(r);
5441 if (is_master(r)) { 4430 if (is_master(r))
5442 purge_dead_list(ls, r, &r->res_grantqueue, 4431 purge_dead_locks(r);
5443 nodeid_gone, &lkb_count);
5444 purge_dead_list(ls, r, &r->res_convertqueue,
5445 nodeid_gone, &lkb_count);
5446 purge_dead_list(ls, r, &r->res_waitqueue,
5447 nodeid_gone, &lkb_count);
5448 }
5449 unlock_rsb(r); 4432 unlock_rsb(r);
5450 unhold_rsb(r); 4433 unhold_rsb(r);
5451 cond_resched(); 4434
4435 schedule();
5452 } 4436 }
5453 up_write(&ls->ls_root_sem); 4437 up_write(&ls->ls_root_sem);
5454 4438
5455 if (lkb_count) 4439 return 0;
5456 log_debug(ls, "dlm_recover_purge %u locks for %u nodes",
5457 lkb_count, nodes_count);
5458} 4440}
5459 4441
5460static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) 4442static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
5461{ 4443{
5462 struct rb_node *n; 4444 struct dlm_rsb *r, *r_ret = NULL;
5463 struct dlm_rsb *r;
5464 4445
5465 spin_lock(&ls->ls_rsbtbl[bucket].lock); 4446 spin_lock(&ls->ls_rsbtbl[bucket].lock);
5466 for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) { 4447 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
5467 r = rb_entry(n, struct dlm_rsb, res_hashnode); 4448 if (!rsb_flag(r, RSB_LOCKS_PURGED))
5468
5469 if (!rsb_flag(r, RSB_RECOVER_GRANT))
5470 continue; 4449 continue;
5471 if (!is_master(r)) {
5472 rsb_clear_flag(r, RSB_RECOVER_GRANT);
5473 continue;
5474 }
5475 hold_rsb(r); 4450 hold_rsb(r);
5476 spin_unlock(&ls->ls_rsbtbl[bucket].lock); 4451 rsb_clear_flag(r, RSB_LOCKS_PURGED);
5477 return r; 4452 r_ret = r;
4453 break;
5478 } 4454 }
5479 spin_unlock(&ls->ls_rsbtbl[bucket].lock); 4455 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
5480 return NULL; 4456 return r_ret;
5481} 4457}
5482 4458
5483/* 4459void dlm_grant_after_purge(struct dlm_ls *ls)
5484 * Attempt to grant locks on resources that we are the master of.
5485 * Locks may have become grantable during recovery because locks
5486 * from departed nodes have been purged (or not rebuilt), allowing
5487 * previously blocked locks to now be granted. The subset of rsb's
5488 * we are interested in are those with lkb's on either the convert or
5489 * waiting queues.
5490 *
5491 * Simplest would be to go through each master rsb and check for non-empty
5492 * convert or waiting queues, and attempt to grant on those rsbs.
5493 * Checking the queues requires lock_rsb, though, for which we'd need
5494 * to release the rsbtbl lock. This would make iterating through all
5495 * rsb's very inefficient. So, we rely on earlier recovery routines
5496 * to set RECOVER_GRANT on any rsb's that we should attempt to grant
5497 * locks for.
5498 */
5499
5500void dlm_recover_grant(struct dlm_ls *ls)
5501{ 4460{
5502 struct dlm_rsb *r; 4461 struct dlm_rsb *r;
5503 int bucket = 0; 4462 int bucket = 0;
5504 unsigned int count = 0;
5505 unsigned int rsb_count = 0;
5506 unsigned int lkb_count = 0;
5507 4463
5508 while (1) { 4464 while (1) {
5509 r = find_grant_rsb(ls, bucket); 4465 r = find_purged_rsb(ls, bucket);
5510 if (!r) { 4466 if (!r) {
5511 if (bucket == ls->ls_rsbtbl_size - 1) 4467 if (bucket == ls->ls_rsbtbl_size - 1)
5512 break; 4468 break;
5513 bucket++; 4469 bucket++;
5514 continue; 4470 continue;
5515 } 4471 }
5516 rsb_count++;
5517 count = 0;
5518 lock_rsb(r); 4472 lock_rsb(r);
5519 /* the RECOVER_GRANT flag is checked in the grant path */ 4473 if (is_master(r)) {
5520 grant_pending_locks(r, &count); 4474 grant_pending_locks(r);
5521 rsb_clear_flag(r, RSB_RECOVER_GRANT); 4475 confirm_master(r, 0);
5522 lkb_count += count; 4476 }
5523 confirm_master(r, 0);
5524 unlock_rsb(r); 4477 unlock_rsb(r);
5525 put_rsb(r); 4478 put_rsb(r);
5526 cond_resched(); 4479 schedule();
5527 } 4480 }
5528
5529 if (lkb_count)
5530 log_debug(ls, "dlm_recover_grant %u locks on %u resources",
5531 lkb_count, rsb_count);
5532} 4481}
5533 4482
5534static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid, 4483static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
@@ -5617,8 +4566,6 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5617 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; 4566 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
5618 struct dlm_rsb *r; 4567 struct dlm_rsb *r;
5619 struct dlm_lkb *lkb; 4568 struct dlm_lkb *lkb;
5620 uint32_t remid = 0;
5621 int from_nodeid = rc->rc_header.h_nodeid;
5622 int error; 4569 int error;
5623 4570
5624 if (rl->rl_parent_lkid) { 4571 if (rl->rl_parent_lkid) {
@@ -5626,31 +4573,14 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5626 goto out; 4573 goto out;
5627 } 4574 }
5628 4575
5629 remid = le32_to_cpu(rl->rl_lkid);
5630
5631 /* In general we expect the rsb returned to be R_MASTER, but we don't
5632 have to require it. Recovery of masters on one node can overlap
5633 recovery of locks on another node, so one node can send us MSTCPY
5634 locks before we've made ourselves master of this rsb. We can still
5635 add new MSTCPY locks that we receive here without any harm; when
5636 we make ourselves master, dlm_recover_masters() won't touch the
5637 MSTCPY locks we've received early. */
5638
5639 error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), 4576 error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen),
5640 from_nodeid, R_RECEIVE_RECOVER, &r); 4577 R_MASTER, &r);
5641 if (error) 4578 if (error)
5642 goto out; 4579 goto out;
5643 4580
5644 lock_rsb(r); 4581 lock_rsb(r);
5645 4582
5646 if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) { 4583 lkb = search_remid(r, rc->rc_header.h_nodeid, le32_to_cpu(rl->rl_lkid));
5647 log_error(ls, "dlm_recover_master_copy remote %d %x not dir",
5648 from_nodeid, remid);
5649 error = -EBADR;
5650 goto out_unlock;
5651 }
5652
5653 lkb = search_remid(r, from_nodeid, remid);
5654 if (lkb) { 4584 if (lkb) {
5655 error = -EEXIST; 4585 error = -EEXIST;
5656 goto out_remid; 4586 goto out_remid;
@@ -5669,25 +4599,19 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5669 attach_lkb(r, lkb); 4599 attach_lkb(r, lkb);
5670 add_lkb(r, lkb, rl->rl_status); 4600 add_lkb(r, lkb, rl->rl_status);
5671 error = 0; 4601 error = 0;
5672 ls->ls_recover_locks_in++;
5673
5674 if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
5675 rsb_set_flag(r, RSB_RECOVER_GRANT);
5676 4602
5677 out_remid: 4603 out_remid:
5678 /* this is the new value returned to the lock holder for 4604 /* this is the new value returned to the lock holder for
5679 saving in its process-copy lkb */ 4605 saving in its process-copy lkb */
5680 rl->rl_remid = cpu_to_le32(lkb->lkb_id); 4606 rl->rl_remid = cpu_to_le32(lkb->lkb_id);
5681 4607
5682 lkb->lkb_recover_seq = ls->ls_recover_seq;
5683
5684 out_unlock: 4608 out_unlock:
5685 unlock_rsb(r); 4609 unlock_rsb(r);
5686 put_rsb(r); 4610 put_rsb(r);
5687 out: 4611 out:
5688 if (error && error != -EEXIST) 4612 if (error)
5689 log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", 4613 log_debug(ls, "recover_master_copy %d %x", error,
5690 from_nodeid, remid, error); 4614 le32_to_cpu(rl->rl_lkid));
5691 rl->rl_result = cpu_to_le32(error); 4615 rl->rl_result = cpu_to_le32(error);
5692 return error; 4616 return error;
5693} 4617}
@@ -5698,52 +4622,41 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5698 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; 4622 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
5699 struct dlm_rsb *r; 4623 struct dlm_rsb *r;
5700 struct dlm_lkb *lkb; 4624 struct dlm_lkb *lkb;
5701 uint32_t lkid, remid; 4625 int error;
5702 int error, result;
5703
5704 lkid = le32_to_cpu(rl->rl_lkid);
5705 remid = le32_to_cpu(rl->rl_remid);
5706 result = le32_to_cpu(rl->rl_result);
5707 4626
5708 error = find_lkb(ls, lkid, &lkb); 4627 error = find_lkb(ls, le32_to_cpu(rl->rl_lkid), &lkb);
5709 if (error) { 4628 if (error) {
5710 log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d", 4629 log_error(ls, "recover_process_copy no lkid %x",
5711 lkid, rc->rc_header.h_nodeid, remid, result); 4630 le32_to_cpu(rl->rl_lkid));
5712 return error; 4631 return error;
5713 } 4632 }
5714 4633
4634 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
4635
4636 error = le32_to_cpu(rl->rl_result);
4637
5715 r = lkb->lkb_resource; 4638 r = lkb->lkb_resource;
5716 hold_rsb(r); 4639 hold_rsb(r);
5717 lock_rsb(r); 4640 lock_rsb(r);
5718 4641
5719 if (!is_process_copy(lkb)) { 4642 switch (error) {
5720 log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d",
5721 lkid, rc->rc_header.h_nodeid, remid, result);
5722 dlm_dump_rsb(r);
5723 unlock_rsb(r);
5724 put_rsb(r);
5725 dlm_put_lkb(lkb);
5726 return -EINVAL;
5727 }
5728
5729 switch (result) {
5730 case -EBADR: 4643 case -EBADR:
5731 /* There's a chance the new master received our lock before 4644 /* There's a chance the new master received our lock before
5732 dlm_recover_master_reply(), this wouldn't happen if we did 4645 dlm_recover_master_reply(), this wouldn't happen if we did
5733 a barrier between recover_masters and recover_locks. */ 4646 a barrier between recover_masters and recover_locks. */
5734 4647 log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id,
5735 log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d", 4648 (unsigned long)r, r->res_name);
5736 lkid, rc->rc_header.h_nodeid, remid, result);
5737
5738 dlm_send_rcom_lock(r, lkb); 4649 dlm_send_rcom_lock(r, lkb);
5739 goto out; 4650 goto out;
5740 case -EEXIST: 4651 case -EEXIST:
4652 log_debug(ls, "master copy exists %x", lkb->lkb_id);
4653 /* fall through */
5741 case 0: 4654 case 0:
5742 lkb->lkb_remid = remid; 4655 lkb->lkb_remid = le32_to_cpu(rl->rl_remid);
5743 break; 4656 break;
5744 default: 4657 default:
5745 log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk", 4658 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
5746 lkid, rc->rc_header.h_nodeid, remid, result); 4659 error, lkb->lkb_id);
5747 } 4660 }
5748 4661
5749 /* an ack for dlm_recover_locks() which waits for replies from 4662 /* an ack for dlm_recover_locks() which waits for replies from
@@ -6032,18 +4945,15 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6032 return error; 4945 return error;
6033} 4946}
6034 4947
6035/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't 4948/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
6036 granted. Regardless of what rsb queue the lock is on, it's removed and 4949 Regardless of what rsb queue the lock is on, it's removed and freed. */
6037 freed. The IVVALBLK flag causes the lvb on the resource to be invalidated
6038 if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
6039 4950
6040static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) 4951static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6041{ 4952{
6042 struct dlm_args args; 4953 struct dlm_args args;
6043 int error; 4954 int error;
6044 4955
6045 set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK, 4956 set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args);
6046 lkb->lkb_ua, &args);
6047 4957
6048 error = unlock_lock(ls, lkb, &args); 4958 error = unlock_lock(ls, lkb, &args);
6049 if (error == -DLM_EUNLOCK) 4959 if (error == -DLM_EUNLOCK)
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 5e0c72e36a9..265017a7c3e 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -14,10 +14,8 @@
14#define __LOCK_DOT_H__ 14#define __LOCK_DOT_H__
15 15
16void dlm_dump_rsb(struct dlm_rsb *r); 16void dlm_dump_rsb(struct dlm_rsb *r);
17void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len);
18void dlm_print_lkb(struct dlm_lkb *lkb); 17void dlm_print_lkb(struct dlm_lkb *lkb);
19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, 18void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
20 uint32_t saved_seq);
21void dlm_receive_buffer(union dlm_packet *p, int nodeid); 19void dlm_receive_buffer(union dlm_packet *p, int nodeid);
22int dlm_modes_compat(int mode1, int mode2); 20int dlm_modes_compat(int mode1, int mode2);
23void dlm_put_rsb(struct dlm_rsb *r); 21void dlm_put_rsb(struct dlm_rsb *r);
@@ -29,15 +27,10 @@ void dlm_unlock_recovery(struct dlm_ls *ls);
29void dlm_scan_waiters(struct dlm_ls *ls); 27void dlm_scan_waiters(struct dlm_ls *ls);
30void dlm_scan_timeout(struct dlm_ls *ls); 28void dlm_scan_timeout(struct dlm_ls *ls);
31void dlm_adjust_timeouts(struct dlm_ls *ls); 29void dlm_adjust_timeouts(struct dlm_ls *ls);
32int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
33 unsigned int flags, int *r_nodeid, int *result);
34 30
35int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, 31int dlm_purge_locks(struct dlm_ls *ls);
36 struct dlm_rsb **r_ret);
37
38void dlm_recover_purge(struct dlm_ls *ls);
39void dlm_purge_mstcpy_locks(struct dlm_rsb *r); 32void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
40void dlm_recover_grant(struct dlm_ls *ls); 33void dlm_grant_after_purge(struct dlm_ls *ls);
41int dlm_recover_waiters_post(struct dlm_ls *ls); 34int dlm_recover_waiters_post(struct dlm_ls *ls);
42void dlm_recover_waiters_pre(struct dlm_ls *ls); 35void dlm_recover_waiters_pre(struct dlm_ls *ls);
43int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); 36int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 2e99fb0c973..a1d8f1af144 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -74,19 +74,6 @@ static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
74 return len; 74 return len;
75} 75}
76 76
77static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf)
78{
79 return snprintf(buf, PAGE_SIZE, "%u\n", dlm_no_directory(ls));
80}
81
82static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len)
83{
84 int val = simple_strtoul(buf, NULL, 0);
85 if (val == 1)
86 set_bit(LSFL_NODIR, &ls->ls_flags);
87 return len;
88}
89
90static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf) 77static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
91{ 78{
92 uint32_t status = dlm_recover_status(ls); 79 uint32_t status = dlm_recover_status(ls);
@@ -120,12 +107,6 @@ static struct dlm_attr dlm_attr_id = {
120 .store = dlm_id_store 107 .store = dlm_id_store
121}; 108};
122 109
123static struct dlm_attr dlm_attr_nodir = {
124 .attr = {.name = "nodir", .mode = S_IRUGO | S_IWUSR},
125 .show = dlm_nodir_show,
126 .store = dlm_nodir_store
127};
128
129static struct dlm_attr dlm_attr_recover_status = { 110static struct dlm_attr dlm_attr_recover_status = {
130 .attr = {.name = "recover_status", .mode = S_IRUGO}, 111 .attr = {.name = "recover_status", .mode = S_IRUGO},
131 .show = dlm_recover_status_show 112 .show = dlm_recover_status_show
@@ -140,7 +121,6 @@ static struct attribute *dlm_attrs[] = {
140 &dlm_attr_control.attr, 121 &dlm_attr_control.attr,
141 &dlm_attr_event.attr, 122 &dlm_attr_event.attr,
142 &dlm_attr_id.attr, 123 &dlm_attr_id.attr,
143 &dlm_attr_nodir.attr,
144 &dlm_attr_recover_status.attr, 124 &dlm_attr_recover_status.attr,
145 &dlm_attr_recover_nodeid.attr, 125 &dlm_attr_recover_nodeid.attr,
146 NULL, 126 NULL,
@@ -406,15 +386,12 @@ static void threads_stop(void)
406 dlm_lowcomms_stop(); 386 dlm_lowcomms_stop();
407} 387}
408 388
409static int new_lockspace(const char *name, const char *cluster, 389static int new_lockspace(const char *name, int namelen, void **lockspace,
410 uint32_t flags, int lvblen, 390 uint32_t flags, int lvblen)
411 const struct dlm_lockspace_ops *ops, void *ops_arg,
412 int *ops_result, dlm_lockspace_t **lockspace)
413{ 391{
414 struct dlm_ls *ls; 392 struct dlm_ls *ls;
415 int i, size, error; 393 int i, size, error;
416 int do_unreg = 0; 394 int do_unreg = 0;
417 int namelen = strlen(name);
418 395
419 if (namelen > DLM_LOCKSPACE_LEN) 396 if (namelen > DLM_LOCKSPACE_LEN)
420 return -EINVAL; 397 return -EINVAL;
@@ -426,24 +403,8 @@ static int new_lockspace(const char *name, const char *cluster,
426 return -EINVAL; 403 return -EINVAL;
427 404
428 if (!dlm_user_daemon_available()) { 405 if (!dlm_user_daemon_available()) {
429 log_print("dlm user daemon not available"); 406 module_put(THIS_MODULE);
430 error = -EUNATCH; 407 return -EUNATCH;
431 goto out;
432 }
433
434 if (ops && ops_result) {
435 if (!dlm_config.ci_recover_callbacks)
436 *ops_result = -EOPNOTSUPP;
437 else
438 *ops_result = 0;
439 }
440
441 if (dlm_config.ci_recover_callbacks && cluster &&
442 strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) {
443 log_print("dlm cluster name %s mismatch %s",
444 dlm_config.ci_cluster_name, cluster);
445 error = -EBADR;
446 goto out;
447 } 408 }
448 409
449 error = 0; 410 error = 0;
@@ -481,11 +442,6 @@ static int new_lockspace(const char *name, const char *cluster,
481 ls->ls_flags = 0; 442 ls->ls_flags = 0;
482 ls->ls_scan_time = jiffies; 443 ls->ls_scan_time = jiffies;
483 444
484 if (ops && dlm_config.ci_recover_callbacks) {
485 ls->ls_ops = ops;
486 ls->ls_ops_arg = ops_arg;
487 }
488
489 if (flags & DLM_LSFL_TIMEWARN) 445 if (flags & DLM_LSFL_TIMEWARN)
490 set_bit(LSFL_TIMEWARN, &ls->ls_flags); 446 set_bit(LSFL_TIMEWARN, &ls->ls_flags);
491 447
@@ -501,23 +457,25 @@ static int new_lockspace(const char *name, const char *cluster,
501 if (!ls->ls_rsbtbl) 457 if (!ls->ls_rsbtbl)
502 goto out_lsfree; 458 goto out_lsfree;
503 for (i = 0; i < size; i++) { 459 for (i = 0; i < size; i++) {
504 ls->ls_rsbtbl[i].keep.rb_node = NULL; 460 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
505 ls->ls_rsbtbl[i].toss.rb_node = NULL; 461 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
506 spin_lock_init(&ls->ls_rsbtbl[i].lock); 462 spin_lock_init(&ls->ls_rsbtbl[i].lock);
507 } 463 }
508 464
509 spin_lock_init(&ls->ls_remove_spin);
510
511 for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
512 ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
513 GFP_KERNEL);
514 if (!ls->ls_remove_names[i])
515 goto out_rsbtbl;
516 }
517
518 idr_init(&ls->ls_lkbidr); 465 idr_init(&ls->ls_lkbidr);
519 spin_lock_init(&ls->ls_lkbidr_spin); 466 spin_lock_init(&ls->ls_lkbidr_spin);
520 467
468 size = dlm_config.ci_dirtbl_size;
469 ls->ls_dirtbl_size = size;
470
471 ls->ls_dirtbl = vmalloc(sizeof(struct dlm_dirtable) * size);
472 if (!ls->ls_dirtbl)
473 goto out_lkbfree;
474 for (i = 0; i < size; i++) {
475 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
476 spin_lock_init(&ls->ls_dirtbl[i].lock);
477 }
478
521 INIT_LIST_HEAD(&ls->ls_waiters); 479 INIT_LIST_HEAD(&ls->ls_waiters);
522 mutex_init(&ls->ls_waiters_mutex); 480 mutex_init(&ls->ls_waiters_mutex);
523 INIT_LIST_HEAD(&ls->ls_orphans); 481 INIT_LIST_HEAD(&ls->ls_orphans);
@@ -565,23 +523,18 @@ static int new_lockspace(const char *name, const char *cluster,
565 523
566 ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); 524 ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
567 if (!ls->ls_recover_buf) 525 if (!ls->ls_recover_buf)
568 goto out_lkbidr; 526 goto out_dirfree;
569
570 ls->ls_slot = 0;
571 ls->ls_num_slots = 0;
572 ls->ls_slots_size = 0;
573 ls->ls_slots = NULL;
574 527
575 INIT_LIST_HEAD(&ls->ls_recover_list); 528 INIT_LIST_HEAD(&ls->ls_recover_list);
576 spin_lock_init(&ls->ls_recover_list_lock); 529 spin_lock_init(&ls->ls_recover_list_lock);
577 idr_init(&ls->ls_recover_idr);
578 spin_lock_init(&ls->ls_recover_idr_lock);
579 ls->ls_recover_list_count = 0; 530 ls->ls_recover_list_count = 0;
580 ls->ls_local_handle = ls; 531 ls->ls_local_handle = ls;
581 init_waitqueue_head(&ls->ls_wait_general); 532 init_waitqueue_head(&ls->ls_wait_general);
582 INIT_LIST_HEAD(&ls->ls_root_list); 533 INIT_LIST_HEAD(&ls->ls_root_list);
583 init_rwsem(&ls->ls_root_sem); 534 init_rwsem(&ls->ls_root_sem);
584 535
536 down_write(&ls->ls_in_recovery);
537
585 spin_lock(&lslist_lock); 538 spin_lock(&lslist_lock);
586 ls->ls_create_count = 1; 539 ls->ls_create_count = 1;
587 list_add(&ls->ls_list, &lslist); 540 list_add(&ls->ls_list, &lslist);
@@ -595,24 +548,13 @@ static int new_lockspace(const char *name, const char *cluster,
595 } 548 }
596 } 549 }
597 550
598 init_waitqueue_head(&ls->ls_recover_lock_wait); 551 /* needs to find ls in lslist */
599
600 /*
601 * Once started, dlm_recoverd first looks for ls in lslist, then
602 * initializes ls_in_recovery as locked in "down" mode. We need
603 * to wait for the wakeup from dlm_recoverd because in_recovery
604 * has to start out in down mode.
605 */
606
607 error = dlm_recoverd_start(ls); 552 error = dlm_recoverd_start(ls);
608 if (error) { 553 if (error) {
609 log_error(ls, "can't start dlm_recoverd %d", error); 554 log_error(ls, "can't start dlm_recoverd %d", error);
610 goto out_callback; 555 goto out_callback;
611 } 556 }
612 557
613 wait_event(ls->ls_recover_lock_wait,
614 test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
615
616 ls->ls_kobj.kset = dlm_kset; 558 ls->ls_kobj.kset = dlm_kset;
617 error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, 559 error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
618 "%s", ls->ls_name); 560 "%s", ls->ls_name);
@@ -656,15 +598,11 @@ static int new_lockspace(const char *name, const char *cluster,
656 spin_lock(&lslist_lock); 598 spin_lock(&lslist_lock);
657 list_del(&ls->ls_list); 599 list_del(&ls->ls_list);
658 spin_unlock(&lslist_lock); 600 spin_unlock(&lslist_lock);
659 idr_destroy(&ls->ls_recover_idr);
660 kfree(ls->ls_recover_buf); 601 kfree(ls->ls_recover_buf);
661 out_lkbidr: 602 out_dirfree:
603 vfree(ls->ls_dirtbl);
604 out_lkbfree:
662 idr_destroy(&ls->ls_lkbidr); 605 idr_destroy(&ls->ls_lkbidr);
663 for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
664 if (ls->ls_remove_names[i])
665 kfree(ls->ls_remove_names[i]);
666 }
667 out_rsbtbl:
668 vfree(ls->ls_rsbtbl); 606 vfree(ls->ls_rsbtbl);
669 out_lsfree: 607 out_lsfree:
670 if (do_unreg) 608 if (do_unreg)
@@ -676,10 +614,8 @@ static int new_lockspace(const char *name, const char *cluster,
676 return error; 614 return error;
677} 615}
678 616
679int dlm_new_lockspace(const char *name, const char *cluster, 617int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
680 uint32_t flags, int lvblen, 618 uint32_t flags, int lvblen)
681 const struct dlm_lockspace_ops *ops, void *ops_arg,
682 int *ops_result, dlm_lockspace_t **lockspace)
683{ 619{
684 int error = 0; 620 int error = 0;
685 621
@@ -689,8 +625,7 @@ int dlm_new_lockspace(const char *name, const char *cluster,
689 if (error) 625 if (error)
690 goto out; 626 goto out;
691 627
692 error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg, 628 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
693 ops_result, lockspace);
694 if (!error) 629 if (!error)
695 ls_count++; 630 ls_count++;
696 if (error > 0) 631 if (error > 0)
@@ -750,7 +685,7 @@ static int lockspace_busy(struct dlm_ls *ls, int force)
750static int release_lockspace(struct dlm_ls *ls, int force) 685static int release_lockspace(struct dlm_ls *ls, int force)
751{ 686{
752 struct dlm_rsb *rsb; 687 struct dlm_rsb *rsb;
753 struct rb_node *n; 688 struct list_head *head;
754 int i, busy, rv; 689 int i, busy, rv;
755 690
756 busy = lockspace_busy(ls, force); 691 busy = lockspace_busy(ls, force);
@@ -792,6 +727,13 @@ static int release_lockspace(struct dlm_ls *ls, int force)
792 kfree(ls->ls_recover_buf); 727 kfree(ls->ls_recover_buf);
793 728
794 /* 729 /*
730 * Free direntry structs.
731 */
732
733 dlm_dir_clear(ls);
734 vfree(ls->ls_dirtbl);
735
736 /*
795 * Free all lkb's in idr 737 * Free all lkb's in idr
796 */ 738 */
797 739
@@ -804,24 +746,26 @@ static int release_lockspace(struct dlm_ls *ls, int force)
804 */ 746 */
805 747
806 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 748 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
807 while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) { 749 head = &ls->ls_rsbtbl[i].list;
808 rsb = rb_entry(n, struct dlm_rsb, res_hashnode); 750 while (!list_empty(head)) {
809 rb_erase(n, &ls->ls_rsbtbl[i].keep); 751 rsb = list_entry(head->next, struct dlm_rsb,
752 res_hashchain);
753
754 list_del(&rsb->res_hashchain);
810 dlm_free_rsb(rsb); 755 dlm_free_rsb(rsb);
811 } 756 }
812 757
813 while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) { 758 head = &ls->ls_rsbtbl[i].toss;
814 rsb = rb_entry(n, struct dlm_rsb, res_hashnode); 759 while (!list_empty(head)) {
815 rb_erase(n, &ls->ls_rsbtbl[i].toss); 760 rsb = list_entry(head->next, struct dlm_rsb,
761 res_hashchain);
762 list_del(&rsb->res_hashchain);
816 dlm_free_rsb(rsb); 763 dlm_free_rsb(rsb);
817 } 764 }
818 } 765 }
819 766
820 vfree(ls->ls_rsbtbl); 767 vfree(ls->ls_rsbtbl);
821 768
822 for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++)
823 kfree(ls->ls_remove_names[i]);
824
825 while (!list_empty(&ls->ls_new_rsb)) { 769 while (!list_empty(&ls->ls_new_rsb)) {
826 rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, 770 rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb,
827 res_hashchain); 771 res_hashchain);
@@ -835,6 +779,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
835 779
836 dlm_purge_requestqueue(ls); 780 dlm_purge_requestqueue(ls);
837 kfree(ls->ls_recover_args); 781 kfree(ls->ls_recover_args);
782 dlm_clear_free_entries(ls);
838 dlm_clear_members(ls); 783 dlm_clear_members(ls);
839 dlm_clear_members_gone(ls); 784 dlm_clear_members_gone(ls);
840 kfree(ls->ls_node_array); 785 kfree(ls->ls_node_array);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index dd87a31bcc2..990626e7da8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,7 +52,6 @@
52#include <linux/mutex.h> 52#include <linux/mutex.h>
53#include <linux/sctp.h> 53#include <linux/sctp.h>
54#include <linux/slab.h> 54#include <linux/slab.h>
55#include <net/sctp/sctp.h>
56#include <net/sctp/user.h> 55#include <net/sctp/user.h>
57#include <net/ipv6.h> 56#include <net/ipv6.h>
58 57
@@ -140,19 +139,8 @@ struct writequeue_entry {
140 struct connection *con; 139 struct connection *con;
141}; 140};
142 141
143struct dlm_node_addr {
144 struct list_head list;
145 int nodeid;
146 int addr_count;
147 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
148};
149
150static LIST_HEAD(dlm_node_addrs);
151static DEFINE_SPINLOCK(dlm_node_addrs_spin);
152
153static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; 142static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
154static int dlm_local_count; 143static int dlm_local_count;
155static int dlm_allow_conn;
156 144
157/* Work queues */ 145/* Work queues */
158static struct workqueue_struct *recv_workqueue; 146static struct workqueue_struct *recv_workqueue;
@@ -274,146 +262,31 @@ static struct connection *assoc2con(int assoc_id)
274 return NULL; 262 return NULL;
275} 263}
276 264
277static struct dlm_node_addr *find_node_addr(int nodeid) 265static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
278{
279 struct dlm_node_addr *na;
280
281 list_for_each_entry(na, &dlm_node_addrs, list) {
282 if (na->nodeid == nodeid)
283 return na;
284 }
285 return NULL;
286}
287
288static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
289{
290 switch (x->ss_family) {
291 case AF_INET: {
292 struct sockaddr_in *sinx = (struct sockaddr_in *)x;
293 struct sockaddr_in *siny = (struct sockaddr_in *)y;
294 if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
295 return 0;
296 if (sinx->sin_port != siny->sin_port)
297 return 0;
298 break;
299 }
300 case AF_INET6: {
301 struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
302 struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
303 if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
304 return 0;
305 if (sinx->sin6_port != siny->sin6_port)
306 return 0;
307 break;
308 }
309 default:
310 return 0;
311 }
312 return 1;
313}
314
315static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
316 struct sockaddr *sa_out)
317{ 266{
318 struct sockaddr_storage sas; 267 struct sockaddr_storage addr;
319 struct dlm_node_addr *na; 268 int error;
320 269
321 if (!dlm_local_count) 270 if (!dlm_local_count)
322 return -1; 271 return -1;
323 272
324 spin_lock(&dlm_node_addrs_spin); 273 error = dlm_nodeid_to_addr(nodeid, &addr);
325 na = find_node_addr(nodeid); 274 if (error)
326 if (na && na->addr_count) 275 return error;
327 memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
328 spin_unlock(&dlm_node_addrs_spin);
329
330 if (!na)
331 return -EEXIST;
332
333 if (!na->addr_count)
334 return -ENOENT;
335
336 if (sas_out)
337 memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
338
339 if (!sa_out)
340 return 0;
341 276
342 if (dlm_local_addr[0]->ss_family == AF_INET) { 277 if (dlm_local_addr[0]->ss_family == AF_INET) {
343 struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; 278 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
344 struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out; 279 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
345 ret4->sin_addr.s_addr = in4->sin_addr.s_addr; 280 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
346 } else { 281 } else {
347 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &sas; 282 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
348 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out; 283 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
349 ret6->sin6_addr = in6->sin6_addr; 284 ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr);
350 } 285 }
351 286
352 return 0; 287 return 0;
353} 288}
354 289
355static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
356{
357 struct dlm_node_addr *na;
358 int rv = -EEXIST;
359
360 spin_lock(&dlm_node_addrs_spin);
361 list_for_each_entry(na, &dlm_node_addrs, list) {
362 if (!na->addr_count)
363 continue;
364
365 if (!addr_compare(na->addr[0], addr))
366 continue;
367
368 *nodeid = na->nodeid;
369 rv = 0;
370 break;
371 }
372 spin_unlock(&dlm_node_addrs_spin);
373 return rv;
374}
375
376int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
377{
378 struct sockaddr_storage *new_addr;
379 struct dlm_node_addr *new_node, *na;
380
381 new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
382 if (!new_node)
383 return -ENOMEM;
384
385 new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS);
386 if (!new_addr) {
387 kfree(new_node);
388 return -ENOMEM;
389 }
390
391 memcpy(new_addr, addr, len);
392
393 spin_lock(&dlm_node_addrs_spin);
394 na = find_node_addr(nodeid);
395 if (!na) {
396 new_node->nodeid = nodeid;
397 new_node->addr[0] = new_addr;
398 new_node->addr_count = 1;
399 list_add(&new_node->list, &dlm_node_addrs);
400 spin_unlock(&dlm_node_addrs_spin);
401 return 0;
402 }
403
404 if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
405 spin_unlock(&dlm_node_addrs_spin);
406 kfree(new_addr);
407 kfree(new_node);
408 return -ENOSPC;
409 }
410
411 na->addr[na->addr_count++] = new_addr;
412 spin_unlock(&dlm_node_addrs_spin);
413 kfree(new_node);
414 return 0;
415}
416
417/* Data available on socket or listen socket received a connect */ 290/* Data available on socket or listen socket received a connect */
418static void lowcomms_data_ready(struct sock *sk, int count_unused) 291static void lowcomms_data_ready(struct sock *sk, int count_unused)
419{ 292{
@@ -473,7 +346,7 @@ int dlm_lowcomms_connect_node(int nodeid)
473} 346}
474 347
475/* Make a socket active */ 348/* Make a socket active */
476static void add_sock(struct socket *sock, struct connection *con) 349static int add_sock(struct socket *sock, struct connection *con)
477{ 350{
478 con->sock = sock; 351 con->sock = sock;
479 352
@@ -483,6 +356,7 @@ static void add_sock(struct socket *sock, struct connection *con)
483 con->sock->sk->sk_state_change = lowcomms_state_change; 356 con->sock->sk->sk_state_change = lowcomms_state_change;
484 con->sock->sk->sk_user_data = con; 357 con->sock->sk->sk_user_data = con;
485 con->sock->sk->sk_allocation = GFP_NOFS; 358 con->sock->sk->sk_allocation = GFP_NOFS;
359 return 0;
486} 360}
487 361
488/* Add the port number to an IPv6 or 4 sockaddr and return the address 362/* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -600,6 +474,9 @@ static void process_sctp_notification(struct connection *con,
600 int prim_len, ret; 474 int prim_len, ret;
601 int addr_len; 475 int addr_len;
602 struct connection *new_con; 476 struct connection *new_con;
477 sctp_peeloff_arg_t parg;
478 int parglen = sizeof(parg);
479 int err;
603 480
604 /* 481 /*
605 * We get this before any data for an association. 482 * We get this before any data for an association.
@@ -634,7 +511,7 @@ static void process_sctp_notification(struct connection *con,
634 return; 511 return;
635 } 512 }
636 make_sockaddr(&prim.ssp_addr, 0, &addr_len); 513 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
637 if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) { 514 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
638 unsigned char *b=(unsigned char *)&prim.ssp_addr; 515 unsigned char *b=(unsigned char *)&prim.ssp_addr;
639 log_print("reject connect from unknown addr"); 516 log_print("reject connect from unknown addr");
640 print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, 517 print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
@@ -648,19 +525,23 @@ static void process_sctp_notification(struct connection *con,
648 return; 525 return;
649 526
650 /* Peel off a new sock */ 527 /* Peel off a new sock */
651 sctp_lock_sock(con->sock->sk); 528 parg.associd = sn->sn_assoc_change.sac_assoc_id;
652 ret = sctp_do_peeloff(con->sock->sk, 529 ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
653 sn->sn_assoc_change.sac_assoc_id, 530 SCTP_SOCKOPT_PEELOFF,
654 &new_con->sock); 531 (void *)&parg, &parglen);
655 sctp_release_sock(con->sock->sk);
656 if (ret < 0) { 532 if (ret < 0) {
657 log_print("Can't peel off a socket for " 533 log_print("Can't peel off a socket for "
658 "connection %d to node %d: err=%d", 534 "connection %d to node %d: err=%d",
659 (int)sn->sn_assoc_change.sac_assoc_id, 535 parg.associd, nodeid, ret);
660 nodeid, ret); 536 return;
537 }
538 new_con->sock = sockfd_lookup(parg.sd, &err);
539 if (!new_con->sock) {
540 log_print("sockfd_lookup error %d", err);
661 return; 541 return;
662 } 542 }
663 add_sock(new_con->sock, new_con); 543 add_sock(new_con->sock, new_con);
544 sockfd_put(new_con->sock);
664 545
665 log_print("connecting to %d sctp association %d", 546 log_print("connecting to %d sctp association %d",
666 nodeid, (int)sn->sn_assoc_change.sac_assoc_id); 547 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
@@ -835,13 +716,6 @@ static int tcp_accept_from_sock(struct connection *con)
835 struct connection *newcon; 716 struct connection *newcon;
836 struct connection *addcon; 717 struct connection *addcon;
837 718
838 mutex_lock(&connections_lock);
839 if (!dlm_allow_conn) {
840 mutex_unlock(&connections_lock);
841 return -1;
842 }
843 mutex_unlock(&connections_lock);
844
845 memset(&peeraddr, 0, sizeof(peeraddr)); 719 memset(&peeraddr, 0, sizeof(peeraddr));
846 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, 720 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
847 IPPROTO_TCP, &newsock); 721 IPPROTO_TCP, &newsock);
@@ -871,7 +745,7 @@ static int tcp_accept_from_sock(struct connection *con)
871 745
872 /* Get the new node's NODEID */ 746 /* Get the new node's NODEID */
873 make_sockaddr(&peeraddr, 0, &len); 747 make_sockaddr(&peeraddr, 0, &len);
874 if (addr_to_nodeid(&peeraddr, &nodeid)) { 748 if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
875 unsigned char *b=(unsigned char *)&peeraddr; 749 unsigned char *b=(unsigned char *)&peeraddr;
876 log_print("connect from non cluster node"); 750 log_print("connect from non cluster node");
877 print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, 751 print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
@@ -986,7 +860,7 @@ static void sctp_init_assoc(struct connection *con)
986 if (con->retries++ > MAX_CONNECT_RETRIES) 860 if (con->retries++ > MAX_CONNECT_RETRIES)
987 return; 861 return;
988 862
989 if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) { 863 if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
990 log_print("no address for nodeid %d", con->nodeid); 864 log_print("no address for nodeid %d", con->nodeid);
991 return; 865 return;
992 } 866 }
@@ -1052,11 +926,11 @@ static void sctp_init_assoc(struct connection *con)
1052/* Connect a new socket to its peer */ 926/* Connect a new socket to its peer */
1053static void tcp_connect_to_sock(struct connection *con) 927static void tcp_connect_to_sock(struct connection *con)
1054{ 928{
929 int result = -EHOSTUNREACH;
1055 struct sockaddr_storage saddr, src_addr; 930 struct sockaddr_storage saddr, src_addr;
1056 int addr_len; 931 int addr_len;
1057 struct socket *sock = NULL; 932 struct socket *sock = NULL;
1058 int one = 1; 933 int one = 1;
1059 int result;
1060 934
1061 if (con->nodeid == 0) { 935 if (con->nodeid == 0) {
1062 log_print("attempt to connect sock 0 foiled"); 936 log_print("attempt to connect sock 0 foiled");
@@ -1068,8 +942,10 @@ static void tcp_connect_to_sock(struct connection *con)
1068 goto out; 942 goto out;
1069 943
1070 /* Some odd races can cause double-connects, ignore them */ 944 /* Some odd races can cause double-connects, ignore them */
1071 if (con->sock) 945 if (con->sock) {
946 result = 0;
1072 goto out; 947 goto out;
948 }
1073 949
1074 /* Create a socket to communicate with */ 950 /* Create a socket to communicate with */
1075 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, 951 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
@@ -1078,11 +954,8 @@ static void tcp_connect_to_sock(struct connection *con)
1078 goto out_err; 954 goto out_err;
1079 955
1080 memset(&saddr, 0, sizeof(saddr)); 956 memset(&saddr, 0, sizeof(saddr));
1081 result = nodeid_to_addr(con->nodeid, &saddr, NULL); 957 if (dlm_nodeid_to_addr(con->nodeid, &saddr))
1082 if (result < 0) {
1083 log_print("no address for nodeid %d", con->nodeid);
1084 goto out_err; 958 goto out_err;
1085 }
1086 959
1087 sock->sk->sk_user_data = con; 960 sock->sk->sk_user_data = con;
1088 con->rx_action = receive_from_sock; 961 con->rx_action = receive_from_sock;
@@ -1108,7 +981,8 @@ static void tcp_connect_to_sock(struct connection *con)
1108 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, 981 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
1109 sizeof(one)); 982 sizeof(one));
1110 983
1111 result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, 984 result =
985 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
1112 O_NONBLOCK); 986 O_NONBLOCK);
1113 if (result == -EINPROGRESS) 987 if (result == -EINPROGRESS)
1114 result = 0; 988 result = 0;
@@ -1126,17 +1000,11 @@ out_err:
1126 * Some errors are fatal and this list might need adjusting. For other 1000 * Some errors are fatal and this list might need adjusting. For other
1127 * errors we try again until the max number of retries is reached. 1001 * errors we try again until the max number of retries is reached.
1128 */ 1002 */
1129 if (result != -EHOSTUNREACH && 1003 if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
1130 result != -ENETUNREACH && 1004 result != -ENETDOWN && result != -EINVAL
1131 result != -ENETDOWN && 1005 && result != -EPROTONOSUPPORT) {
1132 result != -EINVAL &&
1133 result != -EPROTONOSUPPORT) {
1134 log_print("connect %d try %d error %d", con->nodeid,
1135 con->retries, result);
1136 mutex_unlock(&con->sock_mutex);
1137 msleep(1000);
1138 lowcomms_connect_sock(con); 1006 lowcomms_connect_sock(con);
1139 return; 1007 result = 0;
1140 } 1008 }
1141out: 1009out:
1142 mutex_unlock(&con->sock_mutex); 1010 mutex_unlock(&con->sock_mutex);
@@ -1174,8 +1042,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
1174 if (result < 0) { 1042 if (result < 0) {
1175 log_print("Failed to set SO_REUSEADDR on socket: %d", result); 1043 log_print("Failed to set SO_REUSEADDR on socket: %d", result);
1176 } 1044 }
1045 sock->sk->sk_user_data = con;
1177 con->rx_action = tcp_accept_from_sock; 1046 con->rx_action = tcp_accept_from_sock;
1178 con->connect_action = tcp_connect_to_sock; 1047 con->connect_action = tcp_connect_to_sock;
1048 con->sock = sock;
1179 1049
1180 /* Bind to our port */ 1050 /* Bind to our port */
1181 make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); 1051 make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
@@ -1212,7 +1082,7 @@ static void init_local(void)
1212 int i; 1082 int i;
1213 1083
1214 dlm_local_count = 0; 1084 dlm_local_count = 0;
1215 for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) { 1085 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
1216 if (dlm_our_addr(&sas, i)) 1086 if (dlm_our_addr(&sas, i))
1217 break; 1087 break;
1218 1088
@@ -1385,6 +1255,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1385 struct connection *con; 1255 struct connection *con;
1386 struct writequeue_entry *e; 1256 struct writequeue_entry *e;
1387 int offset = 0; 1257 int offset = 0;
1258 int users = 0;
1388 1259
1389 con = nodeid2con(nodeid, allocation); 1260 con = nodeid2con(nodeid, allocation);
1390 if (!con) 1261 if (!con)
@@ -1398,7 +1269,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1398 } else { 1269 } else {
1399 offset = e->end; 1270 offset = e->end;
1400 e->end += len; 1271 e->end += len;
1401 e->users++; 1272 users = e->users++;
1402 } 1273 }
1403 spin_unlock(&con->writequeue_lock); 1274 spin_unlock(&con->writequeue_lock);
1404 1275
@@ -1413,7 +1284,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1413 spin_lock(&con->writequeue_lock); 1284 spin_lock(&con->writequeue_lock);
1414 offset = e->end; 1285 offset = e->end;
1415 e->end += len; 1286 e->end += len;
1416 e->users++; 1287 users = e->users++;
1417 list_add_tail(&e->list, &con->writequeue); 1288 list_add_tail(&e->list, &con->writequeue);
1418 spin_unlock(&con->writequeue_lock); 1289 spin_unlock(&con->writequeue_lock);
1419 goto got_one; 1290 goto got_one;
@@ -1485,7 +1356,8 @@ static void send_to_sock(struct connection *con)
1485 } 1356 }
1486 cond_resched(); 1357 cond_resched();
1487 goto out; 1358 goto out;
1488 } else if (ret < 0) 1359 }
1360 if (ret <= 0)
1489 goto send_error; 1361 goto send_error;
1490 } 1362 }
1491 1363
@@ -1502,6 +1374,7 @@ static void send_to_sock(struct connection *con)
1502 if (e->len == 0 && e->users == 0) { 1374 if (e->len == 0 && e->users == 0) {
1503 list_del(&e->list); 1375 list_del(&e->list);
1504 free_entry(e); 1376 free_entry(e);
1377 continue;
1505 } 1378 }
1506 } 1379 }
1507 spin_unlock(&con->writequeue_lock); 1380 spin_unlock(&con->writequeue_lock);
@@ -1519,6 +1392,7 @@ out_connect:
1519 mutex_unlock(&con->sock_mutex); 1392 mutex_unlock(&con->sock_mutex);
1520 if (!test_bit(CF_INIT_PENDING, &con->flags)) 1393 if (!test_bit(CF_INIT_PENDING, &con->flags))
1521 lowcomms_connect_sock(con); 1394 lowcomms_connect_sock(con);
1395 return;
1522} 1396}
1523 1397
1524static void clean_one_writequeue(struct connection *con) 1398static void clean_one_writequeue(struct connection *con)
@@ -1538,7 +1412,6 @@ static void clean_one_writequeue(struct connection *con)
1538int dlm_lowcomms_close(int nodeid) 1412int dlm_lowcomms_close(int nodeid)
1539{ 1413{
1540 struct connection *con; 1414 struct connection *con;
1541 struct dlm_node_addr *na;
1542 1415
1543 log_print("closing connection to node %d", nodeid); 1416 log_print("closing connection to node %d", nodeid);
1544 con = nodeid2con(nodeid, 0); 1417 con = nodeid2con(nodeid, 0);
@@ -1553,17 +1426,6 @@ int dlm_lowcomms_close(int nodeid)
1553 clean_one_writequeue(con); 1426 clean_one_writequeue(con);
1554 close_connection(con, true); 1427 close_connection(con, true);
1555 } 1428 }
1556
1557 spin_lock(&dlm_node_addrs_spin);
1558 na = find_node_addr(nodeid);
1559 if (na) {
1560 list_del(&na->list);
1561 while (na->addr_count--)
1562 kfree(na->addr[na->addr_count]);
1563 kfree(na);
1564 }
1565 spin_unlock(&dlm_node_addrs_spin);
1566
1567 return 0; 1429 return 0;
1568} 1430}
1569 1431
@@ -1647,7 +1509,6 @@ void dlm_lowcomms_stop(void)
1647 socket activity. 1509 socket activity.
1648 */ 1510 */
1649 mutex_lock(&connections_lock); 1511 mutex_lock(&connections_lock);
1650 dlm_allow_conn = 0;
1651 foreach_conn(stop_conn); 1512 foreach_conn(stop_conn);
1652 mutex_unlock(&connections_lock); 1513 mutex_unlock(&connections_lock);
1653 1514
@@ -1675,7 +1536,7 @@ int dlm_lowcomms_start(void)
1675 if (!dlm_local_count) { 1536 if (!dlm_local_count) {
1676 error = -ENOTCONN; 1537 error = -ENOTCONN;
1677 log_print("no local IP address has been set"); 1538 log_print("no local IP address has been set");
1678 goto fail; 1539 goto out;
1679 } 1540 }
1680 1541
1681 error = -ENOMEM; 1542 error = -ENOMEM;
@@ -1683,13 +1544,7 @@ int dlm_lowcomms_start(void)
1683 __alignof__(struct connection), 0, 1544 __alignof__(struct connection), 0,
1684 NULL); 1545 NULL);
1685 if (!con_cache) 1546 if (!con_cache)
1686 goto fail; 1547 goto out;
1687
1688 error = work_start();
1689 if (error)
1690 goto fail_destroy;
1691
1692 dlm_allow_conn = 1;
1693 1548
1694 /* Start listening */ 1549 /* Start listening */
1695 if (dlm_config.ci_protocol == 0) 1550 if (dlm_config.ci_protocol == 0)
@@ -1699,31 +1554,20 @@ int dlm_lowcomms_start(void)
1699 if (error) 1554 if (error)
1700 goto fail_unlisten; 1555 goto fail_unlisten;
1701 1556
1557 error = work_start();
1558 if (error)
1559 goto fail_unlisten;
1560
1702 return 0; 1561 return 0;
1703 1562
1704fail_unlisten: 1563fail_unlisten:
1705 dlm_allow_conn = 0;
1706 con = nodeid2con(0,0); 1564 con = nodeid2con(0,0);
1707 if (con) { 1565 if (con) {
1708 close_connection(con, false); 1566 close_connection(con, false);
1709 kmem_cache_free(con_cache, con); 1567 kmem_cache_free(con_cache, con);
1710 } 1568 }
1711fail_destroy:
1712 kmem_cache_destroy(con_cache); 1569 kmem_cache_destroy(con_cache);
1713fail:
1714 return error;
1715}
1716 1570
1717void dlm_lowcomms_exit(void) 1571out:
1718{ 1572 return error;
1719 struct dlm_node_addr *na, *safe;
1720
1721 spin_lock(&dlm_node_addrs_spin);
1722 list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) {
1723 list_del(&na->list);
1724 while (na->addr_count--)
1725 kfree(na->addr[na->addr_count]);
1726 kfree(na);
1727 }
1728 spin_unlock(&dlm_node_addrs_spin);
1729} 1573}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 67462e54fc2..1311e642628 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -16,12 +16,10 @@
16 16
17int dlm_lowcomms_start(void); 17int dlm_lowcomms_start(void);
18void dlm_lowcomms_stop(void); 18void dlm_lowcomms_stop(void);
19void dlm_lowcomms_exit(void);
20int dlm_lowcomms_close(int nodeid); 19int dlm_lowcomms_close(int nodeid);
21void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); 20void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
22void dlm_lowcomms_commit_buffer(void *mh); 21void dlm_lowcomms_commit_buffer(void *mh);
23int dlm_lowcomms_connect_node(int nodeid); 22int dlm_lowcomms_connect_node(int nodeid);
24int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
25 23
26#endif /* __LOWCOMMS_DOT_H__ */ 24#endif /* __LOWCOMMS_DOT_H__ */
27 25
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 079c0bd71ab..5a59efa0bb4 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -17,7 +17,6 @@
17#include "user.h" 17#include "user.h"
18#include "memory.h" 18#include "memory.h"
19#include "config.h" 19#include "config.h"
20#include "lowcomms.h"
21 20
22static int __init init_dlm(void) 21static int __init init_dlm(void)
23{ 22{
@@ -79,7 +78,6 @@ static void __exit exit_dlm(void)
79 dlm_config_exit(); 78 dlm_config_exit();
80 dlm_memory_exit(); 79 dlm_memory_exit();
81 dlm_lockspace_exit(); 80 dlm_lockspace_exit();
82 dlm_lowcomms_exit();
83 dlm_unregister_debugfs(); 81 dlm_unregister_debugfs();
84} 82}
85 83
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 476557b5492..b12532e553f 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,280 +19,6 @@
19#include "config.h" 19#include "config.h"
20#include "lowcomms.h" 20#include "lowcomms.h"
21 21
22int dlm_slots_version(struct dlm_header *h)
23{
24 if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS)
25 return 0;
26 return 1;
27}
28
29void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
30 struct dlm_member *memb)
31{
32 struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
33
34 if (!dlm_slots_version(&rc->rc_header))
35 return;
36
37 memb->slot = le16_to_cpu(rf->rf_our_slot);
38 memb->generation = le32_to_cpu(rf->rf_generation);
39}
40
41void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
42{
43 struct dlm_slot *slot;
44 struct rcom_slot *ro;
45 int i;
46
47 ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
48
49 /* ls_slots array is sparse, but not rcom_slots */
50
51 for (i = 0; i < ls->ls_slots_size; i++) {
52 slot = &ls->ls_slots[i];
53 if (!slot->nodeid)
54 continue;
55 ro->ro_nodeid = cpu_to_le32(slot->nodeid);
56 ro->ro_slot = cpu_to_le16(slot->slot);
57 ro++;
58 }
59}
60
61#define SLOT_DEBUG_LINE 128
62
63static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
64 struct rcom_slot *ro0, struct dlm_slot *array,
65 int array_size)
66{
67 char line[SLOT_DEBUG_LINE];
68 int len = SLOT_DEBUG_LINE - 1;
69 int pos = 0;
70 int ret, i;
71
72 if (!dlm_config.ci_log_debug)
73 return;
74
75 memset(line, 0, sizeof(line));
76
77 if (array) {
78 for (i = 0; i < array_size; i++) {
79 if (!array[i].nodeid)
80 continue;
81
82 ret = snprintf(line + pos, len - pos, " %d:%d",
83 array[i].slot, array[i].nodeid);
84 if (ret >= len - pos)
85 break;
86 pos += ret;
87 }
88 } else if (ro0) {
89 for (i = 0; i < num_slots; i++) {
90 ret = snprintf(line + pos, len - pos, " %d:%d",
91 ro0[i].ro_slot, ro0[i].ro_nodeid);
92 if (ret >= len - pos)
93 break;
94 pos += ret;
95 }
96 }
97
98 log_debug(ls, "generation %u slots %d%s", gen, num_slots, line);
99}
100
101int dlm_slots_copy_in(struct dlm_ls *ls)
102{
103 struct dlm_member *memb;
104 struct dlm_rcom *rc = ls->ls_recover_buf;
105 struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
106 struct rcom_slot *ro0, *ro;
107 int our_nodeid = dlm_our_nodeid();
108 int i, num_slots;
109 uint32_t gen;
110
111 if (!dlm_slots_version(&rc->rc_header))
112 return -1;
113
114 gen = le32_to_cpu(rf->rf_generation);
115 if (gen <= ls->ls_generation) {
116 log_error(ls, "dlm_slots_copy_in gen %u old %u",
117 gen, ls->ls_generation);
118 }
119 ls->ls_generation = gen;
120
121 num_slots = le16_to_cpu(rf->rf_num_slots);
122 if (!num_slots)
123 return -1;
124
125 ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
126
127 for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
128 ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid);
129 ro->ro_slot = le16_to_cpu(ro->ro_slot);
130 }
131
132 log_debug_slots(ls, gen, num_slots, ro0, NULL, 0);
133
134 list_for_each_entry(memb, &ls->ls_nodes, list) {
135 for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
136 if (ro->ro_nodeid != memb->nodeid)
137 continue;
138 memb->slot = ro->ro_slot;
139 memb->slot_prev = memb->slot;
140 break;
141 }
142
143 if (memb->nodeid == our_nodeid) {
144 if (ls->ls_slot && ls->ls_slot != memb->slot) {
145 log_error(ls, "dlm_slots_copy_in our slot "
146 "changed %d %d", ls->ls_slot,
147 memb->slot);
148 return -1;
149 }
150
151 if (!ls->ls_slot)
152 ls->ls_slot = memb->slot;
153 }
154
155 if (!memb->slot) {
156 log_error(ls, "dlm_slots_copy_in nodeid %d no slot",
157 memb->nodeid);
158 return -1;
159 }
160 }
161
162 return 0;
163}
164
165/* for any nodes that do not support slots, we will not have set memb->slot
166 in wait_status_all(), so memb->slot will remain -1, and we will not
167 assign slots or set ls_num_slots here */
168
169int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
170 struct dlm_slot **slots_out, uint32_t *gen_out)
171{
172 struct dlm_member *memb;
173 struct dlm_slot *array;
174 int our_nodeid = dlm_our_nodeid();
175 int array_size, max_slots, i;
176 int need = 0;
177 int max = 0;
178 int num = 0;
179 uint32_t gen = 0;
180
181 /* our own memb struct will have slot -1 gen 0 */
182
183 list_for_each_entry(memb, &ls->ls_nodes, list) {
184 if (memb->nodeid == our_nodeid) {
185 memb->slot = ls->ls_slot;
186 memb->generation = ls->ls_generation;
187 break;
188 }
189 }
190
191 list_for_each_entry(memb, &ls->ls_nodes, list) {
192 if (memb->generation > gen)
193 gen = memb->generation;
194
195 /* node doesn't support slots */
196
197 if (memb->slot == -1)
198 return -1;
199
200 /* node needs a slot assigned */
201
202 if (!memb->slot)
203 need++;
204
205 /* node has a slot assigned */
206
207 num++;
208
209 if (!max || max < memb->slot)
210 max = memb->slot;
211
212 /* sanity check, once slot is assigned it shouldn't change */
213
214 if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) {
215 log_error(ls, "nodeid %d slot changed %d %d",
216 memb->nodeid, memb->slot_prev, memb->slot);
217 return -1;
218 }
219 memb->slot_prev = memb->slot;
220 }
221
222 array_size = max + need;
223
224 array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS);
225 if (!array)
226 return -ENOMEM;
227
228 num = 0;
229
230 /* fill in slots (offsets) that are used */
231
232 list_for_each_entry(memb, &ls->ls_nodes, list) {
233 if (!memb->slot)
234 continue;
235
236 if (memb->slot > array_size) {
237 log_error(ls, "invalid slot number %d", memb->slot);
238 kfree(array);
239 return -1;
240 }
241
242 array[memb->slot - 1].nodeid = memb->nodeid;
243 array[memb->slot - 1].slot = memb->slot;
244 num++;
245 }
246
247 /* assign new slots from unused offsets */
248
249 list_for_each_entry(memb, &ls->ls_nodes, list) {
250 if (memb->slot)
251 continue;
252
253 for (i = 0; i < array_size; i++) {
254 if (array[i].nodeid)
255 continue;
256
257 memb->slot = i + 1;
258 memb->slot_prev = memb->slot;
259 array[i].nodeid = memb->nodeid;
260 array[i].slot = memb->slot;
261 num++;
262
263 if (!ls->ls_slot && memb->nodeid == our_nodeid)
264 ls->ls_slot = memb->slot;
265 break;
266 }
267
268 if (!memb->slot) {
269 log_error(ls, "no free slot found");
270 kfree(array);
271 return -1;
272 }
273 }
274
275 gen++;
276
277 log_debug_slots(ls, gen, num, NULL, array, array_size);
278
279 max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
280 sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
281
282 if (num > max_slots) {
283 log_error(ls, "num_slots %d exceeds max_slots %d",
284 num, max_slots);
285 kfree(array);
286 return -1;
287 }
288
289 *gen_out = gen;
290 *slots_out = array;
291 *slots_size = array_size;
292 *num_slots = num;
293 return 0;
294}
295
296static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) 22static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
297{ 23{
298 struct dlm_member *memb = NULL; 24 struct dlm_member *memb = NULL;
@@ -317,51 +43,59 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
317 } 43 }
318} 44}
319 45
320static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) 46static int dlm_add_member(struct dlm_ls *ls, int nodeid)
321{ 47{
322 struct dlm_member *memb; 48 struct dlm_member *memb;
323 int error; 49 int w, error;
324 50
325 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); 51 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
326 if (!memb) 52 if (!memb)
327 return -ENOMEM; 53 return -ENOMEM;
328 54
329 error = dlm_lowcomms_connect_node(node->nodeid); 55 w = dlm_node_weight(ls->ls_name, nodeid);
56 if (w < 0) {
57 kfree(memb);
58 return w;
59 }
60
61 error = dlm_lowcomms_connect_node(nodeid);
330 if (error < 0) { 62 if (error < 0) {
331 kfree(memb); 63 kfree(memb);
332 return error; 64 return error;
333 } 65 }
334 66
335 memb->nodeid = node->nodeid; 67 memb->nodeid = nodeid;
336 memb->weight = node->weight; 68 memb->weight = w;
337 memb->comm_seq = node->comm_seq;
338 add_ordered_member(ls, memb); 69 add_ordered_member(ls, memb);
339 ls->ls_num_nodes++; 70 ls->ls_num_nodes++;
340 return 0; 71 return 0;
341} 72}
342 73
343static struct dlm_member *find_memb(struct list_head *head, int nodeid) 74static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
344{ 75{
345 struct dlm_member *memb; 76 list_move(&memb->list, &ls->ls_nodes_gone);
346 77 ls->ls_num_nodes--;
347 list_for_each_entry(memb, head, list) {
348 if (memb->nodeid == nodeid)
349 return memb;
350 }
351 return NULL;
352} 78}
353 79
354int dlm_is_member(struct dlm_ls *ls, int nodeid) 80int dlm_is_member(struct dlm_ls *ls, int nodeid)
355{ 81{
356 if (find_memb(&ls->ls_nodes, nodeid)) 82 struct dlm_member *memb;
357 return 1; 83
84 list_for_each_entry(memb, &ls->ls_nodes, list) {
85 if (memb->nodeid == nodeid)
86 return 1;
87 }
358 return 0; 88 return 0;
359} 89}
360 90
361int dlm_is_removed(struct dlm_ls *ls, int nodeid) 91int dlm_is_removed(struct dlm_ls *ls, int nodeid)
362{ 92{
363 if (find_memb(&ls->ls_nodes_gone, nodeid)) 93 struct dlm_member *memb;
364 return 1; 94
95 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
96 if (memb->nodeid == nodeid)
97 return 1;
98 }
365 return 0; 99 return 0;
366} 100}
367 101
@@ -442,7 +176,7 @@ static int ping_members(struct dlm_ls *ls)
442 error = dlm_recovery_stopped(ls); 176 error = dlm_recovery_stopped(ls);
443 if (error) 177 if (error)
444 break; 178 break;
445 error = dlm_rcom_status(ls, memb->nodeid, 0); 179 error = dlm_rcom_status(ls, memb->nodeid);
446 if (error) 180 if (error)
447 break; 181 break;
448 } 182 }
@@ -452,88 +186,10 @@ static int ping_members(struct dlm_ls *ls)
452 return error; 186 return error;
453} 187}
454 188
455static void dlm_lsop_recover_prep(struct dlm_ls *ls)
456{
457 if (!ls->ls_ops || !ls->ls_ops->recover_prep)
458 return;
459 ls->ls_ops->recover_prep(ls->ls_ops_arg);
460}
461
462static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
463{
464 struct dlm_slot slot;
465 uint32_t seq;
466 int error;
467
468 if (!ls->ls_ops || !ls->ls_ops->recover_slot)
469 return;
470
471 /* if there is no comms connection with this node
472 or the present comms connection is newer
473 than the one when this member was added, then
474 we consider the node to have failed (versus
475 being removed due to dlm_release_lockspace) */
476
477 error = dlm_comm_seq(memb->nodeid, &seq);
478
479 if (!error && seq == memb->comm_seq)
480 return;
481
482 slot.nodeid = memb->nodeid;
483 slot.slot = memb->slot;
484
485 ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot);
486}
487
488void dlm_lsop_recover_done(struct dlm_ls *ls)
489{
490 struct dlm_member *memb;
491 struct dlm_slot *slots;
492 int i, num;
493
494 if (!ls->ls_ops || !ls->ls_ops->recover_done)
495 return;
496
497 num = ls->ls_num_nodes;
498
499 slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL);
500 if (!slots)
501 return;
502
503 i = 0;
504 list_for_each_entry(memb, &ls->ls_nodes, list) {
505 if (i == num) {
506 log_error(ls, "dlm_lsop_recover_done bad num %d", num);
507 goto out;
508 }
509 slots[i].nodeid = memb->nodeid;
510 slots[i].slot = memb->slot;
511 i++;
512 }
513
514 ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num,
515 ls->ls_slot, ls->ls_generation);
516 out:
517 kfree(slots);
518}
519
520static struct dlm_config_node *find_config_node(struct dlm_recover *rv,
521 int nodeid)
522{
523 int i;
524
525 for (i = 0; i < rv->nodes_count; i++) {
526 if (rv->nodes[i].nodeid == nodeid)
527 return &rv->nodes[i];
528 }
529 return NULL;
530}
531
532int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) 189int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
533{ 190{
534 struct dlm_member *memb, *safe; 191 struct dlm_member *memb, *safe;
535 struct dlm_config_node *node; 192 int i, error, found, pos = 0, neg = 0, low = -1;
536 int i, error, neg = 0, low = -1;
537 193
538 /* previously removed members that we've not finished removing need to 194 /* previously removed members that we've not finished removing need to
539 count as a negative change so the "neg" recovery steps will happen */ 195 count as a negative change so the "neg" recovery steps will happen */
@@ -546,32 +202,46 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
546 /* move departed members from ls_nodes to ls_nodes_gone */ 202 /* move departed members from ls_nodes to ls_nodes_gone */
547 203
548 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { 204 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
549 node = find_config_node(rv, memb->nodeid); 205 found = 0;
550 if (node && !node->new) 206 for (i = 0; i < rv->node_count; i++) {
551 continue; 207 if (memb->nodeid == rv->nodeids[i]) {
208 found = 1;
209 break;
210 }
211 }
552 212
553 if (!node) { 213 if (!found) {
214 neg++;
215 dlm_remove_member(ls, memb);
554 log_debug(ls, "remove member %d", memb->nodeid); 216 log_debug(ls, "remove member %d", memb->nodeid);
555 } else {
556 /* removed and re-added */
557 log_debug(ls, "remove member %d comm_seq %u %u",
558 memb->nodeid, memb->comm_seq, node->comm_seq);
559 } 217 }
218 }
560 219
220 /* Add an entry to ls_nodes_gone for members that were removed and
221 then added again, so that previous state for these nodes will be
222 cleared during recovery. */
223
224 for (i = 0; i < rv->new_count; i++) {
225 if (!dlm_is_member(ls, rv->new[i]))
226 continue;
227 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
228
229 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
230 if (!memb)
231 return -ENOMEM;
232 memb->nodeid = rv->new[i];
233 list_add_tail(&memb->list, &ls->ls_nodes_gone);
561 neg++; 234 neg++;
562 list_move(&memb->list, &ls->ls_nodes_gone);
563 ls->ls_num_nodes--;
564 dlm_lsop_recover_slot(ls, memb);
565 } 235 }
566 236
567 /* add new members to ls_nodes */ 237 /* add new members to ls_nodes */
568 238
569 for (i = 0; i < rv->nodes_count; i++) { 239 for (i = 0; i < rv->node_count; i++) {
570 node = &rv->nodes[i]; 240 if (dlm_is_member(ls, rv->nodeids[i]))
571 if (dlm_is_member(ls, node->nodeid))
572 continue; 241 continue;
573 dlm_add_member(ls, node); 242 dlm_add_member(ls, rv->nodeids[i]);
574 log_debug(ls, "add member %d", node->nodeid); 243 pos++;
244 log_debug(ls, "add member %d", rv->nodeids[i]);
575 } 245 }
576 246
577 list_for_each_entry(memb, &ls->ls_nodes, list) { 247 list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -581,6 +251,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
581 ls->ls_low_nodeid = low; 251 ls->ls_low_nodeid = low;
582 252
583 make_member_array(ls); 253 make_member_array(ls);
254 dlm_set_recover_status(ls, DLM_RS_NODES);
584 *neg_out = neg; 255 *neg_out = neg;
585 256
586 error = ping_members(ls); 257 error = ping_members(ls);
@@ -590,8 +261,12 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
590 ls->ls_members_result = error; 261 ls->ls_members_result = error;
591 complete(&ls->ls_members_done); 262 complete(&ls->ls_members_done);
592 } 263 }
264 if (error)
265 goto out;
593 266
594 log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); 267 error = dlm_recover_members_wait(ls);
268 out:
269 log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
595 return error; 270 return error;
596} 271}
597 272
@@ -616,13 +291,13 @@ int dlm_ls_stop(struct dlm_ls *ls)
616 down_write(&ls->ls_recv_active); 291 down_write(&ls->ls_recv_active);
617 292
618 /* 293 /*
619 * Abort any recovery that's in progress (see RECOVER_STOP, 294 * Abort any recovery that's in progress (see RECOVERY_STOP,
620 * dlm_recovery_stopped()) and tell any other threads running in the 295 * dlm_recovery_stopped()) and tell any other threads running in the
621 * dlm to quit any processing (see RUNNING, dlm_locking_stopped()). 296 * dlm to quit any processing (see RUNNING, dlm_locking_stopped()).
622 */ 297 */
623 298
624 spin_lock(&ls->ls_recover_lock); 299 spin_lock(&ls->ls_recover_lock);
625 set_bit(LSFL_RECOVER_STOP, &ls->ls_flags); 300 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
626 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags); 301 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
627 ls->ls_recover_seq++; 302 ls->ls_recover_seq++;
628 spin_unlock(&ls->ls_recover_lock); 303 spin_unlock(&ls->ls_recover_lock);
@@ -642,49 +317,36 @@ int dlm_ls_stop(struct dlm_ls *ls)
642 * when recovery is complete. 317 * when recovery is complete.
643 */ 318 */
644 319
645 if (new) { 320 if (new)
646 set_bit(LSFL_RECOVER_DOWN, &ls->ls_flags); 321 down_write(&ls->ls_in_recovery);
647 wake_up_process(ls->ls_recoverd_task);
648 wait_event(ls->ls_recover_lock_wait,
649 test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
650 }
651 322
652 /* 323 /*
653 * The recoverd suspend/resume makes sure that dlm_recoverd (if 324 * The recoverd suspend/resume makes sure that dlm_recoverd (if
654 * running) has noticed RECOVER_STOP above and quit processing the 325 * running) has noticed RECOVERY_STOP above and quit processing the
655 * previous recovery. 326 * previous recovery.
656 */ 327 */
657 328
658 dlm_recoverd_suspend(ls); 329 dlm_recoverd_suspend(ls);
659
660 spin_lock(&ls->ls_recover_lock);
661 kfree(ls->ls_slots);
662 ls->ls_slots = NULL;
663 ls->ls_num_slots = 0;
664 ls->ls_slots_size = 0;
665 ls->ls_recover_status = 0; 330 ls->ls_recover_status = 0;
666 spin_unlock(&ls->ls_recover_lock);
667
668 dlm_recoverd_resume(ls); 331 dlm_recoverd_resume(ls);
669 332
670 if (!ls->ls_recover_begin) 333 if (!ls->ls_recover_begin)
671 ls->ls_recover_begin = jiffies; 334 ls->ls_recover_begin = jiffies;
672
673 dlm_lsop_recover_prep(ls);
674 return 0; 335 return 0;
675} 336}
676 337
677int dlm_ls_start(struct dlm_ls *ls) 338int dlm_ls_start(struct dlm_ls *ls)
678{ 339{
679 struct dlm_recover *rv = NULL, *rv_old; 340 struct dlm_recover *rv = NULL, *rv_old;
680 struct dlm_config_node *nodes; 341 int *ids = NULL, *new = NULL;
681 int error, count; 342 int error, ids_count = 0, new_count = 0;
682 343
683 rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); 344 rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
684 if (!rv) 345 if (!rv)
685 return -ENOMEM; 346 return -ENOMEM;
686 347
687 error = dlm_config_nodes(ls->ls_name, &nodes, &count); 348 error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count,
349 &new, &new_count);
688 if (error < 0) 350 if (error < 0)
689 goto fail; 351 goto fail;
690 352
@@ -699,8 +361,10 @@ int dlm_ls_start(struct dlm_ls *ls)
699 goto fail; 361 goto fail;
700 } 362 }
701 363
702 rv->nodes = nodes; 364 rv->nodeids = ids;
703 rv->nodes_count = count; 365 rv->node_count = ids_count;
366 rv->new = new;
367 rv->new_count = new_count;
704 rv->seq = ++ls->ls_recover_seq; 368 rv->seq = ++ls->ls_recover_seq;
705 rv_old = ls->ls_recover_args; 369 rv_old = ls->ls_recover_args;
706 ls->ls_recover_args = rv; 370 ls->ls_recover_args = rv;
@@ -708,18 +372,19 @@ int dlm_ls_start(struct dlm_ls *ls)
708 372
709 if (rv_old) { 373 if (rv_old) {
710 log_error(ls, "unused recovery %llx %d", 374 log_error(ls, "unused recovery %llx %d",
711 (unsigned long long)rv_old->seq, rv_old->nodes_count); 375 (unsigned long long)rv_old->seq, rv_old->node_count);
712 kfree(rv_old->nodes); 376 kfree(rv_old->nodeids);
377 kfree(rv_old->new);
713 kfree(rv_old); 378 kfree(rv_old);
714 } 379 }
715 380
716 set_bit(LSFL_RECOVER_WORK, &ls->ls_flags); 381 dlm_recoverd_kick(ls);
717 wake_up_process(ls->ls_recoverd_task);
718 return 0; 382 return 0;
719 383
720 fail: 384 fail:
721 kfree(rv); 385 kfree(rv);
722 kfree(nodes); 386 kfree(ids);
387 kfree(new);
723 return error; 388 return error;
724} 389}
725 390
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 3deb70661c6..7a26fca1e0b 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -20,14 +20,6 @@ void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); 20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid); 21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22int dlm_is_member(struct dlm_ls *ls, int nodeid); 22int dlm_is_member(struct dlm_ls *ls, int nodeid);
23int dlm_slots_version(struct dlm_header *h);
24void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
25 struct dlm_member *memb);
26void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
27int dlm_slots_copy_in(struct dlm_ls *ls);
28int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
29 struct dlm_slot **slots_out, uint32_t *gen_out);
30void dlm_lsop_recover_done(struct dlm_ls *ls);
31 23
32#endif /* __MEMBER_DOT_H__ */ 24#endif /* __MEMBER_DOT_H__ */
33 25
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 7cd24bccd4f..da64df7576e 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -21,19 +21,21 @@ static struct kmem_cache *rsb_cache;
21 21
22int __init dlm_memory_init(void) 22int __init dlm_memory_init(void)
23{ 23{
24 int ret = 0;
25
24 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), 26 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
25 __alignof__(struct dlm_lkb), 0, NULL); 27 __alignof__(struct dlm_lkb), 0, NULL);
26 if (!lkb_cache) 28 if (!lkb_cache)
27 return -ENOMEM; 29 ret = -ENOMEM;
28 30
29 rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), 31 rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb),
30 __alignof__(struct dlm_rsb), 0, NULL); 32 __alignof__(struct dlm_rsb), 0, NULL);
31 if (!rsb_cache) { 33 if (!rsb_cache) {
32 kmem_cache_destroy(lkb_cache); 34 kmem_cache_destroy(lkb_cache);
33 return -ENOMEM; 35 ret = -ENOMEM;
34 } 36 }
35 37
36 return 0; 38 return ret;
37} 39}
38 40
39void dlm_memory_exit(void) 41void dlm_memory_exit(void)
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 60a327863b1..ef17e0169da 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -14,7 +14,7 @@
14#include "dlm_internal.h" 14#include "dlm_internal.h"
15 15
16static uint32_t dlm_nl_seqnum; 16static uint32_t dlm_nl_seqnum;
17static uint32_t listener_nlportid; 17static uint32_t listener_nlpid;
18 18
19static struct genl_family family = { 19static struct genl_family family = {
20 .id = GENL_ID_GENERATE, 20 .id = GENL_ID_GENERATE,
@@ -64,13 +64,13 @@ static int send_data(struct sk_buff *skb)
64 return rv; 64 return rv;
65 } 65 }
66 66
67 return genlmsg_unicast(&init_net, skb, listener_nlportid); 67 return genlmsg_unicast(&init_net, skb, listener_nlpid);
68} 68}
69 69
70static int user_cmd(struct sk_buff *skb, struct genl_info *info) 70static int user_cmd(struct sk_buff *skb, struct genl_info *info)
71{ 71{
72 listener_nlportid = info->snd_portid; 72 listener_nlpid = info->snd_pid;
73 printk("user_cmd nlpid %u\n", listener_nlportid); 73 printk("user_cmd nlpid %u\n", listener_nlpid);
74 return 0; 74 return 0;
75} 75}
76 76
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 9d61947d473..f10a50f24e8 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -24,6 +24,7 @@
24#include "lock.h" 24#include "lock.h"
25#include "util.h" 25#include "util.h"
26 26
27
27static int rcom_response(struct dlm_ls *ls) 28static int rcom_response(struct dlm_ls *ls)
28{ 29{
29 return test_bit(LSFL_RCOM_READY, &ls->ls_flags); 30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
@@ -71,30 +72,20 @@ static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
71 dlm_lowcomms_commit_buffer(mh); 72 dlm_lowcomms_commit_buffer(mh);
72} 73}
73 74
74static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
75 uint32_t flags)
76{
77 rs->rs_flags = cpu_to_le32(flags);
78}
79
80/* When replying to a status request, a node also sends back its 75/* When replying to a status request, a node also sends back its
81 configuration values. The requesting node then checks that the remote 76 configuration values. The requesting node then checks that the remote
82 node is configured the same way as itself. */ 77 node is configured the same way as itself. */
83 78
84static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf, 79static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
85 uint32_t num_slots)
86{ 80{
87 rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen); 81 rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
88 rf->rf_lsflags = cpu_to_le32(ls->ls_exflags); 82 rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
89
90 rf->rf_our_slot = cpu_to_le16(ls->ls_slot);
91 rf->rf_num_slots = cpu_to_le16(num_slots);
92 rf->rf_generation = cpu_to_le32(ls->ls_generation);
93} 83}
94 84
95static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) 85static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
96{ 86{
97 struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; 87 struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
88 size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
98 89
99 if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) { 90 if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
100 log_error(ls, "version mismatch: %x nodeid %d: %x", 91 log_error(ls, "version mismatch: %x nodeid %d: %x",
@@ -103,6 +94,12 @@ static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
103 return -EPROTO; 94 return -EPROTO;
104 } 95 }
105 96
97 if (rc->rc_header.h_length < conf_size) {
98 log_error(ls, "config too short: %d nodeid %d",
99 rc->rc_header.h_length, nodeid);
100 return -EPROTO;
101 }
102
106 if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen || 103 if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
107 le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) { 104 le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
108 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", 105 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -130,18 +127,7 @@ static void disallow_sync_reply(struct dlm_ls *ls)
130 spin_unlock(&ls->ls_rcom_spin); 127 spin_unlock(&ls->ls_rcom_spin);
131} 128}
132 129
133/* 130int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
134 * low nodeid gathers one slot value at a time from each node.
135 * it sets need_slots=0, and saves rf_our_slot returned from each
136 * rcom_config.
137 *
138 * other nodes gather all slot values at once from the low nodeid.
139 * they set need_slots=1, and ignore the rf_our_slot returned from each
140 * rcom_config. they use the rf_num_slots returned from the low
141 * node's rcom_config.
142 */
143
144int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
145{ 131{
146 struct dlm_rcom *rc; 132 struct dlm_rcom *rc;
147 struct dlm_mhandle *mh; 133 struct dlm_mhandle *mh;
@@ -155,13 +141,10 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
155 goto out; 141 goto out;
156 } 142 }
157 143
158 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 144 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
159 sizeof(struct rcom_status), &rc, &mh);
160 if (error) 145 if (error)
161 goto out; 146 goto out;
162 147
163 set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
164
165 allow_sync_reply(ls, &rc->rc_id); 148 allow_sync_reply(ls, &rc->rc_id);
166 memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); 149 memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
167 150
@@ -178,11 +161,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
178 /* we pretend the remote lockspace exists with 0 status */ 161 /* we pretend the remote lockspace exists with 0 status */
179 log_debug(ls, "remote node %d not ready", nodeid); 162 log_debug(ls, "remote node %d not ready", nodeid);
180 rc->rc_result = 0; 163 rc->rc_result = 0;
181 error = 0; 164 } else
182 } else { 165 error = check_config(ls, rc, nodeid);
183 error = check_rcom_config(ls, rc, nodeid);
184 }
185
186 /* the caller looks at rc_result for the remote recovery status */ 166 /* the caller looks at rc_result for the remote recovery status */
187 out: 167 out:
188 return error; 168 return error;
@@ -192,60 +172,17 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
192{ 172{
193 struct dlm_rcom *rc; 173 struct dlm_rcom *rc;
194 struct dlm_mhandle *mh; 174 struct dlm_mhandle *mh;
195 struct rcom_status *rs; 175 int error, nodeid = rc_in->rc_header.h_nodeid;
196 uint32_t status;
197 int nodeid = rc_in->rc_header.h_nodeid;
198 int len = sizeof(struct rcom_config);
199 int num_slots = 0;
200 int error;
201
202 if (!dlm_slots_version(&rc_in->rc_header)) {
203 status = dlm_recover_status(ls);
204 goto do_create;
205 }
206
207 rs = (struct rcom_status *)rc_in->rc_buf;
208
209 if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) {
210 status = dlm_recover_status(ls);
211 goto do_create;
212 }
213
214 spin_lock(&ls->ls_recover_lock);
215 status = ls->ls_recover_status;
216 num_slots = ls->ls_num_slots;
217 spin_unlock(&ls->ls_recover_lock);
218 len += num_slots * sizeof(struct rcom_slot);
219 176
220 do_create:
221 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, 177 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
222 len, &rc, &mh); 178 sizeof(struct rcom_config), &rc, &mh);
223 if (error) 179 if (error)
224 return; 180 return;
225
226 rc->rc_id = rc_in->rc_id; 181 rc->rc_id = rc_in->rc_id;
227 rc->rc_seq_reply = rc_in->rc_seq; 182 rc->rc_seq_reply = rc_in->rc_seq;
228 rc->rc_result = status; 183 rc->rc_result = dlm_recover_status(ls);
184 make_config(ls, (struct rcom_config *) rc->rc_buf);
229 185
230 set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
231
232 if (!num_slots)
233 goto do_send;
234
235 spin_lock(&ls->ls_recover_lock);
236 if (ls->ls_num_slots != num_slots) {
237 spin_unlock(&ls->ls_recover_lock);
238 log_debug(ls, "receive_rcom_status num_slots %d to %d",
239 num_slots, ls->ls_num_slots);
240 rc->rc_result = 0;
241 set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0);
242 goto do_send;
243 }
244
245 dlm_slots_copy_out(ls, rc);
246 spin_unlock(&ls->ls_recover_lock);
247
248 do_send:
249 send_rcom(ls, mh, rc); 186 send_rcom(ls, mh, rc);
250} 187}
251 188
@@ -273,9 +210,19 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
273 struct dlm_rcom *rc; 210 struct dlm_rcom *rc;
274 struct dlm_mhandle *mh; 211 struct dlm_mhandle *mh;
275 int error = 0; 212 int error = 0;
213 int max_size = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
276 214
277 ls->ls_recover_nodeid = nodeid; 215 ls->ls_recover_nodeid = nodeid;
278 216
217 if (nodeid == dlm_our_nodeid()) {
218 ls->ls_recover_buf->rc_header.h_length =
219 dlm_config.ci_buffer_size;
220 dlm_copy_master_names(ls, last_name, last_len,
221 ls->ls_recover_buf->rc_buf,
222 max_size, nodeid);
223 goto out;
224 }
225
279 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); 226 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
280 if (error) 227 if (error)
281 goto out; 228 goto out;
@@ -325,26 +272,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
325 if (error) 272 if (error)
326 goto out; 273 goto out;
327 memcpy(rc->rc_buf, r->res_name, r->res_length); 274 memcpy(rc->rc_buf, r->res_name, r->res_length);
328 rc->rc_id = (unsigned long) r->res_id; 275 rc->rc_id = (unsigned long) r;
329
330 send_rcom(ls, mh, rc);
331 out:
332 return error;
333}
334
335int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid)
336{
337 struct dlm_rcom *rc;
338 struct dlm_mhandle *mh;
339 struct dlm_ls *ls = r->res_ls;
340 int error;
341
342 error = create_rcom(ls, to_nodeid, DLM_RCOM_LOOKUP, r->res_length,
343 &rc, &mh);
344 if (error)
345 goto out;
346 memcpy(rc->rc_buf, r->res_name, r->res_length);
347 rc->rc_id = 0xFFFFFFFF;
348 276
349 send_rcom(ls, mh, rc); 277 send_rcom(ls, mh, rc);
350 out: 278 out:
@@ -362,14 +290,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
362 if (error) 290 if (error)
363 return; 291 return;
364 292
365 if (rc_in->rc_id == 0xFFFFFFFF) { 293 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
366 log_error(ls, "receive_rcom_lookup dump from %d", nodeid);
367 dlm_dump_rsb_name(ls, rc_in->rc_buf, len);
368 return;
369 }
370
371 error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len,
372 DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL);
373 if (error) 294 if (error)
374 ret_nodeid = error; 295 ret_nodeid = error;
375 rc->rc_result = ret_nodeid; 296 rc->rc_result = ret_nodeid;
@@ -500,102 +421,46 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
500 return 0; 421 return 0;
501} 422}
502 423
503/* 424static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
504 * Ignore messages for stage Y before we set
505 * recover_status bit for stage X:
506 *
507 * recover_status = 0
508 *
509 * dlm_recover_members()
510 * - send nothing
511 * - recv nothing
512 * - ignore NAMES, NAMES_REPLY
513 * - ignore LOOKUP, LOOKUP_REPLY
514 * - ignore LOCK, LOCK_REPLY
515 *
516 * recover_status |= NODES
517 *
518 * dlm_recover_members_wait()
519 *
520 * dlm_recover_directory()
521 * - send NAMES
522 * - recv NAMES_REPLY
523 * - ignore LOOKUP, LOOKUP_REPLY
524 * - ignore LOCK, LOCK_REPLY
525 *
526 * recover_status |= DIR
527 *
528 * dlm_recover_directory_wait()
529 *
530 * dlm_recover_masters()
531 * - send LOOKUP
532 * - recv LOOKUP_REPLY
533 *
534 * dlm_recover_locks()
535 * - send LOCKS
536 * - recv LOCKS_REPLY
537 *
538 * recover_status |= LOCKS
539 *
540 * dlm_recover_locks_wait()
541 *
542 * recover_status |= DONE
543 */
544
545/* Called by dlm_recv; corresponds to dlm_receive_message() but special
546 recovery-only comms are sent through here. */
547
548void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
549{ 425{
550 int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock);
551 int stop, reply = 0, names = 0, lookup = 0, lock = 0;
552 uint32_t status;
553 uint64_t seq; 426 uint64_t seq;
427 int rv = 0;
554 428
555 switch (rc->rc_type) { 429 switch (rc->rc_type) {
556 case DLM_RCOM_STATUS_REPLY: 430 case DLM_RCOM_STATUS_REPLY:
557 reply = 1;
558 break;
559 case DLM_RCOM_NAMES:
560 names = 1;
561 break;
562 case DLM_RCOM_NAMES_REPLY: 431 case DLM_RCOM_NAMES_REPLY:
563 names = 1;
564 reply = 1;
565 break;
566 case DLM_RCOM_LOOKUP:
567 lookup = 1;
568 break;
569 case DLM_RCOM_LOOKUP_REPLY: 432 case DLM_RCOM_LOOKUP_REPLY:
570 lookup = 1;
571 reply = 1;
572 break;
573 case DLM_RCOM_LOCK:
574 lock = 1;
575 break;
576 case DLM_RCOM_LOCK_REPLY: 433 case DLM_RCOM_LOCK_REPLY:
577 lock = 1; 434 spin_lock(&ls->ls_recover_lock);
578 reply = 1; 435 seq = ls->ls_recover_seq;
579 break; 436 spin_unlock(&ls->ls_recover_lock);
580 }; 437 if (rc->rc_seq_reply != seq) {
581 438 log_debug(ls, "ignoring old reply %x from %d "
582 spin_lock(&ls->ls_recover_lock); 439 "seq_reply %llx expect %llx",
583 status = ls->ls_recover_status; 440 rc->rc_type, rc->rc_header.h_nodeid,
584 stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); 441 (unsigned long long)rc->rc_seq_reply,
585 seq = ls->ls_recover_seq; 442 (unsigned long long)seq);
586 spin_unlock(&ls->ls_recover_lock); 443 rv = 1;
444 }
445 }
446 return rv;
447}
587 448
588 if (stop && (rc->rc_type != DLM_RCOM_STATUS)) 449/* Called by dlm_recv; corresponds to dlm_receive_message() but special
589 goto ignore; 450 recovery-only comms are sent through here. */
590 451
591 if (reply && (rc->rc_seq_reply != seq)) 452void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
592 goto ignore; 453{
454 int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock);
593 455
594 if (!(status & DLM_RS_NODES) && (names || lookup || lock)) 456 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
595 goto ignore; 457 log_debug(ls, "ignoring recovery message %x from %d",
458 rc->rc_type, nodeid);
459 goto out;
460 }
596 461
597 if (!(status & DLM_RS_DIR) && (lookup || lock)) 462 if (is_old_reply(ls, rc))
598 goto ignore; 463 goto out;
599 464
600 switch (rc->rc_type) { 465 switch (rc->rc_type) {
601 case DLM_RCOM_STATUS: 466 case DLM_RCOM_STATUS:
@@ -637,20 +502,10 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
637 default: 502 default:
638 log_error(ls, "receive_rcom bad type %d", rc->rc_type); 503 log_error(ls, "receive_rcom bad type %d", rc->rc_type);
639 } 504 }
640 return; 505out:
641
642ignore:
643 log_limit(ls, "dlm_receive_rcom ignore msg %d "
644 "from %d %llu %llu recover seq %llu sts %x gen %u",
645 rc->rc_type,
646 nodeid,
647 (unsigned long long)rc->rc_seq,
648 (unsigned long long)rc->rc_seq_reply,
649 (unsigned long long)seq,
650 status, ls->ls_generation);
651 return; 506 return;
652Eshort: 507Eshort:
653 log_error(ls, "recovery message %d from %d is too short", 508 log_error(ls, "recovery message %x from %d is too short",
654 rc->rc_type, nodeid); 509 rc->rc_type, nodeid);
655} 510}
656 511
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index f8e243463c1..b09abd29ba3 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -14,10 +14,9 @@
14#ifndef __RCOM_DOT_H__ 14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__ 15#define __RCOM_DOT_H__
16 16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); 17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); 18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); 19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid);
21int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); 20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
22void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); 21void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid);
23int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in); 22int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index aedea28a86a..14638235f7b 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -36,23 +36,30 @@
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another 36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake 37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the 38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timeout. This uses a timeout so it can check periodically if the wait 39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * should abort due to node failure (which doesn't cause a wake_up). 40 * to see if we should abort due to a node failure. This should only be called
41 * This should only be called by the dlm_recoverd thread. 41 * by the dlm_recoverd thread.
42 */ 42 */
43 43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.ci_recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
44int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) 51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
45{ 52{
46 int error = 0; 53 int error = 0;
47 int rv;
48 54
49 while (1) { 55 init_timer(&ls->ls_timer);
50 rv = wait_event_timeout(ls->ls_wait_general, 56 ls->ls_timer.function = dlm_wait_timer_fn;
51 testfn(ls) || dlm_recovery_stopped(ls), 57 ls->ls_timer.data = (long) ls;
52 dlm_config.ci_recover_timer * HZ); 58 ls->ls_timer.expires = jiffies + (dlm_config.ci_recover_timer * HZ);
53 if (rv) 59 add_timer(&ls->ls_timer);
54 break; 60
55 } 61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
56 63
57 if (dlm_recovery_stopped(ls)) { 64 if (dlm_recovery_stopped(ls)) {
58 log_debug(ls, "dlm_wait_function aborted"); 65 log_debug(ls, "dlm_wait_function aborted");
@@ -78,20 +85,14 @@ uint32_t dlm_recover_status(struct dlm_ls *ls)
78 return status; 85 return status;
79} 86}
80 87
81static void _set_recover_status(struct dlm_ls *ls, uint32_t status)
82{
83 ls->ls_recover_status |= status;
84}
85
86void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) 88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
87{ 89{
88 spin_lock(&ls->ls_recover_lock); 90 spin_lock(&ls->ls_recover_lock);
89 _set_recover_status(ls, status); 91 ls->ls_recover_status |= status;
90 spin_unlock(&ls->ls_recover_lock); 92 spin_unlock(&ls->ls_recover_lock);
91} 93}
92 94
93static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, 95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
94 int save_slots)
95{ 96{
96 struct dlm_rcom *rc = ls->ls_recover_buf; 97 struct dlm_rcom *rc = ls->ls_recover_buf;
97 struct dlm_member *memb; 98 struct dlm_member *memb;
@@ -105,13 +106,10 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
105 goto out; 106 goto out;
106 } 107 }
107 108
108 error = dlm_rcom_status(ls, memb->nodeid, 0); 109 error = dlm_rcom_status(ls, memb->nodeid);
109 if (error) 110 if (error)
110 goto out; 111 goto out;
111 112
112 if (save_slots)
113 dlm_slot_save(ls, rc, memb);
114
115 if (rc->rc_result & wait_status) 113 if (rc->rc_result & wait_status)
116 break; 114 break;
117 if (delay < 1000) 115 if (delay < 1000)
@@ -123,8 +121,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
123 return error; 121 return error;
124} 122}
125 123
126static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, 124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
127 uint32_t status_flags)
128{ 125{
129 struct dlm_rcom *rc = ls->ls_recover_buf; 126 struct dlm_rcom *rc = ls->ls_recover_buf;
130 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; 127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
@@ -135,7 +132,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
135 goto out; 132 goto out;
136 } 133 }
137 134
138 error = dlm_rcom_status(ls, nodeid, status_flags); 135 error = dlm_rcom_status(ls, nodeid);
139 if (error) 136 if (error)
140 break; 137 break;
141 138
@@ -155,56 +152,18 @@ static int wait_status(struct dlm_ls *ls, uint32_t status)
155 int error; 152 int error;
156 153
157 if (ls->ls_low_nodeid == dlm_our_nodeid()) { 154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
158 error = wait_status_all(ls, status, 0); 155 error = wait_status_all(ls, status);
159 if (!error) 156 if (!error)
160 dlm_set_recover_status(ls, status_all); 157 dlm_set_recover_status(ls, status_all);
161 } else 158 } else
162 error = wait_status_low(ls, status_all, 0); 159 error = wait_status_low(ls, status_all);
163 160
164 return error; 161 return error;
165} 162}
166 163
167int dlm_recover_members_wait(struct dlm_ls *ls) 164int dlm_recover_members_wait(struct dlm_ls *ls)
168{ 165{
169 struct dlm_member *memb; 166 return wait_status(ls, DLM_RS_NODES);
170 struct dlm_slot *slots;
171 int num_slots, slots_size;
172 int error, rv;
173 uint32_t gen;
174
175 list_for_each_entry(memb, &ls->ls_nodes, list) {
176 memb->slot = -1;
177 memb->generation = 0;
178 }
179
180 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
181 error = wait_status_all(ls, DLM_RS_NODES, 1);
182 if (error)
183 goto out;
184
185 /* slots array is sparse, slots_size may be > num_slots */
186
187 rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
188 if (!rv) {
189 spin_lock(&ls->ls_recover_lock);
190 _set_recover_status(ls, DLM_RS_NODES_ALL);
191 ls->ls_num_slots = num_slots;
192 ls->ls_slots_size = slots_size;
193 ls->ls_slots = slots;
194 ls->ls_generation = gen;
195 spin_unlock(&ls->ls_recover_lock);
196 } else {
197 dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
198 }
199 } else {
200 error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS);
201 if (error)
202 goto out;
203
204 dlm_slots_copy_in(ls);
205 }
206 out:
207 return error;
208} 167}
209 168
210int dlm_recover_directory_wait(struct dlm_ls *ls) 169int dlm_recover_directory_wait(struct dlm_ls *ls)
@@ -270,6 +229,22 @@ static void recover_list_del(struct dlm_rsb *r)
270 dlm_put_rsb(r); 229 dlm_put_rsb(r);
271} 230}
272 231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
273static void recover_list_clear(struct dlm_ls *ls) 248static void recover_list_clear(struct dlm_ls *ls)
274{ 249{
275 struct dlm_rsb *r, *s; 250 struct dlm_rsb *r, *s;
@@ -290,94 +265,6 @@ static void recover_list_clear(struct dlm_ls *ls)
290 spin_unlock(&ls->ls_recover_list_lock); 265 spin_unlock(&ls->ls_recover_list_lock);
291} 266}
292 267
293static int recover_idr_empty(struct dlm_ls *ls)
294{
295 int empty = 1;
296
297 spin_lock(&ls->ls_recover_idr_lock);
298 if (ls->ls_recover_list_count)
299 empty = 0;
300 spin_unlock(&ls->ls_recover_idr_lock);
301
302 return empty;
303}
304
305static int recover_idr_add(struct dlm_rsb *r)
306{
307 struct dlm_ls *ls = r->res_ls;
308 int rv, id;
309
310 rv = idr_pre_get(&ls->ls_recover_idr, GFP_NOFS);
311 if (!rv)
312 return -ENOMEM;
313
314 spin_lock(&ls->ls_recover_idr_lock);
315 if (r->res_id) {
316 spin_unlock(&ls->ls_recover_idr_lock);
317 return -1;
318 }
319 rv = idr_get_new_above(&ls->ls_recover_idr, r, 1, &id);
320 if (rv) {
321 spin_unlock(&ls->ls_recover_idr_lock);
322 return rv;
323 }
324 r->res_id = id;
325 ls->ls_recover_list_count++;
326 dlm_hold_rsb(r);
327 spin_unlock(&ls->ls_recover_idr_lock);
328 return 0;
329}
330
331static void recover_idr_del(struct dlm_rsb *r)
332{
333 struct dlm_ls *ls = r->res_ls;
334
335 spin_lock(&ls->ls_recover_idr_lock);
336 idr_remove(&ls->ls_recover_idr, r->res_id);
337 r->res_id = 0;
338 ls->ls_recover_list_count--;
339 spin_unlock(&ls->ls_recover_idr_lock);
340
341 dlm_put_rsb(r);
342}
343
344static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id)
345{
346 struct dlm_rsb *r;
347
348 spin_lock(&ls->ls_recover_idr_lock);
349 r = idr_find(&ls->ls_recover_idr, (int)id);
350 spin_unlock(&ls->ls_recover_idr_lock);
351 return r;
352}
353
354static int recover_idr_clear_rsb(int id, void *p, void *data)
355{
356 struct dlm_ls *ls = data;
357 struct dlm_rsb *r = p;
358
359 r->res_id = 0;
360 r->res_recover_locks_count = 0;
361 ls->ls_recover_list_count--;
362
363 dlm_put_rsb(r);
364 return 0;
365}
366
367static void recover_idr_clear(struct dlm_ls *ls)
368{
369 spin_lock(&ls->ls_recover_idr_lock);
370 idr_for_each(&ls->ls_recover_idr, recover_idr_clear_rsb, ls);
371 idr_remove_all(&ls->ls_recover_idr);
372
373 if (ls->ls_recover_list_count != 0) {
374 log_error(ls, "warning: recover_list_count %d",
375 ls->ls_recover_list_count);
376 ls->ls_recover_list_count = 0;
377 }
378 spin_unlock(&ls->ls_recover_idr_lock);
379}
380
381 268
382/* Master recovery: find new master node for rsb's that were 269/* Master recovery: find new master node for rsb's that were
383 mastered on nodes that have been removed. 270 mastered on nodes that have been removed.
@@ -404,12 +291,9 @@ static void set_lock_master(struct list_head *queue, int nodeid)
404{ 291{
405 struct dlm_lkb *lkb; 292 struct dlm_lkb *lkb;
406 293
407 list_for_each_entry(lkb, queue, lkb_statequeue) { 294 list_for_each_entry(lkb, queue, lkb_statequeue)
408 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) { 295 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
409 lkb->lkb_nodeid = nodeid; 296 lkb->lkb_nodeid = nodeid;
410 lkb->lkb_remid = 0;
411 }
412 }
413} 297}
414 298
415static void set_master_lkbs(struct dlm_rsb *r) 299static void set_master_lkbs(struct dlm_rsb *r)
@@ -422,93 +306,67 @@ static void set_master_lkbs(struct dlm_rsb *r)
422/* 306/*
423 * Propagate the new master nodeid to locks 307 * Propagate the new master nodeid to locks
424 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider. 308 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
425 * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which 309 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
426 * rsb's to consider. 310 * rsb's to consider.
427 */ 311 */
428 312
429static void set_new_master(struct dlm_rsb *r) 313static void set_new_master(struct dlm_rsb *r, int nodeid)
430{ 314{
315 lock_rsb(r);
316 r->res_nodeid = nodeid;
431 set_master_lkbs(r); 317 set_master_lkbs(r);
432 rsb_set_flag(r, RSB_NEW_MASTER); 318 rsb_set_flag(r, RSB_NEW_MASTER);
433 rsb_set_flag(r, RSB_NEW_MASTER2); 319 rsb_set_flag(r, RSB_NEW_MASTER2);
320 unlock_rsb(r);
434} 321}
435 322
436/* 323/*
437 * We do async lookups on rsb's that need new masters. The rsb's 324 * We do async lookups on rsb's that need new masters. The rsb's
438 * waiting for a lookup reply are kept on the recover_list. 325 * waiting for a lookup reply are kept on the recover_list.
439 *
440 * Another node recovering the master may have sent us a rcom lookup,
441 * and our dlm_master_lookup() set it as the new master, along with
442 * NEW_MASTER so that we'll recover it here (this implies dir_nodeid
443 * equals our_nodeid below).
444 */ 326 */
445 327
446static int recover_master(struct dlm_rsb *r, unsigned int *count) 328static int recover_master(struct dlm_rsb *r)
447{ 329{
448 struct dlm_ls *ls = r->res_ls; 330 struct dlm_ls *ls = r->res_ls;
449 int our_nodeid, dir_nodeid; 331 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
450 int is_removed = 0;
451 int error;
452
453 if (is_master(r))
454 return 0;
455
456 is_removed = dlm_is_removed(ls, r->res_nodeid);
457 332
458 if (!is_removed && !rsb_flag(r, RSB_NEW_MASTER))
459 return 0;
460
461 our_nodeid = dlm_our_nodeid();
462 dir_nodeid = dlm_dir_nodeid(r); 333 dir_nodeid = dlm_dir_nodeid(r);
463 334
464 if (dir_nodeid == our_nodeid) { 335 if (dir_nodeid == our_nodeid) {
465 if (is_removed) { 336 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
466 r->res_master_nodeid = our_nodeid; 337 r->res_length, &ret_nodeid);
467 r->res_nodeid = 0; 338 if (error)
468 } 339 log_error(ls, "recover dir lookup error %d", error);
469 340
470 /* set master of lkbs to ourself when is_removed, or to 341 if (ret_nodeid == our_nodeid)
471 another new master which we set along with NEW_MASTER 342 ret_nodeid = 0;
472 in dlm_master_lookup */ 343 set_new_master(r, ret_nodeid);
473 set_new_master(r);
474 error = 0;
475 } else { 344 } else {
476 recover_idr_add(r); 345 recover_list_add(r);
477 error = dlm_send_rcom_lookup(r, dir_nodeid); 346 error = dlm_send_rcom_lookup(r, dir_nodeid);
478 } 347 }
479 348
480 (*count)++;
481 return error; 349 return error;
482} 350}
483 351
484/* 352/*
485 * All MSTCPY locks are purged and rebuilt, even if the master stayed the same. 353 * When not using a directory, most resource names will hash to a new static
486 * This is necessary because recovery can be started, aborted and restarted, 354 * master nodeid and the resource will need to be remastered.
487 * causing the master nodeid to briefly change during the aborted recovery, and
488 * change back to the original value in the second recovery. The MSTCPY locks
489 * may or may not have been purged during the aborted recovery. Another node
490 * with an outstanding request in waiters list and a request reply saved in the
491 * requestqueue, cannot know whether it should ignore the reply and resend the
492 * request, or accept the reply and complete the request. It must do the
493 * former if the remote node purged MSTCPY locks, and it must do the later if
494 * the remote node did not. This is solved by always purging MSTCPY locks, in
495 * which case, the request reply would always be ignored and the request
496 * resent.
497 */ 355 */
498 356
499static int recover_master_static(struct dlm_rsb *r, unsigned int *count) 357static int recover_master_static(struct dlm_rsb *r)
500{ 358{
501 int dir_nodeid = dlm_dir_nodeid(r); 359 int master = dlm_dir_nodeid(r);
502 int new_master = dir_nodeid;
503 360
504 if (dir_nodeid == dlm_our_nodeid()) 361 if (master == dlm_our_nodeid())
505 new_master = 0; 362 master = 0;
506 363
507 dlm_purge_mstcpy_locks(r); 364 if (r->res_nodeid != master) {
508 r->res_master_nodeid = dir_nodeid; 365 if (is_master(r))
509 r->res_nodeid = new_master; 366 dlm_purge_mstcpy_locks(r);
510 set_new_master(r); 367 set_new_master(r, master);
511 (*count)++; 368 return 1;
369 }
512 return 0; 370 return 0;
513} 371}
514 372
@@ -525,10 +383,7 @@ static int recover_master_static(struct dlm_rsb *r, unsigned int *count)
525int dlm_recover_masters(struct dlm_ls *ls) 383int dlm_recover_masters(struct dlm_ls *ls)
526{ 384{
527 struct dlm_rsb *r; 385 struct dlm_rsb *r;
528 unsigned int total = 0; 386 int error = 0, count = 0;
529 unsigned int count = 0;
530 int nodir = dlm_no_directory(ls);
531 int error;
532 387
533 log_debug(ls, "dlm_recover_masters"); 388 log_debug(ls, "dlm_recover_masters");
534 389
@@ -540,58 +395,48 @@ int dlm_recover_masters(struct dlm_ls *ls)
540 goto out; 395 goto out;
541 } 396 }
542 397
543 lock_rsb(r); 398 if (dlm_no_directory(ls))
544 if (nodir) 399 count += recover_master_static(r);
545 error = recover_master_static(r, &count); 400 else if (!is_master(r) &&
546 else 401 (dlm_is_removed(ls, r->res_nodeid) ||
547 error = recover_master(r, &count); 402 rsb_flag(r, RSB_NEW_MASTER))) {
548 unlock_rsb(r); 403 recover_master(r);
549 cond_resched(); 404 count++;
550 total++;
551
552 if (error) {
553 up_read(&ls->ls_root_sem);
554 goto out;
555 } 405 }
406
407 schedule();
556 } 408 }
557 up_read(&ls->ls_root_sem); 409 up_read(&ls->ls_root_sem);
558 410
559 log_debug(ls, "dlm_recover_masters %u of %u", count, total); 411 log_debug(ls, "dlm_recover_masters %d resources", count);
560 412
561 error = dlm_wait_function(ls, &recover_idr_empty); 413 error = dlm_wait_function(ls, &recover_list_empty);
562 out: 414 out:
563 if (error) 415 if (error)
564 recover_idr_clear(ls); 416 recover_list_clear(ls);
565 return error; 417 return error;
566} 418}
567 419
568int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) 420int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
569{ 421{
570 struct dlm_rsb *r; 422 struct dlm_rsb *r;
571 int ret_nodeid, new_master; 423 int nodeid;
572 424
573 r = recover_idr_find(ls, rc->rc_id); 425 r = recover_list_find(ls, rc->rc_id);
574 if (!r) { 426 if (!r) {
575 log_error(ls, "dlm_recover_master_reply no id %llx", 427 log_error(ls, "dlm_recover_master_reply no id %llx",
576 (unsigned long long)rc->rc_id); 428 (unsigned long long)rc->rc_id);
577 goto out; 429 goto out;
578 } 430 }
579 431
580 ret_nodeid = rc->rc_result; 432 nodeid = rc->rc_result;
581 433 if (nodeid == dlm_our_nodeid())
582 if (ret_nodeid == dlm_our_nodeid()) 434 nodeid = 0;
583 new_master = 0;
584 else
585 new_master = ret_nodeid;
586 435
587 lock_rsb(r); 436 set_new_master(r, nodeid);
588 r->res_master_nodeid = ret_nodeid; 437 recover_list_del(r);
589 r->res_nodeid = new_master;
590 set_new_master(r);
591 unlock_rsb(r);
592 recover_idr_del(r);
593 438
594 if (recover_idr_empty(ls)) 439 if (recover_list_empty(ls))
595 wake_up(&ls->ls_wait_general); 440 wake_up(&ls->ls_wait_general);
596 out: 441 out:
597 return 0; 442 return 0;
@@ -663,6 +508,8 @@ int dlm_recover_locks(struct dlm_ls *ls)
663 struct dlm_rsb *r; 508 struct dlm_rsb *r;
664 int error, count = 0; 509 int error, count = 0;
665 510
511 log_debug(ls, "dlm_recover_locks");
512
666 down_read(&ls->ls_root_sem); 513 down_read(&ls->ls_root_sem);
667 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 514 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
668 if (is_master(r)) { 515 if (is_master(r)) {
@@ -689,12 +536,14 @@ int dlm_recover_locks(struct dlm_ls *ls)
689 } 536 }
690 up_read(&ls->ls_root_sem); 537 up_read(&ls->ls_root_sem);
691 538
692 log_debug(ls, "dlm_recover_locks %d out", count); 539 log_debug(ls, "dlm_recover_locks %d locks", count);
693 540
694 error = dlm_wait_function(ls, &recover_list_empty); 541 error = dlm_wait_function(ls, &recover_list_empty);
695 out: 542 out:
696 if (error) 543 if (error)
697 recover_list_clear(ls); 544 recover_list_clear(ls);
545 else
546 dlm_set_recover_status(ls, DLM_RS_LOCKS);
698 return error; 547 return error;
699} 548}
700 549
@@ -717,14 +566,8 @@ void dlm_recovered_lock(struct dlm_rsb *r)
717 * the VALNOTVALID flag if necessary, and determining the correct lvb contents 566 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
718 * based on the lvb's of the locks held on the rsb. 567 * based on the lvb's of the locks held on the rsb.
719 * 568 *
720 * RSB_VALNOTVALID is set in two cases: 569 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
721 * 570 * was already set prior to recovery, it's not cleared, regardless of locks.
722 * 1. we are master, but not new, and we purged an EX/PW lock held by a
723 * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
724 *
725 * 2. we are a new master, and there are only NL/CR locks left.
726 * (We could probably improve this by only invaliding in this way when
727 * the previous master left uncleanly. VMS docs mention that.)
728 * 571 *
729 * The LVB contents are only considered for changing when this is a new master 572 * The LVB contents are only considered for changing when this is a new master
730 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with 573 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
@@ -740,19 +583,6 @@ static void recover_lvb(struct dlm_rsb *r)
740 int big_lock_exists = 0; 583 int big_lock_exists = 0;
741 int lvblen = r->res_ls->ls_lvblen; 584 int lvblen = r->res_ls->ls_lvblen;
742 585
743 if (!rsb_flag(r, RSB_NEW_MASTER2) &&
744 rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
745 /* case 1 above */
746 rsb_set_flag(r, RSB_VALNOTVALID);
747 return;
748 }
749
750 if (!rsb_flag(r, RSB_NEW_MASTER2))
751 return;
752
753 /* we are the new master, so figure out if VALNOTVALID should
754 be set, and set the rsb lvb from the best lkb available. */
755
756 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { 586 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
757 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 587 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
758 continue; 588 continue;
@@ -791,10 +621,13 @@ static void recover_lvb(struct dlm_rsb *r)
791 if (!lock_lvb_exists) 621 if (!lock_lvb_exists)
792 goto out; 622 goto out;
793 623
794 /* lvb is invalidated if only NL/CR locks remain */
795 if (!big_lock_exists) 624 if (!big_lock_exists)
796 rsb_set_flag(r, RSB_VALNOTVALID); 625 rsb_set_flag(r, RSB_VALNOTVALID);
797 626
627 /* don't mess with the lvb unless we're the new master */
628 if (!rsb_flag(r, RSB_NEW_MASTER2))
629 goto out;
630
798 if (!r->res_lvbptr) { 631 if (!r->res_lvbptr) {
799 r->res_lvbptr = dlm_allocate_lvb(r->res_ls); 632 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
800 if (!r->res_lvbptr) 633 if (!r->res_lvbptr)
@@ -820,7 +653,6 @@ static void recover_lvb(struct dlm_rsb *r)
820 653
821static void recover_conversion(struct dlm_rsb *r) 654static void recover_conversion(struct dlm_rsb *r)
822{ 655{
823 struct dlm_ls *ls = r->res_ls;
824 struct dlm_lkb *lkb; 656 struct dlm_lkb *lkb;
825 int grmode = -1; 657 int grmode = -1;
826 658
@@ -835,32 +667,29 @@ static void recover_conversion(struct dlm_rsb *r)
835 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { 667 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
836 if (lkb->lkb_grmode != DLM_LOCK_IV) 668 if (lkb->lkb_grmode != DLM_LOCK_IV)
837 continue; 669 continue;
838 if (grmode == -1) { 670 if (grmode == -1)
839 log_debug(ls, "recover_conversion %x set gr to rq %d",
840 lkb->lkb_id, lkb->lkb_rqmode);
841 lkb->lkb_grmode = lkb->lkb_rqmode; 671 lkb->lkb_grmode = lkb->lkb_rqmode;
842 } else { 672 else
843 log_debug(ls, "recover_conversion %x set gr %d",
844 lkb->lkb_id, grmode);
845 lkb->lkb_grmode = grmode; 673 lkb->lkb_grmode = grmode;
846 }
847 } 674 }
848} 675}
849 676
850/* We've become the new master for this rsb and waiting/converting locks may 677/* We've become the new master for this rsb and waiting/converting locks may
851 need to be granted in dlm_recover_grant() due to locks that may have 678 need to be granted in dlm_grant_after_purge() due to locks that may have
852 existed from a removed node. */ 679 existed from a removed node. */
853 680
854static void recover_grant(struct dlm_rsb *r) 681static void set_locks_purged(struct dlm_rsb *r)
855{ 682{
856 if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) 683 if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
857 rsb_set_flag(r, RSB_RECOVER_GRANT); 684 rsb_set_flag(r, RSB_LOCKS_PURGED);
858} 685}
859 686
860void dlm_recover_rsbs(struct dlm_ls *ls) 687void dlm_recover_rsbs(struct dlm_ls *ls)
861{ 688{
862 struct dlm_rsb *r; 689 struct dlm_rsb *r;
863 unsigned int count = 0; 690 int count = 0;
691
692 log_debug(ls, "dlm_recover_rsbs");
864 693
865 down_read(&ls->ls_root_sem); 694 down_read(&ls->ls_root_sem);
866 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 695 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
@@ -868,33 +697,24 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
868 if (is_master(r)) { 697 if (is_master(r)) {
869 if (rsb_flag(r, RSB_RECOVER_CONVERT)) 698 if (rsb_flag(r, RSB_RECOVER_CONVERT))
870 recover_conversion(r); 699 recover_conversion(r);
871
872 /* recover lvb before granting locks so the updated
873 lvb/VALNOTVALID is presented in the completion */
874 recover_lvb(r);
875
876 if (rsb_flag(r, RSB_NEW_MASTER2)) 700 if (rsb_flag(r, RSB_NEW_MASTER2))
877 recover_grant(r); 701 set_locks_purged(r);
702 recover_lvb(r);
878 count++; 703 count++;
879 } else {
880 rsb_clear_flag(r, RSB_VALNOTVALID);
881 } 704 }
882 rsb_clear_flag(r, RSB_RECOVER_CONVERT); 705 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
883 rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
884 rsb_clear_flag(r, RSB_NEW_MASTER2); 706 rsb_clear_flag(r, RSB_NEW_MASTER2);
885 unlock_rsb(r); 707 unlock_rsb(r);
886 } 708 }
887 up_read(&ls->ls_root_sem); 709 up_read(&ls->ls_root_sem);
888 710
889 if (count) 711 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
890 log_debug(ls, "dlm_recover_rsbs %d done", count);
891} 712}
892 713
893/* Create a single list of all root rsb's to be used during recovery */ 714/* Create a single list of all root rsb's to be used during recovery */
894 715
895int dlm_create_root_list(struct dlm_ls *ls) 716int dlm_create_root_list(struct dlm_ls *ls)
896{ 717{
897 struct rb_node *n;
898 struct dlm_rsb *r; 718 struct dlm_rsb *r;
899 int i, error = 0; 719 int i, error = 0;
900 720
@@ -907,14 +727,24 @@ int dlm_create_root_list(struct dlm_ls *ls)
907 727
908 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 728 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
909 spin_lock(&ls->ls_rsbtbl[i].lock); 729 spin_lock(&ls->ls_rsbtbl[i].lock);
910 for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { 730 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
911 r = rb_entry(n, struct dlm_rsb, res_hashnode);
912 list_add(&r->res_root_list, &ls->ls_root_list); 731 list_add(&r->res_root_list, &ls->ls_root_list);
913 dlm_hold_rsb(r); 732 dlm_hold_rsb(r);
914 } 733 }
915 734
916 if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[i].toss)) 735 /* If we're using a directory, add tossed rsbs to the root
917 log_error(ls, "dlm_create_root_list toss not empty"); 736 list; they'll have entries created in the new directory,
737 but no other recovery steps should do anything with them. */
738
739 if (dlm_no_directory(ls)) {
740 spin_unlock(&ls->ls_rsbtbl[i].lock);
741 continue;
742 }
743
744 list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
745 list_add(&r->res_root_list, &ls->ls_root_list);
746 dlm_hold_rsb(r);
747 }
918 spin_unlock(&ls->ls_rsbtbl[i].lock); 748 spin_unlock(&ls->ls_rsbtbl[i].lock);
919 } 749 }
920 out: 750 out:
@@ -934,26 +764,26 @@ void dlm_release_root_list(struct dlm_ls *ls)
934 up_write(&ls->ls_root_sem); 764 up_write(&ls->ls_root_sem);
935} 765}
936 766
937void dlm_clear_toss(struct dlm_ls *ls) 767/* If not using a directory, clear the entire toss list, there's no benefit to
768 caching the master value since it's fixed. If we are using a dir, keep the
769 rsb's we're the master of. Recovery will add them to the root list and from
770 there they'll be entered in the rebuilt directory. */
771
772void dlm_clear_toss_list(struct dlm_ls *ls)
938{ 773{
939 struct rb_node *n, *next; 774 struct dlm_rsb *r, *safe;
940 struct dlm_rsb *r;
941 unsigned int count = 0;
942 int i; 775 int i;
943 776
944 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 777 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
945 spin_lock(&ls->ls_rsbtbl[i].lock); 778 spin_lock(&ls->ls_rsbtbl[i].lock);
946 for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) { 779 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
947 next = rb_next(n); 780 res_hashchain) {
948 r = rb_entry(n, struct dlm_rsb, res_hashnode); 781 if (dlm_no_directory(ls) || !is_master(r)) {
949 rb_erase(n, &ls->ls_rsbtbl[i].toss); 782 list_del(&r->res_hashchain);
950 dlm_free_rsb(r); 783 dlm_free_rsb(r);
951 count++; 784 }
952 } 785 }
953 spin_unlock(&ls->ls_rsbtbl[i].lock); 786 spin_unlock(&ls->ls_rsbtbl[i].lock);
954 } 787 }
955
956 if (count)
957 log_debug(ls, "dlm_clear_toss %u done", count);
958} 788}
959 789
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
index d8c8738c70e..ebd0363f1e0 100644
--- a/fs/dlm/recover.h
+++ b/fs/dlm/recover.h
@@ -27,7 +27,7 @@ int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r); 27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls); 28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls); 29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss(struct dlm_ls *ls); 30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls); 31void dlm_recover_rsbs(struct dlm_ls *ls);
32 32
33#endif /* __RECOVER_DOT_H__ */ 33#endif /* __RECOVER_DOT_H__ */
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 32f9f8926ec..774da3cf92c 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -41,7 +41,6 @@ static int enable_locking(struct dlm_ls *ls, uint64_t seq)
41 set_bit(LSFL_RUNNING, &ls->ls_flags); 41 set_bit(LSFL_RUNNING, &ls->ls_flags);
42 /* unblocks processes waiting to enter the dlm */ 42 /* unblocks processes waiting to enter the dlm */
43 up_write(&ls->ls_in_recovery); 43 up_write(&ls->ls_in_recovery);
44 clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
45 error = 0; 44 error = 0;
46 } 45 }
47 spin_unlock(&ls->ls_recover_lock); 46 spin_unlock(&ls->ls_recover_lock);
@@ -55,13 +54,18 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
55 unsigned long start; 54 unsigned long start;
56 int error, neg = 0; 55 int error, neg = 0;
57 56
58 log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq); 57 log_debug(ls, "recover %llx", (unsigned long long)rv->seq);
59 58
60 mutex_lock(&ls->ls_recoverd_active); 59 mutex_lock(&ls->ls_recoverd_active);
61 60
62 dlm_callback_suspend(ls); 61 dlm_callback_suspend(ls);
63 62
64 dlm_clear_toss(ls); 63 /*
64 * Free non-master tossed rsb's. Master rsb's are kept on toss
65 * list and put on root list to be included in resdir recovery.
66 */
67
68 dlm_clear_toss_list(ls);
65 69
66 /* 70 /*
67 * This list of root rsb's will be the basis of most of the recovery 71 * This list of root rsb's will be the basis of most of the recovery
@@ -72,28 +76,14 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
72 76
73 /* 77 /*
74 * Add or remove nodes from the lockspace's ls_nodes list. 78 * Add or remove nodes from the lockspace's ls_nodes list.
79 * Also waits for all nodes to complete dlm_recover_members.
75 */ 80 */
76 81
77 error = dlm_recover_members(ls, rv, &neg); 82 error = dlm_recover_members(ls, rv, &neg);
78 if (error) { 83 if (error) {
79 log_debug(ls, "dlm_recover_members error %d", error); 84 log_debug(ls, "recover_members failed %d", error);
80 goto fail; 85 goto fail;
81 } 86 }
82
83 dlm_recover_dir_nodeid(ls);
84
85 ls->ls_recover_dir_sent_res = 0;
86 ls->ls_recover_dir_sent_msg = 0;
87 ls->ls_recover_locks_in = 0;
88
89 dlm_set_recover_status(ls, DLM_RS_NODES);
90
91 error = dlm_recover_members_wait(ls);
92 if (error) {
93 log_debug(ls, "dlm_recover_members_wait error %d", error);
94 goto fail;
95 }
96
97 start = jiffies; 87 start = jiffies;
98 88
99 /* 89 /*
@@ -103,21 +93,20 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
103 93
104 error = dlm_recover_directory(ls); 94 error = dlm_recover_directory(ls);
105 if (error) { 95 if (error) {
106 log_debug(ls, "dlm_recover_directory error %d", error); 96 log_debug(ls, "recover_directory failed %d", error);
107 goto fail; 97 goto fail;
108 } 98 }
109 99
110 dlm_set_recover_status(ls, DLM_RS_DIR); 100 /*
101 * Wait for all nodes to complete directory rebuild.
102 */
111 103
112 error = dlm_recover_directory_wait(ls); 104 error = dlm_recover_directory_wait(ls);
113 if (error) { 105 if (error) {
114 log_debug(ls, "dlm_recover_directory_wait error %d", error); 106 log_debug(ls, "recover_directory_wait failed %d", error);
115 goto fail; 107 goto fail;
116 } 108 }
117 109
118 log_debug(ls, "dlm_recover_directory %u out %u messages",
119 ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
120
121 /* 110 /*
122 * We may have outstanding operations that are waiting for a reply from 111 * We may have outstanding operations that are waiting for a reply from
123 * a failed node. Mark these to be resent after recovery. Unlock and 112 * a failed node. Mark these to be resent after recovery. Unlock and
@@ -135,7 +124,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
135 * Clear lkb's for departed nodes. 124 * Clear lkb's for departed nodes.
136 */ 125 */
137 126
138 dlm_recover_purge(ls); 127 dlm_purge_locks(ls);
139 128
140 /* 129 /*
141 * Get new master nodeid's for rsb's that were mastered on 130 * Get new master nodeid's for rsb's that were mastered on
@@ -144,7 +133,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
144 133
145 error = dlm_recover_masters(ls); 134 error = dlm_recover_masters(ls);
146 if (error) { 135 if (error) {
147 log_debug(ls, "dlm_recover_masters error %d", error); 136 log_debug(ls, "recover_masters failed %d", error);
148 goto fail; 137 goto fail;
149 } 138 }
150 139
@@ -154,21 +143,16 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
154 143
155 error = dlm_recover_locks(ls); 144 error = dlm_recover_locks(ls);
156 if (error) { 145 if (error) {
157 log_debug(ls, "dlm_recover_locks error %d", error); 146 log_debug(ls, "recover_locks failed %d", error);
158 goto fail; 147 goto fail;
159 } 148 }
160 149
161 dlm_set_recover_status(ls, DLM_RS_LOCKS);
162
163 error = dlm_recover_locks_wait(ls); 150 error = dlm_recover_locks_wait(ls);
164 if (error) { 151 if (error) {
165 log_debug(ls, "dlm_recover_locks_wait error %d", error); 152 log_debug(ls, "recover_locks_wait failed %d", error);
166 goto fail; 153 goto fail;
167 } 154 }
168 155
169 log_debug(ls, "dlm_recover_locks %u in",
170 ls->ls_recover_locks_in);
171
172 /* 156 /*
173 * Finalize state in master rsb's now that all locks can be 157 * Finalize state in master rsb's now that all locks can be
174 * checked. This includes conversion resolution and lvb 158 * checked. This includes conversion resolution and lvb
@@ -186,7 +170,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
186 170
187 error = dlm_recover_locks_wait(ls); 171 error = dlm_recover_locks_wait(ls);
188 if (error) { 172 if (error) {
189 log_debug(ls, "dlm_recover_locks_wait error %d", error); 173 log_debug(ls, "recover_locks_wait failed %d", error);
190 goto fail; 174 goto fail;
191 } 175 }
192 } 176 }
@@ -202,10 +186,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
202 dlm_purge_requestqueue(ls); 186 dlm_purge_requestqueue(ls);
203 187
204 dlm_set_recover_status(ls, DLM_RS_DONE); 188 dlm_set_recover_status(ls, DLM_RS_DONE);
205
206 error = dlm_recover_done_wait(ls); 189 error = dlm_recover_done_wait(ls);
207 if (error) { 190 if (error) {
208 log_debug(ls, "dlm_recover_done_wait error %d", error); 191 log_debug(ls, "recover_done_wait failed %d", error);
209 goto fail; 192 goto fail;
210 } 193 }
211 194
@@ -217,35 +200,34 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
217 200
218 error = enable_locking(ls, rv->seq); 201 error = enable_locking(ls, rv->seq);
219 if (error) { 202 if (error) {
220 log_debug(ls, "enable_locking error %d", error); 203 log_debug(ls, "enable_locking failed %d", error);
221 goto fail; 204 goto fail;
222 } 205 }
223 206
224 error = dlm_process_requestqueue(ls); 207 error = dlm_process_requestqueue(ls);
225 if (error) { 208 if (error) {
226 log_debug(ls, "dlm_process_requestqueue error %d", error); 209 log_debug(ls, "process_requestqueue failed %d", error);
227 goto fail; 210 goto fail;
228 } 211 }
229 212
230 error = dlm_recover_waiters_post(ls); 213 error = dlm_recover_waiters_post(ls);
231 if (error) { 214 if (error) {
232 log_debug(ls, "dlm_recover_waiters_post error %d", error); 215 log_debug(ls, "recover_waiters_post failed %d", error);
233 goto fail; 216 goto fail;
234 } 217 }
235 218
236 dlm_recover_grant(ls); 219 dlm_grant_after_purge(ls);
237 220
238 log_debug(ls, "dlm_recover %llu generation %u done: %u ms", 221 log_debug(ls, "recover %llx done: %u ms",
239 (unsigned long long)rv->seq, ls->ls_generation, 222 (unsigned long long)rv->seq,
240 jiffies_to_msecs(jiffies - start)); 223 jiffies_to_msecs(jiffies - start));
241 mutex_unlock(&ls->ls_recoverd_active); 224 mutex_unlock(&ls->ls_recoverd_active);
242 225
243 dlm_lsop_recover_done(ls);
244 return 0; 226 return 0;
245 227
246 fail: 228 fail:
247 dlm_release_root_list(ls); 229 dlm_release_root_list(ls);
248 log_debug(ls, "dlm_recover %llu error %d", 230 log_debug(ls, "recover %llx error %d",
249 (unsigned long long)rv->seq, error); 231 (unsigned long long)rv->seq, error);
250 mutex_unlock(&ls->ls_recoverd_active); 232 mutex_unlock(&ls->ls_recoverd_active);
251 return error; 233 return error;
@@ -263,12 +245,13 @@ static void do_ls_recovery(struct dlm_ls *ls)
263 rv = ls->ls_recover_args; 245 rv = ls->ls_recover_args;
264 ls->ls_recover_args = NULL; 246 ls->ls_recover_args = NULL;
265 if (rv && ls->ls_recover_seq == rv->seq) 247 if (rv && ls->ls_recover_seq == rv->seq)
266 clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags); 248 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
267 spin_unlock(&ls->ls_recover_lock); 249 spin_unlock(&ls->ls_recover_lock);
268 250
269 if (rv) { 251 if (rv) {
270 ls_recover(ls, rv); 252 ls_recover(ls, rv);
271 kfree(rv->nodes); 253 kfree(rv->nodeids);
254 kfree(rv->new);
272 kfree(rv); 255 kfree(rv);
273 } 256 }
274} 257}
@@ -283,34 +266,26 @@ static int dlm_recoverd(void *arg)
283 return -1; 266 return -1;
284 } 267 }
285 268
286 down_write(&ls->ls_in_recovery);
287 set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
288 wake_up(&ls->ls_recover_lock_wait);
289
290 while (!kthread_should_stop()) { 269 while (!kthread_should_stop()) {
291 set_current_state(TASK_INTERRUPTIBLE); 270 set_current_state(TASK_INTERRUPTIBLE);
292 if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && 271 if (!test_bit(LSFL_WORK, &ls->ls_flags))
293 !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags))
294 schedule(); 272 schedule();
295 set_current_state(TASK_RUNNING); 273 set_current_state(TASK_RUNNING);
296 274
297 if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { 275 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
298 down_write(&ls->ls_in_recovery);
299 set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
300 wake_up(&ls->ls_recover_lock_wait);
301 }
302
303 if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags))
304 do_ls_recovery(ls); 276 do_ls_recovery(ls);
305 } 277 }
306 278
307 if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags))
308 up_write(&ls->ls_in_recovery);
309
310 dlm_put_lockspace(ls); 279 dlm_put_lockspace(ls);
311 return 0; 280 return 0;
312} 281}
313 282
283void dlm_recoverd_kick(struct dlm_ls *ls)
284{
285 set_bit(LSFL_WORK, &ls->ls_flags);
286 wake_up_process(ls->ls_recoverd_task);
287}
288
314int dlm_recoverd_start(struct dlm_ls *ls) 289int dlm_recoverd_start(struct dlm_ls *ls)
315{ 290{
316 struct task_struct *p; 291 struct task_struct *p;
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
index 8856079733f..866657c5d69 100644
--- a/fs/dlm/recoverd.h
+++ b/fs/dlm/recoverd.h
@@ -14,6 +14,7 @@
14#ifndef __RECOVERD_DOT_H__ 14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__ 15#define __RECOVERD_DOT_H__
16 16
17void dlm_recoverd_kick(struct dlm_ls *ls);
17void dlm_recoverd_stop(struct dlm_ls *ls); 18void dlm_recoverd_stop(struct dlm_ls *ls);
18int dlm_recoverd_start(struct dlm_ls *ls); 19int dlm_recoverd_start(struct dlm_ls *ls);
19void dlm_recoverd_suspend(struct dlm_ls *ls); 20void dlm_recoverd_suspend(struct dlm_ls *ls);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 1695f1b0dd4..a44fa22890e 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -19,7 +19,6 @@
19 19
20struct rq_entry { 20struct rq_entry {
21 struct list_head list; 21 struct list_head list;
22 uint32_t recover_seq;
23 int nodeid; 22 int nodeid;
24 struct dlm_message request; 23 struct dlm_message request;
25}; 24};
@@ -42,7 +41,6 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
42 return; 41 return;
43 } 42 }
44 43
45 e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
46 e->nodeid = nodeid; 44 e->nodeid = nodeid;
47 memcpy(&e->request, ms, ms->m_header.h_length); 45 memcpy(&e->request, ms, ms->m_header.h_length);
48 46
@@ -65,7 +63,6 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
65int dlm_process_requestqueue(struct dlm_ls *ls) 63int dlm_process_requestqueue(struct dlm_ls *ls)
66{ 64{
67 struct rq_entry *e; 65 struct rq_entry *e;
68 struct dlm_message *ms;
69 int error = 0; 66 int error = 0;
70 67
71 mutex_lock(&ls->ls_requestqueue_mutex); 68 mutex_lock(&ls->ls_requestqueue_mutex);
@@ -79,15 +76,7 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
79 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); 76 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
80 mutex_unlock(&ls->ls_requestqueue_mutex); 77 mutex_unlock(&ls->ls_requestqueue_mutex);
81 78
82 ms = &e->request; 79 dlm_receive_message_saved(ls, &e->request);
83
84 log_limit(ls, "dlm_process_requestqueue msg %d from %d "
85 "lkid %x remid %x result %d seq %u",
86 ms->m_type, ms->m_header.h_nodeid,
87 ms->m_lkid, ms->m_remid, ms->m_result,
88 e->recover_seq);
89
90 dlm_receive_message_saved(ls, &e->request, e->recover_seq);
91 80
92 mutex_lock(&ls->ls_requestqueue_mutex); 81 mutex_lock(&ls->ls_requestqueue_mutex);
93 list_del(&e->list); 82 list_del(&e->list);
@@ -149,7 +138,35 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
149 if (!dlm_no_directory(ls)) 138 if (!dlm_no_directory(ls))
150 return 0; 139 return 0;
151 140
152 return 1; 141 /* with no directory, the master is likely to change as a part of
142 recovery; requests to/from the defunct master need to be purged */
143
144 switch (type) {
145 case DLM_MSG_REQUEST:
146 case DLM_MSG_CONVERT:
147 case DLM_MSG_UNLOCK:
148 case DLM_MSG_CANCEL:
149 /* we're no longer the master of this resource, the sender
150 will resend to the new master (see waiter_needs_recovery) */
151
152 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
153 return 1;
154 break;
155
156 case DLM_MSG_REQUEST_REPLY:
157 case DLM_MSG_CONVERT_REPLY:
158 case DLM_MSG_UNLOCK_REPLY:
159 case DLM_MSG_CANCEL_REPLY:
160 case DLM_MSG_GRANT:
161 /* this reply is from the former master of the resource,
162 we'll resend to the new master if needed */
163
164 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
165 return 1;
166 break;
167 }
168
169 return 0;
153} 170}
154 171
155void dlm_purge_requestqueue(struct dlm_ls *ls) 172void dlm_purge_requestqueue(struct dlm_ls *ls)
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 7ff49852b0c..d8ea6075640 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -392,9 +392,8 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
392 if (!capable(CAP_SYS_ADMIN)) 392 if (!capable(CAP_SYS_ADMIN))
393 return -EPERM; 393 return -EPERM;
394 394
395 error = dlm_new_lockspace(params->name, NULL, params->flags, 395 error = dlm_new_lockspace(params->name, strlen(params->name),
396 DLM_USER_LVB_LEN, NULL, NULL, NULL, 396 &lockspace, params->flags, DLM_USER_LVB_LEN);
397 &lockspace);
398 if (error) 397 if (error)
399 return error; 398 return error;
400 399
@@ -503,13 +502,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
503#endif 502#endif
504 return -EINVAL; 503 return -EINVAL;
505 504
506#ifdef CONFIG_COMPAT
507 if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN)
508#else
509 if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
510#endif
511 return -EINVAL;
512
513 kbuf = kzalloc(count + 1, GFP_NOFS); 505 kbuf = kzalloc(count + 1, GFP_NOFS);
514 if (!kbuf) 506 if (!kbuf)
515 return -ENOMEM; 507 return -ENOMEM;