aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/cifs/CHANGES2
-rw-r--r--fs/cifs/cifsfs.c10
-rw-r--r--fs/cifs/file.c12
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/cifs/smbdes.c10
-rw-r--r--fs/configfs/file.c9
-rw-r--r--fs/dlm/Kconfig18
-rw-r--r--fs/dlm/config.c154
-rw-r--r--fs/dlm/config.h17
-rw-r--r--fs/dlm/dlm_internal.h20
-rw-r--r--fs/dlm/lock.c87
-rw-r--r--fs/dlm/lockspace.c10
-rw-r--r--fs/dlm/lowcomms-sctp.c151
-rw-r--r--fs/dlm/lowcomms-tcp.c361
-rw-r--r--fs/dlm/midcomms.c4
-rw-r--r--fs/dlm/rcom.c85
-rw-r--r--fs/dlm/recover.c8
-rw-r--r--fs/dlm/recoverd.c22
-rw-r--r--fs/dlm/user.c9
-rw-r--r--fs/dlm/util.c4
-rw-r--r--fs/gfs2/Kconfig47
-rw-r--r--fs/gfs2/bmap.c10
-rw-r--r--fs/gfs2/dir.c25
-rw-r--r--fs/gfs2/dir.h21
-rw-r--r--fs/gfs2/eattr.c8
-rw-r--r--fs/gfs2/glock.c316
-rw-r--r--fs/gfs2/glock.h11
-rw-r--r--fs/gfs2/glops.c136
-rw-r--r--fs/gfs2/incore.h18
-rw-r--r--fs/gfs2/inode.c61
-rw-r--r--fs/gfs2/lm.c8
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h2
-rw-r--r--fs/gfs2/locking/dlm/main.c6
-rw-r--r--fs/gfs2/locking/dlm/mount.c6
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c13
-rw-r--r--fs/gfs2/lops.c14
-rw-r--r--fs/gfs2/ops_address.c134
-rw-r--r--fs/gfs2/ops_dentry.c16
-rw-r--r--fs/gfs2/ops_export.c15
-rw-r--r--fs/gfs2/ops_file.c52
-rw-r--r--fs/gfs2/ops_inode.c55
-rw-r--r--fs/gfs2/ops_super.c11
-rw-r--r--fs/gfs2/ops_vm.c24
-rw-r--r--fs/gfs2/super.c16
-rw-r--r--fs/gfs2/sys.c10
-rw-r--r--fs/jfs/inode.c6
-rw-r--r--fs/jfs/jfs_debug.h5
-rw-r--r--fs/jfs/jfs_dmap.c16
-rw-r--r--fs/jfs/jfs_imap.c16
-rw-r--r--fs/jfs/jfs_incore.h29
-rw-r--r--fs/jfs/jfs_lock.h2
-rw-r--r--fs/jfs/jfs_metapage.c2
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c15
-rw-r--r--fs/jfs/namei.c48
-rw-r--r--fs/ocfs2/cluster/heartbeat.c158
-rw-r--r--fs/ocfs2/cluster/tcp.c35
-rw-r--r--fs/ocfs2/cluster/tcp.h6
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h12
-rw-r--r--fs/ocfs2/dlm/dlmast.c14
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h130
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c40
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c253
-rw-r--r--fs/ocfs2/dlm/dlmlock.c7
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c579
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c182
-rw-r--r--fs/ocfs2/dlm/dlmthread.c200
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c15
-rw-r--r--fs/ocfs2/journal.h4
-rw-r--r--fs/ocfs2/vote.c8
-rw-r--r--fs/sysfs/bin.c6
-rw-r--r--fs/sysfs/dir.c214
-rw-r--r--fs/sysfs/file.c82
-rw-r--r--fs/sysfs/group.c2
-rw-r--r--fs/sysfs/inode.c36
-rw-r--r--fs/sysfs/mount.c11
-rw-r--r--fs/sysfs/symlink.c1
-rw-r--r--fs/sysfs/sysfs.h21
80 files changed, 2508 insertions, 1714 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 8cd2417a14db..5e8e9d9ccb33 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -426,7 +426,6 @@ config OCFS2_FS
426 select CONFIGFS_FS 426 select CONFIGFS_FS
427 select JBD 427 select JBD
428 select CRC32 428 select CRC32
429 select INET
430 help 429 help
431 OCFS2 is a general purpose extent based shared disk cluster file 430 OCFS2 is a general purpose extent based shared disk cluster file
432 system with many similarities to ext3. It supports 64 bit inode 431 system with many similarities to ext3. It supports 64 bit inode
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index d04d2f7448d9..85e3850bf2c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,6 +1,8 @@
1Version 1.47 1Version 1.47
2------------ 2------------
3Fix oops in list_del during mount caused by unaligned string. 3Fix oops in list_del during mount caused by unaligned string.
4Seek to SEEK_END forces check for update of file size for non-cached
5files.
4 6
5Version 1.46 7Version 1.46
6------------ 8------------
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 10c90294cd18..93ef09971d2f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -511,7 +511,15 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
511{ 511{
512 /* origin == SEEK_END => we must revalidate the cached file length */ 512 /* origin == SEEK_END => we must revalidate the cached file length */
513 if (origin == SEEK_END) { 513 if (origin == SEEK_END) {
514 int retval = cifs_revalidate(file->f_path.dentry); 514 int retval;
515
516 /* some applications poll for the file length in this strange
517 way so we must seek to end on non-oplocked files by
518 setting the revalidate time to zero */
519 if(file->f_path.dentry->d_inode)
520 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
521
522 retval = cifs_revalidate(file->f_path.dentry);
515 if (retval < 0) 523 if (retval < 0)
516 return (loff_t)retval; 524 return (loff_t)retval;
517 } 525 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8a49b2e77d37..e9dcf5ee29a2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1146,7 +1146,7 @@ static int cifs_writepages(struct address_space *mapping,
1146 pgoff_t end; 1146 pgoff_t end;
1147 pgoff_t index; 1147 pgoff_t index;
1148 int range_whole = 0; 1148 int range_whole = 0;
1149 struct kvec iov[32]; 1149 struct kvec * iov;
1150 int len; 1150 int len;
1151 int n_iov = 0; 1151 int n_iov = 0;
1152 pgoff_t next; 1152 pgoff_t next;
@@ -1171,15 +1171,21 @@ static int cifs_writepages(struct address_space *mapping,
1171 if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server)) 1171 if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
1172 if(cifs_sb->tcon->ses->server->secMode & 1172 if(cifs_sb->tcon->ses->server->secMode &
1173 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 1173 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
1174 if(!experimEnabled) 1174 if(!experimEnabled)
1175 return generic_writepages(mapping, wbc); 1175 return generic_writepages(mapping, wbc);
1176 1176
1177 iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
1178 if(iov == NULL)
1179 return generic_writepages(mapping, wbc);
1180
1181
1177 /* 1182 /*
1178 * BB: Is this meaningful for a non-block-device file system? 1183 * BB: Is this meaningful for a non-block-device file system?
1179 * If it is, we should test it again after we do I/O 1184 * If it is, we should test it again after we do I/O
1180 */ 1185 */
1181 if (wbc->nonblocking && bdi_write_congested(bdi)) { 1186 if (wbc->nonblocking && bdi_write_congested(bdi)) {
1182 wbc->encountered_congestion = 1; 1187 wbc->encountered_congestion = 1;
1188 kfree(iov);
1183 return 0; 1189 return 0;
1184 } 1190 }
1185 1191
@@ -1345,7 +1351,7 @@ retry:
1345 mapping->writeback_index = index; 1351 mapping->writeback_index = index;
1346 1352
1347 FreeXid(xid); 1353 FreeXid(xid);
1348 1354 kfree(iov);
1349 return rc; 1355 return rc;
1350} 1356}
1351 1357
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 99dfb5337e31..782940be550f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -156,9 +156,9 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
156 tmp_inode->i_atime = cnvrtDosUnixTm( 156 tmp_inode->i_atime = cnvrtDosUnixTm(
157 le16_to_cpu(pfindData->LastAccessDate), 157 le16_to_cpu(pfindData->LastAccessDate),
158 le16_to_cpu(pfindData->LastAccessTime)); 158 le16_to_cpu(pfindData->LastAccessTime));
159 tmp_inode->i_ctime = cnvrtDosUnixTm( 159 tmp_inode->i_ctime = cnvrtDosUnixTm(
160 le16_to_cpu(pfindData->LastWriteDate), 160 le16_to_cpu(pfindData->LastWriteDate),
161 le16_to_cpu(pfindData->LastWriteTime)); 161 le16_to_cpu(pfindData->LastWriteTime));
162 AdjustForTZ(cifs_sb->tcon, tmp_inode); 162 AdjustForTZ(cifs_sb->tcon, tmp_inode);
163 attr = le16_to_cpu(pfindData->Attributes); 163 attr = le16_to_cpu(pfindData->Attributes);
164 allocation_size = le32_to_cpu(pfindData->AllocationSize); 164 allocation_size = le32_to_cpu(pfindData->AllocationSize);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 7a1b2b961ec8..1b1daf63f062 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -196,7 +196,7 @@ dohash(char *out, char *in, char *key, int forw)
196 char c[28]; 196 char c[28];
197 char d[28]; 197 char d[28];
198 char *cd; 198 char *cd;
199 char ki[16][48]; 199 char (*ki)[48];
200 char *pd1; 200 char *pd1;
201 char l[32], r[32]; 201 char l[32], r[32];
202 char *rl; 202 char *rl;
@@ -206,6 +206,12 @@ dohash(char *out, char *in, char *key, int forw)
206 if(pk1 == NULL) 206 if(pk1 == NULL)
207 return; 207 return;
208 208
209 ki = kmalloc(16*48, GFP_KERNEL);
210 if(ki == NULL) {
211 kfree(pk1);
212 return;
213 }
214
209 cd = pk1 + 56; 215 cd = pk1 + 56;
210 pd1= cd + 56; 216 pd1= cd + 56;
211 rl = pd1 + 64; 217 rl = pd1 + 64;
@@ -243,6 +249,7 @@ dohash(char *out, char *in, char *key, int forw)
243 er = kmalloc(48+48+32+32+32, GFP_KERNEL); 249 er = kmalloc(48+48+32+32+32, GFP_KERNEL);
244 if(er == NULL) { 250 if(er == NULL) {
245 kfree(pk1); 251 kfree(pk1);
252 kfree(ki);
246 return; 253 return;
247 } 254 }
248 erk = er+48; 255 erk = er+48;
@@ -290,6 +297,7 @@ dohash(char *out, char *in, char *key, int forw)
290 297
291 permute(out, rl, perm6, 64); 298 permute(out, rl, perm6, 64);
292 kfree(pk1); 299 kfree(pk1);
300 kfree(ki);
293} 301}
294 302
295static void 303static void
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2a7cb086e80c..d98be5e01328 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -162,14 +162,17 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
162 int error; 162 int error;
163 163
164 if (!buffer->page) 164 if (!buffer->page)
165 buffer->page = (char *)get_zeroed_page(GFP_KERNEL); 165 buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0);
166 if (!buffer->page) 166 if (!buffer->page)
167 return -ENOMEM; 167 return -ENOMEM;
168 168
169 if (count > PAGE_SIZE) 169 if (count >= PAGE_SIZE)
170 count = PAGE_SIZE; 170 count = PAGE_SIZE - 1;
171 error = copy_from_user(buffer->page,buf,count); 171 error = copy_from_user(buffer->page,buf,count);
172 buffer->needs_read_fill = 1; 172 buffer->needs_read_fill = 1;
173 /* if buf is assumed to contain a string, terminate it by \0,
174 * so e.g. sscanf() can scan the string easily */
175 buffer->page[count] = 0;
173 return error ? -EFAULT : count; 176 return error ? -EFAULT : count;
174} 177}
175 178
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index b5654a284fef..6fa7b0d5c043 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,21 +3,21 @@ menu "Distributed Lock Manager"
3 3
4config DLM 4config DLM
5 tristate "Distributed Lock Manager (DLM)" 5 tristate "Distributed Lock Manager (DLM)"
6 depends on IPV6 || IPV6=n 6 depends on SYSFS && (IPV6 || IPV6=n)
7 select CONFIGFS_FS 7 select CONFIGFS_FS
8 select IP_SCTP if DLM_SCTP 8 select IP_SCTP if DLM_SCTP
9 help 9 help
10 A general purpose distributed lock manager for kernel or userspace 10 A general purpose distributed lock manager for kernel or userspace
11 applications. 11 applications.
12 12
13choice 13choice
14 prompt "Select DLM communications protocol" 14 prompt "Select DLM communications protocol"
15 depends on DLM 15 depends on DLM
16 default DLM_TCP 16 default DLM_TCP
17 help 17 help
18 The DLM Can use TCP or SCTP for it's network communications. 18 The DLM Can use TCP or SCTP for it's network communications.
19 SCTP supports multi-homed operations whereas TCP doesn't. 19 SCTP supports multi-homed operations whereas TCP doesn't.
20 However, SCTP seems to have stability problems at the moment. 20 However, SCTP seems to have stability problems at the moment.
21 21
22config DLM_TCP 22config DLM_TCP
23 bool "TCP/IP" 23 bool "TCP/IP"
@@ -31,8 +31,8 @@ config DLM_DEBUG
31 bool "DLM debugging" 31 bool "DLM debugging"
32 depends on DLM 32 depends on DLM
33 help 33 help
34 Under the debugfs mount point, the name of each lockspace will 34 Under the debugfs mount point, the name of each lockspace will
35 appear as a file in the "dlm" directory. The output is the 35 appear as a file in the "dlm" directory. The output is the
36 list of resource and locks the local node knows about. 36 list of resource and locks the local node knows about.
37 37
38endmenu 38endmenu
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 88553054bbfa..8665c88e5af2 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -54,6 +54,11 @@ static struct config_item *make_node(struct config_group *, const char *);
54static void drop_node(struct config_group *, struct config_item *); 54static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *); 55static void release_node(struct config_item *);
56 56
57static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
58 char *buf);
59static ssize_t store_cluster(struct config_item *i,
60 struct configfs_attribute *a,
61 const char *buf, size_t len);
57static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, 62static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
58 char *buf); 63 char *buf);
59static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, 64static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
@@ -73,6 +78,101 @@ static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
73static ssize_t node_weight_read(struct node *nd, char *buf); 78static ssize_t node_weight_read(struct node *nd, char *buf);
74static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len); 79static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
75 80
81struct cluster {
82 struct config_group group;
83 unsigned int cl_tcp_port;
84 unsigned int cl_buffer_size;
85 unsigned int cl_rsbtbl_size;
86 unsigned int cl_lkbtbl_size;
87 unsigned int cl_dirtbl_size;
88 unsigned int cl_recover_timer;
89 unsigned int cl_toss_secs;
90 unsigned int cl_scan_secs;
91 unsigned int cl_log_debug;
92};
93
94enum {
95 CLUSTER_ATTR_TCP_PORT = 0,
96 CLUSTER_ATTR_BUFFER_SIZE,
97 CLUSTER_ATTR_RSBTBL_SIZE,
98 CLUSTER_ATTR_LKBTBL_SIZE,
99 CLUSTER_ATTR_DIRTBL_SIZE,
100 CLUSTER_ATTR_RECOVER_TIMER,
101 CLUSTER_ATTR_TOSS_SECS,
102 CLUSTER_ATTR_SCAN_SECS,
103 CLUSTER_ATTR_LOG_DEBUG,
104};
105
106struct cluster_attribute {
107 struct configfs_attribute attr;
108 ssize_t (*show)(struct cluster *, char *);
109 ssize_t (*store)(struct cluster *, const char *, size_t);
110};
111
112static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
113 unsigned int *info_field, int check_zero,
114 const char *buf, size_t len)
115{
116 unsigned int x;
117
118 if (!capable(CAP_SYS_ADMIN))
119 return -EACCES;
120
121 x = simple_strtoul(buf, NULL, 0);
122
123 if (check_zero && !x)
124 return -EINVAL;
125
126 *cl_field = x;
127 *info_field = x;
128
129 return len;
130}
131
132#define __CONFIGFS_ATTR(_name,_mode,_read,_write) { \
133 .attr = { .ca_name = __stringify(_name), \
134 .ca_mode = _mode, \
135 .ca_owner = THIS_MODULE }, \
136 .show = _read, \
137 .store = _write, \
138}
139
140#define CLUSTER_ATTR(name, check_zero) \
141static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \
142{ \
143 return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
144 check_zero, buf, len); \
145} \
146static ssize_t name##_read(struct cluster *cl, char *buf) \
147{ \
148 return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \
149} \
150static struct cluster_attribute cluster_attr_##name = \
151__CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
152
153CLUSTER_ATTR(tcp_port, 1);
154CLUSTER_ATTR(buffer_size, 1);
155CLUSTER_ATTR(rsbtbl_size, 1);
156CLUSTER_ATTR(lkbtbl_size, 1);
157CLUSTER_ATTR(dirtbl_size, 1);
158CLUSTER_ATTR(recover_timer, 1);
159CLUSTER_ATTR(toss_secs, 1);
160CLUSTER_ATTR(scan_secs, 1);
161CLUSTER_ATTR(log_debug, 0);
162
163static struct configfs_attribute *cluster_attrs[] = {
164 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
165 [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
166 [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
167 [CLUSTER_ATTR_LKBTBL_SIZE] = &cluster_attr_lkbtbl_size.attr,
168 [CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr,
169 [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
170 [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
171 [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
172 [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
173 NULL,
174};
175
76enum { 176enum {
77 COMM_ATTR_NODEID = 0, 177 COMM_ATTR_NODEID = 0,
78 COMM_ATTR_LOCAL, 178 COMM_ATTR_LOCAL,
@@ -152,10 +252,6 @@ struct clusters {
152 struct configfs_subsystem subsys; 252 struct configfs_subsystem subsys;
153}; 253};
154 254
155struct cluster {
156 struct config_group group;
157};
158
159struct spaces { 255struct spaces {
160 struct config_group ss_group; 256 struct config_group ss_group;
161}; 257};
@@ -197,6 +293,8 @@ static struct configfs_group_operations clusters_ops = {
197 293
198static struct configfs_item_operations cluster_ops = { 294static struct configfs_item_operations cluster_ops = {
199 .release = release_cluster, 295 .release = release_cluster,
296 .show_attribute = show_cluster,
297 .store_attribute = store_cluster,
200}; 298};
201 299
202static struct configfs_group_operations spaces_ops = { 300static struct configfs_group_operations spaces_ops = {
@@ -237,6 +335,7 @@ static struct config_item_type clusters_type = {
237 335
238static struct config_item_type cluster_type = { 336static struct config_item_type cluster_type = {
239 .ct_item_ops = &cluster_ops, 337 .ct_item_ops = &cluster_ops,
338 .ct_attrs = cluster_attrs,
240 .ct_owner = THIS_MODULE, 339 .ct_owner = THIS_MODULE,
241}; 340};
242 341
@@ -317,6 +416,16 @@ static struct config_group *make_cluster(struct config_group *g,
317 cl->group.default_groups[1] = &cms->cs_group; 416 cl->group.default_groups[1] = &cms->cs_group;
318 cl->group.default_groups[2] = NULL; 417 cl->group.default_groups[2] = NULL;
319 418
419 cl->cl_tcp_port = dlm_config.ci_tcp_port;
420 cl->cl_buffer_size = dlm_config.ci_buffer_size;
421 cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size;
422 cl->cl_lkbtbl_size = dlm_config.ci_lkbtbl_size;
423 cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size;
424 cl->cl_recover_timer = dlm_config.ci_recover_timer;
425 cl->cl_toss_secs = dlm_config.ci_toss_secs;
426 cl->cl_scan_secs = dlm_config.ci_scan_secs;
427 cl->cl_log_debug = dlm_config.ci_log_debug;
428
320 space_list = &sps->ss_group; 429 space_list = &sps->ss_group;
321 comm_list = &cms->cs_group; 430 comm_list = &cms->cs_group;
322 return &cl->group; 431 return &cl->group;
@@ -509,6 +618,25 @@ void dlm_config_exit(void)
509 * Functions for user space to read/write attributes 618 * Functions for user space to read/write attributes
510 */ 619 */
511 620
621static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
622 char *buf)
623{
624 struct cluster *cl = to_cluster(i);
625 struct cluster_attribute *cla =
626 container_of(a, struct cluster_attribute, attr);
627 return cla->show ? cla->show(cl, buf) : 0;
628}
629
630static ssize_t store_cluster(struct config_item *i,
631 struct configfs_attribute *a,
632 const char *buf, size_t len)
633{
634 struct cluster *cl = to_cluster(i);
635 struct cluster_attribute *cla =
636 container_of(a, struct cluster_attribute, attr);
637 return cla->store ? cla->store(cl, buf, len) : -EINVAL;
638}
639
512static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, 640static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
513 char *buf) 641 char *buf)
514{ 642{
@@ -775,15 +903,17 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
775#define DEFAULT_RECOVER_TIMER 5 903#define DEFAULT_RECOVER_TIMER 5
776#define DEFAULT_TOSS_SECS 10 904#define DEFAULT_TOSS_SECS 10
777#define DEFAULT_SCAN_SECS 5 905#define DEFAULT_SCAN_SECS 5
906#define DEFAULT_LOG_DEBUG 0
778 907
779struct dlm_config_info dlm_config = { 908struct dlm_config_info dlm_config = {
780 .tcp_port = DEFAULT_TCP_PORT, 909 .ci_tcp_port = DEFAULT_TCP_PORT,
781 .buffer_size = DEFAULT_BUFFER_SIZE, 910 .ci_buffer_size = DEFAULT_BUFFER_SIZE,
782 .rsbtbl_size = DEFAULT_RSBTBL_SIZE, 911 .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
783 .lkbtbl_size = DEFAULT_LKBTBL_SIZE, 912 .ci_lkbtbl_size = DEFAULT_LKBTBL_SIZE,
784 .dirtbl_size = DEFAULT_DIRTBL_SIZE, 913 .ci_dirtbl_size = DEFAULT_DIRTBL_SIZE,
785 .recover_timer = DEFAULT_RECOVER_TIMER, 914 .ci_recover_timer = DEFAULT_RECOVER_TIMER,
786 .toss_secs = DEFAULT_TOSS_SECS, 915 .ci_toss_secs = DEFAULT_TOSS_SECS,
787 .scan_secs = DEFAULT_SCAN_SECS 916 .ci_scan_secs = DEFAULT_SCAN_SECS,
917 .ci_log_debug = DEFAULT_LOG_DEBUG
788}; 918};
789 919
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 9da7839958a9..1e978611a96e 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -17,14 +17,15 @@
17#define DLM_MAX_ADDR_COUNT 3 17#define DLM_MAX_ADDR_COUNT 3
18 18
19struct dlm_config_info { 19struct dlm_config_info {
20 int tcp_port; 20 int ci_tcp_port;
21 int buffer_size; 21 int ci_buffer_size;
22 int rsbtbl_size; 22 int ci_rsbtbl_size;
23 int lkbtbl_size; 23 int ci_lkbtbl_size;
24 int dirtbl_size; 24 int ci_dirtbl_size;
25 int recover_timer; 25 int ci_recover_timer;
26 int toss_secs; 26 int ci_toss_secs;
27 int scan_secs; 27 int ci_scan_secs;
28 int ci_log_debug;
28}; 29};
29 30
30extern struct dlm_config_info dlm_config; 31extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 1ee8195e6fc0..61d93201e1b2 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -41,6 +41,7 @@
41#include <asm/uaccess.h> 41#include <asm/uaccess.h>
42 42
43#include <linux/dlm.h> 43#include <linux/dlm.h>
44#include "config.h"
44 45
45#define DLM_LOCKSPACE_LEN 64 46#define DLM_LOCKSPACE_LEN 64
46 47
@@ -69,12 +70,12 @@ struct dlm_mhandle;
69#define log_error(ls, fmt, args...) \ 70#define log_error(ls, fmt, args...) \
70 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) 71 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
71 72
72#define DLM_LOG_DEBUG 73#define log_debug(ls, fmt, args...) \
73#ifdef DLM_LOG_DEBUG 74do { \
74#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args) 75 if (dlm_config.ci_log_debug) \
75#else 76 printk(KERN_DEBUG "dlm: %s: " fmt "\n", \
76#define log_debug(ls, fmt, args...) 77 (ls)->ls_name , ##args); \
77#endif 78} while (0)
78 79
79#define DLM_ASSERT(x, do) \ 80#define DLM_ASSERT(x, do) \
80{ \ 81{ \
@@ -309,8 +310,8 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
309 310
310/* dlm_header is first element of all structs sent between nodes */ 311/* dlm_header is first element of all structs sent between nodes */
311 312
312#define DLM_HEADER_MAJOR 0x00020000 313#define DLM_HEADER_MAJOR 0x00030000
313#define DLM_HEADER_MINOR 0x00000001 314#define DLM_HEADER_MINOR 0x00000000
314 315
315#define DLM_MSG 1 316#define DLM_MSG 1
316#define DLM_RCOM 2 317#define DLM_RCOM 2
@@ -386,6 +387,8 @@ struct dlm_rcom {
386 uint32_t rc_type; /* DLM_RCOM_ */ 387 uint32_t rc_type; /* DLM_RCOM_ */
387 int rc_result; /* multi-purpose */ 388 int rc_result; /* multi-purpose */
388 uint64_t rc_id; /* match reply with request */ 389 uint64_t rc_id; /* match reply with request */
390 uint64_t rc_seq; /* sender's ls_recover_seq */
391 uint64_t rc_seq_reply; /* remote ls_recover_seq */
389 char rc_buf[0]; 392 char rc_buf[0];
390}; 393};
391 394
@@ -523,6 +526,7 @@ struct dlm_user_proc {
523 spinlock_t asts_spin; 526 spinlock_t asts_spin;
524 struct list_head locks; 527 struct list_head locks;
525 spinlock_t locks_spin; 528 spinlock_t locks_spin;
529 struct list_head unlocking;
526 wait_queue_head_t wait; 530 wait_queue_head_t wait;
527}; 531};
528 532
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 30878defaeb6..e725005fafd0 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -754,6 +754,11 @@ static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
754 mutex_unlock(&ls->ls_waiters_mutex); 754 mutex_unlock(&ls->ls_waiters_mutex);
755} 755}
756 756
757/* We clear the RESEND flag because we might be taking an lkb off the waiters
758 list as part of process_requestqueue (e.g. a lookup that has an optimized
759 request reply on the requestqueue) between dlm_recover_waiters_pre() which
760 set RESEND and dlm_recover_waiters_post() */
761
757static int _remove_from_waiters(struct dlm_lkb *lkb) 762static int _remove_from_waiters(struct dlm_lkb *lkb)
758{ 763{
759 int error = 0; 764 int error = 0;
@@ -764,6 +769,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb)
764 goto out; 769 goto out;
765 } 770 }
766 lkb->lkb_wait_type = 0; 771 lkb->lkb_wait_type = 0;
772 lkb->lkb_flags &= ~DLM_IFL_RESEND;
767 list_del(&lkb->lkb_wait_reply); 773 list_del(&lkb->lkb_wait_reply);
768 unhold_lkb(lkb); 774 unhold_lkb(lkb);
769 out: 775 out:
@@ -810,7 +816,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
810 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, 816 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
811 res_hashchain) { 817 res_hashchain) {
812 if (!time_after_eq(jiffies, r->res_toss_time + 818 if (!time_after_eq(jiffies, r->res_toss_time +
813 dlm_config.toss_secs * HZ)) 819 dlm_config.ci_toss_secs * HZ))
814 continue; 820 continue;
815 found = 1; 821 found = 1;
816 break; 822 break;
@@ -2144,12 +2150,24 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2144 if (lkb->lkb_astaddr) 2150 if (lkb->lkb_astaddr)
2145 ms->m_asts |= AST_COMP; 2151 ms->m_asts |= AST_COMP;
2146 2152
2147 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP) 2153 /* compare with switch in create_message; send_remove() doesn't
2148 memcpy(ms->m_extra, r->res_name, r->res_length); 2154 use send_args() */
2149 2155
2150 else if (lkb->lkb_lvbptr) 2156 switch (ms->m_type) {
2157 case DLM_MSG_REQUEST:
2158 case DLM_MSG_LOOKUP:
2159 memcpy(ms->m_extra, r->res_name, r->res_length);
2160 break;
2161 case DLM_MSG_CONVERT:
2162 case DLM_MSG_UNLOCK:
2163 case DLM_MSG_REQUEST_REPLY:
2164 case DLM_MSG_CONVERT_REPLY:
2165 case DLM_MSG_GRANT:
2166 if (!lkb->lkb_lvbptr)
2167 break;
2151 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); 2168 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2152 2169 break;
2170 }
2153} 2171}
2154 2172
2155static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) 2173static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
@@ -2418,8 +2436,12 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2418 2436
2419 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb);); 2437 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2420 2438
2421 if (receive_lvb(ls, lkb, ms)) 2439 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2422 return -ENOMEM; 2440 /* lkb was just created so there won't be an lvb yet */
2441 lkb->lkb_lvbptr = allocate_lvb(ls);
2442 if (!lkb->lkb_lvbptr)
2443 return -ENOMEM;
2444 }
2423 2445
2424 return 0; 2446 return 0;
2425} 2447}
@@ -3002,7 +3024,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3002{ 3024{
3003 struct dlm_message *ms = (struct dlm_message *) hd; 3025 struct dlm_message *ms = (struct dlm_message *) hd;
3004 struct dlm_ls *ls; 3026 struct dlm_ls *ls;
3005 int error; 3027 int error = 0;
3006 3028
3007 if (!recovery) 3029 if (!recovery)
3008 dlm_message_in(ms); 3030 dlm_message_in(ms);
@@ -3119,7 +3141,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3119 out: 3141 out:
3120 dlm_put_lockspace(ls); 3142 dlm_put_lockspace(ls);
3121 dlm_astd_wake(); 3143 dlm_astd_wake();
3122 return 0; 3144 return error;
3123} 3145}
3124 3146
3125 3147
@@ -3132,6 +3154,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3132 if (middle_conversion(lkb)) { 3154 if (middle_conversion(lkb)) {
3133 hold_lkb(lkb); 3155 hold_lkb(lkb);
3134 ls->ls_stub_ms.m_result = -EINPROGRESS; 3156 ls->ls_stub_ms.m_result = -EINPROGRESS;
3157 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3135 _remove_from_waiters(lkb); 3158 _remove_from_waiters(lkb);
3136 _receive_convert_reply(lkb, &ls->ls_stub_ms); 3159 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3137 3160
@@ -3205,6 +3228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3205 case DLM_MSG_UNLOCK: 3228 case DLM_MSG_UNLOCK:
3206 hold_lkb(lkb); 3229 hold_lkb(lkb);
3207 ls->ls_stub_ms.m_result = -DLM_EUNLOCK; 3230 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3231 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3208 _remove_from_waiters(lkb); 3232 _remove_from_waiters(lkb);
3209 _receive_unlock_reply(lkb, &ls->ls_stub_ms); 3233 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3210 dlm_put_lkb(lkb); 3234 dlm_put_lkb(lkb);
@@ -3213,6 +3237,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3213 case DLM_MSG_CANCEL: 3237 case DLM_MSG_CANCEL:
3214 hold_lkb(lkb); 3238 hold_lkb(lkb);
3215 ls->ls_stub_ms.m_result = -DLM_ECANCEL; 3239 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3240 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3216 _remove_from_waiters(lkb); 3241 _remove_from_waiters(lkb);
3217 _receive_cancel_reply(lkb, &ls->ls_stub_ms); 3242 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3218 dlm_put_lkb(lkb); 3243 dlm_put_lkb(lkb);
@@ -3571,6 +3596,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3571 lock_rsb(r); 3596 lock_rsb(r);
3572 3597
3573 switch (error) { 3598 switch (error) {
3599 case -EBADR:
3600 /* There's a chance the new master received our lock before
3601 dlm_recover_master_reply(), this wouldn't happen if we did
3602 a barrier between recover_masters and recover_locks. */
3603 log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id,
3604 (unsigned long)r, r->res_name);
3605 dlm_send_rcom_lock(r, lkb);
3606 goto out;
3574 case -EEXIST: 3607 case -EEXIST:
3575 log_debug(ls, "master copy exists %x", lkb->lkb_id); 3608 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3576 /* fall through */ 3609 /* fall through */
@@ -3585,7 +3618,7 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3585 /* an ack for dlm_recover_locks() which waits for replies from 3618 /* an ack for dlm_recover_locks() which waits for replies from
3586 all the locks it sends to new masters */ 3619 all the locks it sends to new masters */
3587 dlm_recovered_lock(r); 3620 dlm_recovered_lock(r);
3588 3621 out:
3589 unlock_rsb(r); 3622 unlock_rsb(r);
3590 put_rsb(r); 3623 put_rsb(r);
3591 dlm_put_lkb(lkb); 3624 dlm_put_lkb(lkb);
@@ -3610,7 +3643,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
3610 } 3643 }
3611 3644
3612 if (flags & DLM_LKF_VALBLK) { 3645 if (flags & DLM_LKF_VALBLK) {
3613 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL); 3646 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3614 if (!ua->lksb.sb_lvbptr) { 3647 if (!ua->lksb.sb_lvbptr) {
3615 kfree(ua); 3648 kfree(ua);
3616 __put_lkb(ls, lkb); 3649 __put_lkb(ls, lkb);
@@ -3679,7 +3712,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3679 ua = (struct dlm_user_args *)lkb->lkb_astparam; 3712 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3680 3713
3681 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { 3714 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
3682 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL); 3715 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3683 if (!ua->lksb.sb_lvbptr) { 3716 if (!ua->lksb.sb_lvbptr) {
3684 error = -ENOMEM; 3717 error = -ENOMEM;
3685 goto out_put; 3718 goto out_put;
@@ -3745,12 +3778,10 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3745 goto out_put; 3778 goto out_put;
3746 3779
3747 spin_lock(&ua->proc->locks_spin); 3780 spin_lock(&ua->proc->locks_spin);
3748 list_del_init(&lkb->lkb_ownqueue); 3781 /* dlm_user_add_ast() may have already taken lkb off the proc list */
3782 if (!list_empty(&lkb->lkb_ownqueue))
3783 list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
3749 spin_unlock(&ua->proc->locks_spin); 3784 spin_unlock(&ua->proc->locks_spin);
3750
3751 /* this removes the reference for the proc->locks list added by
3752 dlm_user_request */
3753 unhold_lkb(lkb);
3754 out_put: 3785 out_put:
3755 dlm_put_lkb(lkb); 3786 dlm_put_lkb(lkb);
3756 out: 3787 out:
@@ -3790,9 +3821,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3790 /* this lkb was removed from the WAITING queue */ 3821 /* this lkb was removed from the WAITING queue */
3791 if (lkb->lkb_grmode == DLM_LOCK_IV) { 3822 if (lkb->lkb_grmode == DLM_LOCK_IV) {
3792 spin_lock(&ua->proc->locks_spin); 3823 spin_lock(&ua->proc->locks_spin);
3793 list_del_init(&lkb->lkb_ownqueue); 3824 list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
3794 spin_unlock(&ua->proc->locks_spin); 3825 spin_unlock(&ua->proc->locks_spin);
3795 unhold_lkb(lkb);
3796 } 3826 }
3797 out_put: 3827 out_put:
3798 dlm_put_lkb(lkb); 3828 dlm_put_lkb(lkb);
@@ -3853,11 +3883,6 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3853 mutex_lock(&ls->ls_clear_proc_locks); 3883 mutex_lock(&ls->ls_clear_proc_locks);
3854 3884
3855 list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) { 3885 list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
3856 if (lkb->lkb_ast_type) {
3857 list_del(&lkb->lkb_astqueue);
3858 unhold_lkb(lkb);
3859 }
3860
3861 list_del_init(&lkb->lkb_ownqueue); 3886 list_del_init(&lkb->lkb_ownqueue);
3862 3887
3863 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) { 3888 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
@@ -3874,6 +3899,20 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3874 3899
3875 dlm_put_lkb(lkb); 3900 dlm_put_lkb(lkb);
3876 } 3901 }
3902
3903 /* in-progress unlocks */
3904 list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
3905 list_del_init(&lkb->lkb_ownqueue);
3906 lkb->lkb_flags |= DLM_IFL_DEAD;
3907 dlm_put_lkb(lkb);
3908 }
3909
3910 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
3911 list_del(&lkb->lkb_astqueue);
3912 dlm_put_lkb(lkb);
3913 }
3914
3877 mutex_unlock(&ls->ls_clear_proc_locks); 3915 mutex_unlock(&ls->ls_clear_proc_locks);
3878 unlock_recovery(ls); 3916 unlock_recovery(ls);
3879} 3917}
3918
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 59012b089e8d..f40817b53c6f 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -236,7 +236,7 @@ static int dlm_scand(void *data)
236 while (!kthread_should_stop()) { 236 while (!kthread_should_stop()) {
237 list_for_each_entry(ls, &lslist, ls_list) 237 list_for_each_entry(ls, &lslist, ls_list)
238 dlm_scan_rsbs(ls); 238 dlm_scan_rsbs(ls);
239 schedule_timeout_interruptible(dlm_config.scan_secs * HZ); 239 schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
240 } 240 }
241 return 0; 241 return 0;
242} 242}
@@ -422,7 +422,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
422 ls->ls_count = 0; 422 ls->ls_count = 0;
423 ls->ls_flags = 0; 423 ls->ls_flags = 0;
424 424
425 size = dlm_config.rsbtbl_size; 425 size = dlm_config.ci_rsbtbl_size;
426 ls->ls_rsbtbl_size = size; 426 ls->ls_rsbtbl_size = size;
427 427
428 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); 428 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
@@ -434,7 +434,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
434 rwlock_init(&ls->ls_rsbtbl[i].lock); 434 rwlock_init(&ls->ls_rsbtbl[i].lock);
435 } 435 }
436 436
437 size = dlm_config.lkbtbl_size; 437 size = dlm_config.ci_lkbtbl_size;
438 ls->ls_lkbtbl_size = size; 438 ls->ls_lkbtbl_size = size;
439 439
440 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); 440 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
@@ -446,7 +446,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
446 ls->ls_lkbtbl[i].counter = 1; 446 ls->ls_lkbtbl[i].counter = 1;
447 } 447 }
448 448
449 size = dlm_config.dirtbl_size; 449 size = dlm_config.ci_dirtbl_size;
450 ls->ls_dirtbl_size = size; 450 ls->ls_dirtbl_size = size;
451 451
452 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); 452 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
@@ -489,7 +489,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
489 mutex_init(&ls->ls_requestqueue_mutex); 489 mutex_init(&ls->ls_requestqueue_mutex);
490 mutex_init(&ls->ls_clear_proc_locks); 490 mutex_init(&ls->ls_clear_proc_locks);
491 491
492 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL); 492 ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
493 if (!ls->ls_recover_buf) 493 if (!ls->ls_recover_buf)
494 goto out_dirfree; 494 goto out_dirfree;
495 495
diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c
index fe158d7a9285..dc83a9d979b5 100644
--- a/fs/dlm/lowcomms-sctp.c
+++ b/fs/dlm/lowcomms-sctp.c
@@ -72,6 +72,8 @@ struct nodeinfo {
72 struct list_head writequeue; /* outgoing writequeue_entries */ 72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock; 73 spinlock_t writequeue_lock;
74 int nodeid; 74 int nodeid;
75 struct work_struct swork; /* Send workqueue */
76 struct work_struct lwork; /* Locking workqueue */
75}; 77};
76 78
77static DEFINE_IDR(nodeinfo_idr); 79static DEFINE_IDR(nodeinfo_idr);
@@ -96,6 +98,7 @@ struct connection {
96 atomic_t waiting_requests; 98 atomic_t waiting_requests;
97 struct cbuf cb; 99 struct cbuf cb;
98 int eagain_flag; 100 int eagain_flag;
101 struct work_struct work; /* Send workqueue */
99}; 102};
100 103
101/* An entry waiting to be sent */ 104/* An entry waiting to be sent */
@@ -137,19 +140,23 @@ static void cbuf_eat(struct cbuf *cb, int n)
137static LIST_HEAD(write_nodes); 140static LIST_HEAD(write_nodes);
138static DEFINE_SPINLOCK(write_nodes_lock); 141static DEFINE_SPINLOCK(write_nodes_lock);
139 142
143
140/* Maximum number of incoming messages to process before 144/* Maximum number of incoming messages to process before
141 * doing a schedule() 145 * doing a schedule()
142 */ 146 */
143#define MAX_RX_MSG_COUNT 25 147#define MAX_RX_MSG_COUNT 25
144 148
145/* Manage daemons */ 149/* Work queues */
146static struct task_struct *recv_task; 150static struct workqueue_struct *recv_workqueue;
147static struct task_struct *send_task; 151static struct workqueue_struct *send_workqueue;
148static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_wait); 152static struct workqueue_struct *lock_workqueue;
149 153
150/* The SCTP connection */ 154/* The SCTP connection */
151static struct connection sctp_con; 155static struct connection sctp_con;
152 156
157static void process_send_sockets(struct work_struct *work);
158static void process_recv_sockets(struct work_struct *work);
159static void process_lock_request(struct work_struct *work);
153 160
154static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) 161static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
155{ 162{
@@ -222,6 +229,8 @@ static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
222 spin_lock_init(&ni->lock); 229 spin_lock_init(&ni->lock);
223 INIT_LIST_HEAD(&ni->writequeue); 230 INIT_LIST_HEAD(&ni->writequeue);
224 spin_lock_init(&ni->writequeue_lock); 231 spin_lock_init(&ni->writequeue_lock);
232 INIT_WORK(&ni->lwork, process_lock_request);
233 INIT_WORK(&ni->swork, process_send_sockets);
225 ni->nodeid = nodeid; 234 ni->nodeid = nodeid;
226 235
227 if (nodeid > max_nodeid) 236 if (nodeid > max_nodeid)
@@ -249,11 +258,8 @@ static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
249/* Data or notification available on socket */ 258/* Data or notification available on socket */
250static void lowcomms_data_ready(struct sock *sk, int count_unused) 259static void lowcomms_data_ready(struct sock *sk, int count_unused)
251{ 260{
252 atomic_inc(&sctp_con.waiting_requests);
253 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags)) 261 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
254 return; 262 queue_work(recv_workqueue, &sctp_con.work);
255
256 wake_up_interruptible(&lowcomms_recv_wait);
257} 263}
258 264
259 265
@@ -361,10 +367,10 @@ static void init_failed(void)
361 spin_lock_bh(&write_nodes_lock); 367 spin_lock_bh(&write_nodes_lock);
362 list_add_tail(&ni->write_list, &write_nodes); 368 list_add_tail(&ni->write_list, &write_nodes);
363 spin_unlock_bh(&write_nodes_lock); 369 spin_unlock_bh(&write_nodes_lock);
370 queue_work(send_workqueue, &ni->swork);
364 } 371 }
365 } 372 }
366 } 373 }
367 wake_up_process(send_task);
368} 374}
369 375
370/* Something happened to an association */ 376/* Something happened to an association */
@@ -446,8 +452,8 @@ static void process_sctp_notification(struct msghdr *msg, char *buf)
446 spin_lock_bh(&write_nodes_lock); 452 spin_lock_bh(&write_nodes_lock);
447 list_add_tail(&ni->write_list, &write_nodes); 453 list_add_tail(&ni->write_list, &write_nodes);
448 spin_unlock_bh(&write_nodes_lock); 454 spin_unlock_bh(&write_nodes_lock);
455 queue_work(send_workqueue, &ni->swork);
449 } 456 }
450 wake_up_process(send_task);
451 } 457 }
452 break; 458 break;
453 459
@@ -580,8 +586,8 @@ static int receive_from_sock(void)
580 spin_lock_bh(&write_nodes_lock); 586 spin_lock_bh(&write_nodes_lock);
581 list_add_tail(&ni->write_list, &write_nodes); 587 list_add_tail(&ni->write_list, &write_nodes);
582 spin_unlock_bh(&write_nodes_lock); 588 spin_unlock_bh(&write_nodes_lock);
589 queue_work(send_workqueue, &ni->swork);
583 } 590 }
584 wake_up_process(send_task);
585 } 591 }
586 } 592 }
587 593
@@ -590,6 +596,7 @@ static int receive_from_sock(void)
590 return 0; 596 return 0;
591 597
592 cbuf_add(&sctp_con.cb, ret); 598 cbuf_add(&sctp_con.cb, ret);
599 // PJC: TODO: Add to node's workqueue....can we ??
593 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid), 600 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
594 page_address(sctp_con.rx_page), 601 page_address(sctp_con.rx_page),
595 sctp_con.cb.base, sctp_con.cb.len, 602 sctp_con.cb.base, sctp_con.cb.len,
@@ -635,7 +642,7 @@ static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
635 642
636 if (result < 0) 643 if (result < 0)
637 log_print("Can't bind to port %d addr number %d", 644 log_print("Can't bind to port %d addr number %d",
638 dlm_config.tcp_port, num); 645 dlm_config.ci_tcp_port, num);
639 646
640 return result; 647 return result;
641} 648}
@@ -711,7 +718,7 @@ static int init_sock(void)
711 /* Bind to all interfaces. */ 718 /* Bind to all interfaces. */
712 for (i = 0; i < dlm_local_count; i++) { 719 for (i = 0; i < dlm_local_count; i++) {
713 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); 720 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
714 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len); 721 make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
715 722
716 result = add_bind_addr(&localaddr, addr_len, num); 723 result = add_bind_addr(&localaddr, addr_len, num);
717 if (result) 724 if (result)
@@ -820,7 +827,8 @@ void dlm_lowcomms_commit_buffer(void *arg)
820 spin_lock_bh(&write_nodes_lock); 827 spin_lock_bh(&write_nodes_lock);
821 list_add_tail(&ni->write_list, &write_nodes); 828 list_add_tail(&ni->write_list, &write_nodes);
822 spin_unlock_bh(&write_nodes_lock); 829 spin_unlock_bh(&write_nodes_lock);
823 wake_up_process(send_task); 830
831 queue_work(send_workqueue, &ni->swork);
824 } 832 }
825 return; 833 return;
826 834
@@ -863,7 +871,7 @@ static void initiate_association(int nodeid)
863 return; 871 return;
864 } 872 }
865 873
866 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen); 874 make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
867 875
868 outmessage.msg_name = &rem_addr; 876 outmessage.msg_name = &rem_addr;
869 outmessage.msg_namelen = addrlen; 877 outmessage.msg_namelen = addrlen;
@@ -1088,101 +1096,75 @@ int dlm_lowcomms_close(int nodeid)
1088 return 0; 1096 return 0;
1089} 1097}
1090 1098
1091static int write_list_empty(void) 1099// PJC: The work queue function for receiving.
1100static void process_recv_sockets(struct work_struct *work)
1092{ 1101{
1093 int status; 1102 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1094 1103 int ret;
1095 spin_lock_bh(&write_nodes_lock);
1096 status = list_empty(&write_nodes);
1097 spin_unlock_bh(&write_nodes_lock);
1098
1099 return status;
1100}
1101
1102static int dlm_recvd(void *data)
1103{
1104 DECLARE_WAITQUEUE(wait, current);
1105
1106 while (!kthread_should_stop()) {
1107 int count = 0; 1104 int count = 0;
1108 1105
1109 set_current_state(TASK_INTERRUPTIBLE); 1106 do {
1110 add_wait_queue(&lowcomms_recv_wait, &wait); 1107 ret = receive_from_sock();
1111 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1112 cond_resched();
1113 remove_wait_queue(&lowcomms_recv_wait, &wait);
1114 set_current_state(TASK_RUNNING);
1115
1116 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1117 int ret;
1118
1119 do {
1120 ret = receive_from_sock();
1121 1108
1122 /* Don't starve out everyone else */ 1109 /* Don't starve out everyone else */
1123 if (++count >= MAX_RX_MSG_COUNT) { 1110 if (++count >= MAX_RX_MSG_COUNT) {
1124 cond_resched(); 1111 cond_resched();
1125 count = 0; 1112 count = 0;
1126 } 1113 }
1127 } while (!kthread_should_stop() && ret >=0); 1114 } while (!kthread_should_stop() && ret >=0);
1128 }
1129 cond_resched();
1130 } 1115 }
1131 1116 cond_resched();
1132 return 0;
1133} 1117}
1134 1118
1135static int dlm_sendd(void *data) 1119// PJC: the work queue function for sending
1120static void process_send_sockets(struct work_struct *work)
1136{ 1121{
1137 DECLARE_WAITQUEUE(wait, current); 1122 if (sctp_con.eagain_flag) {
1138 1123 sctp_con.eagain_flag = 0;
1139 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); 1124 refill_write_queue();
1140
1141 while (!kthread_should_stop()) {
1142 set_current_state(TASK_INTERRUPTIBLE);
1143 if (write_list_empty())
1144 cond_resched();
1145 set_current_state(TASK_RUNNING);
1146
1147 if (sctp_con.eagain_flag) {
1148 sctp_con.eagain_flag = 0;
1149 refill_write_queue();
1150 }
1151 process_output_queue();
1152 } 1125 }
1126 process_output_queue();
1127}
1153 1128
1154 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); 1129// PJC: Process lock requests from a particular node.
1155 1130// TODO: can we optimise this out on UP ??
1156 return 0; 1131static void process_lock_request(struct work_struct *work)
1132{
1157} 1133}
1158 1134
1159static void daemons_stop(void) 1135static void daemons_stop(void)
1160{ 1136{
1161 kthread_stop(recv_task); 1137 destroy_workqueue(recv_workqueue);
1162 kthread_stop(send_task); 1138 destroy_workqueue(send_workqueue);
1139 destroy_workqueue(lock_workqueue);
1163} 1140}
1164 1141
1165static int daemons_start(void) 1142static int daemons_start(void)
1166{ 1143{
1167 struct task_struct *p;
1168 int error; 1144 int error;
1145 recv_workqueue = create_workqueue("dlm_recv");
1146 error = IS_ERR(recv_workqueue);
1147 if (error) {
1148 log_print("can't start dlm_recv %d", error);
1149 return error;
1150 }
1169 1151
1170 p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); 1152 send_workqueue = create_singlethread_workqueue("dlm_send");
1171 error = IS_ERR(p); 1153 error = IS_ERR(send_workqueue);
1172 if (error) { 1154 if (error) {
1173 log_print("can't start dlm_recvd %d", error); 1155 log_print("can't start dlm_send %d", error);
1156 destroy_workqueue(recv_workqueue);
1174 return error; 1157 return error;
1175 } 1158 }
1176 recv_task = p;
1177 1159
1178 p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); 1160 lock_workqueue = create_workqueue("dlm_rlock");
1179 error = IS_ERR(p); 1161 error = IS_ERR(lock_workqueue);
1180 if (error) { 1162 if (error) {
1181 log_print("can't start dlm_sendd %d", error); 1163 log_print("can't start dlm_rlock %d", error);
1182 kthread_stop(recv_task); 1164 destroy_workqueue(send_workqueue);
1165 destroy_workqueue(recv_workqueue);
1183 return error; 1166 return error;
1184 } 1167 }
1185 send_task = p;
1186 1168
1187 return 0; 1169 return 0;
1188} 1170}
@@ -1194,6 +1176,8 @@ int dlm_lowcomms_start(void)
1194{ 1176{
1195 int error; 1177 int error;
1196 1178
1179 INIT_WORK(&sctp_con.work, process_recv_sockets);
1180
1197 error = init_sock(); 1181 error = init_sock();
1198 if (error) 1182 if (error)
1199 goto fail_sock; 1183 goto fail_sock;
@@ -1224,4 +1208,3 @@ void dlm_lowcomms_stop(void)
1224 for (i = 0; i < dlm_local_count; i++) 1208 for (i = 0; i < dlm_local_count; i++)
1225 kfree(dlm_local_addr[i]); 1209 kfree(dlm_local_addr[i]);
1226} 1210}
1227
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c
index 9be3a440c42a..f1efd17b2614 100644
--- a/fs/dlm/lowcomms-tcp.c
+++ b/fs/dlm/lowcomms-tcp.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -96,10 +96,7 @@ static bool cbuf_empty(struct cbuf *cb)
96struct connection { 96struct connection {
97 struct socket *sock; /* NULL if not connected */ 97 struct socket *sock; /* NULL if not connected */
98 uint32_t nodeid; /* So we know who we are in the list */ 98 uint32_t nodeid; /* So we know who we are in the list */
99 struct rw_semaphore sock_sem; /* Stop connect races */ 99 struct mutex sock_mutex;
100 struct list_head read_list; /* On this list when ready for reading */
101 struct list_head write_list; /* On this list when ready for writing */
102 struct list_head state_list; /* On this list when ready to connect */
103 unsigned long flags; /* bit 1,2 = We are on the read/write lists */ 100 unsigned long flags; /* bit 1,2 = We are on the read/write lists */
104#define CF_READ_PENDING 1 101#define CF_READ_PENDING 1
105#define CF_WRITE_PENDING 2 102#define CF_WRITE_PENDING 2
@@ -112,9 +109,10 @@ struct connection {
112 struct page *rx_page; 109 struct page *rx_page;
113 struct cbuf cb; 110 struct cbuf cb;
114 int retries; 111 int retries;
115 atomic_t waiting_requests;
116#define MAX_CONNECT_RETRIES 3 112#define MAX_CONNECT_RETRIES 3
117 struct connection *othercon; 113 struct connection *othercon;
114 struct work_struct rwork; /* Receive workqueue */
115 struct work_struct swork; /* Send workqueue */
118}; 116};
119#define sock2con(x) ((struct connection *)(x)->sk_user_data) 117#define sock2con(x) ((struct connection *)(x)->sk_user_data)
120 118
@@ -131,14 +129,9 @@ struct writequeue_entry {
131 129
132static struct sockaddr_storage dlm_local_addr; 130static struct sockaddr_storage dlm_local_addr;
133 131
134/* Manage daemons */ 132/* Work queues */
135static struct task_struct *recv_task; 133static struct workqueue_struct *recv_workqueue;
136static struct task_struct *send_task; 134static struct workqueue_struct *send_workqueue;
137
138static wait_queue_t lowcomms_send_waitq_head;
139static DECLARE_WAIT_QUEUE_HEAD(lowcomms_send_waitq);
140static wait_queue_t lowcomms_recv_waitq_head;
141static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_waitq);
142 135
143/* An array of pointers to connections, indexed by NODEID */ 136/* An array of pointers to connections, indexed by NODEID */
144static struct connection **connections; 137static struct connection **connections;
@@ -146,17 +139,8 @@ static DECLARE_MUTEX(connections_lock);
146static struct kmem_cache *con_cache; 139static struct kmem_cache *con_cache;
147static int conn_array_size; 140static int conn_array_size;
148 141
149/* List of sockets that have reads pending */ 142static void process_recv_sockets(struct work_struct *work);
150static LIST_HEAD(read_sockets); 143static void process_send_sockets(struct work_struct *work);
151static DEFINE_SPINLOCK(read_sockets_lock);
152
153/* List of sockets which have writes pending */
154static LIST_HEAD(write_sockets);
155static DEFINE_SPINLOCK(write_sockets_lock);
156
157/* List of sockets which have connects pending */
158static LIST_HEAD(state_sockets);
159static DEFINE_SPINLOCK(state_sockets_lock);
160 144
161static struct connection *nodeid2con(int nodeid, gfp_t allocation) 145static struct connection *nodeid2con(int nodeid, gfp_t allocation)
162{ 146{
@@ -186,9 +170,11 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
186 goto finish; 170 goto finish;
187 171
188 con->nodeid = nodeid; 172 con->nodeid = nodeid;
189 init_rwsem(&con->sock_sem); 173 mutex_init(&con->sock_mutex);
190 INIT_LIST_HEAD(&con->writequeue); 174 INIT_LIST_HEAD(&con->writequeue);
191 spin_lock_init(&con->writequeue_lock); 175 spin_lock_init(&con->writequeue_lock);
176 INIT_WORK(&con->swork, process_send_sockets);
177 INIT_WORK(&con->rwork, process_recv_sockets);
192 178
193 connections[nodeid] = con; 179 connections[nodeid] = con;
194 } 180 }
@@ -203,41 +189,22 @@ static void lowcomms_data_ready(struct sock *sk, int count_unused)
203{ 189{
204 struct connection *con = sock2con(sk); 190 struct connection *con = sock2con(sk);
205 191
206 atomic_inc(&con->waiting_requests); 192 if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
207 if (test_and_set_bit(CF_READ_PENDING, &con->flags)) 193 queue_work(recv_workqueue, &con->rwork);
208 return;
209
210 spin_lock_bh(&read_sockets_lock);
211 list_add_tail(&con->read_list, &read_sockets);
212 spin_unlock_bh(&read_sockets_lock);
213
214 wake_up_interruptible(&lowcomms_recv_waitq);
215} 194}
216 195
217static void lowcomms_write_space(struct sock *sk) 196static void lowcomms_write_space(struct sock *sk)
218{ 197{
219 struct connection *con = sock2con(sk); 198 struct connection *con = sock2con(sk);
220 199
221 if (test_and_set_bit(CF_WRITE_PENDING, &con->flags)) 200 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
222 return; 201 queue_work(send_workqueue, &con->swork);
223
224 spin_lock_bh(&write_sockets_lock);
225 list_add_tail(&con->write_list, &write_sockets);
226 spin_unlock_bh(&write_sockets_lock);
227
228 wake_up_interruptible(&lowcomms_send_waitq);
229} 202}
230 203
231static inline void lowcomms_connect_sock(struct connection *con) 204static inline void lowcomms_connect_sock(struct connection *con)
232{ 205{
233 if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) 206 if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
234 return; 207 queue_work(send_workqueue, &con->swork);
235
236 spin_lock_bh(&state_sockets_lock);
237 list_add_tail(&con->state_list, &state_sockets);
238 spin_unlock_bh(&state_sockets_lock);
239
240 wake_up_interruptible(&lowcomms_send_waitq);
241} 208}
242 209
243static void lowcomms_state_change(struct sock *sk) 210static void lowcomms_state_change(struct sock *sk)
@@ -279,7 +246,7 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
279/* Close a remote connection and tidy up */ 246/* Close a remote connection and tidy up */
280static void close_connection(struct connection *con, bool and_other) 247static void close_connection(struct connection *con, bool and_other)
281{ 248{
282 down_write(&con->sock_sem); 249 mutex_lock(&con->sock_mutex);
283 250
284 if (con->sock) { 251 if (con->sock) {
285 sock_release(con->sock); 252 sock_release(con->sock);
@@ -294,7 +261,7 @@ static void close_connection(struct connection *con, bool and_other)
294 con->rx_page = NULL; 261 con->rx_page = NULL;
295 } 262 }
296 con->retries = 0; 263 con->retries = 0;
297 up_write(&con->sock_sem); 264 mutex_unlock(&con->sock_mutex);
298} 265}
299 266
300/* Data received from remote end */ 267/* Data received from remote end */
@@ -308,10 +275,13 @@ static int receive_from_sock(struct connection *con)
308 int r; 275 int r;
309 int call_again_soon = 0; 276 int call_again_soon = 0;
310 277
311 down_read(&con->sock_sem); 278 mutex_lock(&con->sock_mutex);
279
280 if (con->sock == NULL) {
281 ret = -EAGAIN;
282 goto out_close;
283 }
312 284
313 if (con->sock == NULL)
314 goto out;
315 if (con->rx_page == NULL) { 285 if (con->rx_page == NULL) {
316 /* 286 /*
317 * This doesn't need to be atomic, but I think it should 287 * This doesn't need to be atomic, but I think it should
@@ -359,6 +329,9 @@ static int receive_from_sock(struct connection *con)
359 329
360 if (ret <= 0) 330 if (ret <= 0)
361 goto out_close; 331 goto out_close;
332 if (ret == -EAGAIN)
333 goto out_resched;
334
362 if (ret == len) 335 if (ret == len)
363 call_again_soon = 1; 336 call_again_soon = 1;
364 cbuf_add(&con->cb, ret); 337 cbuf_add(&con->cb, ret);
@@ -381,24 +354,26 @@ static int receive_from_sock(struct connection *con)
381 con->rx_page = NULL; 354 con->rx_page = NULL;
382 } 355 }
383 356
384out:
385 if (call_again_soon) 357 if (call_again_soon)
386 goto out_resched; 358 goto out_resched;
387 up_read(&con->sock_sem); 359 mutex_unlock(&con->sock_mutex);
388 return 0; 360 return 0;
389 361
390out_resched: 362out_resched:
391 lowcomms_data_ready(con->sock->sk, 0); 363 if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
392 up_read(&con->sock_sem); 364 queue_work(recv_workqueue, &con->rwork);
393 cond_resched(); 365 mutex_unlock(&con->sock_mutex);
394 return 0; 366 return -EAGAIN;
395 367
396out_close: 368out_close:
397 up_read(&con->sock_sem); 369 mutex_unlock(&con->sock_mutex);
398 if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) { 370 if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
399 close_connection(con, false); 371 close_connection(con, false);
400 /* Reconnect when there is something to send */ 372 /* Reconnect when there is something to send */
401 } 373 }
374 /* Don't return success if we really got EOF */
375 if (ret == 0)
376 ret = -EAGAIN;
402 377
403 return ret; 378 return ret;
404} 379}
@@ -412,6 +387,7 @@ static int accept_from_sock(struct connection *con)
412 int len; 387 int len;
413 int nodeid; 388 int nodeid;
414 struct connection *newcon; 389 struct connection *newcon;
390 struct connection *addcon;
415 391
416 memset(&peeraddr, 0, sizeof(peeraddr)); 392 memset(&peeraddr, 0, sizeof(peeraddr));
417 result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, 393 result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
@@ -419,7 +395,7 @@ static int accept_from_sock(struct connection *con)
419 if (result < 0) 395 if (result < 0)
420 return -ENOMEM; 396 return -ENOMEM;
421 397
422 down_read(&con->sock_sem); 398 mutex_lock_nested(&con->sock_mutex, 0);
423 399
424 result = -ENOTCONN; 400 result = -ENOTCONN;
425 if (con->sock == NULL) 401 if (con->sock == NULL)
@@ -445,7 +421,7 @@ static int accept_from_sock(struct connection *con)
445 if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { 421 if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
446 printk("dlm: connect from non cluster node\n"); 422 printk("dlm: connect from non cluster node\n");
447 sock_release(newsock); 423 sock_release(newsock);
448 up_read(&con->sock_sem); 424 mutex_unlock(&con->sock_mutex);
449 return -1; 425 return -1;
450 } 426 }
451 427
@@ -462,7 +438,7 @@ static int accept_from_sock(struct connection *con)
462 result = -ENOMEM; 438 result = -ENOMEM;
463 goto accept_err; 439 goto accept_err;
464 } 440 }
465 down_write(&newcon->sock_sem); 441 mutex_lock_nested(&newcon->sock_mutex, 1);
466 if (newcon->sock) { 442 if (newcon->sock) {
467 struct connection *othercon = newcon->othercon; 443 struct connection *othercon = newcon->othercon;
468 444
@@ -470,41 +446,45 @@ static int accept_from_sock(struct connection *con)
470 othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL); 446 othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
471 if (!othercon) { 447 if (!othercon) {
472 printk("dlm: failed to allocate incoming socket\n"); 448 printk("dlm: failed to allocate incoming socket\n");
473 up_write(&newcon->sock_sem); 449 mutex_unlock(&newcon->sock_mutex);
474 result = -ENOMEM; 450 result = -ENOMEM;
475 goto accept_err; 451 goto accept_err;
476 } 452 }
477 othercon->nodeid = nodeid; 453 othercon->nodeid = nodeid;
478 othercon->rx_action = receive_from_sock; 454 othercon->rx_action = receive_from_sock;
479 init_rwsem(&othercon->sock_sem); 455 mutex_init(&othercon->sock_mutex);
456 INIT_WORK(&othercon->swork, process_send_sockets);
457 INIT_WORK(&othercon->rwork, process_recv_sockets);
480 set_bit(CF_IS_OTHERCON, &othercon->flags); 458 set_bit(CF_IS_OTHERCON, &othercon->flags);
481 newcon->othercon = othercon; 459 newcon->othercon = othercon;
482 } 460 }
483 othercon->sock = newsock; 461 othercon->sock = newsock;
484 newsock->sk->sk_user_data = othercon; 462 newsock->sk->sk_user_data = othercon;
485 add_sock(newsock, othercon); 463 add_sock(newsock, othercon);
464 addcon = othercon;
486 } 465 }
487 else { 466 else {
488 newsock->sk->sk_user_data = newcon; 467 newsock->sk->sk_user_data = newcon;
489 newcon->rx_action = receive_from_sock; 468 newcon->rx_action = receive_from_sock;
490 add_sock(newsock, newcon); 469 add_sock(newsock, newcon);
491 470 addcon = newcon;
492 } 471 }
493 472
494 up_write(&newcon->sock_sem); 473 mutex_unlock(&newcon->sock_mutex);
495 474
496 /* 475 /*
497 * Add it to the active queue in case we got data 476 * Add it to the active queue in case we got data
498 * beween processing the accept adding the socket 477 * beween processing the accept adding the socket
499 * to the read_sockets list 478 * to the read_sockets list
500 */ 479 */
501 lowcomms_data_ready(newsock->sk, 0); 480 if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
502 up_read(&con->sock_sem); 481 queue_work(recv_workqueue, &addcon->rwork);
482 mutex_unlock(&con->sock_mutex);
503 483
504 return 0; 484 return 0;
505 485
506accept_err: 486accept_err:
507 up_read(&con->sock_sem); 487 mutex_unlock(&con->sock_mutex);
508 sock_release(newsock); 488 sock_release(newsock);
509 489
510 if (result != -EAGAIN) 490 if (result != -EAGAIN)
@@ -525,7 +505,7 @@ static void connect_to_sock(struct connection *con)
525 return; 505 return;
526 } 506 }
527 507
528 down_write(&con->sock_sem); 508 mutex_lock(&con->sock_mutex);
529 if (con->retries++ > MAX_CONNECT_RETRIES) 509 if (con->retries++ > MAX_CONNECT_RETRIES)
530 goto out; 510 goto out;
531 511
@@ -548,7 +528,7 @@ static void connect_to_sock(struct connection *con)
548 sock->sk->sk_user_data = con; 528 sock->sk->sk_user_data = con;
549 con->rx_action = receive_from_sock; 529 con->rx_action = receive_from_sock;
550 530
551 make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len); 531 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
552 532
553 add_sock(sock, con); 533 add_sock(sock, con);
554 534
@@ -577,7 +557,7 @@ out_err:
577 result = 0; 557 result = 0;
578 } 558 }
579out: 559out:
580 up_write(&con->sock_sem); 560 mutex_unlock(&con->sock_mutex);
581 return; 561 return;
582} 562}
583 563
@@ -616,10 +596,10 @@ static struct socket *create_listen_sock(struct connection *con,
616 con->sock = sock; 596 con->sock = sock;
617 597
618 /* Bind to our port */ 598 /* Bind to our port */
619 make_sockaddr(saddr, dlm_config.tcp_port, &addr_len); 599 make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
620 result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); 600 result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
621 if (result < 0) { 601 if (result < 0) {
622 printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port); 602 printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port);
623 sock_release(sock); 603 sock_release(sock);
624 sock = NULL; 604 sock = NULL;
625 con->sock = NULL; 605 con->sock = NULL;
@@ -638,7 +618,7 @@ static struct socket *create_listen_sock(struct connection *con,
638 618
639 result = sock->ops->listen(sock, 5); 619 result = sock->ops->listen(sock, 5);
640 if (result < 0) { 620 if (result < 0) {
641 printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port); 621 printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port);
642 sock_release(sock); 622 sock_release(sock);
643 sock = NULL; 623 sock = NULL;
644 goto create_out; 624 goto create_out;
@@ -709,6 +689,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len,
709 if (!con) 689 if (!con)
710 return NULL; 690 return NULL;
711 691
692 spin_lock(&con->writequeue_lock);
712 e = list_entry(con->writequeue.prev, struct writequeue_entry, list); 693 e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
713 if ((&e->list == &con->writequeue) || 694 if ((&e->list == &con->writequeue) ||
714 (PAGE_CACHE_SIZE - e->end < len)) { 695 (PAGE_CACHE_SIZE - e->end < len)) {
@@ -747,6 +728,7 @@ void dlm_lowcomms_commit_buffer(void *mh)
747 struct connection *con = e->con; 728 struct connection *con = e->con;
748 int users; 729 int users;
749 730
731 spin_lock(&con->writequeue_lock);
750 users = --e->users; 732 users = --e->users;
751 if (users) 733 if (users)
752 goto out; 734 goto out;
@@ -754,12 +736,8 @@ void dlm_lowcomms_commit_buffer(void *mh)
754 kunmap(e->page); 736 kunmap(e->page);
755 spin_unlock(&con->writequeue_lock); 737 spin_unlock(&con->writequeue_lock);
756 738
757 if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) { 739 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
758 spin_lock_bh(&write_sockets_lock); 740 queue_work(send_workqueue, &con->swork);
759 list_add_tail(&con->write_list, &write_sockets);
760 spin_unlock_bh(&write_sockets_lock);
761
762 wake_up_interruptible(&lowcomms_send_waitq);
763 } 741 }
764 return; 742 return;
765 743
@@ -783,7 +761,7 @@ static void send_to_sock(struct connection *con)
783 struct writequeue_entry *e; 761 struct writequeue_entry *e;
784 int len, offset; 762 int len, offset;
785 763
786 down_read(&con->sock_sem); 764 mutex_lock(&con->sock_mutex);
787 if (con->sock == NULL) 765 if (con->sock == NULL)
788 goto out_connect; 766 goto out_connect;
789 767
@@ -800,6 +778,7 @@ static void send_to_sock(struct connection *con)
800 offset = e->offset; 778 offset = e->offset;
801 BUG_ON(len == 0 && e->users == 0); 779 BUG_ON(len == 0 && e->users == 0);
802 spin_unlock(&con->writequeue_lock); 780 spin_unlock(&con->writequeue_lock);
781 kmap(e->page);
803 782
804 ret = 0; 783 ret = 0;
805 if (len) { 784 if (len) {
@@ -828,18 +807,18 @@ static void send_to_sock(struct connection *con)
828 } 807 }
829 spin_unlock(&con->writequeue_lock); 808 spin_unlock(&con->writequeue_lock);
830out: 809out:
831 up_read(&con->sock_sem); 810 mutex_unlock(&con->sock_mutex);
832 return; 811 return;
833 812
834send_error: 813send_error:
835 up_read(&con->sock_sem); 814 mutex_unlock(&con->sock_mutex);
836 close_connection(con, false); 815 close_connection(con, false);
837 lowcomms_connect_sock(con); 816 lowcomms_connect_sock(con);
838 return; 817 return;
839 818
840out_connect: 819out_connect:
841 up_read(&con->sock_sem); 820 mutex_unlock(&con->sock_mutex);
842 lowcomms_connect_sock(con); 821 connect_to_sock(con);
843 return; 822 return;
844} 823}
845 824
@@ -872,7 +851,6 @@ int dlm_lowcomms_close(int nodeid)
872 if (con) { 851 if (con) {
873 clean_one_writequeue(con); 852 clean_one_writequeue(con);
874 close_connection(con, true); 853 close_connection(con, true);
875 atomic_set(&con->waiting_requests, 0);
876 } 854 }
877 return 0; 855 return 0;
878 856
@@ -880,102 +858,29 @@ out:
880 return -1; 858 return -1;
881} 859}
882 860
883/* API send message call, may queue the request */
884/* N.B. This is the old interface - use the new one for new calls */
885int lowcomms_send_message(int nodeid, char *buf, int len, gfp_t allocation)
886{
887 struct writequeue_entry *e;
888 char *b;
889
890 e = dlm_lowcomms_get_buffer(nodeid, len, allocation, &b);
891 if (e) {
892 memcpy(b, buf, len);
893 dlm_lowcomms_commit_buffer(e);
894 return 0;
895 }
896 return -ENOBUFS;
897}
898
899/* Look for activity on active sockets */ 861/* Look for activity on active sockets */
900static void process_sockets(void) 862static void process_recv_sockets(struct work_struct *work)
901{ 863{
902 struct list_head *list; 864 struct connection *con = container_of(work, struct connection, rwork);
903 struct list_head *temp; 865 int err;
904 int count = 0;
905
906 spin_lock_bh(&read_sockets_lock);
907 list_for_each_safe(list, temp, &read_sockets) {
908 866
909 struct connection *con = 867 clear_bit(CF_READ_PENDING, &con->flags);
910 list_entry(list, struct connection, read_list); 868 do {
911 list_del(&con->read_list); 869 err = con->rx_action(con);
912 clear_bit(CF_READ_PENDING, &con->flags); 870 } while (!err);
913
914 spin_unlock_bh(&read_sockets_lock);
915
916 /* This can reach zero if we are processing requests
917 * as they come in.
918 */
919 if (atomic_read(&con->waiting_requests) == 0) {
920 spin_lock_bh(&read_sockets_lock);
921 continue;
922 }
923
924 do {
925 con->rx_action(con);
926
927 /* Don't starve out everyone else */
928 if (++count >= MAX_RX_MSG_COUNT) {
929 cond_resched();
930 count = 0;
931 }
932
933 } while (!atomic_dec_and_test(&con->waiting_requests) &&
934 !kthread_should_stop());
935
936 spin_lock_bh(&read_sockets_lock);
937 }
938 spin_unlock_bh(&read_sockets_lock);
939} 871}
940 872
941/* Try to send any messages that are pending
942 */
943static void process_output_queue(void)
944{
945 struct list_head *list;
946 struct list_head *temp;
947
948 spin_lock_bh(&write_sockets_lock);
949 list_for_each_safe(list, temp, &write_sockets) {
950 struct connection *con =
951 list_entry(list, struct connection, write_list);
952 clear_bit(CF_WRITE_PENDING, &con->flags);
953 list_del(&con->write_list);
954
955 spin_unlock_bh(&write_sockets_lock);
956 send_to_sock(con);
957 spin_lock_bh(&write_sockets_lock);
958 }
959 spin_unlock_bh(&write_sockets_lock);
960}
961 873
962static void process_state_queue(void) 874static void process_send_sockets(struct work_struct *work)
963{ 875{
964 struct list_head *list; 876 struct connection *con = container_of(work, struct connection, swork);
965 struct list_head *temp;
966
967 spin_lock_bh(&state_sockets_lock);
968 list_for_each_safe(list, temp, &state_sockets) {
969 struct connection *con =
970 list_entry(list, struct connection, state_list);
971 list_del(&con->state_list);
972 clear_bit(CF_CONNECT_PENDING, &con->flags);
973 spin_unlock_bh(&state_sockets_lock);
974 877
878 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
975 connect_to_sock(con); 879 connect_to_sock(con);
976 spin_lock_bh(&state_sockets_lock);
977 } 880 }
978 spin_unlock_bh(&state_sockets_lock); 881
882 clear_bit(CF_WRITE_PENDING, &con->flags);
883 send_to_sock(con);
979} 884}
980 885
981 886
@@ -992,109 +897,33 @@ static void clean_writequeues(void)
992 } 897 }
993} 898}
994 899
995static int read_list_empty(void) 900static void work_stop(void)
996{ 901{
997 int status; 902 destroy_workqueue(recv_workqueue);
998 903 destroy_workqueue(send_workqueue);
999 spin_lock_bh(&read_sockets_lock);
1000 status = list_empty(&read_sockets);
1001 spin_unlock_bh(&read_sockets_lock);
1002
1003 return status;
1004}
1005
1006/* DLM Transport comms receive daemon */
1007static int dlm_recvd(void *data)
1008{
1009 init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
1010 add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
1011
1012 while (!kthread_should_stop()) {
1013 set_current_state(TASK_INTERRUPTIBLE);
1014 if (read_list_empty())
1015 cond_resched();
1016 set_current_state(TASK_RUNNING);
1017
1018 process_sockets();
1019 }
1020
1021 return 0;
1022} 904}
1023 905
1024static int write_and_state_lists_empty(void) 906static int work_start(void)
1025{ 907{
1026 int status;
1027
1028 spin_lock_bh(&write_sockets_lock);
1029 status = list_empty(&write_sockets);
1030 spin_unlock_bh(&write_sockets_lock);
1031
1032 spin_lock_bh(&state_sockets_lock);
1033 if (list_empty(&state_sockets) == 0)
1034 status = 0;
1035 spin_unlock_bh(&state_sockets_lock);
1036
1037 return status;
1038}
1039
1040/* DLM Transport send daemon */
1041static int dlm_sendd(void *data)
1042{
1043 init_waitqueue_entry(&lowcomms_send_waitq_head, current);
1044 add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
1045
1046 while (!kthread_should_stop()) {
1047 set_current_state(TASK_INTERRUPTIBLE);
1048 if (write_and_state_lists_empty())
1049 cond_resched();
1050 set_current_state(TASK_RUNNING);
1051
1052 process_state_queue();
1053 process_output_queue();
1054 }
1055
1056 return 0;
1057}
1058
1059static void daemons_stop(void)
1060{
1061 kthread_stop(recv_task);
1062 kthread_stop(send_task);
1063}
1064
1065static int daemons_start(void)
1066{
1067 struct task_struct *p;
1068 int error; 908 int error;
1069 909 recv_workqueue = create_workqueue("dlm_recv");
1070 p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); 910 error = IS_ERR(recv_workqueue);
1071 error = IS_ERR(p);
1072 if (error) { 911 if (error) {
1073 log_print("can't start dlm_recvd %d", error); 912 log_print("can't start dlm_recv %d", error);
1074 return error; 913 return error;
1075 } 914 }
1076 recv_task = p;
1077 915
1078 p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); 916 send_workqueue = create_singlethread_workqueue("dlm_send");
1079 error = IS_ERR(p); 917 error = IS_ERR(send_workqueue);
1080 if (error) { 918 if (error) {
1081 log_print("can't start dlm_sendd %d", error); 919 log_print("can't start dlm_send %d", error);
1082 kthread_stop(recv_task); 920 destroy_workqueue(recv_workqueue);
1083 return error; 921 return error;
1084 } 922 }
1085 send_task = p;
1086 923
1087 return 0; 924 return 0;
1088} 925}
1089 926
1090/*
1091 * Return the largest buffer size we can cope with.
1092 */
1093int lowcomms_max_buffer_size(void)
1094{
1095 return PAGE_CACHE_SIZE;
1096}
1097
1098void dlm_lowcomms_stop(void) 927void dlm_lowcomms_stop(void)
1099{ 928{
1100 int i; 929 int i;
@@ -1107,7 +936,7 @@ void dlm_lowcomms_stop(void)
1107 connections[i]->flags |= 0xFF; 936 connections[i]->flags |= 0xFF;
1108 } 937 }
1109 938
1110 daemons_stop(); 939 work_stop();
1111 clean_writequeues(); 940 clean_writequeues();
1112 941
1113 for (i = 0; i < conn_array_size; i++) { 942 for (i = 0; i < conn_array_size; i++) {
@@ -1159,7 +988,7 @@ int dlm_lowcomms_start(void)
1159 if (error) 988 if (error)
1160 goto fail_unlisten; 989 goto fail_unlisten;
1161 990
1162 error = daemons_start(); 991 error = work_start();
1163 if (error) 992 if (error)
1164 goto fail_unlisten; 993 goto fail_unlisten;
1165 994
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index c9b1c3d535f4..a5126e0c68a6 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -82,7 +82,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
82 if (msglen < sizeof(struct dlm_header)) 82 if (msglen < sizeof(struct dlm_header))
83 break; 83 break;
84 err = -E2BIG; 84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) { 85 if (msglen > dlm_config.ci_buffer_size) {
86 log_print("message size %d from %d too big, buf len %d", 86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len); 87 msglen, nodeid, len);
88 break; 88 break;
@@ -103,7 +103,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
103 103
104 if (msglen > sizeof(__tmp) && 104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) { 105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL); 106 msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
107 if (msg == NULL) 107 if (msg == NULL)
108 return ret; 108 return ret;
109 } 109 }
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 4cc31be9cd9d..6bfbd6153809 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -56,6 +56,10 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
56 56
57 rc->rc_type = type; 57 rc->rc_type = type;
58 58
59 spin_lock(&ls->ls_recover_lock);
60 rc->rc_seq = ls->ls_recover_seq;
61 spin_unlock(&ls->ls_recover_lock);
62
59 *mh_ret = mh; 63 *mh_ret = mh;
60 *rc_ret = rc; 64 *rc_ret = rc;
61 return 0; 65 return 0;
@@ -78,8 +82,17 @@ static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
78 rf->rf_lsflags = ls->ls_exflags; 82 rf->rf_lsflags = ls->ls_exflags;
79} 83}
80 84
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid) 85static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
82{ 86{
87 struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
88
89 if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
90 log_error(ls, "version mismatch: %x nodeid %d: %x",
91 DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
92 rc->rc_header.h_version);
93 return -EINVAL;
94 }
95
83 if (rf->rf_lvblen != ls->ls_lvblen || 96 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) { 97 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", 98 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -125,7 +138,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
125 goto out; 138 goto out;
126 139
127 allow_sync_reply(ls, &rc->rc_id); 140 allow_sync_reply(ls, &rc->rc_id);
128 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); 141 memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
129 142
130 send_rcom(ls, mh, rc); 143 send_rcom(ls, mh, rc);
131 144
@@ -141,8 +154,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
141 log_debug(ls, "remote node %d not ready", nodeid); 154 log_debug(ls, "remote node %d not ready", nodeid);
142 rc->rc_result = 0; 155 rc->rc_result = 0;
143 } else 156 } else
144 error = check_config(ls, (struct rcom_config *) rc->rc_buf, 157 error = check_config(ls, rc, nodeid);
145 nodeid);
146 /* the caller looks at rc_result for the remote recovery status */ 158 /* the caller looks at rc_result for the remote recovery status */
147 out: 159 out:
148 return error; 160 return error;
@@ -159,6 +171,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
159 if (error) 171 if (error)
160 return; 172 return;
161 rc->rc_id = rc_in->rc_id; 173 rc->rc_id = rc_in->rc_id;
174 rc->rc_seq_reply = rc_in->rc_seq;
162 rc->rc_result = dlm_recover_status(ls); 175 rc->rc_result = dlm_recover_status(ls);
163 make_config(ls, (struct rcom_config *) rc->rc_buf); 176 make_config(ls, (struct rcom_config *) rc->rc_buf);
164 177
@@ -200,7 +213,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
200 if (nodeid == dlm_our_nodeid()) { 213 if (nodeid == dlm_our_nodeid()) {
201 dlm_copy_master_names(ls, last_name, last_len, 214 dlm_copy_master_names(ls, last_name, last_len,
202 ls->ls_recover_buf + len, 215 ls->ls_recover_buf + len,
203 dlm_config.buffer_size - len, nodeid); 216 dlm_config.ci_buffer_size - len, nodeid);
204 goto out; 217 goto out;
205 } 218 }
206 219
@@ -210,7 +223,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
210 memcpy(rc->rc_buf, last_name, last_len); 223 memcpy(rc->rc_buf, last_name, last_len);
211 224
212 allow_sync_reply(ls, &rc->rc_id); 225 allow_sync_reply(ls, &rc->rc_id);
213 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); 226 memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
214 227
215 send_rcom(ls, mh, rc); 228 send_rcom(ls, mh, rc);
216 229
@@ -224,30 +237,17 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
224{ 237{
225 struct dlm_rcom *rc; 238 struct dlm_rcom *rc;
226 struct dlm_mhandle *mh; 239 struct dlm_mhandle *mh;
227 int error, inlen, outlen; 240 int error, inlen, outlen, nodeid;
228 int nodeid = rc_in->rc_header.h_nodeid;
229 uint32_t status = dlm_recover_status(ls);
230
231 /*
232 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
233 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
234 * It could only happen in rare cases where we get a late NAMES
235 * message from a previous instance of recovery.
236 */
237
238 if (!(status & DLM_RS_NODES)) {
239 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
240 return;
241 }
242 241
243 nodeid = rc_in->rc_header.h_nodeid; 242 nodeid = rc_in->rc_header.h_nodeid;
244 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); 243 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
245 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom); 244 outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
246 245
247 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); 246 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
248 if (error) 247 if (error)
249 return; 248 return;
250 rc->rc_id = rc_in->rc_id; 249 rc->rc_id = rc_in->rc_id;
250 rc->rc_seq_reply = rc_in->rc_seq;
251 251
252 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, 252 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
253 nodeid); 253 nodeid);
@@ -294,6 +294,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
294 ret_nodeid = error; 294 ret_nodeid = error;
295 rc->rc_result = ret_nodeid; 295 rc->rc_result = ret_nodeid;
296 rc->rc_id = rc_in->rc_id; 296 rc->rc_id = rc_in->rc_id;
297 rc->rc_seq_reply = rc_in->rc_seq;
297 298
298 send_rcom(ls, mh, rc); 299 send_rcom(ls, mh, rc);
299} 300}
@@ -375,20 +376,13 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
375 376
376 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); 377 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
377 rc->rc_id = rc_in->rc_id; 378 rc->rc_id = rc_in->rc_id;
379 rc->rc_seq_reply = rc_in->rc_seq;
378 380
379 send_rcom(ls, mh, rc); 381 send_rcom(ls, mh, rc);
380} 382}
381 383
382static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) 384static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
383{ 385{
384 uint32_t status = dlm_recover_status(ls);
385
386 if (!(status & DLM_RS_DIR)) {
387 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
388 rc_in->rc_header.h_nodeid);
389 return;
390 }
391
392 dlm_recover_process_copy(ls, rc_in); 386 dlm_recover_process_copy(ls, rc_in);
393} 387}
394 388
@@ -415,6 +409,7 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
415 409
416 rc->rc_type = DLM_RCOM_STATUS_REPLY; 410 rc->rc_type = DLM_RCOM_STATUS_REPLY;
417 rc->rc_id = rc_in->rc_id; 411 rc->rc_id = rc_in->rc_id;
412 rc->rc_seq_reply = rc_in->rc_seq;
418 rc->rc_result = -ESRCH; 413 rc->rc_result = -ESRCH;
419 414
420 rf = (struct rcom_config *) rc->rc_buf; 415 rf = (struct rcom_config *) rc->rc_buf;
@@ -426,6 +421,31 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
426 return 0; 421 return 0;
427} 422}
428 423
424static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
425{
426 uint64_t seq;
427 int rv = 0;
428
429 switch (rc->rc_type) {
430 case DLM_RCOM_STATUS_REPLY:
431 case DLM_RCOM_NAMES_REPLY:
432 case DLM_RCOM_LOOKUP_REPLY:
433 case DLM_RCOM_LOCK_REPLY:
434 spin_lock(&ls->ls_recover_lock);
435 seq = ls->ls_recover_seq;
436 spin_unlock(&ls->ls_recover_lock);
437 if (rc->rc_seq_reply != seq) {
438 log_debug(ls, "ignoring old reply %x from %d "
439 "seq_reply %llx expect %llx",
440 rc->rc_type, rc->rc_header.h_nodeid,
441 (unsigned long long)rc->rc_seq_reply,
442 (unsigned long long)seq);
443 rv = 1;
444 }
445 }
446 return rv;
447}
448
429/* Called by dlm_recvd; corresponds to dlm_receive_message() but special 449/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
430 recovery-only comms are sent through here. */ 450 recovery-only comms are sent through here. */
431 451
@@ -449,11 +469,14 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
449 } 469 }
450 470
451 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { 471 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
452 log_error(ls, "ignoring recovery message %x from %d", 472 log_debug(ls, "ignoring recovery message %x from %d",
453 rc->rc_type, nodeid); 473 rc->rc_type, nodeid);
454 goto out; 474 goto out;
455 } 475 }
456 476
477 if (is_old_reply(ls, rc))
478 goto out;
479
457 if (nodeid != rc->rc_header.h_nodeid) { 480 if (nodeid != rc->rc_header.h_nodeid) {
458 log_error(ls, "bad rcom nodeid %d from %d", 481 log_error(ls, "bad rcom nodeid %d from %d",
459 rc->rc_header.h_nodeid, nodeid); 482 rc->rc_header.h_nodeid, nodeid);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index cf9f6831bab5..c2cc7694cd16 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -44,7 +44,7 @@
44static void dlm_wait_timer_fn(unsigned long data) 44static void dlm_wait_timer_fn(unsigned long data)
45{ 45{
46 struct dlm_ls *ls = (struct dlm_ls *) data; 46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ)); 47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.ci_recover_timer * HZ));
48 wake_up(&ls->ls_wait_general); 48 wake_up(&ls->ls_wait_general);
49} 49}
50 50
@@ -55,7 +55,7 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
55 init_timer(&ls->ls_timer); 55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn; 56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls; 57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ); 58 ls->ls_timer.expires = jiffies + (dlm_config.ci_recover_timer * HZ);
59 add_timer(&ls->ls_timer); 59 add_timer(&ls->ls_timer);
60 60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls)); 61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
@@ -397,7 +397,9 @@ int dlm_recover_masters(struct dlm_ls *ls)
397 397
398 if (dlm_no_directory(ls)) 398 if (dlm_no_directory(ls))
399 count += recover_master_static(r); 399 count += recover_master_static(r);
400 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) { 400 else if (!is_master(r) &&
401 (dlm_is_removed(ls, r->res_nodeid) ||
402 rsb_flag(r, RSB_NEW_MASTER))) {
401 recover_master(r); 403 recover_master(r);
402 count++; 404 count++;
403 } 405 }
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 650536aa5139..3cb636d60249 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -77,7 +77,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
77 77
78 error = dlm_recover_members(ls, rv, &neg); 78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) { 79 if (error) {
80 log_error(ls, "recover_members failed %d", error); 80 log_debug(ls, "recover_members failed %d", error);
81 goto fail; 81 goto fail;
82 } 82 }
83 start = jiffies; 83 start = jiffies;
@@ -89,7 +89,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
89 89
90 error = dlm_recover_directory(ls); 90 error = dlm_recover_directory(ls);
91 if (error) { 91 if (error) {
92 log_error(ls, "recover_directory failed %d", error); 92 log_debug(ls, "recover_directory failed %d", error);
93 goto fail; 93 goto fail;
94 } 94 }
95 95
@@ -99,7 +99,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
99 99
100 error = dlm_recover_directory_wait(ls); 100 error = dlm_recover_directory_wait(ls);
101 if (error) { 101 if (error) {
102 log_error(ls, "recover_directory_wait failed %d", error); 102 log_debug(ls, "recover_directory_wait failed %d", error);
103 goto fail; 103 goto fail;
104 } 104 }
105 105
@@ -129,7 +129,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
129 129
130 error = dlm_recover_masters(ls); 130 error = dlm_recover_masters(ls);
131 if (error) { 131 if (error) {
132 log_error(ls, "recover_masters failed %d", error); 132 log_debug(ls, "recover_masters failed %d", error);
133 goto fail; 133 goto fail;
134 } 134 }
135 135
@@ -139,13 +139,13 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
139 139
140 error = dlm_recover_locks(ls); 140 error = dlm_recover_locks(ls);
141 if (error) { 141 if (error) {
142 log_error(ls, "recover_locks failed %d", error); 142 log_debug(ls, "recover_locks failed %d", error);
143 goto fail; 143 goto fail;
144 } 144 }
145 145
146 error = dlm_recover_locks_wait(ls); 146 error = dlm_recover_locks_wait(ls);
147 if (error) { 147 if (error) {
148 log_error(ls, "recover_locks_wait failed %d", error); 148 log_debug(ls, "recover_locks_wait failed %d", error);
149 goto fail; 149 goto fail;
150 } 150 }
151 151
@@ -166,7 +166,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
166 166
167 error = dlm_recover_locks_wait(ls); 167 error = dlm_recover_locks_wait(ls);
168 if (error) { 168 if (error) {
169 log_error(ls, "recover_locks_wait failed %d", error); 169 log_debug(ls, "recover_locks_wait failed %d", error);
170 goto fail; 170 goto fail;
171 } 171 }
172 } 172 }
@@ -184,7 +184,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
184 dlm_set_recover_status(ls, DLM_RS_DONE); 184 dlm_set_recover_status(ls, DLM_RS_DONE);
185 error = dlm_recover_done_wait(ls); 185 error = dlm_recover_done_wait(ls);
186 if (error) { 186 if (error) {
187 log_error(ls, "recover_done_wait failed %d", error); 187 log_debug(ls, "recover_done_wait failed %d", error);
188 goto fail; 188 goto fail;
189 } 189 }
190 190
@@ -192,19 +192,19 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
192 192
193 error = enable_locking(ls, rv->seq); 193 error = enable_locking(ls, rv->seq);
194 if (error) { 194 if (error) {
195 log_error(ls, "enable_locking failed %d", error); 195 log_debug(ls, "enable_locking failed %d", error);
196 goto fail; 196 goto fail;
197 } 197 }
198 198
199 error = dlm_process_requestqueue(ls); 199 error = dlm_process_requestqueue(ls);
200 if (error) { 200 if (error) {
201 log_error(ls, "process_requestqueue failed %d", error); 201 log_debug(ls, "process_requestqueue failed %d", error);
202 goto fail; 202 goto fail;
203 } 203 }
204 204
205 error = dlm_recover_waiters_post(ls); 205 error = dlm_recover_waiters_post(ls);
206 if (error) { 206 if (error) {
207 log_error(ls, "recover_waiters_post failed %d", error); 207 log_debug(ls, "recover_waiters_post failed %d", error);
208 goto fail; 208 goto fail;
209 } 209 }
210 210
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index c37e93e4f2df..d378b7fe2a1e 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -180,6 +180,14 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
180 ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue)) 180 ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
181 remove_ownqueue = 1; 181 remove_ownqueue = 1;
182 182
183 /* unlocks or cancels of waiting requests need to be removed from the
184 proc's unlocking list, again there must be a better way... */
185
186 if (ua->lksb.sb_status == -DLM_EUNLOCK ||
187 (ua->lksb.sb_status == -DLM_ECANCEL &&
188 lkb->lkb_grmode == DLM_LOCK_IV))
189 remove_ownqueue = 1;
190
183 /* We want to copy the lvb to userspace when the completion 191 /* We want to copy the lvb to userspace when the completion
184 ast is read if the status is 0, the lock has an lvb and 192 ast is read if the status is 0, the lock has an lvb and
185 lvb_ops says we should. We could probably have set_lvb_lock() 193 lvb_ops says we should. We could probably have set_lvb_lock()
@@ -523,6 +531,7 @@ static int device_open(struct inode *inode, struct file *file)
523 proc->lockspace = ls->ls_local_handle; 531 proc->lockspace = ls->ls_local_handle;
524 INIT_LIST_HEAD(&proc->asts); 532 INIT_LIST_HEAD(&proc->asts);
525 INIT_LIST_HEAD(&proc->locks); 533 INIT_LIST_HEAD(&proc->locks);
534 INIT_LIST_HEAD(&proc->unlocking);
526 spin_lock_init(&proc->asts_spin); 535 spin_lock_init(&proc->asts_spin);
527 spin_lock_init(&proc->locks_spin); 536 spin_lock_init(&proc->locks_spin);
528 init_waitqueue_head(&proc->wait); 537 init_waitqueue_head(&proc->wait);
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 767197db9944..963889cf6740 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -134,6 +134,8 @@ void dlm_rcom_out(struct dlm_rcom *rc)
134 rc->rc_type = cpu_to_le32(rc->rc_type); 134 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result); 135 rc->rc_result = cpu_to_le32(rc->rc_result);
136 rc->rc_id = cpu_to_le64(rc->rc_id); 136 rc->rc_id = cpu_to_le64(rc->rc_id);
137 rc->rc_seq = cpu_to_le64(rc->rc_seq);
138 rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply);
137 139
138 if (type == DLM_RCOM_LOCK) 140 if (type == DLM_RCOM_LOCK)
139 rcom_lock_out((struct rcom_lock *) rc->rc_buf); 141 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
@@ -151,6 +153,8 @@ void dlm_rcom_in(struct dlm_rcom *rc)
151 rc->rc_type = le32_to_cpu(rc->rc_type); 153 rc->rc_type = le32_to_cpu(rc->rc_type);
152 rc->rc_result = le32_to_cpu(rc->rc_result); 154 rc->rc_result = le32_to_cpu(rc->rc_result);
153 rc->rc_id = le64_to_cpu(rc->rc_id); 155 rc->rc_id = le64_to_cpu(rc->rc_id);
156 rc->rc_seq = le64_to_cpu(rc->rc_seq);
157 rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply);
154 158
155 if (rc->rc_type == DLM_RCOM_LOCK) 159 if (rc->rc_type == DLM_RCOM_LOCK)
156 rcom_lock_in((struct rcom_lock *) rc->rc_buf); 160 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 6a2ffa2db14f..de8e64c03f73 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -4,44 +4,43 @@ config GFS2_FS
4 select FS_POSIX_ACL 4 select FS_POSIX_ACL
5 select CRC32 5 select CRC32
6 help 6 help
7 A cluster filesystem. 7 A cluster filesystem.
8 8
9 Allows a cluster of computers to simultaneously use a block device 9 Allows a cluster of computers to simultaneously use a block device
10 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads 10 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
11 and writes to the block device like a local filesystem, but also uses 11 and writes to the block device like a local filesystem, but also uses
12 a lock module to allow the computers coordinate their I/O so 12 a lock module to allow the computers coordinate their I/O so
13 filesystem consistency is maintained. One of the nifty features of 13 filesystem consistency is maintained. One of the nifty features of
14 GFS is perfect consistency -- changes made to the filesystem on one 14 GFS is perfect consistency -- changes made to the filesystem on one
15 machine show up immediately on all other machines in the cluster. 15 machine show up immediately on all other machines in the cluster.
16 16
17 To use the GFS2 filesystem, you will need to enable one or more of 17 To use the GFS2 filesystem, you will need to enable one or more of
18 the below locking modules. Documentation and utilities for GFS2 can 18 the below locking modules. Documentation and utilities for GFS2 can
19 be found here: http://sources.redhat.com/cluster 19 be found here: http://sources.redhat.com/cluster
20 20
21config GFS2_FS_LOCKING_NOLOCK 21config GFS2_FS_LOCKING_NOLOCK
22 tristate "GFS2 \"nolock\" locking module" 22 tristate "GFS2 \"nolock\" locking module"
23 depends on GFS2_FS 23 depends on GFS2_FS
24 help 24 help
25 Single node locking module for GFS2. 25 Single node locking module for GFS2.
26 26
27 Use this module if you want to use GFS2 on a single node without 27 Use this module if you want to use GFS2 on a single node without
28 its clustering features. You can still take advantage of the 28 its clustering features. You can still take advantage of the
29 large file support, and upgrade to running a full cluster later on 29 large file support, and upgrade to running a full cluster later on
30 if required. 30 if required.
31 31
32 If you will only be using GFS2 in cluster mode, you do not need this 32 If you will only be using GFS2 in cluster mode, you do not need this
33 module. 33 module.
34 34
35config GFS2_FS_LOCKING_DLM 35config GFS2_FS_LOCKING_DLM
36 tristate "GFS2 DLM locking module" 36 tristate "GFS2 DLM locking module"
37 depends on GFS2_FS && NET && INET && (IPV6 || IPV6=n) 37 depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n)
38 select IP_SCTP if DLM_SCTP 38 select IP_SCTP if DLM_SCTP
39 select CONFIGFS_FS 39 select CONFIGFS_FS
40 select DLM 40 select DLM
41 help 41 help
42 Multiple node locking module for GFS2 42 Multiple node locking module for GFS2
43
44 Most users of GFS2 will require this module. It provides the locking
45 interface between GFS2 and the DLM, which is required to use GFS2
46 in a cluster environment.
47 43
44 Most users of GFS2 will require this module. It provides the locking
45 interface between GFS2 and the DLM, which is required to use GFS2
46 in a cluster environment.
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 8240c1ff94f4..113f6c9110c7 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -773,7 +773,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
773 gfs2_free_data(ip, bstart, blen); 773 gfs2_free_data(ip, bstart, blen);
774 } 774 }
775 775
776 ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); 776 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
777 777
778 gfs2_dinode_out(ip, dibh->b_data); 778 gfs2_dinode_out(ip, dibh->b_data);
779 779
@@ -848,7 +848,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
848 } 848 }
849 849
850 ip->i_di.di_size = size; 850 ip->i_di.di_size = size;
851 ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); 851 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
852 852
853 error = gfs2_meta_inode_buffer(ip, &dibh); 853 error = gfs2_meta_inode_buffer(ip, &dibh);
854 if (error) 854 if (error)
@@ -963,7 +963,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
963 963
964 if (gfs2_is_stuffed(ip)) { 964 if (gfs2_is_stuffed(ip)) {
965 ip->i_di.di_size = size; 965 ip->i_di.di_size = size;
966 ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); 966 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
967 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 967 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
968 gfs2_dinode_out(ip, dibh->b_data); 968 gfs2_dinode_out(ip, dibh->b_data);
969 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); 969 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
@@ -975,7 +975,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
975 975
976 if (!error) { 976 if (!error) {
977 ip->i_di.di_size = size; 977 ip->i_di.di_size = size;
978 ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); 978 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
979 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; 979 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
980 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 980 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
981 gfs2_dinode_out(ip, dibh->b_data); 981 gfs2_dinode_out(ip, dibh->b_data);
@@ -1048,7 +1048,7 @@ static int trunc_end(struct gfs2_inode *ip)
1048 ip->i_num.no_addr; 1048 ip->i_num.no_addr;
1049 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1049 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1050 } 1050 }
1051 ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); 1051 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
1052 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; 1052 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
1053 1053
1054 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1054 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 0fdcb7713cd9..c93ca8f361b5 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -131,7 +131,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
131 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 131 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
132 if (ip->i_di.di_size < offset + size) 132 if (ip->i_di.di_size < offset + size)
133 ip->i_di.di_size = offset + size; 133 ip->i_di.di_size = offset + size;
134 ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); 134 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
135 gfs2_dinode_out(ip, dibh->b_data); 135 gfs2_dinode_out(ip, dibh->b_data);
136 136
137 brelse(dibh); 137 brelse(dibh);
@@ -229,7 +229,7 @@ out:
229 229
230 if (ip->i_di.di_size < offset + copied) 230 if (ip->i_di.di_size < offset + copied)
231 ip->i_di.di_size = offset + copied; 231 ip->i_di.di_size = offset + copied;
232 ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); 232 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
233 233
234 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 234 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
235 gfs2_dinode_out(ip, dibh->b_data); 235 gfs2_dinode_out(ip, dibh->b_data);
@@ -1198,12 +1198,11 @@ static int compare_dents(const void *a, const void *b)
1198 */ 1198 */
1199 1199
1200static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, 1200static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1201 void *opaque, gfs2_filldir_t filldir, 1201 void *opaque, filldir_t filldir,
1202 const struct gfs2_dirent **darr, u32 entries, 1202 const struct gfs2_dirent **darr, u32 entries,
1203 int *copied) 1203 int *copied)
1204{ 1204{
1205 const struct gfs2_dirent *dent, *dent_next; 1205 const struct gfs2_dirent *dent, *dent_next;
1206 struct gfs2_inum_host inum;
1207 u64 off, off_next; 1206 u64 off, off_next;
1208 unsigned int x, y; 1207 unsigned int x, y;
1209 int run = 0; 1208 int run = 0;
@@ -1240,11 +1239,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1240 *offset = off; 1239 *offset = off;
1241 } 1240 }
1242 1241
1243 gfs2_inum_in(&inum, (char *)&dent->de_inum);
1244
1245 error = filldir(opaque, (const char *)(dent + 1), 1242 error = filldir(opaque, (const char *)(dent + 1),
1246 be16_to_cpu(dent->de_name_len), 1243 be16_to_cpu(dent->de_name_len),
1247 off, &inum, 1244 off, be64_to_cpu(dent->de_inum.no_addr),
1248 be16_to_cpu(dent->de_type)); 1245 be16_to_cpu(dent->de_type));
1249 if (error) 1246 if (error)
1250 return 1; 1247 return 1;
@@ -1262,8 +1259,8 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1262} 1259}
1263 1260
1264static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, 1261static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1265 gfs2_filldir_t filldir, int *copied, 1262 filldir_t filldir, int *copied, unsigned *depth,
1266 unsigned *depth, u64 leaf_no) 1263 u64 leaf_no)
1267{ 1264{
1268 struct gfs2_inode *ip = GFS2_I(inode); 1265 struct gfs2_inode *ip = GFS2_I(inode);
1269 struct buffer_head *bh; 1266 struct buffer_head *bh;
@@ -1343,7 +1340,7 @@ out:
1343 */ 1340 */
1344 1341
1345static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, 1342static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1346 gfs2_filldir_t filldir) 1343 filldir_t filldir)
1347{ 1344{
1348 struct gfs2_inode *dip = GFS2_I(inode); 1345 struct gfs2_inode *dip = GFS2_I(inode);
1349 struct gfs2_sbd *sdp = GFS2_SB(inode); 1346 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1402,7 +1399,7 @@ out:
1402} 1399}
1403 1400
1404int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 1401int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1405 gfs2_filldir_t filldir) 1402 filldir_t filldir)
1406{ 1403{
1407 struct gfs2_inode *dip = GFS2_I(inode); 1404 struct gfs2_inode *dip = GFS2_I(inode);
1408 struct dirent_gather g; 1405 struct dirent_gather g;
@@ -1568,7 +1565,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1568 break; 1565 break;
1569 gfs2_trans_add_bh(ip->i_gl, bh, 1); 1566 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1570 ip->i_di.di_entries++; 1567 ip->i_di.di_entries++;
1571 ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); 1568 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC;
1572 gfs2_dinode_out(ip, bh->b_data); 1569 gfs2_dinode_out(ip, bh->b_data);
1573 brelse(bh); 1570 brelse(bh);
1574 error = 0; 1571 error = 0;
@@ -1654,7 +1651,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1654 gfs2_consist_inode(dip); 1651 gfs2_consist_inode(dip);
1655 gfs2_trans_add_bh(dip->i_gl, bh, 1); 1652 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1656 dip->i_di.di_entries--; 1653 dip->i_di.di_entries--;
1657 dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds(); 1654 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC;
1658 gfs2_dinode_out(dip, bh->b_data); 1655 gfs2_dinode_out(dip, bh->b_data);
1659 brelse(bh); 1656 brelse(bh);
1660 mark_inode_dirty(&dip->i_inode); 1657 mark_inode_dirty(&dip->i_inode);
@@ -1702,7 +1699,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1702 gfs2_trans_add_bh(dip->i_gl, bh, 1); 1699 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1703 } 1700 }
1704 1701
1705 dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds(); 1702 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC;
1706 gfs2_dinode_out(dip, bh->b_data); 1703 gfs2_dinode_out(dip, bh->b_data);
1707 brelse(bh); 1704 brelse(bh);
1708 return 0; 1705 return 0;
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index b21b33668a5b..48fe89046bba 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,30 +16,13 @@ struct inode;
16struct gfs2_inode; 16struct gfs2_inode;
17struct gfs2_inum; 17struct gfs2_inum;
18 18
19/**
20 * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
21 * @opaque: opaque data used by the function
22 * @name: the name of the directory entry
23 * @length: the length of the name
24 * @offset: the entry's offset in the directory
25 * @inum: the inode number the entry points to
26 * @type: the type of inode the entry points to
27 *
28 * Returns: 0 on success, 1 if buffer full
29 */
30
31typedef int (*gfs2_filldir_t) (void *opaque,
32 const char *name, unsigned int length,
33 u64 offset,
34 struct gfs2_inum_host *inum, unsigned int type);
35
36int gfs2_dir_search(struct inode *dir, const struct qstr *filename, 19int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
37 struct gfs2_inum_host *inum, unsigned int *type); 20 struct gfs2_inum_host *inum, unsigned int *type);
38int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 21int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
39 const struct gfs2_inum_host *inum, unsigned int type); 22 const struct gfs2_inum_host *inum, unsigned int type);
40int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); 23int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
41int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque, 24int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
42 gfs2_filldir_t filldir); 25 filldir_t filldir);
43int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 26int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
44 struct gfs2_inum_host *new_inum, unsigned int new_type); 27 struct gfs2_inum_host *new_inum, unsigned int new_type);
45 28
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index ebebbdcd7057..0c83c7f4dda8 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -301,7 +301,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
301 301
302 error = gfs2_meta_inode_buffer(ip, &dibh); 302 error = gfs2_meta_inode_buffer(ip, &dibh);
303 if (!error) { 303 if (!error) {
304 ip->i_inode.i_ctime.tv_sec = get_seconds(); 304 ip->i_inode.i_ctime = CURRENT_TIME_SEC;
305 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 305 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
306 gfs2_dinode_out(ip, dibh->b_data); 306 gfs2_dinode_out(ip, dibh->b_data);
307 brelse(dibh); 307 brelse(dibh);
@@ -718,7 +718,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
718 (er->er_mode & S_IFMT)); 718 (er->er_mode & S_IFMT));
719 ip->i_inode.i_mode = er->er_mode; 719 ip->i_inode.i_mode = er->er_mode;
720 } 720 }
721 ip->i_inode.i_ctime.tv_sec = get_seconds(); 721 ip->i_inode.i_ctime = CURRENT_TIME_SEC;
722 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 722 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
723 gfs2_dinode_out(ip, dibh->b_data); 723 gfs2_dinode_out(ip, dibh->b_data);
724 brelse(dibh); 724 brelse(dibh);
@@ -853,7 +853,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
853 (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT)); 853 (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
854 ip->i_inode.i_mode = er->er_mode; 854 ip->i_inode.i_mode = er->er_mode;
855 } 855 }
856 ip->i_inode.i_ctime.tv_sec = get_seconds(); 856 ip->i_inode.i_ctime = CURRENT_TIME_SEC;
857 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 857 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
858 gfs2_dinode_out(ip, dibh->b_data); 858 gfs2_dinode_out(ip, dibh->b_data);
859 brelse(dibh); 859 brelse(dibh);
@@ -1134,7 +1134,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1134 1134
1135 error = gfs2_meta_inode_buffer(ip, &dibh); 1135 error = gfs2_meta_inode_buffer(ip, &dibh);
1136 if (!error) { 1136 if (!error) {
1137 ip->i_inode.i_ctime.tv_sec = get_seconds(); 1137 ip->i_inode.i_ctime = CURRENT_TIME_SEC;
1138 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1138 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1139 gfs2_dinode_out(ip, dibh->b_data); 1139 gfs2_dinode_out(ip, dibh->b_data);
1140 brelse(dibh); 1140 brelse(dibh);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 438146904b58..6618c1190252 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,6 +19,8 @@
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/lm_interface.h> 21#include <linux/lm_interface.h>
22#include <linux/wait.h>
23#include <linux/rwsem.h>
22#include <asm/uaccess.h> 24#include <asm/uaccess.h>
23 25
24#include "gfs2.h" 26#include "gfs2.h"
@@ -33,11 +35,6 @@
33#include "super.h" 35#include "super.h"
34#include "util.h" 36#include "util.h"
35 37
36struct greedy {
37 struct gfs2_holder gr_gh;
38 struct delayed_work gr_work;
39};
40
41struct gfs2_gl_hash_bucket { 38struct gfs2_gl_hash_bucket {
42 struct hlist_head hb_list; 39 struct hlist_head hb_list;
43}; 40};
@@ -47,6 +44,9 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl);
47static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); 44static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
48static int dump_glock(struct gfs2_glock *gl); 45static int dump_glock(struct gfs2_glock *gl);
49static int dump_inode(struct gfs2_inode *ip); 46static int dump_inode(struct gfs2_inode *ip);
47static void gfs2_glock_xmote_th(struct gfs2_holder *gh);
48static void gfs2_glock_drop_th(struct gfs2_glock *gl);
49static DECLARE_RWSEM(gfs2_umount_flush_sem);
50 50
51#define GFS2_GL_HASH_SHIFT 15 51#define GFS2_GL_HASH_SHIFT 15
52#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) 52#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
@@ -213,30 +213,6 @@ out:
213} 213}
214 214
215/** 215/**
216 * queue_empty - check to see if a glock's queue is empty
217 * @gl: the glock
218 * @head: the head of the queue to check
219 *
220 * This function protects the list in the event that a process already
221 * has a holder on the list and is adding a second holder for itself.
222 * The glmutex lock is what generally prevents processes from working
223 * on the same glock at once, but the special case of adding a second
224 * holder for yourself ("recursive" locking) doesn't involve locking
225 * glmutex, making the spin lock necessary.
226 *
227 * Returns: 1 if the queue is empty
228 */
229
230static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
231{
232 int empty;
233 spin_lock(&gl->gl_spin);
234 empty = list_empty(head);
235 spin_unlock(&gl->gl_spin);
236 return empty;
237}
238
239/**
240 * search_bucket() - Find struct gfs2_glock by lock number 216 * search_bucket() - Find struct gfs2_glock by lock number
241 * @bucket: the bucket to search 217 * @bucket: the bucket to search
242 * @name: The lock name 218 * @name: The lock name
@@ -395,11 +371,6 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
395 gh->gh_flags = flags; 371 gh->gh_flags = flags;
396 gh->gh_error = 0; 372 gh->gh_error = 0;
397 gh->gh_iflags = 0; 373 gh->gh_iflags = 0;
398 init_completion(&gh->gh_wait);
399
400 if (gh->gh_state == LM_ST_EXCLUSIVE)
401 gh->gh_flags |= GL_LOCAL_EXCL;
402
403 gfs2_glock_hold(gl); 374 gfs2_glock_hold(gl);
404} 375}
405 376
@@ -417,9 +388,6 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
417{ 388{
418 gh->gh_state = state; 389 gh->gh_state = state;
419 gh->gh_flags = flags; 390 gh->gh_flags = flags;
420 if (gh->gh_state == LM_ST_EXCLUSIVE)
421 gh->gh_flags |= GL_LOCAL_EXCL;
422
423 gh->gh_iflags &= 1 << HIF_ALLOCED; 391 gh->gh_iflags &= 1 << HIF_ALLOCED;
424 gh->gh_ip = (unsigned long)__builtin_return_address(0); 392 gh->gh_ip = (unsigned long)__builtin_return_address(0);
425} 393}
@@ -479,6 +447,29 @@ static void gfs2_holder_put(struct gfs2_holder *gh)
479 kfree(gh); 447 kfree(gh);
480} 448}
481 449
450static void gfs2_holder_dispose_or_wake(struct gfs2_holder *gh)
451{
452 if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) {
453 gfs2_holder_put(gh);
454 return;
455 }
456 clear_bit(HIF_WAIT, &gh->gh_iflags);
457 smp_mb();
458 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
459}
460
461static int holder_wait(void *word)
462{
463 schedule();
464 return 0;
465}
466
467static void wait_on_holder(struct gfs2_holder *gh)
468{
469 might_sleep();
470 wait_on_bit(&gh->gh_iflags, HIF_WAIT, holder_wait, TASK_UNINTERRUPTIBLE);
471}
472
482/** 473/**
483 * rq_mutex - process a mutex request in the queue 474 * rq_mutex - process a mutex request in the queue
484 * @gh: the glock holder 475 * @gh: the glock holder
@@ -493,7 +484,9 @@ static int rq_mutex(struct gfs2_holder *gh)
493 list_del_init(&gh->gh_list); 484 list_del_init(&gh->gh_list);
494 /* gh->gh_error never examined. */ 485 /* gh->gh_error never examined. */
495 set_bit(GLF_LOCK, &gl->gl_flags); 486 set_bit(GLF_LOCK, &gl->gl_flags);
496 complete(&gh->gh_wait); 487 clear_bit(HIF_WAIT, &gh->gh_iflags);
488 smp_mb();
489 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
497 490
498 return 1; 491 return 1;
499} 492}
@@ -511,7 +504,6 @@ static int rq_promote(struct gfs2_holder *gh)
511{ 504{
512 struct gfs2_glock *gl = gh->gh_gl; 505 struct gfs2_glock *gl = gh->gh_gl;
513 struct gfs2_sbd *sdp = gl->gl_sbd; 506 struct gfs2_sbd *sdp = gl->gl_sbd;
514 const struct gfs2_glock_operations *glops = gl->gl_ops;
515 507
516 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { 508 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
517 if (list_empty(&gl->gl_holders)) { 509 if (list_empty(&gl->gl_holders)) {
@@ -526,7 +518,7 @@ static int rq_promote(struct gfs2_holder *gh)
526 gfs2_reclaim_glock(sdp); 518 gfs2_reclaim_glock(sdp);
527 } 519 }
528 520
529 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); 521 gfs2_glock_xmote_th(gh);
530 spin_lock(&gl->gl_spin); 522 spin_lock(&gl->gl_spin);
531 } 523 }
532 return 1; 524 return 1;
@@ -537,11 +529,11 @@ static int rq_promote(struct gfs2_holder *gh)
537 set_bit(GLF_LOCK, &gl->gl_flags); 529 set_bit(GLF_LOCK, &gl->gl_flags);
538 } else { 530 } else {
539 struct gfs2_holder *next_gh; 531 struct gfs2_holder *next_gh;
540 if (gh->gh_flags & GL_LOCAL_EXCL) 532 if (gh->gh_state == LM_ST_EXCLUSIVE)
541 return 1; 533 return 1;
542 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder, 534 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
543 gh_list); 535 gh_list);
544 if (next_gh->gh_flags & GL_LOCAL_EXCL) 536 if (next_gh->gh_state == LM_ST_EXCLUSIVE)
545 return 1; 537 return 1;
546 } 538 }
547 539
@@ -549,7 +541,7 @@ static int rq_promote(struct gfs2_holder *gh)
549 gh->gh_error = 0; 541 gh->gh_error = 0;
550 set_bit(HIF_HOLDER, &gh->gh_iflags); 542 set_bit(HIF_HOLDER, &gh->gh_iflags);
551 543
552 complete(&gh->gh_wait); 544 gfs2_holder_dispose_or_wake(gh);
553 545
554 return 0; 546 return 0;
555} 547}
@@ -564,7 +556,6 @@ static int rq_promote(struct gfs2_holder *gh)
564static int rq_demote(struct gfs2_holder *gh) 556static int rq_demote(struct gfs2_holder *gh)
565{ 557{
566 struct gfs2_glock *gl = gh->gh_gl; 558 struct gfs2_glock *gl = gh->gh_gl;
567 const struct gfs2_glock_operations *glops = gl->gl_ops;
568 559
569 if (!list_empty(&gl->gl_holders)) 560 if (!list_empty(&gl->gl_holders))
570 return 1; 561 return 1;
@@ -573,10 +564,7 @@ static int rq_demote(struct gfs2_holder *gh)
573 list_del_init(&gh->gh_list); 564 list_del_init(&gh->gh_list);
574 gh->gh_error = 0; 565 gh->gh_error = 0;
575 spin_unlock(&gl->gl_spin); 566 spin_unlock(&gl->gl_spin);
576 if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) 567 gfs2_holder_dispose_or_wake(gh);
577 gfs2_holder_put(gh);
578 else
579 complete(&gh->gh_wait);
580 spin_lock(&gl->gl_spin); 568 spin_lock(&gl->gl_spin);
581 } else { 569 } else {
582 gl->gl_req_gh = gh; 570 gl->gl_req_gh = gh;
@@ -585,9 +573,9 @@ static int rq_demote(struct gfs2_holder *gh)
585 573
586 if (gh->gh_state == LM_ST_UNLOCKED || 574 if (gh->gh_state == LM_ST_UNLOCKED ||
587 gl->gl_state != LM_ST_EXCLUSIVE) 575 gl->gl_state != LM_ST_EXCLUSIVE)
588 glops->go_drop_th(gl); 576 gfs2_glock_drop_th(gl);
589 else 577 else
590 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); 578 gfs2_glock_xmote_th(gh);
591 579
592 spin_lock(&gl->gl_spin); 580 spin_lock(&gl->gl_spin);
593 } 581 }
@@ -596,30 +584,6 @@ static int rq_demote(struct gfs2_holder *gh)
596} 584}
597 585
598/** 586/**
599 * rq_greedy - process a queued request to drop greedy status
600 * @gh: the glock holder
601 *
602 * Returns: 1 if the queue is blocked
603 */
604
605static int rq_greedy(struct gfs2_holder *gh)
606{
607 struct gfs2_glock *gl = gh->gh_gl;
608
609 list_del_init(&gh->gh_list);
610 /* gh->gh_error never examined. */
611 clear_bit(GLF_GREEDY, &gl->gl_flags);
612 spin_unlock(&gl->gl_spin);
613
614 gfs2_holder_uninit(gh);
615 kfree(container_of(gh, struct greedy, gr_gh));
616
617 spin_lock(&gl->gl_spin);
618
619 return 0;
620}
621
622/**
623 * run_queue - process holder structures on a glock 587 * run_queue - process holder structures on a glock
624 * @gl: the glock 588 * @gl: the glock
625 * 589 *
@@ -649,8 +613,6 @@ static void run_queue(struct gfs2_glock *gl)
649 613
650 if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) 614 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
651 blocked = rq_demote(gh); 615 blocked = rq_demote(gh);
652 else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
653 blocked = rq_greedy(gh);
654 else 616 else
655 gfs2_assert_warn(gl->gl_sbd, 0); 617 gfs2_assert_warn(gl->gl_sbd, 0);
656 618
@@ -684,6 +646,8 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
684 646
685 gfs2_holder_init(gl, 0, 0, &gh); 647 gfs2_holder_init(gl, 0, 0, &gh);
686 set_bit(HIF_MUTEX, &gh.gh_iflags); 648 set_bit(HIF_MUTEX, &gh.gh_iflags);
649 if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags))
650 BUG();
687 651
688 spin_lock(&gl->gl_spin); 652 spin_lock(&gl->gl_spin);
689 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { 653 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
@@ -691,11 +655,13 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
691 } else { 655 } else {
692 gl->gl_owner = current; 656 gl->gl_owner = current;
693 gl->gl_ip = (unsigned long)__builtin_return_address(0); 657 gl->gl_ip = (unsigned long)__builtin_return_address(0);
694 complete(&gh.gh_wait); 658 clear_bit(HIF_WAIT, &gh.gh_iflags);
659 smp_mb();
660 wake_up_bit(&gh.gh_iflags, HIF_WAIT);
695 } 661 }
696 spin_unlock(&gl->gl_spin); 662 spin_unlock(&gl->gl_spin);
697 663
698 wait_for_completion(&gh.gh_wait); 664 wait_on_holder(&gh);
699 gfs2_holder_uninit(&gh); 665 gfs2_holder_uninit(&gh);
700} 666}
701 667
@@ -774,6 +740,7 @@ restart:
774 return; 740 return;
775 set_bit(HIF_DEMOTE, &new_gh->gh_iflags); 741 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
776 set_bit(HIF_DEALLOC, &new_gh->gh_iflags); 742 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
743 set_bit(HIF_WAIT, &new_gh->gh_iflags);
777 744
778 goto restart; 745 goto restart;
779 } 746 }
@@ -825,7 +792,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
825 int op_done = 1; 792 int op_done = 1;
826 793
827 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); 794 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
828 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); 795 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
829 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC)); 796 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
830 797
831 state_change(gl, ret & LM_OUT_ST_MASK); 798 state_change(gl, ret & LM_OUT_ST_MASK);
@@ -908,12 +875,8 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
908 875
909 gfs2_glock_put(gl); 876 gfs2_glock_put(gl);
910 877
911 if (gh) { 878 if (gh)
912 if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) 879 gfs2_holder_dispose_or_wake(gh);
913 gfs2_holder_put(gh);
914 else
915 complete(&gh->gh_wait);
916 }
917} 880}
918 881
919/** 882/**
@@ -924,23 +887,26 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
924 * 887 *
925 */ 888 */
926 889
927void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags) 890void gfs2_glock_xmote_th(struct gfs2_holder *gh)
928{ 891{
892 struct gfs2_glock *gl = gh->gh_gl;
929 struct gfs2_sbd *sdp = gl->gl_sbd; 893 struct gfs2_sbd *sdp = gl->gl_sbd;
894 int flags = gh->gh_flags;
895 unsigned state = gh->gh_state;
930 const struct gfs2_glock_operations *glops = gl->gl_ops; 896 const struct gfs2_glock_operations *glops = gl->gl_ops;
931 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB | 897 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
932 LM_FLAG_NOEXP | LM_FLAG_ANY | 898 LM_FLAG_NOEXP | LM_FLAG_ANY |
933 LM_FLAG_PRIORITY); 899 LM_FLAG_PRIORITY);
934 unsigned int lck_ret; 900 unsigned int lck_ret;
935 901
902 if (glops->go_xmote_th)
903 glops->go_xmote_th(gl);
904
936 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); 905 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
937 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); 906 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
938 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED); 907 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
939 gfs2_assert_warn(sdp, state != gl->gl_state); 908 gfs2_assert_warn(sdp, state != gl->gl_state);
940 909
941 if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
942 glops->go_sync(gl);
943
944 gfs2_glock_hold(gl); 910 gfs2_glock_hold(gl);
945 gl->gl_req_bh = xmote_bh; 911 gl->gl_req_bh = xmote_bh;
946 912
@@ -971,10 +937,8 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
971 const struct gfs2_glock_operations *glops = gl->gl_ops; 937 const struct gfs2_glock_operations *glops = gl->gl_ops;
972 struct gfs2_holder *gh = gl->gl_req_gh; 938 struct gfs2_holder *gh = gl->gl_req_gh;
973 939
974 clear_bit(GLF_PREFETCH, &gl->gl_flags);
975
976 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); 940 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
977 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); 941 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
978 gfs2_assert_warn(sdp, !ret); 942 gfs2_assert_warn(sdp, !ret);
979 943
980 state_change(gl, LM_ST_UNLOCKED); 944 state_change(gl, LM_ST_UNLOCKED);
@@ -1001,12 +965,8 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
1001 965
1002 gfs2_glock_put(gl); 966 gfs2_glock_put(gl);
1003 967
1004 if (gh) { 968 if (gh)
1005 if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) 969 gfs2_holder_dispose_or_wake(gh);
1006 gfs2_holder_put(gh);
1007 else
1008 complete(&gh->gh_wait);
1009 }
1010} 970}
1011 971
1012/** 972/**
@@ -1015,19 +975,19 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
1015 * 975 *
1016 */ 976 */
1017 977
1018void gfs2_glock_drop_th(struct gfs2_glock *gl) 978static void gfs2_glock_drop_th(struct gfs2_glock *gl)
1019{ 979{
1020 struct gfs2_sbd *sdp = gl->gl_sbd; 980 struct gfs2_sbd *sdp = gl->gl_sbd;
1021 const struct gfs2_glock_operations *glops = gl->gl_ops; 981 const struct gfs2_glock_operations *glops = gl->gl_ops;
1022 unsigned int ret; 982 unsigned int ret;
1023 983
984 if (glops->go_drop_th)
985 glops->go_drop_th(gl);
986
1024 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); 987 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1025 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); 988 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
1026 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED); 989 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
1027 990
1028 if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
1029 glops->go_sync(gl);
1030
1031 gfs2_glock_hold(gl); 991 gfs2_glock_hold(gl);
1032 gl->gl_req_bh = drop_bh; 992 gl->gl_req_bh = drop_bh;
1033 993
@@ -1107,8 +1067,7 @@ static int glock_wait_internal(struct gfs2_holder *gh)
1107 if (gh->gh_flags & LM_FLAG_PRIORITY) 1067 if (gh->gh_flags & LM_FLAG_PRIORITY)
1108 do_cancels(gh); 1068 do_cancels(gh);
1109 1069
1110 wait_for_completion(&gh->gh_wait); 1070 wait_on_holder(gh);
1111
1112 if (gh->gh_error) 1071 if (gh->gh_error)
1113 return gh->gh_error; 1072 return gh->gh_error;
1114 1073
@@ -1164,6 +1123,8 @@ static void add_to_queue(struct gfs2_holder *gh)
1164 struct gfs2_holder *existing; 1123 struct gfs2_holder *existing;
1165 1124
1166 BUG_ON(!gh->gh_owner); 1125 BUG_ON(!gh->gh_owner);
1126 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
1127 BUG();
1167 1128
1168 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner); 1129 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
1169 if (existing) { 1130 if (existing) {
@@ -1227,8 +1188,6 @@ restart:
1227 } 1188 }
1228 } 1189 }
1229 1190
1230 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1231
1232 return error; 1191 return error;
1233} 1192}
1234 1193
@@ -1321,98 +1280,6 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1321} 1280}
1322 1281
1323/** 1282/**
1324 * gfs2_glock_prefetch - Try to prefetch a glock
1325 * @gl: the glock
1326 * @state: the state to prefetch in
1327 * @flags: flags passed to go_xmote_th()
1328 *
1329 */
1330
1331static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
1332 int flags)
1333{
1334 const struct gfs2_glock_operations *glops = gl->gl_ops;
1335
1336 spin_lock(&gl->gl_spin);
1337
1338 if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) ||
1339 !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) ||
1340 !list_empty(&gl->gl_waiters3) ||
1341 relaxed_state_ok(gl->gl_state, state, flags)) {
1342 spin_unlock(&gl->gl_spin);
1343 return;
1344 }
1345
1346 set_bit(GLF_PREFETCH, &gl->gl_flags);
1347 set_bit(GLF_LOCK, &gl->gl_flags);
1348 spin_unlock(&gl->gl_spin);
1349
1350 glops->go_xmote_th(gl, state, flags);
1351}
1352
1353static void greedy_work(struct work_struct *work)
1354{
1355 struct greedy *gr = container_of(work, struct greedy, gr_work.work);
1356 struct gfs2_holder *gh = &gr->gr_gh;
1357 struct gfs2_glock *gl = gh->gh_gl;
1358 const struct gfs2_glock_operations *glops = gl->gl_ops;
1359
1360 clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1361
1362 if (glops->go_greedy)
1363 glops->go_greedy(gl);
1364
1365 spin_lock(&gl->gl_spin);
1366
1367 if (list_empty(&gl->gl_waiters2)) {
1368 clear_bit(GLF_GREEDY, &gl->gl_flags);
1369 spin_unlock(&gl->gl_spin);
1370 gfs2_holder_uninit(gh);
1371 kfree(gr);
1372 } else {
1373 gfs2_glock_hold(gl);
1374 list_add_tail(&gh->gh_list, &gl->gl_waiters2);
1375 run_queue(gl);
1376 spin_unlock(&gl->gl_spin);
1377 gfs2_glock_put(gl);
1378 }
1379}
1380
1381/**
1382 * gfs2_glock_be_greedy -
1383 * @gl:
1384 * @time:
1385 *
1386 * Returns: 0 if go_greedy will be called, 1 otherwise
1387 */
1388
1389int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
1390{
1391 struct greedy *gr;
1392 struct gfs2_holder *gh;
1393
1394 if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
1395 test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
1396 return 1;
1397
1398 gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
1399 if (!gr) {
1400 clear_bit(GLF_GREEDY, &gl->gl_flags);
1401 return 1;
1402 }
1403 gh = &gr->gr_gh;
1404
1405 gfs2_holder_init(gl, 0, 0, gh);
1406 set_bit(HIF_GREEDY, &gh->gh_iflags);
1407 INIT_DELAYED_WORK(&gr->gr_work, greedy_work);
1408
1409 set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1410 schedule_delayed_work(&gr->gr_work, time);
1411
1412 return 0;
1413}
1414
1415/**
1416 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it 1283 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1417 * @gh: the holder structure 1284 * @gh: the holder structure
1418 * 1285 *
@@ -1470,10 +1337,7 @@ static int glock_compare(const void *arg_a, const void *arg_b)
1470 return 1; 1337 return 1;
1471 if (a->ln_number < b->ln_number) 1338 if (a->ln_number < b->ln_number)
1472 return -1; 1339 return -1;
1473 if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE) 1340 BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type);
1474 return 1;
1475 if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL))
1476 return 1;
1477 return 0; 1341 return 0;
1478} 1342}
1479 1343
@@ -1618,34 +1482,6 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1618} 1482}
1619 1483
1620/** 1484/**
1621 * gfs2_glock_prefetch_num - prefetch a glock based on lock number
1622 * @sdp: the filesystem
1623 * @number: the lock number
1624 * @glops: the glock operations for the type of glock
1625 * @state: the state to acquire the glock in
1626 * @flags: modifier flags for the aquisition
1627 *
1628 * Returns: errno
1629 */
1630
1631void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
1632 const struct gfs2_glock_operations *glops,
1633 unsigned int state, int flags)
1634{
1635 struct gfs2_glock *gl;
1636 int error;
1637
1638 if (atomic_read(&sdp->sd_reclaim_count) <
1639 gfs2_tune_get(sdp, gt_reclaim_limit)) {
1640 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1641 if (!error) {
1642 gfs2_glock_prefetch(gl, state, flags);
1643 gfs2_glock_put(gl);
1644 }
1645 }
1646}
1647
1648/**
1649 * gfs2_lvb_hold - attach a LVB from a glock 1485 * gfs2_lvb_hold - attach a LVB from a glock
1650 * @gl: The glock in question 1486 * @gl: The glock in question
1651 * 1487 *
@@ -1703,8 +1539,6 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1703 if (!gl) 1539 if (!gl)
1704 return; 1540 return;
1705 1541
1706 if (gl->gl_ops->go_callback)
1707 gl->gl_ops->go_callback(gl, state);
1708 handle_callback(gl, state); 1542 handle_callback(gl, state);
1709 1543
1710 spin_lock(&gl->gl_spin); 1544 spin_lock(&gl->gl_spin);
@@ -1746,12 +1580,14 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1746 struct lm_async_cb *async = data; 1580 struct lm_async_cb *async = data;
1747 struct gfs2_glock *gl; 1581 struct gfs2_glock *gl;
1748 1582
1583 down_read(&gfs2_umount_flush_sem);
1749 gl = gfs2_glock_find(sdp, &async->lc_name); 1584 gl = gfs2_glock_find(sdp, &async->lc_name);
1750 if (gfs2_assert_warn(sdp, gl)) 1585 if (gfs2_assert_warn(sdp, gl))
1751 return; 1586 return;
1752 if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) 1587 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1753 gl->gl_req_bh(gl, async->lc_ret); 1588 gl->gl_req_bh(gl, async->lc_ret);
1754 gfs2_glock_put(gl); 1589 gfs2_glock_put(gl);
1590 up_read(&gfs2_umount_flush_sem);
1755 return; 1591 return;
1756 } 1592 }
1757 1593
@@ -1781,15 +1617,11 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1781 1617
1782static int demote_ok(struct gfs2_glock *gl) 1618static int demote_ok(struct gfs2_glock *gl)
1783{ 1619{
1784 struct gfs2_sbd *sdp = gl->gl_sbd;
1785 const struct gfs2_glock_operations *glops = gl->gl_ops; 1620 const struct gfs2_glock_operations *glops = gl->gl_ops;
1786 int demote = 1; 1621 int demote = 1;
1787 1622
1788 if (test_bit(GLF_STICKY, &gl->gl_flags)) 1623 if (test_bit(GLF_STICKY, &gl->gl_flags))
1789 demote = 0; 1624 demote = 0;
1790 else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
1791 demote = time_after_eq(jiffies, gl->gl_stamp +
1792 gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
1793 else if (glops->go_demote_ok) 1625 else if (glops->go_demote_ok)
1794 demote = glops->go_demote_ok(gl); 1626 demote = glops->go_demote_ok(gl);
1795 1627
@@ -1845,7 +1677,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1845 atomic_inc(&sdp->sd_reclaimed); 1677 atomic_inc(&sdp->sd_reclaimed);
1846 1678
1847 if (gfs2_glmutex_trylock(gl)) { 1679 if (gfs2_glmutex_trylock(gl)) {
1848 if (queue_empty(gl, &gl->gl_holders) && 1680 if (list_empty(&gl->gl_holders) &&
1849 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) 1681 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1850 handle_callback(gl, LM_ST_UNLOCKED); 1682 handle_callback(gl, LM_ST_UNLOCKED);
1851 gfs2_glmutex_unlock(gl); 1683 gfs2_glmutex_unlock(gl);
@@ -1909,7 +1741,7 @@ static void scan_glock(struct gfs2_glock *gl)
1909 return; 1741 return;
1910 1742
1911 if (gfs2_glmutex_trylock(gl)) { 1743 if (gfs2_glmutex_trylock(gl)) {
1912 if (queue_empty(gl, &gl->gl_holders) && 1744 if (list_empty(&gl->gl_holders) &&
1913 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) 1745 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1914 goto out_schedule; 1746 goto out_schedule;
1915 gfs2_glmutex_unlock(gl); 1747 gfs2_glmutex_unlock(gl);
@@ -1958,7 +1790,7 @@ static void clear_glock(struct gfs2_glock *gl)
1958 } 1790 }
1959 1791
1960 if (gfs2_glmutex_trylock(gl)) { 1792 if (gfs2_glmutex_trylock(gl)) {
1961 if (queue_empty(gl, &gl->gl_holders) && 1793 if (list_empty(&gl->gl_holders) &&
1962 gl->gl_state != LM_ST_UNLOCKED) 1794 gl->gl_state != LM_ST_UNLOCKED)
1963 handle_callback(gl, LM_ST_UNLOCKED); 1795 handle_callback(gl, LM_ST_UNLOCKED);
1964 gfs2_glmutex_unlock(gl); 1796 gfs2_glmutex_unlock(gl);
@@ -2000,7 +1832,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
2000 t = jiffies; 1832 t = jiffies;
2001 } 1833 }
2002 1834
1835 down_write(&gfs2_umount_flush_sem);
2003 invalidate_inodes(sdp->sd_vfs); 1836 invalidate_inodes(sdp->sd_vfs);
1837 up_write(&gfs2_umount_flush_sem);
2004 msleep(10); 1838 msleep(10);
2005 } 1839 }
2006} 1840}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index fb39108fc05c..f50e40ceca43 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -20,7 +20,6 @@
20#define LM_FLAG_ANY 0x00000008 20#define LM_FLAG_ANY 0x00000008
21#define LM_FLAG_PRIORITY 0x00000010 */ 21#define LM_FLAG_PRIORITY 0x00000010 */
22 22
23#define GL_LOCAL_EXCL 0x00000020
24#define GL_ASYNC 0x00000040 23#define GL_ASYNC 0x00000040
25#define GL_EXACT 0x00000080 24#define GL_EXACT 0x00000080
26#define GL_SKIP 0x00000100 25#define GL_SKIP 0x00000100
@@ -83,17 +82,11 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
83void gfs2_holder_reinit(unsigned int state, unsigned flags, 82void gfs2_holder_reinit(unsigned int state, unsigned flags,
84 struct gfs2_holder *gh); 83 struct gfs2_holder *gh);
85void gfs2_holder_uninit(struct gfs2_holder *gh); 84void gfs2_holder_uninit(struct gfs2_holder *gh);
86
87void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
88void gfs2_glock_drop_th(struct gfs2_glock *gl);
89
90int gfs2_glock_nq(struct gfs2_holder *gh); 85int gfs2_glock_nq(struct gfs2_holder *gh);
91int gfs2_glock_poll(struct gfs2_holder *gh); 86int gfs2_glock_poll(struct gfs2_holder *gh);
92int gfs2_glock_wait(struct gfs2_holder *gh); 87int gfs2_glock_wait(struct gfs2_holder *gh);
93void gfs2_glock_dq(struct gfs2_holder *gh); 88void gfs2_glock_dq(struct gfs2_holder *gh);
94 89
95int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
96
97void gfs2_glock_dq_uninit(struct gfs2_holder *gh); 90void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
98int gfs2_glock_nq_num(struct gfs2_sbd *sdp, 91int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
99 u64 number, const struct gfs2_glock_operations *glops, 92 u64 number, const struct gfs2_glock_operations *glops,
@@ -103,10 +96,6 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
103void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 96void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
104void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); 97void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
105 98
106void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
107 const struct gfs2_glock_operations *glops,
108 unsigned int state, int flags);
109
110/** 99/**
111 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock 100 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
112 * @gl: the glock 101 * @gl: the glock
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index b068d10bcb6e..c4b0391b7aa2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -117,12 +117,14 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
117 117
118static void meta_go_sync(struct gfs2_glock *gl) 118static void meta_go_sync(struct gfs2_glock *gl)
119{ 119{
120 if (gl->gl_state != LM_ST_EXCLUSIVE)
121 return;
122
120 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) { 123 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
121 gfs2_log_flush(gl->gl_sbd, gl); 124 gfs2_log_flush(gl->gl_sbd, gl);
122 gfs2_meta_sync(gl); 125 gfs2_meta_sync(gl);
123 gfs2_ail_empty_gl(gl); 126 gfs2_ail_empty_gl(gl);
124 } 127 }
125
126} 128}
127 129
128/** 130/**
@@ -142,6 +144,37 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
142} 144}
143 145
144/** 146/**
147 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
148 * @gl: the glock protecting the inode
149 *
150 */
151
152static void inode_go_sync(struct gfs2_glock *gl)
153{
154 struct gfs2_inode *ip = gl->gl_object;
155
156 if (ip && !S_ISREG(ip->i_inode.i_mode))
157 ip = NULL;
158
159 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
160 gfs2_log_flush(gl->gl_sbd, gl);
161 if (ip)
162 filemap_fdatawrite(ip->i_inode.i_mapping);
163 gfs2_meta_sync(gl);
164 if (ip) {
165 struct address_space *mapping = ip->i_inode.i_mapping;
166 int error = filemap_fdatawait(mapping);
167 if (error == -ENOSPC)
168 set_bit(AS_ENOSPC, &mapping->flags);
169 else if (error)
170 set_bit(AS_EIO, &mapping->flags);
171 }
172 clear_bit(GLF_DIRTY, &gl->gl_flags);
173 gfs2_ail_empty_gl(gl);
174 }
175}
176
177/**
145 * inode_go_xmote_th - promote/demote a glock 178 * inode_go_xmote_th - promote/demote a glock
146 * @gl: the glock 179 * @gl: the glock
147 * @state: the requested state 180 * @state: the requested state
@@ -149,12 +182,12 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
149 * 182 *
150 */ 183 */
151 184
152static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state, 185static void inode_go_xmote_th(struct gfs2_glock *gl)
153 int flags)
154{ 186{
155 if (gl->gl_state != LM_ST_UNLOCKED) 187 if (gl->gl_state != LM_ST_UNLOCKED)
156 gfs2_pte_inval(gl); 188 gfs2_pte_inval(gl);
157 gfs2_glock_xmote_th(gl, state, flags); 189 if (gl->gl_state == LM_ST_EXCLUSIVE)
190 inode_go_sync(gl);
158} 191}
159 192
160/** 193/**
@@ -189,38 +222,8 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl)
189static void inode_go_drop_th(struct gfs2_glock *gl) 222static void inode_go_drop_th(struct gfs2_glock *gl)
190{ 223{
191 gfs2_pte_inval(gl); 224 gfs2_pte_inval(gl);
192 gfs2_glock_drop_th(gl); 225 if (gl->gl_state == LM_ST_EXCLUSIVE)
193} 226 inode_go_sync(gl);
194
195/**
196 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
197 * @gl: the glock protecting the inode
198 *
199 */
200
201static void inode_go_sync(struct gfs2_glock *gl)
202{
203 struct gfs2_inode *ip = gl->gl_object;
204
205 if (ip && !S_ISREG(ip->i_inode.i_mode))
206 ip = NULL;
207
208 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
209 gfs2_log_flush(gl->gl_sbd, gl);
210 if (ip)
211 filemap_fdatawrite(ip->i_inode.i_mapping);
212 gfs2_meta_sync(gl);
213 if (ip) {
214 struct address_space *mapping = ip->i_inode.i_mapping;
215 int error = filemap_fdatawait(mapping);
216 if (error == -ENOSPC)
217 set_bit(AS_ENOSPC, &mapping->flags);
218 else if (error)
219 set_bit(AS_EIO, &mapping->flags);
220 }
221 clear_bit(GLF_DIRTY, &gl->gl_flags);
222 gfs2_ail_empty_gl(gl);
223 }
224} 227}
225 228
226/** 229/**
@@ -295,7 +298,7 @@ static int inode_go_lock(struct gfs2_holder *gh)
295 298
296 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && 299 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
297 (gl->gl_state == LM_ST_EXCLUSIVE) && 300 (gl->gl_state == LM_ST_EXCLUSIVE) &&
298 (gh->gh_flags & GL_LOCAL_EXCL)) 301 (gh->gh_state == LM_ST_EXCLUSIVE))
299 error = gfs2_truncatei_resume(ip); 302 error = gfs2_truncatei_resume(ip);
300 303
301 return error; 304 return error;
@@ -319,39 +322,6 @@ static void inode_go_unlock(struct gfs2_holder *gh)
319} 322}
320 323
321/** 324/**
322 * inode_greedy -
323 * @gl: the glock
324 *
325 */
326
327static void inode_greedy(struct gfs2_glock *gl)
328{
329 struct gfs2_sbd *sdp = gl->gl_sbd;
330 struct gfs2_inode *ip = gl->gl_object;
331 unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
332 unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
333 unsigned int new_time;
334
335 spin_lock(&ip->i_spin);
336
337 if (time_after(ip->i_last_pfault + quantum, jiffies)) {
338 new_time = ip->i_greedy + quantum;
339 if (new_time > max)
340 new_time = max;
341 } else {
342 new_time = ip->i_greedy - quantum;
343 if (!new_time || new_time > max)
344 new_time = 1;
345 }
346
347 ip->i_greedy = new_time;
348
349 spin_unlock(&ip->i_spin);
350
351 iput(&ip->i_inode);
352}
353
354/**
355 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock 325 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
356 * @gl: the glock 326 * @gl: the glock
357 * 327 *
@@ -398,8 +368,7 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
398 * 368 *
399 */ 369 */
400 370
401static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state, 371static void trans_go_xmote_th(struct gfs2_glock *gl)
402 int flags)
403{ 372{
404 struct gfs2_sbd *sdp = gl->gl_sbd; 373 struct gfs2_sbd *sdp = gl->gl_sbd;
405 374
@@ -408,8 +377,6 @@ static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
408 gfs2_meta_syncfs(sdp); 377 gfs2_meta_syncfs(sdp);
409 gfs2_log_shutdown(sdp); 378 gfs2_log_shutdown(sdp);
410 } 379 }
411
412 gfs2_glock_xmote_th(gl, state, flags);
413} 380}
414 381
415/** 382/**
@@ -461,8 +428,6 @@ static void trans_go_drop_th(struct gfs2_glock *gl)
461 gfs2_meta_syncfs(sdp); 428 gfs2_meta_syncfs(sdp);
462 gfs2_log_shutdown(sdp); 429 gfs2_log_shutdown(sdp);
463 } 430 }
464
465 gfs2_glock_drop_th(gl);
466} 431}
467 432
468/** 433/**
@@ -478,8 +443,8 @@ static int quota_go_demote_ok(struct gfs2_glock *gl)
478} 443}
479 444
480const struct gfs2_glock_operations gfs2_meta_glops = { 445const struct gfs2_glock_operations gfs2_meta_glops = {
481 .go_xmote_th = gfs2_glock_xmote_th, 446 .go_xmote_th = meta_go_sync,
482 .go_drop_th = gfs2_glock_drop_th, 447 .go_drop_th = meta_go_sync,
483 .go_type = LM_TYPE_META, 448 .go_type = LM_TYPE_META,
484}; 449};
485 450
@@ -487,19 +452,14 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
487 .go_xmote_th = inode_go_xmote_th, 452 .go_xmote_th = inode_go_xmote_th,
488 .go_xmote_bh = inode_go_xmote_bh, 453 .go_xmote_bh = inode_go_xmote_bh,
489 .go_drop_th = inode_go_drop_th, 454 .go_drop_th = inode_go_drop_th,
490 .go_sync = inode_go_sync,
491 .go_inval = inode_go_inval, 455 .go_inval = inode_go_inval,
492 .go_demote_ok = inode_go_demote_ok, 456 .go_demote_ok = inode_go_demote_ok,
493 .go_lock = inode_go_lock, 457 .go_lock = inode_go_lock,
494 .go_unlock = inode_go_unlock, 458 .go_unlock = inode_go_unlock,
495 .go_greedy = inode_greedy,
496 .go_type = LM_TYPE_INODE, 459 .go_type = LM_TYPE_INODE,
497}; 460};
498 461
499const struct gfs2_glock_operations gfs2_rgrp_glops = { 462const struct gfs2_glock_operations gfs2_rgrp_glops = {
500 .go_xmote_th = gfs2_glock_xmote_th,
501 .go_drop_th = gfs2_glock_drop_th,
502 .go_sync = meta_go_sync,
503 .go_inval = meta_go_inval, 463 .go_inval = meta_go_inval,
504 .go_demote_ok = rgrp_go_demote_ok, 464 .go_demote_ok = rgrp_go_demote_ok,
505 .go_lock = rgrp_go_lock, 465 .go_lock = rgrp_go_lock,
@@ -515,33 +475,23 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
515}; 475};
516 476
517const struct gfs2_glock_operations gfs2_iopen_glops = { 477const struct gfs2_glock_operations gfs2_iopen_glops = {
518 .go_xmote_th = gfs2_glock_xmote_th,
519 .go_drop_th = gfs2_glock_drop_th,
520 .go_type = LM_TYPE_IOPEN, 478 .go_type = LM_TYPE_IOPEN,
521}; 479};
522 480
523const struct gfs2_glock_operations gfs2_flock_glops = { 481const struct gfs2_glock_operations gfs2_flock_glops = {
524 .go_xmote_th = gfs2_glock_xmote_th,
525 .go_drop_th = gfs2_glock_drop_th,
526 .go_type = LM_TYPE_FLOCK, 482 .go_type = LM_TYPE_FLOCK,
527}; 483};
528 484
529const struct gfs2_glock_operations gfs2_nondisk_glops = { 485const struct gfs2_glock_operations gfs2_nondisk_glops = {
530 .go_xmote_th = gfs2_glock_xmote_th,
531 .go_drop_th = gfs2_glock_drop_th,
532 .go_type = LM_TYPE_NONDISK, 486 .go_type = LM_TYPE_NONDISK,
533}; 487};
534 488
535const struct gfs2_glock_operations gfs2_quota_glops = { 489const struct gfs2_glock_operations gfs2_quota_glops = {
536 .go_xmote_th = gfs2_glock_xmote_th,
537 .go_drop_th = gfs2_glock_drop_th,
538 .go_demote_ok = quota_go_demote_ok, 490 .go_demote_ok = quota_go_demote_ok,
539 .go_type = LM_TYPE_QUOTA, 491 .go_type = LM_TYPE_QUOTA,
540}; 492};
541 493
542const struct gfs2_glock_operations gfs2_journal_glops = { 494const struct gfs2_glock_operations gfs2_journal_glops = {
543 .go_xmote_th = gfs2_glock_xmote_th,
544 .go_drop_th = gfs2_glock_drop_th,
545 .go_type = LM_TYPE_JOURNAL, 495 .go_type = LM_TYPE_JOURNAL,
546}; 496};
547 497
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 734421edae85..12c80fd28db5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -101,17 +101,14 @@ struct gfs2_bufdata {
101}; 101};
102 102
103struct gfs2_glock_operations { 103struct gfs2_glock_operations {
104 void (*go_xmote_th) (struct gfs2_glock *gl, unsigned int state, int flags); 104 void (*go_xmote_th) (struct gfs2_glock *gl);
105 void (*go_xmote_bh) (struct gfs2_glock *gl); 105 void (*go_xmote_bh) (struct gfs2_glock *gl);
106 void (*go_drop_th) (struct gfs2_glock *gl); 106 void (*go_drop_th) (struct gfs2_glock *gl);
107 void (*go_drop_bh) (struct gfs2_glock *gl); 107 void (*go_drop_bh) (struct gfs2_glock *gl);
108 void (*go_sync) (struct gfs2_glock *gl);
109 void (*go_inval) (struct gfs2_glock *gl, int flags); 108 void (*go_inval) (struct gfs2_glock *gl, int flags);
110 int (*go_demote_ok) (struct gfs2_glock *gl); 109 int (*go_demote_ok) (struct gfs2_glock *gl);
111 int (*go_lock) (struct gfs2_holder *gh); 110 int (*go_lock) (struct gfs2_holder *gh);
112 void (*go_unlock) (struct gfs2_holder *gh); 111 void (*go_unlock) (struct gfs2_holder *gh);
113 void (*go_callback) (struct gfs2_glock *gl, unsigned int state);
114 void (*go_greedy) (struct gfs2_glock *gl);
115 const int go_type; 112 const int go_type;
116}; 113};
117 114
@@ -120,7 +117,6 @@ enum {
120 HIF_MUTEX = 0, 117 HIF_MUTEX = 0,
121 HIF_PROMOTE = 1, 118 HIF_PROMOTE = 1,
122 HIF_DEMOTE = 2, 119 HIF_DEMOTE = 2,
123 HIF_GREEDY = 3,
124 120
125 /* States */ 121 /* States */
126 HIF_ALLOCED = 4, 122 HIF_ALLOCED = 4,
@@ -128,6 +124,7 @@ enum {
128 HIF_HOLDER = 6, 124 HIF_HOLDER = 6,
129 HIF_FIRST = 7, 125 HIF_FIRST = 7,
130 HIF_ABORTED = 9, 126 HIF_ABORTED = 9,
127 HIF_WAIT = 10,
131}; 128};
132 129
133struct gfs2_holder { 130struct gfs2_holder {
@@ -140,17 +137,14 @@ struct gfs2_holder {
140 137
141 int gh_error; 138 int gh_error;
142 unsigned long gh_iflags; 139 unsigned long gh_iflags;
143 struct completion gh_wait;
144 unsigned long gh_ip; 140 unsigned long gh_ip;
145}; 141};
146 142
147enum { 143enum {
148 GLF_LOCK = 1, 144 GLF_LOCK = 1,
149 GLF_STICKY = 2, 145 GLF_STICKY = 2,
150 GLF_PREFETCH = 3,
151 GLF_DIRTY = 5, 146 GLF_DIRTY = 5,
152 GLF_SKIP_WAITERS2 = 6, 147 GLF_SKIP_WAITERS2 = 6,
153 GLF_GREEDY = 7,
154}; 148};
155 149
156struct gfs2_glock { 150struct gfs2_glock {
@@ -167,7 +161,7 @@ struct gfs2_glock {
167 unsigned long gl_ip; 161 unsigned long gl_ip;
168 struct list_head gl_holders; 162 struct list_head gl_holders;
169 struct list_head gl_waiters1; /* HIF_MUTEX */ 163 struct list_head gl_waiters1; /* HIF_MUTEX */
170 struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */ 164 struct list_head gl_waiters2; /* HIF_DEMOTE */
171 struct list_head gl_waiters3; /* HIF_PROMOTE */ 165 struct list_head gl_waiters3; /* HIF_PROMOTE */
172 166
173 const struct gfs2_glock_operations *gl_ops; 167 const struct gfs2_glock_operations *gl_ops;
@@ -236,7 +230,6 @@ struct gfs2_inode {
236 230
237 spinlock_t i_spin; 231 spinlock_t i_spin;
238 struct rw_semaphore i_rw_mutex; 232 struct rw_semaphore i_rw_mutex;
239 unsigned int i_greedy;
240 unsigned long i_last_pfault; 233 unsigned long i_last_pfault;
241 234
242 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT]; 235 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
@@ -418,17 +411,12 @@ struct gfs2_tune {
418 unsigned int gt_atime_quantum; /* Min secs between atime updates */ 411 unsigned int gt_atime_quantum; /* Min secs between atime updates */
419 unsigned int gt_new_files_jdata; 412 unsigned int gt_new_files_jdata;
420 unsigned int gt_new_files_directio; 413 unsigned int gt_new_files_directio;
421 unsigned int gt_max_atomic_write; /* Split big writes into this size */
422 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 414 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
423 unsigned int gt_lockdump_size; 415 unsigned int gt_lockdump_size;
424 unsigned int gt_stall_secs; /* Detects trouble! */ 416 unsigned int gt_stall_secs; /* Detects trouble! */
425 unsigned int gt_complain_secs; 417 unsigned int gt_complain_secs;
426 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */ 418 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
427 unsigned int gt_entries_per_readdir; 419 unsigned int gt_entries_per_readdir;
428 unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
429 unsigned int gt_greedy_default;
430 unsigned int gt_greedy_quantum;
431 unsigned int gt_greedy_max;
432 unsigned int gt_statfs_quantum; 420 unsigned int gt_statfs_quantum;
433 unsigned int gt_statfs_slow; 421 unsigned int gt_statfs_slow;
434}; 422};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d122074c45e1..0d6831a40565 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -287,10 +287,8 @@ out:
287 * 287 *
288 * Returns: errno 288 * Returns: errno
289 */ 289 */
290
291int gfs2_change_nlink(struct gfs2_inode *ip, int diff) 290int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
292{ 291{
293 struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
294 struct buffer_head *dibh; 292 struct buffer_head *dibh;
295 u32 nlink; 293 u32 nlink;
296 int error; 294 int error;
@@ -315,42 +313,34 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
315 else 313 else
316 drop_nlink(&ip->i_inode); 314 drop_nlink(&ip->i_inode);
317 315
318 ip->i_inode.i_ctime.tv_sec = get_seconds(); 316 ip->i_inode.i_ctime = CURRENT_TIME_SEC;
319 317
320 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 318 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
321 gfs2_dinode_out(ip, dibh->b_data); 319 gfs2_dinode_out(ip, dibh->b_data);
322 brelse(dibh); 320 brelse(dibh);
323 mark_inode_dirty(&ip->i_inode); 321 mark_inode_dirty(&ip->i_inode);
324 322
325 if (ip->i_inode.i_nlink == 0) { 323 if (ip->i_inode.i_nlink == 0)
326 struct gfs2_rgrpd *rgd;
327 struct gfs2_holder ri_gh, rg_gh;
328
329 error = gfs2_rindex_hold(sdp, &ri_gh);
330 if (error)
331 goto out;
332 error = -EIO;
333 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
334 if (!rgd)
335 goto out_norgrp;
336 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
337 if (error)
338 goto out_norgrp;
339
340 gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */ 324 gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
341 gfs2_glock_dq_uninit(&rg_gh); 325
342out_norgrp:
343 gfs2_glock_dq_uninit(&ri_gh);
344 }
345out:
346 return error; 326 return error;
347} 327}
348 328
349struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) 329struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
350{ 330{
351 struct qstr qstr; 331 struct qstr qstr;
332 struct inode *inode;
352 gfs2_str2qstr(&qstr, name); 333 gfs2_str2qstr(&qstr, name);
353 return gfs2_lookupi(dip, &qstr, 1, NULL); 334 inode = gfs2_lookupi(dip, &qstr, 1, NULL);
335 /* gfs2_lookupi has inconsistent callers: vfs
336 * related routines expect NULL for no entry found,
337 * gfs2_lookup_simple callers expect ENOENT
338 * and do not check for NULL.
339 */
340 if (inode == NULL)
341 return ERR_PTR(-ENOENT);
342 else
343 return inode;
354} 344}
355 345
356 346
@@ -361,8 +351,10 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
361 * @is_root: If 1, ignore the caller's permissions 351 * @is_root: If 1, ignore the caller's permissions
362 * @i_gh: An uninitialized holder for the new inode glock 352 * @i_gh: An uninitialized holder for the new inode glock
363 * 353 *
364 * There will always be a vnode (Linux VFS inode) for the d_gh inode unless 354 * This can be called via the VFS filldir function when NFS is doing
365 * @is_root is true. 355 * a readdirplus and the inode which its intending to stat isn't
356 * already in cache. In this case we must not take the directory glock
357 * again, since the readdir call will have already taken that lock.
366 * 358 *
367 * Returns: errno 359 * Returns: errno
368 */ 360 */
@@ -375,8 +367,9 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
375 struct gfs2_holder d_gh; 367 struct gfs2_holder d_gh;
376 struct gfs2_inum_host inum; 368 struct gfs2_inum_host inum;
377 unsigned int type; 369 unsigned int type;
378 int error = 0; 370 int error;
379 struct inode *inode = NULL; 371 struct inode *inode = NULL;
372 int unlock = 0;
380 373
381 if (!name->len || name->len > GFS2_FNAMESIZE) 374 if (!name->len || name->len > GFS2_FNAMESIZE)
382 return ERR_PTR(-ENAMETOOLONG); 375 return ERR_PTR(-ENAMETOOLONG);
@@ -388,9 +381,12 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
388 return dir; 381 return dir;
389 } 382 }
390 383
391 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); 384 if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) {
392 if (error) 385 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
393 return ERR_PTR(error); 386 if (error)
387 return ERR_PTR(error);
388 unlock = 1;
389 }
394 390
395 if (!is_root) { 391 if (!is_root) {
396 error = permission(dir, MAY_EXEC, NULL); 392 error = permission(dir, MAY_EXEC, NULL);
@@ -405,10 +401,11 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
405 inode = gfs2_inode_lookup(sb, &inum, type); 401 inode = gfs2_inode_lookup(sb, &inum, type);
406 402
407out: 403out:
408 gfs2_glock_dq_uninit(&d_gh); 404 if (unlock)
405 gfs2_glock_dq_uninit(&d_gh);
409 if (error == -ENOENT) 406 if (error == -ENOENT)
410 return NULL; 407 return NULL;
411 return inode; 408 return inode ? inode : ERR_PTR(error);
412} 409}
413 410
414static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) 411static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
index effe4a337c1d..e30673dd37e0 100644
--- a/fs/gfs2/lm.c
+++ b/fs/gfs2/lm.c
@@ -104,15 +104,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
104 vprintk(fmt, args); 104 vprintk(fmt, args);
105 va_end(args); 105 va_end(args);
106 106
107 fs_err(sdp, "about to withdraw from the cluster\n"); 107 fs_err(sdp, "about to withdraw this file system\n");
108 BUG_ON(sdp->sd_args.ar_debug); 108 BUG_ON(sdp->sd_args.ar_debug);
109 109
110
111 fs_err(sdp, "waiting for outstanding I/O\n");
112
113 /* FIXME: suspend dm device so oustanding bio's complete
114 and all further io requests fail */
115
116 fs_err(sdp, "telling LM to withdraw\n"); 110 fs_err(sdp, "telling LM to withdraw\n");
117 gfs2_withdraw_lockproto(&sdp->sd_lockstruct); 111 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
118 fs_err(sdp, "withdrawn\n"); 112 fs_err(sdp, "withdrawn\n");
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 33af707a4d3f..a87c7bf3c568 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -36,7 +36,7 @@
36 36
37#define GDLM_STRNAME_BYTES 24 37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32 38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 50000 39#define GDLM_DROP_COUNT 200000
40#define GDLM_DROP_PERIOD 60 40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128 41#define GDLM_NAME_LEN 128
42 42
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
index 2194b1d5b5ec..a0e7eda643ed 100644
--- a/fs/gfs2/locking/dlm/main.c
+++ b/fs/gfs2/locking/dlm/main.c
@@ -11,9 +11,6 @@
11 11
12#include "lock_dlm.h" 12#include "lock_dlm.h"
13 13
14extern int gdlm_drop_count;
15extern int gdlm_drop_period;
16
17extern struct lm_lockops gdlm_ops; 14extern struct lm_lockops gdlm_ops;
18 15
19static int __init init_lock_dlm(void) 16static int __init init_lock_dlm(void)
@@ -40,9 +37,6 @@ static int __init init_lock_dlm(void)
40 return error; 37 return error;
41 } 38 }
42 39
43 gdlm_drop_count = GDLM_DROP_COUNT;
44 gdlm_drop_period = GDLM_DROP_PERIOD;
45
46 printk(KERN_INFO 40 printk(KERN_INFO
47 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__); 41 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
48 return 0; 42 return 0;
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index cdd1694e889b..1d8faa3da8af 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -9,8 +9,6 @@
9 9
10#include "lock_dlm.h" 10#include "lock_dlm.h"
11 11
12int gdlm_drop_count;
13int gdlm_drop_period;
14const struct lm_lockops gdlm_ops; 12const struct lm_lockops gdlm_ops;
15 13
16 14
@@ -24,8 +22,8 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
24 if (!ls) 22 if (!ls)
25 return NULL; 23 return NULL;
26 24
27 ls->drop_locks_count = gdlm_drop_count; 25 ls->drop_locks_count = GDLM_DROP_COUNT;
28 ls->drop_locks_period = gdlm_drop_period; 26 ls->drop_locks_period = GDLM_DROP_PERIOD;
29 ls->fscb = cb; 27 ls->fscb = cb;
30 ls->sdp = sdp; 28 ls->sdp = sdp;
31 ls->fsflags = flags; 29 ls->fsflags = flags;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 29ae06f94944..4746b884662d 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -116,6 +116,17 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
116 return sprintf(buf, "%d\n", ls->recover_jid_status); 116 return sprintf(buf, "%d\n", ls->recover_jid_status);
117} 117}
118 118
119static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
120{
121 return sprintf(buf, "%d\n", ls->drop_locks_count);
122}
123
124static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
125{
126 ls->drop_locks_count = simple_strtol(buf, NULL, 0);
127 return len;
128}
129
119struct gdlm_attr { 130struct gdlm_attr {
120 struct attribute attr; 131 struct attribute attr;
121 ssize_t (*show)(struct gdlm_ls *, char *); 132 ssize_t (*show)(struct gdlm_ls *, char *);
@@ -135,6 +146,7 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL);
135GDLM_ATTR(recover, 0644, recover_show, recover_store); 146GDLM_ATTR(recover, 0644, recover_show, recover_store);
136GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); 147GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
137GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); 148GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
149GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store);
138 150
139static struct attribute *gdlm_attrs[] = { 151static struct attribute *gdlm_attrs[] = {
140 &gdlm_attr_proto_name.attr, 152 &gdlm_attr_proto_name.attr,
@@ -147,6 +159,7 @@ static struct attribute *gdlm_attrs[] = {
147 &gdlm_attr_recover.attr, 159 &gdlm_attr_recover.attr,
148 &gdlm_attr_recover_done.attr, 160 &gdlm_attr_recover_done.attr,
149 &gdlm_attr_recover_status.attr, 161 &gdlm_attr_recover_status.attr,
162 &gdlm_attr_drop_count.attr,
150 NULL, 163 NULL,
151}; 164};
152 165
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4d7f94d8c7bd..16bb4b4561ae 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -69,13 +69,16 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
69 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); 69 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
70 struct gfs2_trans *tr; 70 struct gfs2_trans *tr;
71 71
72 if (!list_empty(&bd->bd_list_tr)) 72 gfs2_log_lock(sdp);
73 if (!list_empty(&bd->bd_list_tr)) {
74 gfs2_log_unlock(sdp);
73 return; 75 return;
74 76 }
75 tr = current->journal_info; 77 tr = current->journal_info;
76 tr->tr_touched = 1; 78 tr->tr_touched = 1;
77 tr->tr_num_buf++; 79 tr->tr_num_buf++;
78 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 80 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
81 gfs2_log_unlock(sdp);
79 82
80 if (!list_empty(&le->le_list)) 83 if (!list_empty(&le->le_list))
81 return; 84 return;
@@ -84,7 +87,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
84 87
85 gfs2_meta_check(sdp, bd->bd_bh); 88 gfs2_meta_check(sdp, bd->bd_bh);
86 gfs2_pin(sdp, bd->bd_bh); 89 gfs2_pin(sdp, bd->bd_bh);
87
88 gfs2_log_lock(sdp); 90 gfs2_log_lock(sdp);
89 sdp->sd_log_num_buf++; 91 sdp->sd_log_num_buf++;
90 list_add(&le->le_list, &sdp->sd_log_le_buf); 92 list_add(&le->le_list, &sdp->sd_log_le_buf);
@@ -98,11 +100,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
98 struct list_head *head = &tr->tr_list_buf; 100 struct list_head *head = &tr->tr_list_buf;
99 struct gfs2_bufdata *bd; 101 struct gfs2_bufdata *bd;
100 102
103 gfs2_log_lock(sdp);
101 while (!list_empty(head)) { 104 while (!list_empty(head)) {
102 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); 105 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
103 list_del_init(&bd->bd_list_tr); 106 list_del_init(&bd->bd_list_tr);
104 tr->tr_num_buf--; 107 tr->tr_num_buf--;
105 } 108 }
109 gfs2_log_unlock(sdp);
106 gfs2_assert_warn(sdp, !tr->tr_num_buf); 110 gfs2_assert_warn(sdp, !tr->tr_num_buf);
107} 111}
108 112
@@ -462,13 +466,17 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
462 struct address_space *mapping = bd->bd_bh->b_page->mapping; 466 struct address_space *mapping = bd->bd_bh->b_page->mapping;
463 struct gfs2_inode *ip = GFS2_I(mapping->host); 467 struct gfs2_inode *ip = GFS2_I(mapping->host);
464 468
469 gfs2_log_lock(sdp);
465 tr->tr_touched = 1; 470 tr->tr_touched = 1;
466 if (list_empty(&bd->bd_list_tr) && 471 if (list_empty(&bd->bd_list_tr) &&
467 (ip->i_di.di_flags & GFS2_DIF_JDATA)) { 472 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
468 tr->tr_num_buf++; 473 tr->tr_num_buf++;
469 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 474 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
475 gfs2_log_unlock(sdp);
470 gfs2_pin(sdp, bd->bd_bh); 476 gfs2_pin(sdp, bd->bd_bh);
471 tr->tr_num_buf_new++; 477 tr->tr_num_buf_new++;
478 } else {
479 gfs2_log_unlock(sdp);
472 } 480 }
473 gfs2_trans_add_gl(bd->bd_gl); 481 gfs2_trans_add_gl(bd->bd_gl);
474 gfs2_log_lock(sdp); 482 gfs2_log_lock(sdp);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index d8d69a72a10d..56e33590b656 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -16,6 +16,7 @@
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/mpage.h> 17#include <linux/mpage.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/writeback.h>
19#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
20#include <linux/lm_interface.h> 21#include <linux/lm_interface.h>
21 22
@@ -157,6 +158,32 @@ out_ignore:
157} 158}
158 159
159/** 160/**
161 * gfs2_writepages - Write a bunch of dirty pages back to disk
162 * @mapping: The mapping to write
163 * @wbc: Write-back control
164 *
165 * For journaled files and/or ordered writes this just falls back to the
166 * kernel's default writepages path for now. We will probably want to change
167 * that eventually (i.e. when we look at allocate on flush).
168 *
169 * For the data=writeback case though we can already ignore buffer heads
170 * and write whole extents at once. This is a big reduction in the
171 * number of I/O requests we send and the bmap calls we make in this case.
172 */
173static int gfs2_writepages(struct address_space *mapping,
174 struct writeback_control *wbc)
175{
176 struct inode *inode = mapping->host;
177 struct gfs2_inode *ip = GFS2_I(inode);
178 struct gfs2_sbd *sdp = GFS2_SB(inode);
179
180 if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip))
181 return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
182
183 return generic_writepages(mapping, wbc);
184}
185
186/**
160 * stuffed_readpage - Fill in a Linux page with stuffed file data 187 * stuffed_readpage - Fill in a Linux page with stuffed file data
161 * @ip: the inode 188 * @ip: the inode
162 * @page: the page 189 * @page: the page
@@ -256,7 +283,7 @@ out_unlock:
256 * the page lock and the glock) and return having done no I/O. Its 283 * the page lock and the glock) and return having done no I/O. Its
257 * obviously not something we'd want to do on too regular a basis. 284 * obviously not something we'd want to do on too regular a basis.
258 * Any I/O we ignore at this time will be done via readpage later. 285 * Any I/O we ignore at this time will be done via readpage later.
259 * 2. We have to handle stuffed files here too. 286 * 2. We don't handle stuffed files here we let readpage do the honours.
260 * 3. mpage_readpages() does most of the heavy lifting in the common case. 287 * 3. mpage_readpages() does most of the heavy lifting in the common case.
261 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places. 288 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
262 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as 289 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
@@ -269,8 +296,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
269 struct gfs2_inode *ip = GFS2_I(inode); 296 struct gfs2_inode *ip = GFS2_I(inode);
270 struct gfs2_sbd *sdp = GFS2_SB(inode); 297 struct gfs2_sbd *sdp = GFS2_SB(inode);
271 struct gfs2_holder gh; 298 struct gfs2_holder gh;
272 unsigned page_idx; 299 int ret = 0;
273 int ret;
274 int do_unlock = 0; 300 int do_unlock = 0;
275 301
276 if (likely(file != &gfs2_internal_file_sentinel)) { 302 if (likely(file != &gfs2_internal_file_sentinel)) {
@@ -289,29 +315,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
289 goto out_unlock; 315 goto out_unlock;
290 } 316 }
291skip_lock: 317skip_lock:
292 if (gfs2_is_stuffed(ip)) { 318 if (!gfs2_is_stuffed(ip))
293 struct pagevec lru_pvec;
294 pagevec_init(&lru_pvec, 0);
295 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
296 struct page *page = list_entry(pages->prev, struct page, lru);
297 prefetchw(&page->flags);
298 list_del(&page->lru);
299 if (!add_to_page_cache(page, mapping,
300 page->index, GFP_KERNEL)) {
301 ret = stuffed_readpage(ip, page);
302 unlock_page(page);
303 if (!pagevec_add(&lru_pvec, page))
304 __pagevec_lru_add(&lru_pvec);
305 } else {
306 page_cache_release(page);
307 }
308 }
309 pagevec_lru_add(&lru_pvec);
310 ret = 0;
311 } else {
312 /* What we really want to do .... */
313 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block); 319 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
314 }
315 320
316 if (do_unlock) { 321 if (do_unlock) {
317 gfs2_glock_dq_m(1, &gh); 322 gfs2_glock_dq_m(1, &gh);
@@ -356,8 +361,10 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
356 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|LM_FLAG_TRY_1CB, &ip->i_gh); 361 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|LM_FLAG_TRY_1CB, &ip->i_gh);
357 error = gfs2_glock_nq_atime(&ip->i_gh); 362 error = gfs2_glock_nq_atime(&ip->i_gh);
358 if (unlikely(error)) { 363 if (unlikely(error)) {
359 if (error == GLR_TRYFAILED) 364 if (error == GLR_TRYFAILED) {
365 unlock_page(page);
360 error = AOP_TRUNCATED_PAGE; 366 error = AOP_TRUNCATED_PAGE;
367 }
361 goto out_uninit; 368 goto out_uninit;
362 } 369 }
363 370
@@ -594,6 +601,36 @@ static void gfs2_invalidatepage(struct page *page, unsigned long offset)
594 return; 601 return;
595} 602}
596 603
604/**
605 * gfs2_ok_for_dio - check that dio is valid on this file
606 * @ip: The inode
607 * @rw: READ or WRITE
608 * @offset: The offset at which we are reading or writing
609 *
610 * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
611 * 1 (to accept the i/o request)
612 */
613static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
614{
615 /*
616 * Should we return an error here? I can't see that O_DIRECT for
617 * a journaled file makes any sense. For now we'll silently fall
618 * back to buffered I/O, likewise we do the same for stuffed
619 * files since they are (a) small and (b) unaligned.
620 */
621 if (gfs2_is_jdata(ip))
622 return 0;
623
624 if (gfs2_is_stuffed(ip))
625 return 0;
626
627 if (offset > i_size_read(&ip->i_inode))
628 return 0;
629 return 1;
630}
631
632
633
597static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, 634static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
598 const struct iovec *iov, loff_t offset, 635 const struct iovec *iov, loff_t offset,
599 unsigned long nr_segs) 636 unsigned long nr_segs)
@@ -604,42 +641,28 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
604 struct gfs2_holder gh; 641 struct gfs2_holder gh;
605 int rv; 642 int rv;
606 643
607 if (rw == READ)
608 mutex_lock(&inode->i_mutex);
609 /* 644 /*
610 * Shared lock, even if its a write, since we do no allocation 645 * Deferred lock, even if its a write, since we do no allocation
611 * on this path. All we need change is atime. 646 * on this path. All we need change is atime, and this lock mode
647 * ensures that other nodes have flushed their buffered read caches
648 * (i.e. their page cache entries for this inode). We do not,
649 * unfortunately have the option of only flushing a range like
650 * the VFS does.
612 */ 651 */
613 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 652 gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
614 rv = gfs2_glock_nq_atime(&gh); 653 rv = gfs2_glock_nq_atime(&gh);
615 if (rv) 654 if (rv)
616 goto out; 655 return rv;
617 656 rv = gfs2_ok_for_dio(ip, rw, offset);
618 if (offset > i_size_read(inode)) 657 if (rv != 1)
619 goto out; 658 goto out; /* dio not valid, fall back to buffered i/o */
620 659
621 /* 660 rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
622 * Should we return an error here? I can't see that O_DIRECT for 661 iov, offset, nr_segs,
623 * a journaled file makes any sense. For now we'll silently fall 662 gfs2_get_block_direct, NULL);
624 * back to buffered I/O, likewise we do the same for stuffed
625 * files since they are (a) small and (b) unaligned.
626 */
627 if (gfs2_is_jdata(ip))
628 goto out;
629
630 if (gfs2_is_stuffed(ip))
631 goto out;
632
633 rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
634 inode->i_sb->s_bdev,
635 iov, offset, nr_segs,
636 gfs2_get_block_direct, NULL);
637out: 663out:
638 gfs2_glock_dq_m(1, &gh); 664 gfs2_glock_dq_m(1, &gh);
639 gfs2_holder_uninit(&gh); 665 gfs2_holder_uninit(&gh);
640 if (rw == READ)
641 mutex_unlock(&inode->i_mutex);
642
643 return rv; 666 return rv;
644} 667}
645 668
@@ -763,6 +786,7 @@ out:
763 786
764const struct address_space_operations gfs2_file_aops = { 787const struct address_space_operations gfs2_file_aops = {
765 .writepage = gfs2_writepage, 788 .writepage = gfs2_writepage,
789 .writepages = gfs2_writepages,
766 .readpage = gfs2_readpage, 790 .readpage = gfs2_readpage,
767 .readpages = gfs2_readpages, 791 .readpages = gfs2_readpages,
768 .sync_page = block_sync_page, 792 .sync_page = block_sync_page,
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index d355899585d8..9187eb174b43 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -46,6 +46,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
46 struct gfs2_inum_host inum; 46 struct gfs2_inum_host inum;
47 unsigned int type; 47 unsigned int type;
48 int error; 48 int error;
49 int had_lock=0;
49 50
50 if (inode && is_bad_inode(inode)) 51 if (inode && is_bad_inode(inode))
51 goto invalid; 52 goto invalid;
@@ -53,9 +54,12 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
53 if (sdp->sd_args.ar_localcaching) 54 if (sdp->sd_args.ar_localcaching)
54 goto valid; 55 goto valid;
55 56
56 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); 57 had_lock = gfs2_glock_is_locked_by_me(dip->i_gl);
57 if (error) 58 if (!had_lock) {
58 goto fail; 59 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
60 if (error)
61 goto fail;
62 }
59 63
60 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type); 64 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
61 switch (error) { 65 switch (error) {
@@ -82,13 +86,15 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
82 } 86 }
83 87
84valid_gunlock: 88valid_gunlock:
85 gfs2_glock_dq_uninit(&d_gh); 89 if (!had_lock)
90 gfs2_glock_dq_uninit(&d_gh);
86valid: 91valid:
87 dput(parent); 92 dput(parent);
88 return 1; 93 return 1;
89 94
90invalid_gunlock: 95invalid_gunlock:
91 gfs2_glock_dq_uninit(&d_gh); 96 if (!had_lock)
97 gfs2_glock_dq_uninit(&d_gh);
92invalid: 98invalid:
93 if (inode && S_ISDIR(inode->i_mode)) { 99 if (inode && S_ISDIR(inode->i_mode)) {
94 if (have_submounts(dentry)) 100 if (have_submounts(dentry))
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index b4e7b8775315..4855e8cca622 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,6 +22,7 @@
22#include "glock.h" 22#include "glock.h"
23#include "glops.h" 23#include "glops.h"
24#include "inode.h" 24#include "inode.h"
25#include "ops_dentry.h"
25#include "ops_export.h" 26#include "ops_export.h"
26#include "rgrp.h" 27#include "rgrp.h"
27#include "util.h" 28#include "util.h"
@@ -112,13 +113,12 @@ struct get_name_filldir {
112 char *name; 113 char *name;
113}; 114};
114 115
115static int get_name_filldir(void *opaque, const char *name, unsigned int length, 116static int get_name_filldir(void *opaque, const char *name, int length,
116 u64 offset, struct gfs2_inum_host *inum, 117 loff_t offset, u64 inum, unsigned int type)
117 unsigned int type)
118{ 118{
119 struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque; 119 struct get_name_filldir *gnfd = opaque;
120 120
121 if (!gfs2_inum_equal(inum, &gnfd->inum)) 121 if (inum != gnfd->inum.no_addr)
122 return 0; 122 return 0;
123 123
124 memcpy(gnfd->name, name, length); 124 memcpy(gnfd->name, name, length);
@@ -189,6 +189,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
189 return ERR_PTR(-ENOMEM); 189 return ERR_PTR(-ENOMEM);
190 } 190 }
191 191
192 dentry->d_op = &gfs2_dops;
192 return dentry; 193 return dentry;
193} 194}
194 195
@@ -215,8 +216,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
215 } 216 }
216 217
217 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, 218 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
218 LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL, 219 LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
219 &i_gh);
220 if (error) 220 if (error)
221 return ERR_PTR(error); 221 return ERR_PTR(error);
222 222
@@ -269,6 +269,7 @@ out_inode:
269 return ERR_PTR(-ENOMEM); 269 return ERR_PTR(-ENOMEM);
270 } 270 }
271 271
272 dentry->d_op = &gfs2_dops;
272 return dentry; 273 return dentry;
273 274
274fail_rgd: 275fail_rgd:
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index faa07e4b97d0..c996aa739a05 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -43,15 +43,6 @@
43#include "util.h" 43#include "util.h"
44#include "eaops.h" 44#include "eaops.h"
45 45
46/* For regular, non-NFS */
47struct filldir_reg {
48 struct gfs2_sbd *fdr_sbd;
49 int fdr_prefetch;
50
51 filldir_t fdr_filldir;
52 void *fdr_opaque;
53};
54
55/* 46/*
56 * Most fields left uninitialised to catch anybody who tries to 47 * Most fields left uninitialised to catch anybody who tries to
57 * use them. f_flags set to prevent file_accessed() from touching 48 * use them. f_flags set to prevent file_accessed() from touching
@@ -128,41 +119,6 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
128} 119}
129 120
130/** 121/**
131 * filldir_func - Report a directory entry to the caller of gfs2_dir_read()
132 * @opaque: opaque data used by the function
133 * @name: the name of the directory entry
134 * @length: the length of the name
135 * @offset: the entry's offset in the directory
136 * @inum: the inode number the entry points to
137 * @type: the type of inode the entry points to
138 *
139 * Returns: 0 on success, 1 if buffer full
140 */
141
142static int filldir_func(void *opaque, const char *name, unsigned int length,
143 u64 offset, struct gfs2_inum_host *inum,
144 unsigned int type)
145{
146 struct filldir_reg *fdr = (struct filldir_reg *)opaque;
147 struct gfs2_sbd *sdp = fdr->fdr_sbd;
148 int error;
149
150 error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
151 inum->no_addr, type);
152 if (error)
153 return 1;
154
155 if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
156 gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops,
157 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
158 gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops,
159 LM_ST_SHARED, LM_FLAG_TRY);
160 }
161
162 return 0;
163}
164
165/**
166 * gfs2_readdir - Read directory entries from a directory 122 * gfs2_readdir - Read directory entries from a directory
167 * @file: The directory to read from 123 * @file: The directory to read from
168 * @dirent: Buffer for dirents 124 * @dirent: Buffer for dirents
@@ -175,16 +131,10 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
175{ 131{
176 struct inode *dir = file->f_mapping->host; 132 struct inode *dir = file->f_mapping->host;
177 struct gfs2_inode *dip = GFS2_I(dir); 133 struct gfs2_inode *dip = GFS2_I(dir);
178 struct filldir_reg fdr;
179 struct gfs2_holder d_gh; 134 struct gfs2_holder d_gh;
180 u64 offset = file->f_pos; 135 u64 offset = file->f_pos;
181 int error; 136 int error;
182 137
183 fdr.fdr_sbd = GFS2_SB(dir);
184 fdr.fdr_prefetch = 1;
185 fdr.fdr_filldir = filldir;
186 fdr.fdr_opaque = dirent;
187
188 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); 138 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
189 error = gfs2_glock_nq_atime(&d_gh); 139 error = gfs2_glock_nq_atime(&d_gh);
190 if (error) { 140 if (error) {
@@ -192,7 +142,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
192 return error; 142 return error;
193 } 143 }
194 144
195 error = gfs2_dir_read(dir, &offset, &fdr, filldir_func); 145 error = gfs2_dir_read(dir, &offset, dirent, filldir);
196 146
197 gfs2_glock_dq_uninit(&d_gh); 147 gfs2_glock_dq_uninit(&d_gh);
198 148
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 636dda4c7d38..f40a84807d75 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -264,13 +264,23 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
264 struct gfs2_inode *dip = GFS2_I(dir); 264 struct gfs2_inode *dip = GFS2_I(dir);
265 struct gfs2_sbd *sdp = GFS2_SB(dir); 265 struct gfs2_sbd *sdp = GFS2_SB(dir);
266 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 266 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
267 struct gfs2_holder ghs[2]; 267 struct gfs2_holder ghs[3];
268 struct gfs2_rgrpd *rgd;
269 struct gfs2_holder ri_gh;
268 int error; 270 int error;
269 271
272 error = gfs2_rindex_hold(sdp, &ri_gh);
273 if (error)
274 return error;
275
270 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 276 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
271 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 277 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
272 278
273 error = gfs2_glock_nq_m(2, ghs); 279 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
280 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
281
282
283 error = gfs2_glock_nq_m(3, ghs);
274 if (error) 284 if (error)
275 goto out; 285 goto out;
276 286
@@ -291,10 +301,12 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
291out_end_trans: 301out_end_trans:
292 gfs2_trans_end(sdp); 302 gfs2_trans_end(sdp);
293out_gunlock: 303out_gunlock:
294 gfs2_glock_dq_m(2, ghs); 304 gfs2_glock_dq_m(3, ghs);
295out: 305out:
296 gfs2_holder_uninit(ghs); 306 gfs2_holder_uninit(ghs);
297 gfs2_holder_uninit(ghs + 1); 307 gfs2_holder_uninit(ghs + 1);
308 gfs2_holder_uninit(ghs + 2);
309 gfs2_glock_dq_uninit(&ri_gh);
298 return error; 310 return error;
299} 311}
300 312
@@ -449,13 +461,22 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
449 struct gfs2_inode *dip = GFS2_I(dir); 461 struct gfs2_inode *dip = GFS2_I(dir);
450 struct gfs2_sbd *sdp = GFS2_SB(dir); 462 struct gfs2_sbd *sdp = GFS2_SB(dir);
451 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 463 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
452 struct gfs2_holder ghs[2]; 464 struct gfs2_holder ghs[3];
465 struct gfs2_rgrpd *rgd;
466 struct gfs2_holder ri_gh;
453 int error; 467 int error;
454 468
469
470 error = gfs2_rindex_hold(sdp, &ri_gh);
471 if (error)
472 return error;
455 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 473 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
456 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 474 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
457 475
458 error = gfs2_glock_nq_m(2, ghs); 476 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
477 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
478
479 error = gfs2_glock_nq_m(3, ghs);
459 if (error) 480 if (error)
460 goto out; 481 goto out;
461 482
@@ -483,10 +504,12 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
483 gfs2_trans_end(sdp); 504 gfs2_trans_end(sdp);
484 505
485out_gunlock: 506out_gunlock:
486 gfs2_glock_dq_m(2, ghs); 507 gfs2_glock_dq_m(3, ghs);
487out: 508out:
488 gfs2_holder_uninit(ghs); 509 gfs2_holder_uninit(ghs);
489 gfs2_holder_uninit(ghs + 1); 510 gfs2_holder_uninit(ghs + 1);
511 gfs2_holder_uninit(ghs + 2);
512 gfs2_glock_dq_uninit(&ri_gh);
490 return error; 513 return error;
491} 514}
492 515
@@ -547,7 +570,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
547 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 570 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
548 struct gfs2_inode *nip = NULL; 571 struct gfs2_inode *nip = NULL;
549 struct gfs2_sbd *sdp = GFS2_SB(odir); 572 struct gfs2_sbd *sdp = GFS2_SB(odir);
550 struct gfs2_holder ghs[4], r_gh; 573 struct gfs2_holder ghs[5], r_gh;
574 struct gfs2_rgrpd *nrgd;
551 unsigned int num_gh; 575 unsigned int num_gh;
552 int dir_rename = 0; 576 int dir_rename = 0;
553 int alloc_required; 577 int alloc_required;
@@ -587,6 +611,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
587 if (nip) { 611 if (nip) {
588 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); 612 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
589 num_gh++; 613 num_gh++;
614 /* grab the resource lock for unlink flag twiddling
615 * this is the case of the target file already existing
616 * so we unlink before doing the rename
617 */
618 nrgd = gfs2_blk2rgrpd(sdp, nip->i_num.no_addr);
619 if (nrgd)
620 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
590 } 621 }
591 622
592 error = gfs2_glock_nq_m(num_gh, ghs); 623 error = gfs2_glock_nq_m(num_gh, ghs);
@@ -684,12 +715,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
684 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 715 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
685 al->al_rgd->rd_ri.ri_length + 716 al->al_rgd->rd_ri.ri_length +
686 4 * RES_DINODE + 4 * RES_LEAF + 717 4 * RES_DINODE + 4 * RES_LEAF +
687 RES_STATFS + RES_QUOTA, 0); 718 RES_STATFS + RES_QUOTA + 4, 0);
688 if (error) 719 if (error)
689 goto out_ipreserv; 720 goto out_ipreserv;
690 } else { 721 } else {
691 error = gfs2_trans_begin(sdp, 4 * RES_DINODE + 722 error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
692 5 * RES_LEAF, 0); 723 5 * RES_LEAF + 4, 0);
693 if (error) 724 if (error)
694 goto out_gunlock; 725 goto out_gunlock;
695 } 726 }
@@ -728,7 +759,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
728 error = gfs2_meta_inode_buffer(ip, &dibh); 759 error = gfs2_meta_inode_buffer(ip, &dibh);
729 if (error) 760 if (error)
730 goto out_end_trans; 761 goto out_end_trans;
731 ip->i_inode.i_ctime.tv_sec = get_seconds(); 762 ip->i_inode.i_ctime = CURRENT_TIME_SEC;
732 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 763 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
733 gfs2_dinode_out(ip, dibh->b_data); 764 gfs2_dinode_out(ip, dibh->b_data);
734 brelse(dibh); 765 brelse(dibh);
@@ -1018,7 +1049,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1018 } 1049 }
1019 1050
1020 generic_fillattr(inode, stat); 1051 generic_fillattr(inode, stat);
1021 if (unlock); 1052 if (unlock)
1022 gfs2_glock_dq_uninit(&gh); 1053 gfs2_glock_dq_uninit(&gh);
1023 1054
1024 return 0; 1055 return 0;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 7685b46f934b..47369d011214 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -173,6 +173,9 @@ static void gfs2_write_super_lockfs(struct super_block *sb)
173 struct gfs2_sbd *sdp = sb->s_fs_info; 173 struct gfs2_sbd *sdp = sb->s_fs_info;
174 int error; 174 int error;
175 175
176 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
177 return;
178
176 for (;;) { 179 for (;;) {
177 error = gfs2_freeze_fs(sdp); 180 error = gfs2_freeze_fs(sdp);
178 if (!error) 181 if (!error)
@@ -426,6 +429,12 @@ static void gfs2_delete_inode(struct inode *inode)
426 } 429 }
427 430
428 error = gfs2_dinode_dealloc(ip); 431 error = gfs2_dinode_dealloc(ip);
432 /*
433 * Must do this before unlock to avoid trying to write back
434 * potentially dirty data now that inode no longer exists
435 * on disk.
436 */
437 truncate_inode_pages(&inode->i_data, 0);
429 438
430out_unlock: 439out_unlock:
431 gfs2_glock_dq(&ip->i_iopen_gh); 440 gfs2_glock_dq(&ip->i_iopen_gh);
@@ -443,14 +452,12 @@ out:
443 452
444static struct inode *gfs2_alloc_inode(struct super_block *sb) 453static struct inode *gfs2_alloc_inode(struct super_block *sb)
445{ 454{
446 struct gfs2_sbd *sdp = sb->s_fs_info;
447 struct gfs2_inode *ip; 455 struct gfs2_inode *ip;
448 456
449 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); 457 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
450 if (ip) { 458 if (ip) {
451 ip->i_flags = 0; 459 ip->i_flags = 0;
452 ip->i_gl = NULL; 460 ip->i_gl = NULL;
453 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
454 ip->i_last_pfault = jiffies; 461 ip->i_last_pfault = jiffies;
455 } 462 }
456 return &ip->i_inode; 463 return &ip->i_inode;
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index 45a5f11fc39a..14b380fb0602 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -28,34 +28,13 @@
28#include "trans.h" 28#include "trans.h"
29#include "util.h" 29#include "util.h"
30 30
31static void pfault_be_greedy(struct gfs2_inode *ip)
32{
33 unsigned int time;
34
35 spin_lock(&ip->i_spin);
36 time = ip->i_greedy;
37 ip->i_last_pfault = jiffies;
38 spin_unlock(&ip->i_spin);
39
40 igrab(&ip->i_inode);
41 if (gfs2_glock_be_greedy(ip->i_gl, time))
42 iput(&ip->i_inode);
43}
44
45static struct page *gfs2_private_nopage(struct vm_area_struct *area, 31static struct page *gfs2_private_nopage(struct vm_area_struct *area,
46 unsigned long address, int *type) 32 unsigned long address, int *type)
47{ 33{
48 struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host); 34 struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
49 struct page *result;
50 35
51 set_bit(GIF_PAGED, &ip->i_flags); 36 set_bit(GIF_PAGED, &ip->i_flags);
52 37 return filemap_nopage(area, address, type);
53 result = filemap_nopage(area, address, type);
54
55 if (result && result != NOPAGE_OOM)
56 pfault_be_greedy(ip);
57
58 return result;
59} 38}
60 39
61static int alloc_page_backing(struct gfs2_inode *ip, struct page *page) 40static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
@@ -167,7 +146,6 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
167 set_page_dirty(result); 146 set_page_dirty(result);
168 } 147 }
169 148
170 pfault_be_greedy(ip);
171out: 149out:
172 gfs2_glock_dq_uninit(&i_gh); 150 gfs2_glock_dq_uninit(&i_gh);
173 151
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 43a24f2e5905..70f424fcf1cd 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -71,17 +71,12 @@ void gfs2_tune_init(struct gfs2_tune *gt)
71 gt->gt_atime_quantum = 3600; 71 gt->gt_atime_quantum = 3600;
72 gt->gt_new_files_jdata = 0; 72 gt->gt_new_files_jdata = 0;
73 gt->gt_new_files_directio = 0; 73 gt->gt_new_files_directio = 0;
74 gt->gt_max_atomic_write = 4 << 20;
75 gt->gt_max_readahead = 1 << 18; 74 gt->gt_max_readahead = 1 << 18;
76 gt->gt_lockdump_size = 131072; 75 gt->gt_lockdump_size = 131072;
77 gt->gt_stall_secs = 600; 76 gt->gt_stall_secs = 600;
78 gt->gt_complain_secs = 10; 77 gt->gt_complain_secs = 10;
79 gt->gt_reclaim_limit = 5000; 78 gt->gt_reclaim_limit = 5000;
80 gt->gt_entries_per_readdir = 32; 79 gt->gt_entries_per_readdir = 32;
81 gt->gt_prefetch_secs = 10;
82 gt->gt_greedy_default = HZ / 10;
83 gt->gt_greedy_quantum = HZ / 40;
84 gt->gt_greedy_max = HZ / 4;
85 gt->gt_statfs_quantum = 30; 80 gt->gt_statfs_quantum = 30;
86 gt->gt_statfs_slow = 0; 81 gt->gt_statfs_slow = 0;
87} 82}
@@ -359,8 +354,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
359 mutex_lock(&sdp->sd_jindex_mutex); 354 mutex_lock(&sdp->sd_jindex_mutex);
360 355
361 for (;;) { 356 for (;;) {
362 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 357 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
363 GL_LOCAL_EXCL, ji_gh);
364 if (error) 358 if (error)
365 break; 359 break;
366 360
@@ -529,8 +523,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
529 struct gfs2_log_header_host head; 523 struct gfs2_log_header_host head;
530 int error; 524 int error;
531 525
532 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 526 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
533 GL_LOCAL_EXCL, &t_gh);
534 if (error) 527 if (error)
535 return error; 528 return error;
536 529
@@ -583,9 +576,8 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
583 gfs2_quota_sync(sdp); 576 gfs2_quota_sync(sdp);
584 gfs2_statfs_sync(sdp); 577 gfs2_statfs_sync(sdp);
585 578
586 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 579 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
587 GL_LOCAL_EXCL | GL_NOCACHE, 580 &t_gh);
588 &t_gh);
589 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 581 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
590 return error; 582 return error;
591 583
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 983eaf1e06be..d01f9f0fda26 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -436,17 +436,12 @@ TUNE_ATTR(atime_quantum, 0);
436TUNE_ATTR(max_readahead, 0); 436TUNE_ATTR(max_readahead, 0);
437TUNE_ATTR(complain_secs, 0); 437TUNE_ATTR(complain_secs, 0);
438TUNE_ATTR(reclaim_limit, 0); 438TUNE_ATTR(reclaim_limit, 0);
439TUNE_ATTR(prefetch_secs, 0);
440TUNE_ATTR(statfs_slow, 0); 439TUNE_ATTR(statfs_slow, 0);
441TUNE_ATTR(new_files_jdata, 0); 440TUNE_ATTR(new_files_jdata, 0);
442TUNE_ATTR(new_files_directio, 0); 441TUNE_ATTR(new_files_directio, 0);
443TUNE_ATTR(quota_simul_sync, 1); 442TUNE_ATTR(quota_simul_sync, 1);
444TUNE_ATTR(quota_cache_secs, 1); 443TUNE_ATTR(quota_cache_secs, 1);
445TUNE_ATTR(max_atomic_write, 1);
446TUNE_ATTR(stall_secs, 1); 444TUNE_ATTR(stall_secs, 1);
447TUNE_ATTR(greedy_default, 1);
448TUNE_ATTR(greedy_quantum, 1);
449TUNE_ATTR(greedy_max, 1);
450TUNE_ATTR(statfs_quantum, 1); 445TUNE_ATTR(statfs_quantum, 1);
451TUNE_ATTR_DAEMON(scand_secs, scand_process); 446TUNE_ATTR_DAEMON(scand_secs, scand_process);
452TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); 447TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
@@ -465,15 +460,10 @@ static struct attribute *tune_attrs[] = {
465 &tune_attr_max_readahead.attr, 460 &tune_attr_max_readahead.attr,
466 &tune_attr_complain_secs.attr, 461 &tune_attr_complain_secs.attr,
467 &tune_attr_reclaim_limit.attr, 462 &tune_attr_reclaim_limit.attr,
468 &tune_attr_prefetch_secs.attr,
469 &tune_attr_statfs_slow.attr, 463 &tune_attr_statfs_slow.attr,
470 &tune_attr_quota_simul_sync.attr, 464 &tune_attr_quota_simul_sync.attr,
471 &tune_attr_quota_cache_secs.attr, 465 &tune_attr_quota_cache_secs.attr,
472 &tune_attr_max_atomic_write.attr,
473 &tune_attr_stall_secs.attr, 466 &tune_attr_stall_secs.attr,
474 &tune_attr_greedy_default.attr,
475 &tune_attr_greedy_quantum.attr,
476 &tune_attr_greedy_max.attr,
477 &tune_attr_statfs_quantum.attr, 467 &tune_attr_statfs_quantum.attr,
478 &tune_attr_scand_secs.attr, 468 &tune_attr_scand_secs.attr,
479 &tune_attr_recoverd_secs.attr, 469 &tune_attr_recoverd_secs.attr,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index f5719117edfe..e285022f006c 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -182,9 +182,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
182 * Take appropriate lock on inode 182 * Take appropriate lock on inode
183 */ 183 */
184 if (create) 184 if (create)
185 IWRITE_LOCK(ip); 185 IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
186 else 186 else
187 IREAD_LOCK(ip); 187 IREAD_LOCK(ip, RDWRLOCK_NORMAL);
188 188
189 if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && 189 if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) &&
190 (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) && 190 (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) &&
@@ -359,7 +359,7 @@ void jfs_truncate(struct inode *ip)
359 359
360 nobh_truncate_page(ip->i_mapping, ip->i_size); 360 nobh_truncate_page(ip->i_mapping, ip->i_size);
361 361
362 IWRITE_LOCK(ip); 362 IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
363 jfs_truncate_nolock(ip, ip->i_size); 363 jfs_truncate_nolock(ip, ip->i_size);
364 IWRITE_UNLOCK(ip); 364 IWRITE_UNLOCK(ip);
365} 365}
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index ddffbbd4d955..7378798f0b21 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -39,10 +39,6 @@ extern void jfs_proc_clean(void);
39/* 39/*
40 * assert with traditional printf/panic 40 * assert with traditional printf/panic
41 */ 41 */
42#ifdef CONFIG_KERNEL_ASSERTS
43/* kgdb stuff */
44#define assert(p) KERNEL_ASSERT(#p, p)
45#else
46#define assert(p) do { \ 42#define assert(p) do { \
47 if (!(p)) { \ 43 if (!(p)) { \
48 printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \ 44 printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \
@@ -50,7 +46,6 @@ extern void jfs_proc_clean(void);
50 BUG(); \ 46 BUG(); \
51 } \ 47 } \
52} while (0) 48} while (0)
53#endif
54 49
55/* 50/*
56 * debug ON 51 * debug ON
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 23546c8fd48b..82b0544bd76d 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -337,7 +337,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
337 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; 337 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
338 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; 338 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
339 339
340 IREAD_LOCK(ipbmap); 340 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
341 341
342 /* block to be freed better be within the mapsize. */ 342 /* block to be freed better be within the mapsize. */
343 if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { 343 if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
@@ -733,7 +733,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
733 * allocation group size, try to allocate anywhere. 733 * allocation group size, try to allocate anywhere.
734 */ 734 */
735 if (l2nb > bmp->db_agl2size) { 735 if (l2nb > bmp->db_agl2size) {
736 IWRITE_LOCK(ipbmap); 736 IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
737 737
738 rc = dbAllocAny(bmp, nblocks, l2nb, results); 738 rc = dbAllocAny(bmp, nblocks, l2nb, results);
739 739
@@ -774,7 +774,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
774 * the hint using a tiered strategy. 774 * the hint using a tiered strategy.
775 */ 775 */
776 if (nblocks <= BPERDMAP) { 776 if (nblocks <= BPERDMAP) {
777 IREAD_LOCK(ipbmap); 777 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
778 778
779 /* get the buffer for the dmap containing the hint. 779 /* get the buffer for the dmap containing the hint.
780 */ 780 */
@@ -844,7 +844,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
844 /* try to satisfy the allocation request with blocks within 844 /* try to satisfy the allocation request with blocks within
845 * the same allocation group as the hint. 845 * the same allocation group as the hint.
846 */ 846 */
847 IWRITE_LOCK(ipbmap); 847 IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
848 if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC) 848 if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
849 goto write_unlock; 849 goto write_unlock;
850 850
@@ -856,7 +856,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
856 * Let dbNextAG recommend a preferred allocation group 856 * Let dbNextAG recommend a preferred allocation group
857 */ 857 */
858 agno = dbNextAG(ipbmap); 858 agno = dbNextAG(ipbmap);
859 IWRITE_LOCK(ipbmap); 859 IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
860 860
861 /* Try to allocate within this allocation group. if that fails, try to 861 /* Try to allocate within this allocation group. if that fails, try to
862 * allocate anywhere in the map. 862 * allocate anywhere in the map.
@@ -900,7 +900,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
900 s64 lblkno; 900 s64 lblkno;
901 struct metapage *mp; 901 struct metapage *mp;
902 902
903 IREAD_LOCK(ipbmap); 903 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
904 904
905 /* 905 /*
906 * validate extent request: 906 * validate extent request:
@@ -1050,7 +1050,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1050 */ 1050 */
1051 extblkno = lastblkno + 1; 1051 extblkno = lastblkno + 1;
1052 1052
1053 IREAD_LOCK(ipbmap); 1053 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
1054 1054
1055 /* better be within the file system */ 1055 /* better be within the file system */
1056 bmp = sbi->bmap; 1056 bmp = sbi->bmap;
@@ -3116,7 +3116,7 @@ int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
3116 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; 3116 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
3117 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; 3117 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
3118 3118
3119 IREAD_LOCK(ipbmap); 3119 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
3120 3120
3121 /* block to be allocated better be within the mapsize. */ 3121 /* block to be allocated better be within the mapsize. */
3122 ASSERT(nblocks <= bmp->db_mapsize - blkno); 3122 ASSERT(nblocks <= bmp->db_mapsize - blkno);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 53f63b47a6d3..aa5124b643b1 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -331,7 +331,7 @@ int diRead(struct inode *ip)
331 331
332 /* read the iag */ 332 /* read the iag */
333 imap = JFS_IP(ipimap)->i_imap; 333 imap = JFS_IP(ipimap)->i_imap;
334 IREAD_LOCK(ipimap); 334 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
335 rc = diIAGRead(imap, iagno, &mp); 335 rc = diIAGRead(imap, iagno, &mp);
336 IREAD_UNLOCK(ipimap); 336 IREAD_UNLOCK(ipimap);
337 if (rc) { 337 if (rc) {
@@ -920,7 +920,7 @@ int diFree(struct inode *ip)
920 /* Obtain read lock in imap inode. Don't release it until we have 920 /* Obtain read lock in imap inode. Don't release it until we have
921 * read all of the IAG's that we are going to. 921 * read all of the IAG's that we are going to.
922 */ 922 */
923 IREAD_LOCK(ipimap); 923 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
924 924
925 /* read the iag. 925 /* read the iag.
926 */ 926 */
@@ -1415,7 +1415,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1415 AG_LOCK(imap, agno); 1415 AG_LOCK(imap, agno);
1416 1416
1417 /* Get read lock on imap inode */ 1417 /* Get read lock on imap inode */
1418 IREAD_LOCK(ipimap); 1418 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
1419 1419
1420 /* get the iag number and read the iag */ 1420 /* get the iag number and read the iag */
1421 iagno = INOTOIAG(inum); 1421 iagno = INOTOIAG(inum);
@@ -1808,7 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1808 return -ENOSPC; 1808 return -ENOSPC;
1809 1809
1810 /* obtain read lock on imap inode */ 1810 /* obtain read lock on imap inode */
1811 IREAD_LOCK(imap->im_ipimap); 1811 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
1812 1812
1813 /* read the iag at the head of the list. 1813 /* read the iag at the head of the list.
1814 */ 1814 */
@@ -1946,7 +1946,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1946 } else { 1946 } else {
1947 /* read the iag. 1947 /* read the iag.
1948 */ 1948 */
1949 IREAD_LOCK(imap->im_ipimap); 1949 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
1950 if ((rc = diIAGRead(imap, iagno, &mp))) { 1950 if ((rc = diIAGRead(imap, iagno, &mp))) {
1951 IREAD_UNLOCK(imap->im_ipimap); 1951 IREAD_UNLOCK(imap->im_ipimap);
1952 jfs_error(ip->i_sb, "diAllocExt: error reading iag"); 1952 jfs_error(ip->i_sb, "diAllocExt: error reading iag");
@@ -2509,7 +2509,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2509 */ 2509 */
2510 2510
2511 /* acquire inode map lock */ 2511 /* acquire inode map lock */
2512 IWRITE_LOCK(ipimap); 2512 IWRITE_LOCK(ipimap, RDWRLOCK_IMAP);
2513 2513
2514 if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { 2514 if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
2515 IWRITE_UNLOCK(ipimap); 2515 IWRITE_UNLOCK(ipimap);
@@ -2648,7 +2648,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2648 } 2648 }
2649 2649
2650 /* obtain read lock on map */ 2650 /* obtain read lock on map */
2651 IREAD_LOCK(ipimap); 2651 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
2652 2652
2653 /* read the iag */ 2653 /* read the iag */
2654 if ((rc = diIAGRead(imap, iagno, &mp))) { 2654 if ((rc = diIAGRead(imap, iagno, &mp))) {
@@ -2779,7 +2779,7 @@ diUpdatePMap(struct inode *ipimap,
2779 return -EIO; 2779 return -EIO;
2780 } 2780 }
2781 /* read the iag */ 2781 /* read the iag */
2782 IREAD_LOCK(ipimap); 2782 IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
2783 rc = diIAGRead(imap, iagno, &mp); 2783 rc = diIAGRead(imap, iagno, &mp);
2784 IREAD_UNLOCK(ipimap); 2784 IREAD_UNLOCK(ipimap);
2785 if (rc) 2785 if (rc)
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 94005584445a..8f453eff3c83 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -109,9 +109,11 @@ struct jfs_inode_info {
109 109
110#define JFS_ACL_NOT_CACHED ((void *)-1) 110#define JFS_ACL_NOT_CACHED ((void *)-1)
111 111
112#define IREAD_LOCK(ip) down_read(&JFS_IP(ip)->rdwrlock) 112#define IREAD_LOCK(ip, subclass) \
113 down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
113#define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) 114#define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock)
114#define IWRITE_LOCK(ip) down_write(&JFS_IP(ip)->rdwrlock) 115#define IWRITE_LOCK(ip, subclass) \
116 down_write_nested(&JFS_IP(ip)->rdwrlock, subclass)
115#define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock) 117#define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock)
116 118
117/* 119/*
@@ -127,6 +129,29 @@ enum cflags {
127 COMMIT_Synclist, /* metadata pages on group commit synclist */ 129 COMMIT_Synclist, /* metadata pages on group commit synclist */
128}; 130};
129 131
132/*
133 * commit_mutex nesting subclasses:
134 */
135enum commit_mutex_class
136{
137 COMMIT_MUTEX_PARENT,
138 COMMIT_MUTEX_CHILD,
139 COMMIT_MUTEX_SECOND_PARENT, /* Renaming */
140 COMMIT_MUTEX_VICTIM /* Inode being unlinked due to rename */
141};
142
143/*
144 * rdwrlock subclasses:
145 * The dmap inode may be locked while a normal inode or the imap inode are
146 * locked.
147 */
148enum rdwrlock_class
149{
150 RDWRLOCK_NORMAL,
151 RDWRLOCK_IMAP,
152 RDWRLOCK_DMAP
153};
154
130#define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) 155#define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag))
131#define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) 156#define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag))
132#define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) 157#define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag))
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
index 7d78e83d7c40..df48ece4b7a3 100644
--- a/fs/jfs/jfs_lock.h
+++ b/fs/jfs/jfs_lock.h
@@ -42,7 +42,7 @@ do { \
42 if (cond) \ 42 if (cond) \
43 break; \ 43 break; \
44 unlock_cmd; \ 44 unlock_cmd; \
45 schedule(); \ 45 io_schedule(); \
46 lock_cmd; \ 46 lock_cmd; \
47 } \ 47 } \
48 current->state = TASK_RUNNING; \ 48 current->state = TASK_RUNNING; \
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index ceaf03b94935..58deae007507 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -56,7 +56,7 @@ static inline void __lock_metapage(struct metapage *mp)
56 set_current_state(TASK_UNINTERRUPTIBLE); 56 set_current_state(TASK_UNINTERRUPTIBLE);
57 if (metapage_locked(mp)) { 57 if (metapage_locked(mp)) {
58 unlock_page(mp->page); 58 unlock_page(mp->page);
59 schedule(); 59 io_schedule();
60 lock_page(mp->page); 60 lock_page(mp->page);
61 } 61 }
62 } while (trylock_metapage(mp)); 62 } while (trylock_metapage(mp));
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d558e51b0df8..6988a1082f58 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -135,7 +135,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
135 add_wait_queue(event, &wait); 135 add_wait_queue(event, &wait);
136 set_current_state(TASK_UNINTERRUPTIBLE); 136 set_current_state(TASK_UNINTERRUPTIBLE);
137 TXN_UNLOCK(); 137 TXN_UNLOCK();
138 schedule(); 138 io_schedule();
139 current->state = TASK_RUNNING; 139 current->state = TASK_RUNNING;
140 remove_wait_queue(event, &wait); 140 remove_wait_queue(event, &wait);
141} 141}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index e98eb03e5310..acc97c46d8a4 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -757,6 +757,11 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
757 nsplit = 0; 757 nsplit = 0;
758 758
759 /* push (bn, index) of the parent page/entry */ 759 /* push (bn, index) of the parent page/entry */
760 if (BT_STACK_FULL(btstack)) {
761 jfs_error(ip->i_sb, "stack overrun in xtSearch!");
762 XT_PUTPAGE(mp);
763 return -EIO;
764 }
760 BT_PUSH(btstack, bn, index); 765 BT_PUSH(btstack, bn, index);
761 766
762 /* get the child page block number */ 767 /* get the child page block number */
@@ -3915,6 +3920,11 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3915 */ 3920 */
3916 getChild: 3921 getChild:
3917 /* save current parent entry for the child page */ 3922 /* save current parent entry for the child page */
3923 if (BT_STACK_FULL(&btstack)) {
3924 jfs_error(ip->i_sb, "stack overrun in xtTruncate!");
3925 XT_PUTPAGE(mp);
3926 return -EIO;
3927 }
3918 BT_PUSH(&btstack, bn, index); 3928 BT_PUSH(&btstack, bn, index);
3919 3929
3920 /* get child page */ 3930 /* get child page */
@@ -4112,6 +4122,11 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4112 */ 4122 */
4113 getChild: 4123 getChild:
4114 /* save current parent entry for the child page */ 4124 /* save current parent entry for the child page */
4125 if (BT_STACK_FULL(&btstack)) {
4126 jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!");
4127 XT_PUTPAGE(mp);
4128 return -EIO;
4129 }
4115 BT_PUSH(&btstack, bn, index); 4130 BT_PUSH(&btstack, bn, index);
4116 4131
4117 /* get child page */ 4132 /* get child page */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a6a8c16c872c..7ab47561b68d 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -104,8 +104,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
104 104
105 tid = txBegin(dip->i_sb, 0); 105 tid = txBegin(dip->i_sb, 0);
106 106
107 mutex_lock(&JFS_IP(dip)->commit_mutex); 107 mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
108 mutex_lock(&JFS_IP(ip)->commit_mutex); 108 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
109 109
110 rc = jfs_init_acl(tid, ip, dip); 110 rc = jfs_init_acl(tid, ip, dip);
111 if (rc) 111 if (rc)
@@ -238,8 +238,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
238 238
239 tid = txBegin(dip->i_sb, 0); 239 tid = txBegin(dip->i_sb, 0);
240 240
241 mutex_lock(&JFS_IP(dip)->commit_mutex); 241 mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
242 mutex_lock(&JFS_IP(ip)->commit_mutex); 242 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
243 243
244 rc = jfs_init_acl(tid, ip, dip); 244 rc = jfs_init_acl(tid, ip, dip);
245 if (rc) 245 if (rc)
@@ -365,8 +365,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
365 365
366 tid = txBegin(dip->i_sb, 0); 366 tid = txBegin(dip->i_sb, 0);
367 367
368 mutex_lock(&JFS_IP(dip)->commit_mutex); 368 mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
369 mutex_lock(&JFS_IP(ip)->commit_mutex); 369 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
370 370
371 iplist[0] = dip; 371 iplist[0] = dip;
372 iplist[1] = ip; 372 iplist[1] = ip;
@@ -483,12 +483,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
483 if ((rc = get_UCSname(&dname, dentry))) 483 if ((rc = get_UCSname(&dname, dentry)))
484 goto out; 484 goto out;
485 485
486 IWRITE_LOCK(ip); 486 IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
487 487
488 tid = txBegin(dip->i_sb, 0); 488 tid = txBegin(dip->i_sb, 0);
489 489
490 mutex_lock(&JFS_IP(dip)->commit_mutex); 490 mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
491 mutex_lock(&JFS_IP(ip)->commit_mutex); 491 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
492 492
493 iplist[0] = dip; 493 iplist[0] = dip;
494 iplist[1] = ip; 494 iplist[1] = ip;
@@ -802,8 +802,8 @@ static int jfs_link(struct dentry *old_dentry,
802 802
803 tid = txBegin(ip->i_sb, 0); 803 tid = txBegin(ip->i_sb, 0);
804 804
805 mutex_lock(&JFS_IP(dir)->commit_mutex); 805 mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
806 mutex_lock(&JFS_IP(ip)->commit_mutex); 806 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
807 807
808 /* 808 /*
809 * scan parent directory for entry/freespace 809 * scan parent directory for entry/freespace
@@ -913,8 +913,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
913 913
914 tid = txBegin(dip->i_sb, 0); 914 tid = txBegin(dip->i_sb, 0);
915 915
916 mutex_lock(&JFS_IP(dip)->commit_mutex); 916 mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
917 mutex_lock(&JFS_IP(ip)->commit_mutex); 917 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
918 918
919 rc = jfs_init_security(tid, ip, dip); 919 rc = jfs_init_security(tid, ip, dip);
920 if (rc) 920 if (rc)
@@ -1127,7 +1127,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1127 goto out3; 1127 goto out3;
1128 } 1128 }
1129 } else if (new_ip) { 1129 } else if (new_ip) {
1130 IWRITE_LOCK(new_ip); 1130 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
1131 /* Init inode for quota operations. */ 1131 /* Init inode for quota operations. */
1132 DQUOT_INIT(new_ip); 1132 DQUOT_INIT(new_ip);
1133 } 1133 }
@@ -1137,13 +1137,21 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1137 */ 1137 */
1138 tid = txBegin(new_dir->i_sb, 0); 1138 tid = txBegin(new_dir->i_sb, 0);
1139 1139
1140 mutex_lock(&JFS_IP(new_dir)->commit_mutex); 1140 /*
1141 mutex_lock(&JFS_IP(old_ip)->commit_mutex); 1141 * How do we know the locking is safe from deadlocks?
1142 * The vfs does the hard part for us. Any time we are taking nested
1143 * commit_mutexes, the vfs already has i_mutex held on the parent.
1144 * Here, the vfs has already taken i_mutex on both old_dir and new_dir.
1145 */
1146 mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT);
1147 mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD);
1142 if (old_dir != new_dir) 1148 if (old_dir != new_dir)
1143 mutex_lock(&JFS_IP(old_dir)->commit_mutex); 1149 mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex,
1150 COMMIT_MUTEX_SECOND_PARENT);
1144 1151
1145 if (new_ip) { 1152 if (new_ip) {
1146 mutex_lock(&JFS_IP(new_ip)->commit_mutex); 1153 mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex,
1154 COMMIT_MUTEX_VICTIM);
1147 /* 1155 /*
1148 * Change existing directory entry to new inode number 1156 * Change existing directory entry to new inode number
1149 */ 1157 */
@@ -1357,8 +1365,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1357 1365
1358 tid = txBegin(dir->i_sb, 0); 1366 tid = txBegin(dir->i_sb, 0);
1359 1367
1360 mutex_lock(&JFS_IP(dir)->commit_mutex); 1368 mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
1361 mutex_lock(&JFS_IP(ip)->commit_mutex); 1369 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
1362 1370
1363 rc = jfs_init_acl(tid, ip, dir); 1371 rc = jfs_init_acl(tid, ip, dir);
1364 if (rc) 1372 if (rc)
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 277ca67a2ad6..5a9779bb9236 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -184,10 +184,9 @@ static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
184 flush_scheduled_work(); 184 flush_scheduled_work();
185} 185}
186 186
187static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, 187static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
188 unsigned int num_ios)
189{ 188{
190 atomic_set(&wc->wc_num_reqs, num_ios); 189 atomic_set(&wc->wc_num_reqs, 1);
191 init_completion(&wc->wc_io_complete); 190 init_completion(&wc->wc_io_complete);
192 wc->wc_error = 0; 191 wc->wc_error = 0;
193} 192}
@@ -212,6 +211,7 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
212 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; 211 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
213 212
214 blk_run_address_space(mapping); 213 blk_run_address_space(mapping);
214 o2hb_bio_wait_dec(wc, 1);
215 215
216 wait_for_completion(&wc->wc_io_complete); 216 wait_for_completion(&wc->wc_io_complete);
217} 217}
@@ -231,6 +231,7 @@ static int o2hb_bio_end_io(struct bio *bio,
231 return 1; 231 return 1;
232 232
233 o2hb_bio_wait_dec(wc, 1); 233 o2hb_bio_wait_dec(wc, 1);
234 bio_put(bio);
234 return 0; 235 return 0;
235} 236}
236 237
@@ -238,23 +239,22 @@ static int o2hb_bio_end_io(struct bio *bio,
238 * start_slot. */ 239 * start_slot. */
239static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, 240static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
240 struct o2hb_bio_wait_ctxt *wc, 241 struct o2hb_bio_wait_ctxt *wc,
241 unsigned int start_slot, 242 unsigned int *current_slot,
242 unsigned int num_slots) 243 unsigned int max_slots)
243{ 244{
244 int i, nr_vecs, len, first_page, last_page; 245 int len, current_page;
245 unsigned int vec_len, vec_start; 246 unsigned int vec_len, vec_start;
246 unsigned int bits = reg->hr_block_bits; 247 unsigned int bits = reg->hr_block_bits;
247 unsigned int spp = reg->hr_slots_per_page; 248 unsigned int spp = reg->hr_slots_per_page;
249 unsigned int cs = *current_slot;
248 struct bio *bio; 250 struct bio *bio;
249 struct page *page; 251 struct page *page;
250 252
251 nr_vecs = (num_slots + spp - 1) / spp;
252
253 /* Testing has shown this allocation to take long enough under 253 /* Testing has shown this allocation to take long enough under
254 * GFP_KERNEL that the local node can get fenced. It would be 254 * GFP_KERNEL that the local node can get fenced. It would be
255 * nicest if we could pre-allocate these bios and avoid this 255 * nicest if we could pre-allocate these bios and avoid this
256 * all together. */ 256 * all together. */
257 bio = bio_alloc(GFP_ATOMIC, nr_vecs); 257 bio = bio_alloc(GFP_ATOMIC, 16);
258 if (!bio) { 258 if (!bio) {
259 mlog(ML_ERROR, "Could not alloc slots BIO!\n"); 259 mlog(ML_ERROR, "Could not alloc slots BIO!\n");
260 bio = ERR_PTR(-ENOMEM); 260 bio = ERR_PTR(-ENOMEM);
@@ -262,137 +262,53 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
262 } 262 }
263 263
264 /* Must put everything in 512 byte sectors for the bio... */ 264 /* Must put everything in 512 byte sectors for the bio... */
265 bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9); 265 bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
266 bio->bi_bdev = reg->hr_bdev; 266 bio->bi_bdev = reg->hr_bdev;
267 bio->bi_private = wc; 267 bio->bi_private = wc;
268 bio->bi_end_io = o2hb_bio_end_io; 268 bio->bi_end_io = o2hb_bio_end_io;
269 269
270 first_page = start_slot / spp; 270 vec_start = (cs << bits) % PAGE_CACHE_SIZE;
271 last_page = first_page + nr_vecs; 271 while(cs < max_slots) {
272 vec_start = (start_slot << bits) % PAGE_CACHE_SIZE; 272 current_page = cs / spp;
273 for(i = first_page; i < last_page; i++) { 273 page = reg->hr_slot_data[current_page];
274 page = reg->hr_slot_data[i];
275 274
276 vec_len = PAGE_CACHE_SIZE; 275 vec_len = min(PAGE_CACHE_SIZE,
277 /* last page might be short */ 276 (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
278 if (((i + 1) * spp) > (start_slot + num_slots))
279 vec_len = ((num_slots + start_slot) % spp) << bits;
280 vec_len -= vec_start;
281 277
282 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", 278 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
283 i, vec_len, vec_start); 279 current_page, vec_len, vec_start);
284 280
285 len = bio_add_page(bio, page, vec_len, vec_start); 281 len = bio_add_page(bio, page, vec_len, vec_start);
286 if (len != vec_len) { 282 if (len != vec_len) break;
287 bio_put(bio);
288 bio = ERR_PTR(-EIO);
289
290 mlog(ML_ERROR, "Error adding page to bio i = %d, "
291 "vec_len = %u, len = %d\n, start = %u\n",
292 i, vec_len, len, vec_start);
293 goto bail;
294 }
295 283
284 cs += vec_len / (PAGE_CACHE_SIZE/spp);
296 vec_start = 0; 285 vec_start = 0;
297 } 286 }
298 287
299bail: 288bail:
289 *current_slot = cs;
300 return bio; 290 return bio;
301} 291}
302 292
303/*
304 * Compute the maximum number of sectors the bdev can handle in one bio,
305 * as a power of two.
306 *
307 * Stolen from oracleasm, thanks Joel!
308 */
309static int compute_max_sectors(struct block_device *bdev)
310{
311 int max_pages, max_sectors, pow_two_sectors;
312
313 struct request_queue *q;
314
315 q = bdev_get_queue(bdev);
316 max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
317 if (max_pages > BIO_MAX_PAGES)
318 max_pages = BIO_MAX_PAGES;
319 if (max_pages > q->max_phys_segments)
320 max_pages = q->max_phys_segments;
321 if (max_pages > q->max_hw_segments)
322 max_pages = q->max_hw_segments;
323 max_pages--; /* Handle I/Os that straddle a page */
324
325 if (max_pages) {
326 max_sectors = max_pages << (PAGE_SHIFT - 9);
327 } else {
328 /* If BIO contains 1 or less than 1 page. */
329 max_sectors = q->max_sectors;
330 }
331 /* Why is fls() 1-based???? */
332 pow_two_sectors = 1 << (fls(max_sectors) - 1);
333
334 return pow_two_sectors;
335}
336
337static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
338 unsigned int num_slots,
339 unsigned int *num_bios,
340 unsigned int *slots_per_bio)
341{
342 unsigned int max_sectors, io_sectors;
343
344 max_sectors = compute_max_sectors(reg->hr_bdev);
345
346 io_sectors = num_slots << (reg->hr_block_bits - 9);
347
348 *num_bios = (io_sectors + max_sectors - 1) / max_sectors;
349 *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
350
351 mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
352 "device can handle %u sectors of I/O\n", io_sectors, num_slots,
353 max_sectors);
354 mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
355 *num_bios, *slots_per_bio);
356}
357
358static int o2hb_read_slots(struct o2hb_region *reg, 293static int o2hb_read_slots(struct o2hb_region *reg,
359 unsigned int max_slots) 294 unsigned int max_slots)
360{ 295{
361 unsigned int num_bios, slots_per_bio, start_slot, num_slots; 296 unsigned int current_slot=0;
362 int i, status; 297 int status;
363 struct o2hb_bio_wait_ctxt wc; 298 struct o2hb_bio_wait_ctxt wc;
364 struct bio **bios;
365 struct bio *bio; 299 struct bio *bio;
366 300
367 o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio); 301 o2hb_bio_wait_init(&wc);
368 302
369 bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL); 303 while(current_slot < max_slots) {
370 if (!bios) { 304 bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
371 status = -ENOMEM;
372 mlog_errno(status);
373 return status;
374 }
375
376 o2hb_bio_wait_init(&wc, num_bios);
377
378 num_slots = slots_per_bio;
379 for(i = 0; i < num_bios; i++) {
380 start_slot = i * slots_per_bio;
381
382 /* adjust num_slots at last bio */
383 if (max_slots < (start_slot + num_slots))
384 num_slots = max_slots - start_slot;
385
386 bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
387 if (IS_ERR(bio)) { 305 if (IS_ERR(bio)) {
388 o2hb_bio_wait_dec(&wc, num_bios - i);
389
390 status = PTR_ERR(bio); 306 status = PTR_ERR(bio);
391 mlog_errno(status); 307 mlog_errno(status);
392 goto bail_and_wait; 308 goto bail_and_wait;
393 } 309 }
394 bios[i] = bio;
395 310
311 atomic_inc(&wc.wc_num_reqs);
396 submit_bio(READ, bio); 312 submit_bio(READ, bio);
397 } 313 }
398 314
@@ -403,38 +319,30 @@ bail_and_wait:
403 if (wc.wc_error && !status) 319 if (wc.wc_error && !status)
404 status = wc.wc_error; 320 status = wc.wc_error;
405 321
406 if (bios) {
407 for(i = 0; i < num_bios; i++)
408 if (bios[i])
409 bio_put(bios[i]);
410 kfree(bios);
411 }
412
413 return status; 322 return status;
414} 323}
415 324
416static int o2hb_issue_node_write(struct o2hb_region *reg, 325static int o2hb_issue_node_write(struct o2hb_region *reg,
417 struct bio **write_bio,
418 struct o2hb_bio_wait_ctxt *write_wc) 326 struct o2hb_bio_wait_ctxt *write_wc)
419{ 327{
420 int status; 328 int status;
421 unsigned int slot; 329 unsigned int slot;
422 struct bio *bio; 330 struct bio *bio;
423 331
424 o2hb_bio_wait_init(write_wc, 1); 332 o2hb_bio_wait_init(write_wc);
425 333
426 slot = o2nm_this_node(); 334 slot = o2nm_this_node();
427 335
428 bio = o2hb_setup_one_bio(reg, write_wc, slot, 1); 336 bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
429 if (IS_ERR(bio)) { 337 if (IS_ERR(bio)) {
430 status = PTR_ERR(bio); 338 status = PTR_ERR(bio);
431 mlog_errno(status); 339 mlog_errno(status);
432 goto bail; 340 goto bail;
433 } 341 }
434 342
343 atomic_inc(&write_wc->wc_num_reqs);
435 submit_bio(WRITE, bio); 344 submit_bio(WRITE, bio);
436 345
437 *write_bio = bio;
438 status = 0; 346 status = 0;
439bail: 347bail:
440 return status; 348 return status;
@@ -826,7 +734,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
826{ 734{
827 int i, ret, highest_node, change = 0; 735 int i, ret, highest_node, change = 0;
828 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 736 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
829 struct bio *write_bio;
830 struct o2hb_bio_wait_ctxt write_wc; 737 struct o2hb_bio_wait_ctxt write_wc;
831 738
832 ret = o2nm_configured_node_map(configured_nodes, 739 ret = o2nm_configured_node_map(configured_nodes,
@@ -864,7 +771,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
864 771
865 /* And fire off the write. Note that we don't wait on this I/O 772 /* And fire off the write. Note that we don't wait on this I/O
866 * until later. */ 773 * until later. */
867 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); 774 ret = o2hb_issue_node_write(reg, &write_wc);
868 if (ret < 0) { 775 if (ret < 0) {
869 mlog_errno(ret); 776 mlog_errno(ret);
870 return ret; 777 return ret;
@@ -882,7 +789,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
882 * people we find in our steady state have seen us. 789 * people we find in our steady state have seen us.
883 */ 790 */
884 o2hb_wait_on_io(reg, &write_wc); 791 o2hb_wait_on_io(reg, &write_wc);
885 bio_put(write_bio);
886 if (write_wc.wc_error) { 792 if (write_wc.wc_error) {
887 /* Do not re-arm the write timeout on I/O error - we 793 /* Do not re-arm the write timeout on I/O error - we
888 * can't be sure that the new block ever made it to 794 * can't be sure that the new block ever made it to
@@ -943,7 +849,6 @@ static int o2hb_thread(void *data)
943{ 849{
944 int i, ret; 850 int i, ret;
945 struct o2hb_region *reg = data; 851 struct o2hb_region *reg = data;
946 struct bio *write_bio;
947 struct o2hb_bio_wait_ctxt write_wc; 852 struct o2hb_bio_wait_ctxt write_wc;
948 struct timeval before_hb, after_hb; 853 struct timeval before_hb, after_hb;
949 unsigned int elapsed_msec; 854 unsigned int elapsed_msec;
@@ -993,10 +898,9 @@ static int o2hb_thread(void *data)
993 * 898 *
994 * XXX: Should we skip this on unclean_stop? */ 899 * XXX: Should we skip this on unclean_stop? */
995 o2hb_prepare_block(reg, 0); 900 o2hb_prepare_block(reg, 0);
996 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); 901 ret = o2hb_issue_node_write(reg, &write_wc);
997 if (ret == 0) { 902 if (ret == 0) {
998 o2hb_wait_on_io(reg, &write_wc); 903 o2hb_wait_on_io(reg, &write_wc);
999 bio_put(write_bio);
1000 } else { 904 } else {
1001 mlog_errno(ret); 905 mlog_errno(ret);
1002 } 906 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index ae4ff4a6636b..1718215fc018 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -556,6 +556,8 @@ static void o2net_register_callbacks(struct sock *sk,
556 sk->sk_data_ready = o2net_data_ready; 556 sk->sk_data_ready = o2net_data_ready;
557 sk->sk_state_change = o2net_state_change; 557 sk->sk_state_change = o2net_state_change;
558 558
559 mutex_init(&sc->sc_send_lock);
560
559 write_unlock_bh(&sk->sk_callback_lock); 561 write_unlock_bh(&sk->sk_callback_lock);
560} 562}
561 563
@@ -688,6 +690,7 @@ static void o2net_handler_put(struct o2net_msg_handler *nmh)
688 * be given to the handler if their payload is longer than the max. */ 690 * be given to the handler if their payload is longer than the max. */
689int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, 691int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
690 o2net_msg_handler_func *func, void *data, 692 o2net_msg_handler_func *func, void *data,
693 o2net_post_msg_handler_func *post_func,
691 struct list_head *unreg_list) 694 struct list_head *unreg_list)
692{ 695{
693 struct o2net_msg_handler *nmh = NULL; 696 struct o2net_msg_handler *nmh = NULL;
@@ -722,6 +725,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
722 725
723 nmh->nh_func = func; 726 nmh->nh_func = func;
724 nmh->nh_func_data = data; 727 nmh->nh_func_data = data;
728 nmh->nh_post_func = post_func;
725 nmh->nh_msg_type = msg_type; 729 nmh->nh_msg_type = msg_type;
726 nmh->nh_max_len = max_len; 730 nmh->nh_max_len = max_len;
727 nmh->nh_key = key; 731 nmh->nh_key = key;
@@ -856,10 +860,12 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
856 ssize_t ret; 860 ssize_t ret;
857 861
858 862
863 mutex_lock(&sc->sc_send_lock);
859 ret = sc->sc_sock->ops->sendpage(sc->sc_sock, 864 ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
860 virt_to_page(kmalloced_virt), 865 virt_to_page(kmalloced_virt),
861 (long)kmalloced_virt & ~PAGE_MASK, 866 (long)kmalloced_virt & ~PAGE_MASK,
862 size, MSG_DONTWAIT); 867 size, MSG_DONTWAIT);
868 mutex_unlock(&sc->sc_send_lock);
863 if (ret != size) { 869 if (ret != size) {
864 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 870 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
865 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); 871 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
@@ -974,8 +980,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
974 980
975 /* finally, convert the message header to network byte-order 981 /* finally, convert the message header to network byte-order
976 * and send */ 982 * and send */
983 mutex_lock(&sc->sc_send_lock);
977 ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, 984 ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
978 sizeof(struct o2net_msg) + caller_bytes); 985 sizeof(struct o2net_msg) + caller_bytes);
986 mutex_unlock(&sc->sc_send_lock);
979 msglog(msg, "sending returned %d\n", ret); 987 msglog(msg, "sending returned %d\n", ret);
980 if (ret < 0) { 988 if (ret < 0) {
981 mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); 989 mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
@@ -1049,6 +1057,7 @@ static int o2net_process_message(struct o2net_sock_container *sc,
1049 int ret = 0, handler_status; 1057 int ret = 0, handler_status;
1050 enum o2net_system_error syserr; 1058 enum o2net_system_error syserr;
1051 struct o2net_msg_handler *nmh = NULL; 1059 struct o2net_msg_handler *nmh = NULL;
1060 void *ret_data = NULL;
1052 1061
1053 msglog(hdr, "processing message\n"); 1062 msglog(hdr, "processing message\n");
1054 1063
@@ -1101,17 +1110,26 @@ static int o2net_process_message(struct o2net_sock_container *sc,
1101 sc->sc_msg_type = be16_to_cpu(hdr->msg_type); 1110 sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
1102 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + 1111 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
1103 be16_to_cpu(hdr->data_len), 1112 be16_to_cpu(hdr->data_len),
1104 nmh->nh_func_data); 1113 nmh->nh_func_data, &ret_data);
1105 do_gettimeofday(&sc->sc_tv_func_stop); 1114 do_gettimeofday(&sc->sc_tv_func_stop);
1106 1115
1107out_respond: 1116out_respond:
1108 /* this destroys the hdr, so don't use it after this */ 1117 /* this destroys the hdr, so don't use it after this */
1118 mutex_lock(&sc->sc_send_lock);
1109 ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, 1119 ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
1110 handler_status); 1120 handler_status);
1121 mutex_unlock(&sc->sc_send_lock);
1111 hdr = NULL; 1122 hdr = NULL;
1112 mlog(0, "sending handler status %d, syserr %d returned %d\n", 1123 mlog(0, "sending handler status %d, syserr %d returned %d\n",
1113 handler_status, syserr, ret); 1124 handler_status, syserr, ret);
1114 1125
1126 if (nmh) {
1127 BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL);
1128 if (nmh->nh_post_func)
1129 (nmh->nh_post_func)(handler_status, nmh->nh_func_data,
1130 ret_data);
1131 }
1132
1115out: 1133out:
1116 if (nmh) 1134 if (nmh)
1117 o2net_handler_put(nmh); 1135 o2net_handler_put(nmh);
@@ -1795,13 +1813,13 @@ out:
1795 ready(sk, bytes); 1813 ready(sk, bytes);
1796} 1814}
1797 1815
1798static int o2net_open_listening_sock(__be16 port) 1816static int o2net_open_listening_sock(__be32 addr, __be16 port)
1799{ 1817{
1800 struct socket *sock = NULL; 1818 struct socket *sock = NULL;
1801 int ret; 1819 int ret;
1802 struct sockaddr_in sin = { 1820 struct sockaddr_in sin = {
1803 .sin_family = PF_INET, 1821 .sin_family = PF_INET,
1804 .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) }, 1822 .sin_addr = { .s_addr = (__force u32)addr },
1805 .sin_port = (__force u16)port, 1823 .sin_port = (__force u16)port,
1806 }; 1824 };
1807 1825
@@ -1824,15 +1842,15 @@ static int o2net_open_listening_sock(__be16 port)
1824 sock->sk->sk_reuse = 1; 1842 sock->sk->sk_reuse = 1;
1825 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 1843 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
1826 if (ret < 0) { 1844 if (ret < 0) {
1827 mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n", 1845 mlog(ML_ERROR, "unable to bind socket at %u.%u.%u.%u:%u, "
1828 ntohs(port), ret); 1846 "ret=%d\n", NIPQUAD(addr), ntohs(port), ret);
1829 goto out; 1847 goto out;
1830 } 1848 }
1831 1849
1832 ret = sock->ops->listen(sock, 64); 1850 ret = sock->ops->listen(sock, 64);
1833 if (ret < 0) { 1851 if (ret < 0) {
1834 mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n", 1852 mlog(ML_ERROR, "unable to listen on %u.%u.%u.%u:%u, ret=%d\n",
1835 ntohs(port), ret); 1853 NIPQUAD(addr), ntohs(port), ret);
1836 } 1854 }
1837 1855
1838out: 1856out:
@@ -1865,7 +1883,8 @@ int o2net_start_listening(struct o2nm_node *node)
1865 return -ENOMEM; /* ? */ 1883 return -ENOMEM; /* ? */
1866 } 1884 }
1867 1885
1868 ret = o2net_open_listening_sock(node->nd_ipv4_port); 1886 ret = o2net_open_listening_sock(node->nd_ipv4_address,
1887 node->nd_ipv4_port);
1869 if (ret) { 1888 if (ret) {
1870 destroy_workqueue(o2net_wq); 1889 destroy_workqueue(o2net_wq);
1871 o2net_wq = NULL; 1890 o2net_wq = NULL;
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 21a4e43df836..da880fc215f0 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -50,7 +50,10 @@ struct o2net_msg
50 __u8 buf[0]; 50 __u8 buf[0];
51}; 51};
52 52
53typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data); 53typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data,
54 void **ret_data);
55typedef void (o2net_post_msg_handler_func)(int status, void *data,
56 void *ret_data);
54 57
55#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) 58#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg))
56 59
@@ -99,6 +102,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
99 102
100int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, 103int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
101 o2net_msg_handler_func *func, void *data, 104 o2net_msg_handler_func *func, void *data,
105 o2net_post_msg_handler_func *post_func,
102 struct list_head *unreg_list); 106 struct list_head *unreg_list);
103void o2net_unregister_handler_list(struct list_head *list); 107void o2net_unregister_handler_list(struct list_head *list);
104 108
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index b700dc9624d1..4dae5df5e467 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * New in version 7:
42 * - DLM join domain includes the live nodemap
43 *
44 * New in version 6:
45 * - DLM lockres remote refcount fixes.
46 *
41 * New in version 5: 47 * New in version 5:
42 * - Network timeout checking protocol 48 * - Network timeout checking protocol
43 * 49 *
@@ -51,7 +57,7 @@
51 * - full 64 bit i_size in the metadata lock lvbs 57 * - full 64 bit i_size in the metadata lock lvbs
52 * - introduction of "rw" lock and pushing meta/data locking down 58 * - introduction of "rw" lock and pushing meta/data locking down
53 */ 59 */
54#define O2NET_PROTOCOL_VERSION 5ULL 60#define O2NET_PROTOCOL_VERSION 7ULL
55struct o2net_handshake { 61struct o2net_handshake {
56 __be64 protocol_version; 62 __be64 protocol_version;
57 __be64 connector_id; 63 __be64 connector_id;
@@ -149,6 +155,8 @@ struct o2net_sock_container {
149 struct timeval sc_tv_func_stop; 155 struct timeval sc_tv_func_stop;
150 u32 sc_msg_key; 156 u32 sc_msg_key;
151 u16 sc_msg_type; 157 u16 sc_msg_type;
158
159 struct mutex sc_send_lock;
152}; 160};
153 161
154struct o2net_msg_handler { 162struct o2net_msg_handler {
@@ -158,6 +166,8 @@ struct o2net_msg_handler {
158 u32 nh_key; 166 u32 nh_key;
159 o2net_msg_handler_func *nh_func; 167 o2net_msg_handler_func *nh_func;
160 o2net_msg_handler_func *nh_func_data; 168 o2net_msg_handler_func *nh_func_data;
169 o2net_post_msg_handler_func
170 *nh_post_func;
161 struct kref nh_kref; 171 struct kref nh_kref;
162 struct list_head nh_unregister_item; 172 struct list_head nh_unregister_item;
163}; 173};
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 681046d51393..241cad342a48 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -263,7 +263,8 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
263 263
264 264
265 265
266int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) 266int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
267 void **ret_data)
267{ 268{
268 int ret; 269 int ret;
269 unsigned int locklen; 270 unsigned int locklen;
@@ -311,8 +312,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
311 past->type != DLM_BAST) { 312 past->type != DLM_BAST) {
312 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" 313 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
313 "name=%.*s\n", past->type, 314 "name=%.*s\n", past->type,
314 dlm_get_lock_cookie_node(cookie), 315 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
315 dlm_get_lock_cookie_seq(cookie), 316 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
316 locklen, name); 317 locklen, name);
317 ret = DLM_IVLOCKID; 318 ret = DLM_IVLOCKID;
318 goto leave; 319 goto leave;
@@ -323,8 +324,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
323 mlog(0, "got %sast for unknown lockres! " 324 mlog(0, "got %sast for unknown lockres! "
324 "cookie=%u:%llu, name=%.*s, namelen=%u\n", 325 "cookie=%u:%llu, name=%.*s, namelen=%u\n",
325 past->type == DLM_AST ? "" : "b", 326 past->type == DLM_AST ? "" : "b",
326 dlm_get_lock_cookie_node(cookie), 327 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
327 dlm_get_lock_cookie_seq(cookie), 328 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
328 locklen, name, locklen); 329 locklen, name, locklen);
329 ret = DLM_IVLOCKID; 330 ret = DLM_IVLOCKID;
330 goto leave; 331 goto leave;
@@ -369,7 +370,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
369 370
370 mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " 371 mlog(0, "got %sast for unknown lock! cookie=%u:%llu, "
371 "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 372 "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b",
372 dlm_get_lock_cookie_node(cookie), dlm_get_lock_cookie_seq(cookie), 373 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
374 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
373 locklen, name, locklen); 375 locklen, name, locklen);
374 376
375 ret = DLM_NORMAL; 377 ret = DLM_NORMAL;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 6b6ff76538c5..e90b92f9ece1 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -180,6 +180,11 @@ struct dlm_assert_master_priv
180 unsigned ignore_higher:1; 180 unsigned ignore_higher:1;
181}; 181};
182 182
183struct dlm_deref_lockres_priv
184{
185 struct dlm_lock_resource *deref_res;
186 u8 deref_node;
187};
183 188
184struct dlm_work_item 189struct dlm_work_item
185{ 190{
@@ -191,6 +196,7 @@ struct dlm_work_item
191 struct dlm_request_all_locks_priv ral; 196 struct dlm_request_all_locks_priv ral;
192 struct dlm_mig_lockres_priv ml; 197 struct dlm_mig_lockres_priv ml;
193 struct dlm_assert_master_priv am; 198 struct dlm_assert_master_priv am;
199 struct dlm_deref_lockres_priv dl;
194 } u; 200 } u;
195}; 201};
196 202
@@ -222,6 +228,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
222#define DLM_LOCK_RES_DIRTY 0x00000008 228#define DLM_LOCK_RES_DIRTY 0x00000008
223#define DLM_LOCK_RES_IN_PROGRESS 0x00000010 229#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
224#define DLM_LOCK_RES_MIGRATING 0x00000020 230#define DLM_LOCK_RES_MIGRATING 0x00000020
231#define DLM_LOCK_RES_DROPPING_REF 0x00000040
232#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
233#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
225 234
226/* max milliseconds to wait to sync up a network failure with a node death */ 235/* max milliseconds to wait to sync up a network failure with a node death */
227#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) 236#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -265,6 +274,8 @@ struct dlm_lock_resource
265 u8 owner; //node which owns the lock resource, or unknown 274 u8 owner; //node which owns the lock resource, or unknown
266 u16 state; 275 u16 state;
267 char lvb[DLM_LVB_LEN]; 276 char lvb[DLM_LVB_LEN];
277 unsigned int inflight_locks;
278 unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
268}; 279};
269 280
270struct dlm_migratable_lock 281struct dlm_migratable_lock
@@ -367,7 +378,7 @@ enum {
367 DLM_CONVERT_LOCK_MSG, /* 504 */ 378 DLM_CONVERT_LOCK_MSG, /* 504 */
368 DLM_PROXY_AST_MSG, /* 505 */ 379 DLM_PROXY_AST_MSG, /* 505 */
369 DLM_UNLOCK_LOCK_MSG, /* 506 */ 380 DLM_UNLOCK_LOCK_MSG, /* 506 */
370 DLM_UNUSED_MSG2, /* 507 */ 381 DLM_DEREF_LOCKRES_MSG, /* 507 */
371 DLM_MIGRATE_REQUEST_MSG, /* 508 */ 382 DLM_MIGRATE_REQUEST_MSG, /* 508 */
372 DLM_MIG_LOCKRES_MSG, /* 509 */ 383 DLM_MIG_LOCKRES_MSG, /* 509 */
373 DLM_QUERY_JOIN_MSG, /* 510 */ 384 DLM_QUERY_JOIN_MSG, /* 510 */
@@ -417,6 +428,9 @@ struct dlm_master_request
417 u8 name[O2NM_MAX_NAME_LEN]; 428 u8 name[O2NM_MAX_NAME_LEN];
418}; 429};
419 430
431#define DLM_ASSERT_RESPONSE_REASSERT 0x00000001
432#define DLM_ASSERT_RESPONSE_MASTERY_REF 0x00000002
433
420#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 434#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001
421#define DLM_ASSERT_MASTER_REQUERY 0x00000002 435#define DLM_ASSERT_MASTER_REQUERY 0x00000002
422#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 436#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
@@ -430,6 +444,8 @@ struct dlm_assert_master
430 u8 name[O2NM_MAX_NAME_LEN]; 444 u8 name[O2NM_MAX_NAME_LEN];
431}; 445};
432 446
447#define DLM_MIGRATE_RESPONSE_MASTERY_REF 0x00000001
448
433struct dlm_migrate_request 449struct dlm_migrate_request
434{ 450{
435 u8 master; 451 u8 master;
@@ -609,12 +625,16 @@ struct dlm_begin_reco
609}; 625};
610 626
611 627
628#define BITS_PER_BYTE 8
629#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE)
630
612struct dlm_query_join_request 631struct dlm_query_join_request
613{ 632{
614 u8 node_idx; 633 u8 node_idx;
615 u8 pad1[2]; 634 u8 pad1[2];
616 u8 name_len; 635 u8 name_len;
617 u8 domain[O2NM_MAX_NAME_LEN]; 636 u8 domain[O2NM_MAX_NAME_LEN];
637 u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)];
618}; 638};
619 639
620struct dlm_assert_joined 640struct dlm_assert_joined
@@ -648,6 +668,16 @@ struct dlm_finalize_reco
648 __be32 pad2; 668 __be32 pad2;
649}; 669};
650 670
671struct dlm_deref_lockres
672{
673 u32 pad1;
674 u16 pad2;
675 u8 node_idx;
676 u8 namelen;
677
678 u8 name[O2NM_MAX_NAME_LEN];
679};
680
651static inline enum dlm_status 681static inline enum dlm_status
652__dlm_lockres_state_to_status(struct dlm_lock_resource *res) 682__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
653{ 683{
@@ -688,16 +718,20 @@ void dlm_lock_put(struct dlm_lock *lock);
688void dlm_lock_attach_lockres(struct dlm_lock *lock, 718void dlm_lock_attach_lockres(struct dlm_lock *lock,
689 struct dlm_lock_resource *res); 719 struct dlm_lock_resource *res);
690 720
691int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data); 721int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
692int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data); 722 void **ret_data);
693int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data); 723int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
724 void **ret_data);
725int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
726 void **ret_data);
694 727
695void dlm_revert_pending_convert(struct dlm_lock_resource *res, 728void dlm_revert_pending_convert(struct dlm_lock_resource *res,
696 struct dlm_lock *lock); 729 struct dlm_lock *lock);
697void dlm_revert_pending_lock(struct dlm_lock_resource *res, 730void dlm_revert_pending_lock(struct dlm_lock_resource *res,
698 struct dlm_lock *lock); 731 struct dlm_lock *lock);
699 732
700int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data); 733int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
734 void **ret_data);
701void dlm_commit_pending_cancel(struct dlm_lock_resource *res, 735void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
702 struct dlm_lock *lock); 736 struct dlm_lock *lock);
703void dlm_commit_pending_unlock(struct dlm_lock_resource *res, 737void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
@@ -721,8 +755,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
721 struct dlm_lock_resource *res); 755 struct dlm_lock_resource *res);
722void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, 756void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
723 struct dlm_lock_resource *res); 757 struct dlm_lock_resource *res);
724void dlm_purge_lockres(struct dlm_ctxt *dlm,
725 struct dlm_lock_resource *lockres);
726static inline void dlm_lockres_get(struct dlm_lock_resource *res) 758static inline void dlm_lockres_get(struct dlm_lock_resource *res)
727{ 759{
728 /* This is called on every lookup, so it might be worth 760 /* This is called on every lookup, so it might be worth
@@ -733,6 +765,10 @@ void dlm_lockres_put(struct dlm_lock_resource *res);
733void __dlm_unhash_lockres(struct dlm_lock_resource *res); 765void __dlm_unhash_lockres(struct dlm_lock_resource *res);
734void __dlm_insert_lockres(struct dlm_ctxt *dlm, 766void __dlm_insert_lockres(struct dlm_ctxt *dlm,
735 struct dlm_lock_resource *res); 767 struct dlm_lock_resource *res);
768struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
769 const char *name,
770 unsigned int len,
771 unsigned int hash);
736struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, 772struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
737 const char *name, 773 const char *name,
738 unsigned int len, 774 unsigned int len,
@@ -753,6 +789,47 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
753 const char *name, 789 const char *name,
754 unsigned int namelen); 790 unsigned int namelen);
755 791
792#define dlm_lockres_set_refmap_bit(bit,res) \
793 __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
794#define dlm_lockres_clear_refmap_bit(bit,res) \
795 __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
796
797static inline void __dlm_lockres_set_refmap_bit(int bit,
798 struct dlm_lock_resource *res,
799 const char *file,
800 int line)
801{
802 //printk("%s:%d:%.*s: setting bit %d\n", file, line,
803 // res->lockname.len, res->lockname.name, bit);
804 set_bit(bit, res->refmap);
805}
806
807static inline void __dlm_lockres_clear_refmap_bit(int bit,
808 struct dlm_lock_resource *res,
809 const char *file,
810 int line)
811{
812 //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
813 // res->lockname.len, res->lockname.name, bit);
814 clear_bit(bit, res->refmap);
815}
816
817void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
818 struct dlm_lock_resource *res,
819 const char *file,
820 int line);
821void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
822 struct dlm_lock_resource *res,
823 int new_lockres,
824 const char *file,
825 int line);
826#define dlm_lockres_drop_inflight_ref(d,r) \
827 __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
828#define dlm_lockres_grab_inflight_ref(d,r) \
829 __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
830#define dlm_lockres_grab_inflight_ref_new(d,r) \
831 __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
832
756void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 833void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
757void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 834void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
758void dlm_do_local_ast(struct dlm_ctxt *dlm, 835void dlm_do_local_ast(struct dlm_ctxt *dlm,
@@ -801,10 +878,7 @@ int dlm_heartbeat_init(struct dlm_ctxt *dlm);
801void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); 878void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
802void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); 879void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
803 880
804int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); 881int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
805int dlm_migrate_lockres(struct dlm_ctxt *dlm,
806 struct dlm_lock_resource *res,
807 u8 target);
808int dlm_finish_migration(struct dlm_ctxt *dlm, 882int dlm_finish_migration(struct dlm_ctxt *dlm,
809 struct dlm_lock_resource *res, 883 struct dlm_lock_resource *res,
810 u8 old_master); 884 u8 old_master);
@@ -812,15 +886,27 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
812 struct dlm_lock_resource *res); 886 struct dlm_lock_resource *res);
813void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res); 887void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
814 888
815int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data); 889int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
816int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data); 890 void **ret_data);
817int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data); 891int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
818int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data); 892 void **ret_data);
819int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data); 893void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
820int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data); 894int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
821int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data); 895 void **ret_data);
822int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data); 896int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
823int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data); 897 void **ret_data);
898int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
899 void **ret_data);
900int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
901 void **ret_data);
902int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
903 void **ret_data);
904int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
905 void **ret_data);
906int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
907 void **ret_data);
908int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
909 void **ret_data);
824int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 910int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
825 u8 nodenum, u8 *real_master); 911 u8 nodenum, u8 *real_master);
826 912
@@ -856,10 +942,12 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
856int dlm_init_mle_cache(void); 942int dlm_init_mle_cache(void);
857void dlm_destroy_mle_cache(void); 943void dlm_destroy_mle_cache(void);
858void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); 944void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
945int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
946 struct dlm_lock_resource *res);
859void dlm_clean_master_list(struct dlm_ctxt *dlm, 947void dlm_clean_master_list(struct dlm_ctxt *dlm,
860 u8 dead_node); 948 u8 dead_node);
861int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); 949int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
862 950int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
863int __dlm_lockres_unused(struct dlm_lock_resource *res); 951int __dlm_lockres_unused(struct dlm_lock_resource *res);
864 952
865static inline const char * dlm_lock_mode_name(int mode) 953static inline const char * dlm_lock_mode_name(int mode)
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index c764dc8e40a2..ecb4d997221e 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -286,8 +286,8 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
286 __dlm_print_one_lock_resource(res); 286 __dlm_print_one_lock_resource(res);
287 mlog(ML_ERROR, "converting a remote lock that is already " 287 mlog(ML_ERROR, "converting a remote lock that is already "
288 "converting! (cookie=%u:%llu, conv=%d)\n", 288 "converting! (cookie=%u:%llu, conv=%d)\n",
289 dlm_get_lock_cookie_node(lock->ml.cookie), 289 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
290 dlm_get_lock_cookie_seq(lock->ml.cookie), 290 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
291 lock->ml.convert_type); 291 lock->ml.convert_type);
292 status = DLM_DENIED; 292 status = DLM_DENIED;
293 goto bail; 293 goto bail;
@@ -418,7 +418,8 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
418 * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS, 418 * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
419 * status from __dlmconvert_master 419 * status from __dlmconvert_master
420 */ 420 */
421int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) 421int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
422 void **ret_data)
422{ 423{
423 struct dlm_ctxt *dlm = data; 424 struct dlm_ctxt *dlm = data;
424 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; 425 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
@@ -428,7 +429,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
428 struct dlm_lockstatus *lksb; 429 struct dlm_lockstatus *lksb;
429 enum dlm_status status = DLM_NORMAL; 430 enum dlm_status status = DLM_NORMAL;
430 u32 flags; 431 u32 flags;
431 int call_ast = 0, kick_thread = 0, ast_reserved = 0; 432 int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0;
432 433
433 if (!dlm_grab(dlm)) { 434 if (!dlm_grab(dlm)) {
434 dlm_error(DLM_REJECTED); 435 dlm_error(DLM_REJECTED);
@@ -479,25 +480,14 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
479 } 480 }
480 lock = NULL; 481 lock = NULL;
481 } 482 }
482 if (!lock) {
483 __dlm_print_one_lock_resource(res);
484 list_for_each(iter, &res->granted) {
485 lock = list_entry(iter, struct dlm_lock, list);
486 if (lock->ml.node == cnv->node_idx) {
487 mlog(ML_ERROR, "There is something here "
488 "for node %u, lock->ml.cookie=%llu, "
489 "cnv->cookie=%llu\n", cnv->node_idx,
490 (unsigned long long)lock->ml.cookie,
491 (unsigned long long)cnv->cookie);
492 break;
493 }
494 }
495 lock = NULL;
496 }
497 spin_unlock(&res->spinlock); 483 spin_unlock(&res->spinlock);
498 if (!lock) { 484 if (!lock) {
499 status = DLM_IVLOCKID; 485 status = DLM_IVLOCKID;
500 dlm_error(status); 486 mlog(ML_ERROR, "did not find lock to convert on grant queue! "
487 "cookie=%u:%llu\n",
488 dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)),
489 dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie)));
490 __dlm_print_one_lock_resource(res);
501 goto leave; 491 goto leave;
502 } 492 }
503 493
@@ -524,8 +514,11 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
524 cnv->requested_type, 514 cnv->requested_type,
525 &call_ast, &kick_thread); 515 &call_ast, &kick_thread);
526 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 516 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
517 wake = 1;
527 } 518 }
528 spin_unlock(&res->spinlock); 519 spin_unlock(&res->spinlock);
520 if (wake)
521 wake_up(&res->wq);
529 522
530 if (status != DLM_NORMAL) { 523 if (status != DLM_NORMAL) {
531 if (status != DLM_NOTQUEUED) 524 if (status != DLM_NOTQUEUED)
@@ -534,12 +527,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
534 } 527 }
535 528
536leave: 529leave:
537 if (!lock) 530 if (lock)
538 mlog(ML_ERROR, "did not find lock to convert on grant queue! "
539 "cookie=%u:%llu\n",
540 dlm_get_lock_cookie_node(cnv->cookie),
541 dlm_get_lock_cookie_seq(cnv->cookie));
542 else
543 dlm_lock_put(lock); 531 dlm_lock_put(lock);
544 532
545 /* either queue the ast or release it, if reserved */ 533 /* either queue the ast or release it, if reserved */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 3f6c8d88f7af..64239b37e5d4 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -53,6 +53,23 @@ void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
53 spin_unlock(&res->spinlock); 53 spin_unlock(&res->spinlock);
54} 54}
55 55
56static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
57{
58 int bit;
59 assert_spin_locked(&res->spinlock);
60
61 mlog(ML_NOTICE, " refmap nodes: [ ");
62 bit = 0;
63 while (1) {
64 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
65 if (bit >= O2NM_MAX_NODES)
66 break;
67 printk("%u ", bit);
68 bit++;
69 }
70 printk("], inflight=%u\n", res->inflight_locks);
71}
72
56void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) 73void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
57{ 74{
58 struct list_head *iter2; 75 struct list_head *iter2;
@@ -65,6 +82,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
65 res->owner, res->state); 82 res->owner, res->state);
66 mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", 83 mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n",
67 res->last_used, list_empty(&res->purge) ? "no" : "yes"); 84 res->last_used, list_empty(&res->purge) ? "no" : "yes");
85 dlm_print_lockres_refmap(res);
68 mlog(ML_NOTICE, " granted queue: \n"); 86 mlog(ML_NOTICE, " granted queue: \n");
69 list_for_each(iter2, &res->granted) { 87 list_for_each(iter2, &res->granted) {
70 lock = list_entry(iter2, struct dlm_lock, list); 88 lock = list_entry(iter2, struct dlm_lock, list);
@@ -72,8 +90,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
72 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " 90 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
73 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 91 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
74 lock->ml.type, lock->ml.convert_type, lock->ml.node, 92 lock->ml.type, lock->ml.convert_type, lock->ml.node,
75 dlm_get_lock_cookie_node(lock->ml.cookie), 93 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
76 dlm_get_lock_cookie_seq(lock->ml.cookie), 94 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
77 list_empty(&lock->ast_list) ? 'y' : 'n', 95 list_empty(&lock->ast_list) ? 'y' : 'n',
78 lock->ast_pending ? 'y' : 'n', 96 lock->ast_pending ? 'y' : 'n',
79 list_empty(&lock->bast_list) ? 'y' : 'n', 97 list_empty(&lock->bast_list) ? 'y' : 'n',
@@ -87,8 +105,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
87 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " 105 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
88 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 106 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
89 lock->ml.type, lock->ml.convert_type, lock->ml.node, 107 lock->ml.type, lock->ml.convert_type, lock->ml.node,
90 dlm_get_lock_cookie_node(lock->ml.cookie), 108 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
91 dlm_get_lock_cookie_seq(lock->ml.cookie), 109 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
92 list_empty(&lock->ast_list) ? 'y' : 'n', 110 list_empty(&lock->ast_list) ? 'y' : 'n',
93 lock->ast_pending ? 'y' : 'n', 111 lock->ast_pending ? 'y' : 'n',
94 list_empty(&lock->bast_list) ? 'y' : 'n', 112 list_empty(&lock->bast_list) ? 'y' : 'n',
@@ -102,8 +120,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
102 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " 120 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
103 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 121 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
104 lock->ml.type, lock->ml.convert_type, lock->ml.node, 122 lock->ml.type, lock->ml.convert_type, lock->ml.node,
105 dlm_get_lock_cookie_node(lock->ml.cookie), 123 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
106 dlm_get_lock_cookie_seq(lock->ml.cookie), 124 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
107 list_empty(&lock->ast_list) ? 'y' : 'n', 125 list_empty(&lock->ast_list) ? 'y' : 'n',
108 lock->ast_pending ? 'y' : 'n', 126 lock->ast_pending ? 'y' : 'n',
109 list_empty(&lock->bast_list) ? 'y' : 'n', 127 list_empty(&lock->bast_list) ? 'y' : 'n',
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index f0b25f2dd205..6087c4749fee 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -48,6 +48,36 @@
48#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) 48#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
49#include "cluster/masklog.h" 49#include "cluster/masklog.h"
50 50
51/*
52 * ocfs2 node maps are array of long int, which limits to send them freely
53 * across the wire due to endianness issues. To workaround this, we convert
54 * long ints to byte arrays. Following 3 routines are helper functions to
55 * set/test/copy bits within those array of bytes
56 */
57static inline void byte_set_bit(u8 nr, u8 map[])
58{
59 map[nr >> 3] |= (1UL << (nr & 7));
60}
61
62static inline int byte_test_bit(u8 nr, u8 map[])
63{
64 return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0;
65}
66
67static inline void byte_copymap(u8 dmap[], unsigned long smap[],
68 unsigned int sz)
69{
70 unsigned int nn;
71
72 if (!sz)
73 return;
74
75 memset(dmap, 0, ((sz + 7) >> 3));
76 for (nn = 0 ; nn < sz; nn++)
77 if (test_bit(nn, smap))
78 byte_set_bit(nn, dmap);
79}
80
51static void dlm_free_pagevec(void **vec, int pages) 81static void dlm_free_pagevec(void **vec, int pages)
52{ 82{
53 while (pages--) 83 while (pages--)
@@ -95,10 +125,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
95 125
96#define DLM_DOMAIN_BACKOFF_MS 200 126#define DLM_DOMAIN_BACKOFF_MS 200
97 127
98static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data); 128static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
99static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data); 129 void **ret_data);
100static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data); 130static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
101static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data); 131 void **ret_data);
132static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
133 void **ret_data);
134static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
135 void **ret_data);
102 136
103static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); 137static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
104 138
@@ -125,10 +159,10 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
125 hlist_add_head(&res->hash_node, bucket); 159 hlist_add_head(&res->hash_node, bucket);
126} 160}
127 161
128struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, 162struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
129 const char *name, 163 const char *name,
130 unsigned int len, 164 unsigned int len,
131 unsigned int hash) 165 unsigned int hash)
132{ 166{
133 struct hlist_head *bucket; 167 struct hlist_head *bucket;
134 struct hlist_node *list; 168 struct hlist_node *list;
@@ -154,6 +188,37 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
154 return NULL; 188 return NULL;
155} 189}
156 190
191/* intended to be called by functions which do not care about lock
192 * resources which are being purged (most net _handler functions).
193 * this will return NULL for any lock resource which is found but
194 * currently in the process of dropping its mastery reference.
195 * use __dlm_lookup_lockres_full when you need the lock resource
196 * regardless (e.g. dlm_get_lock_resource) */
197struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
198 const char *name,
199 unsigned int len,
200 unsigned int hash)
201{
202 struct dlm_lock_resource *res = NULL;
203
204 mlog_entry("%.*s\n", len, name);
205
206 assert_spin_locked(&dlm->spinlock);
207
208 res = __dlm_lookup_lockres_full(dlm, name, len, hash);
209 if (res) {
210 spin_lock(&res->spinlock);
211 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
212 spin_unlock(&res->spinlock);
213 dlm_lockres_put(res);
214 return NULL;
215 }
216 spin_unlock(&res->spinlock);
217 }
218
219 return res;
220}
221
157struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, 222struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
158 const char *name, 223 const char *name,
159 unsigned int len) 224 unsigned int len)
@@ -330,43 +395,60 @@ static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
330 wake_up(&dlm_domain_events); 395 wake_up(&dlm_domain_events);
331} 396}
332 397
333static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) 398static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
334{ 399{
335 int i; 400 int i, num, n, ret = 0;
336 struct dlm_lock_resource *res; 401 struct dlm_lock_resource *res;
402 struct hlist_node *iter;
403 struct hlist_head *bucket;
404 int dropped;
337 405
338 mlog(0, "Migrating locks from domain %s\n", dlm->name); 406 mlog(0, "Migrating locks from domain %s\n", dlm->name);
339restart: 407
408 num = 0;
340 spin_lock(&dlm->spinlock); 409 spin_lock(&dlm->spinlock);
341 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 410 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
342 while (!hlist_empty(dlm_lockres_hash(dlm, i))) { 411redo_bucket:
343 res = hlist_entry(dlm_lockres_hash(dlm, i)->first, 412 n = 0;
344 struct dlm_lock_resource, hash_node); 413 bucket = dlm_lockres_hash(dlm, i);
345 /* need reference when manually grabbing lockres */ 414 iter = bucket->first;
415 while (iter) {
416 n++;
417 res = hlist_entry(iter, struct dlm_lock_resource,
418 hash_node);
346 dlm_lockres_get(res); 419 dlm_lockres_get(res);
347 /* this should unhash the lockres 420 /* migrate, if necessary. this will drop the dlm
348 * and exit with dlm->spinlock */ 421 * spinlock and retake it if it does migration. */
349 mlog(0, "purging res=%p\n", res); 422 dropped = dlm_empty_lockres(dlm, res);
350 if (dlm_lockres_is_dirty(dlm, res)) { 423
351 /* HACK! this should absolutely go. 424 spin_lock(&res->spinlock);
352 * need to figure out why some empty 425 __dlm_lockres_calc_usage(dlm, res);
353 * lockreses are still marked dirty */ 426 iter = res->hash_node.next;
354 mlog(ML_ERROR, "lockres %.*s dirty!\n", 427 spin_unlock(&res->spinlock);
355 res->lockname.len, res->lockname.name); 428
356
357 spin_unlock(&dlm->spinlock);
358 dlm_kick_thread(dlm, res);
359 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
360 dlm_lockres_put(res);
361 goto restart;
362 }
363 dlm_purge_lockres(dlm, res);
364 dlm_lockres_put(res); 429 dlm_lockres_put(res);
430
431 cond_resched_lock(&dlm->spinlock);
432
433 if (dropped)
434 goto redo_bucket;
365 } 435 }
436 num += n;
437 mlog(0, "%s: touched %d lockreses in bucket %d "
438 "(tot=%d)\n", dlm->name, n, i, num);
366 } 439 }
367 spin_unlock(&dlm->spinlock); 440 spin_unlock(&dlm->spinlock);
368 441 wake_up(&dlm->dlm_thread_wq);
442
443 /* let the dlm thread take care of purging, keep scanning until
444 * nothing remains in the hash */
445 if (num) {
446 mlog(0, "%s: %d lock resources in hash last pass\n",
447 dlm->name, num);
448 ret = -EAGAIN;
449 }
369 mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); 450 mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
451 return ret;
370} 452}
371 453
372static int dlm_no_joining_node(struct dlm_ctxt *dlm) 454static int dlm_no_joining_node(struct dlm_ctxt *dlm)
@@ -418,7 +500,8 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
418 printk("\n"); 500 printk("\n");
419} 501}
420 502
421static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) 503static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
504 void **ret_data)
422{ 505{
423 struct dlm_ctxt *dlm = data; 506 struct dlm_ctxt *dlm = data;
424 unsigned int node; 507 unsigned int node;
@@ -571,7 +654,9 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
571 /* We changed dlm state, notify the thread */ 654 /* We changed dlm state, notify the thread */
572 dlm_kick_thread(dlm, NULL); 655 dlm_kick_thread(dlm, NULL);
573 656
574 dlm_migrate_all_locks(dlm); 657 while (dlm_migrate_all_locks(dlm)) {
658 mlog(0, "%s: more migration to do\n", dlm->name);
659 }
575 dlm_mark_domain_leaving(dlm); 660 dlm_mark_domain_leaving(dlm);
576 dlm_leave_domain(dlm); 661 dlm_leave_domain(dlm);
577 dlm_complete_dlm_shutdown(dlm); 662 dlm_complete_dlm_shutdown(dlm);
@@ -580,11 +665,13 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
580} 665}
581EXPORT_SYMBOL_GPL(dlm_unregister_domain); 666EXPORT_SYMBOL_GPL(dlm_unregister_domain);
582 667
583static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) 668static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
669 void **ret_data)
584{ 670{
585 struct dlm_query_join_request *query; 671 struct dlm_query_join_request *query;
586 enum dlm_query_join_response response; 672 enum dlm_query_join_response response;
587 struct dlm_ctxt *dlm = NULL; 673 struct dlm_ctxt *dlm = NULL;
674 u8 nodenum;
588 675
589 query = (struct dlm_query_join_request *) msg->buf; 676 query = (struct dlm_query_join_request *) msg->buf;
590 677
@@ -608,6 +695,28 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
608 695
609 spin_lock(&dlm_domain_lock); 696 spin_lock(&dlm_domain_lock);
610 dlm = __dlm_lookup_domain_full(query->domain, query->name_len); 697 dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
698 if (!dlm)
699 goto unlock_respond;
700
701 /*
702 * There is a small window where the joining node may not see the
703 * node(s) that just left but still part of the cluster. DISALLOW
704 * join request if joining node has different node map.
705 */
706 nodenum=0;
707 while (nodenum < O2NM_MAX_NODES) {
708 if (test_bit(nodenum, dlm->domain_map)) {
709 if (!byte_test_bit(nodenum, query->node_map)) {
710 mlog(0, "disallow join as node %u does not "
711 "have node %u in its nodemap\n",
712 query->node_idx, nodenum);
713 response = JOIN_DISALLOW;
714 goto unlock_respond;
715 }
716 }
717 nodenum++;
718 }
719
611 /* Once the dlm ctxt is marked as leaving then we don't want 720 /* Once the dlm ctxt is marked as leaving then we don't want
612 * to be put in someone's domain map. 721 * to be put in someone's domain map.
613 * Also, explicitly disallow joining at certain troublesome 722 * Also, explicitly disallow joining at certain troublesome
@@ -626,15 +735,15 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
626 /* Disallow parallel joins. */ 735 /* Disallow parallel joins. */
627 response = JOIN_DISALLOW; 736 response = JOIN_DISALLOW;
628 } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { 737 } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
629 mlog(ML_NOTICE, "node %u trying to join, but recovery " 738 mlog(0, "node %u trying to join, but recovery "
630 "is ongoing.\n", bit); 739 "is ongoing.\n", bit);
631 response = JOIN_DISALLOW; 740 response = JOIN_DISALLOW;
632 } else if (test_bit(bit, dlm->recovery_map)) { 741 } else if (test_bit(bit, dlm->recovery_map)) {
633 mlog(ML_NOTICE, "node %u trying to join, but it " 742 mlog(0, "node %u trying to join, but it "
634 "still needs recovery.\n", bit); 743 "still needs recovery.\n", bit);
635 response = JOIN_DISALLOW; 744 response = JOIN_DISALLOW;
636 } else if (test_bit(bit, dlm->domain_map)) { 745 } else if (test_bit(bit, dlm->domain_map)) {
637 mlog(ML_NOTICE, "node %u trying to join, but it " 746 mlog(0, "node %u trying to join, but it "
638 "is still in the domain! needs recovery?\n", 747 "is still in the domain! needs recovery?\n",
639 bit); 748 bit);
640 response = JOIN_DISALLOW; 749 response = JOIN_DISALLOW;
@@ -649,6 +758,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
649 758
650 spin_unlock(&dlm->spinlock); 759 spin_unlock(&dlm->spinlock);
651 } 760 }
761unlock_respond:
652 spin_unlock(&dlm_domain_lock); 762 spin_unlock(&dlm_domain_lock);
653 763
654respond: 764respond:
@@ -657,7 +767,8 @@ respond:
657 return response; 767 return response;
658} 768}
659 769
660static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) 770static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
771 void **ret_data)
661{ 772{
662 struct dlm_assert_joined *assert; 773 struct dlm_assert_joined *assert;
663 struct dlm_ctxt *dlm = NULL; 774 struct dlm_ctxt *dlm = NULL;
@@ -694,7 +805,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
694 return 0; 805 return 0;
695} 806}
696 807
697static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data) 808static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
809 void **ret_data)
698{ 810{
699 struct dlm_cancel_join *cancel; 811 struct dlm_cancel_join *cancel;
700 struct dlm_ctxt *dlm = NULL; 812 struct dlm_ctxt *dlm = NULL;
@@ -796,6 +908,9 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
796 join_msg.name_len = strlen(dlm->name); 908 join_msg.name_len = strlen(dlm->name);
797 memcpy(join_msg.domain, dlm->name, join_msg.name_len); 909 memcpy(join_msg.domain, dlm->name, join_msg.name_len);
798 910
911 /* copy live node map to join message */
912 byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
913
799 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, 914 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
800 sizeof(join_msg), node, &retval); 915 sizeof(join_msg), node, &retval);
801 if (status < 0 && status != -ENOPROTOOPT) { 916 if (status < 0 && status != -ENOPROTOOPT) {
@@ -1036,98 +1151,106 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1036 status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, 1151 status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
1037 sizeof(struct dlm_master_request), 1152 sizeof(struct dlm_master_request),
1038 dlm_master_request_handler, 1153 dlm_master_request_handler,
1039 dlm, &dlm->dlm_domain_handlers); 1154 dlm, NULL, &dlm->dlm_domain_handlers);
1040 if (status) 1155 if (status)
1041 goto bail; 1156 goto bail;
1042 1157
1043 status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, 1158 status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
1044 sizeof(struct dlm_assert_master), 1159 sizeof(struct dlm_assert_master),
1045 dlm_assert_master_handler, 1160 dlm_assert_master_handler,
1046 dlm, &dlm->dlm_domain_handlers); 1161 dlm, dlm_assert_master_post_handler,
1162 &dlm->dlm_domain_handlers);
1047 if (status) 1163 if (status)
1048 goto bail; 1164 goto bail;
1049 1165
1050 status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, 1166 status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
1051 sizeof(struct dlm_create_lock), 1167 sizeof(struct dlm_create_lock),
1052 dlm_create_lock_handler, 1168 dlm_create_lock_handler,
1053 dlm, &dlm->dlm_domain_handlers); 1169 dlm, NULL, &dlm->dlm_domain_handlers);
1054 if (status) 1170 if (status)
1055 goto bail; 1171 goto bail;
1056 1172
1057 status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, 1173 status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
1058 DLM_CONVERT_LOCK_MAX_LEN, 1174 DLM_CONVERT_LOCK_MAX_LEN,
1059 dlm_convert_lock_handler, 1175 dlm_convert_lock_handler,
1060 dlm, &dlm->dlm_domain_handlers); 1176 dlm, NULL, &dlm->dlm_domain_handlers);
1061 if (status) 1177 if (status)
1062 goto bail; 1178 goto bail;
1063 1179
1064 status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, 1180 status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
1065 DLM_UNLOCK_LOCK_MAX_LEN, 1181 DLM_UNLOCK_LOCK_MAX_LEN,
1066 dlm_unlock_lock_handler, 1182 dlm_unlock_lock_handler,
1067 dlm, &dlm->dlm_domain_handlers); 1183 dlm, NULL, &dlm->dlm_domain_handlers);
1068 if (status) 1184 if (status)
1069 goto bail; 1185 goto bail;
1070 1186
1071 status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, 1187 status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
1072 DLM_PROXY_AST_MAX_LEN, 1188 DLM_PROXY_AST_MAX_LEN,
1073 dlm_proxy_ast_handler, 1189 dlm_proxy_ast_handler,
1074 dlm, &dlm->dlm_domain_handlers); 1190 dlm, NULL, &dlm->dlm_domain_handlers);
1075 if (status) 1191 if (status)
1076 goto bail; 1192 goto bail;
1077 1193
1078 status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, 1194 status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
1079 sizeof(struct dlm_exit_domain), 1195 sizeof(struct dlm_exit_domain),
1080 dlm_exit_domain_handler, 1196 dlm_exit_domain_handler,
1081 dlm, &dlm->dlm_domain_handlers); 1197 dlm, NULL, &dlm->dlm_domain_handlers);
1198 if (status)
1199 goto bail;
1200
1201 status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key,
1202 sizeof(struct dlm_deref_lockres),
1203 dlm_deref_lockres_handler,
1204 dlm, NULL, &dlm->dlm_domain_handlers);
1082 if (status) 1205 if (status)
1083 goto bail; 1206 goto bail;
1084 1207
1085 status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, 1208 status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
1086 sizeof(struct dlm_migrate_request), 1209 sizeof(struct dlm_migrate_request),
1087 dlm_migrate_request_handler, 1210 dlm_migrate_request_handler,
1088 dlm, &dlm->dlm_domain_handlers); 1211 dlm, NULL, &dlm->dlm_domain_handlers);
1089 if (status) 1212 if (status)
1090 goto bail; 1213 goto bail;
1091 1214
1092 status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, 1215 status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
1093 DLM_MIG_LOCKRES_MAX_LEN, 1216 DLM_MIG_LOCKRES_MAX_LEN,
1094 dlm_mig_lockres_handler, 1217 dlm_mig_lockres_handler,
1095 dlm, &dlm->dlm_domain_handlers); 1218 dlm, NULL, &dlm->dlm_domain_handlers);
1096 if (status) 1219 if (status)
1097 goto bail; 1220 goto bail;
1098 1221
1099 status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, 1222 status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
1100 sizeof(struct dlm_master_requery), 1223 sizeof(struct dlm_master_requery),
1101 dlm_master_requery_handler, 1224 dlm_master_requery_handler,
1102 dlm, &dlm->dlm_domain_handlers); 1225 dlm, NULL, &dlm->dlm_domain_handlers);
1103 if (status) 1226 if (status)
1104 goto bail; 1227 goto bail;
1105 1228
1106 status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, 1229 status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
1107 sizeof(struct dlm_lock_request), 1230 sizeof(struct dlm_lock_request),
1108 dlm_request_all_locks_handler, 1231 dlm_request_all_locks_handler,
1109 dlm, &dlm->dlm_domain_handlers); 1232 dlm, NULL, &dlm->dlm_domain_handlers);
1110 if (status) 1233 if (status)
1111 goto bail; 1234 goto bail;
1112 1235
1113 status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, 1236 status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
1114 sizeof(struct dlm_reco_data_done), 1237 sizeof(struct dlm_reco_data_done),
1115 dlm_reco_data_done_handler, 1238 dlm_reco_data_done_handler,
1116 dlm, &dlm->dlm_domain_handlers); 1239 dlm, NULL, &dlm->dlm_domain_handlers);
1117 if (status) 1240 if (status)
1118 goto bail; 1241 goto bail;
1119 1242
1120 status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, 1243 status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
1121 sizeof(struct dlm_begin_reco), 1244 sizeof(struct dlm_begin_reco),
1122 dlm_begin_reco_handler, 1245 dlm_begin_reco_handler,
1123 dlm, &dlm->dlm_domain_handlers); 1246 dlm, NULL, &dlm->dlm_domain_handlers);
1124 if (status) 1247 if (status)
1125 goto bail; 1248 goto bail;
1126 1249
1127 status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, 1250 status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
1128 sizeof(struct dlm_finalize_reco), 1251 sizeof(struct dlm_finalize_reco),
1129 dlm_finalize_reco_handler, 1252 dlm_finalize_reco_handler,
1130 dlm, &dlm->dlm_domain_handlers); 1253 dlm, NULL, &dlm->dlm_domain_handlers);
1131 if (status) 1254 if (status)
1132 goto bail; 1255 goto bail;
1133 1256
@@ -1141,6 +1264,8 @@ bail:
1141static int dlm_join_domain(struct dlm_ctxt *dlm) 1264static int dlm_join_domain(struct dlm_ctxt *dlm)
1142{ 1265{
1143 int status; 1266 int status;
1267 unsigned int backoff;
1268 unsigned int total_backoff = 0;
1144 1269
1145 BUG_ON(!dlm); 1270 BUG_ON(!dlm);
1146 1271
@@ -1172,18 +1297,27 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1172 } 1297 }
1173 1298
1174 do { 1299 do {
1175 unsigned int backoff;
1176 status = dlm_try_to_join_domain(dlm); 1300 status = dlm_try_to_join_domain(dlm);
1177 1301
1178 /* If we're racing another node to the join, then we 1302 /* If we're racing another node to the join, then we
1179 * need to back off temporarily and let them 1303 * need to back off temporarily and let them
1180 * complete. */ 1304 * complete. */
1305#define DLM_JOIN_TIMEOUT_MSECS 90000
1181 if (status == -EAGAIN) { 1306 if (status == -EAGAIN) {
1182 if (signal_pending(current)) { 1307 if (signal_pending(current)) {
1183 status = -ERESTARTSYS; 1308 status = -ERESTARTSYS;
1184 goto bail; 1309 goto bail;
1185 } 1310 }
1186 1311
1312 if (total_backoff >
1313 msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
1314 status = -ERESTARTSYS;
1315 mlog(ML_NOTICE, "Timed out joining dlm domain "
1316 "%s after %u msecs\n", dlm->name,
1317 jiffies_to_msecs(total_backoff));
1318 goto bail;
1319 }
1320
1187 /* 1321 /*
1188 * <chip> After you! 1322 * <chip> After you!
1189 * <dale> No, after you! 1323 * <dale> No, after you!
@@ -1193,6 +1327,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1193 */ 1327 */
1194 backoff = (unsigned int)(jiffies & 0x3); 1328 backoff = (unsigned int)(jiffies & 0x3);
1195 backoff *= DLM_DOMAIN_BACKOFF_MS; 1329 backoff *= DLM_DOMAIN_BACKOFF_MS;
1330 total_backoff += backoff;
1196 mlog(0, "backoff %d\n", backoff); 1331 mlog(0, "backoff %d\n", backoff);
1197 msleep(backoff); 1332 msleep(backoff);
1198 } 1333 }
@@ -1421,21 +1556,21 @@ static int dlm_register_net_handlers(void)
1421 status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, 1556 status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
1422 sizeof(struct dlm_query_join_request), 1557 sizeof(struct dlm_query_join_request),
1423 dlm_query_join_handler, 1558 dlm_query_join_handler,
1424 NULL, &dlm_join_handlers); 1559 NULL, NULL, &dlm_join_handlers);
1425 if (status) 1560 if (status)
1426 goto bail; 1561 goto bail;
1427 1562
1428 status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, 1563 status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1429 sizeof(struct dlm_assert_joined), 1564 sizeof(struct dlm_assert_joined),
1430 dlm_assert_joined_handler, 1565 dlm_assert_joined_handler,
1431 NULL, &dlm_join_handlers); 1566 NULL, NULL, &dlm_join_handlers);
1432 if (status) 1567 if (status)
1433 goto bail; 1568 goto bail;
1434 1569
1435 status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, 1570 status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
1436 sizeof(struct dlm_cancel_join), 1571 sizeof(struct dlm_cancel_join),
1437 dlm_cancel_join_handler, 1572 dlm_cancel_join_handler,
1438 NULL, &dlm_join_handlers); 1573 NULL, NULL, &dlm_join_handlers);
1439 1574
1440bail: 1575bail:
1441 if (status < 0) 1576 if (status < 0)
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index e5ca3db197f6..52578d907d9a 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -163,6 +163,10 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
163 kick_thread = 1; 163 kick_thread = 1;
164 } 164 }
165 } 165 }
166 /* reduce the inflight count, this may result in the lockres
167 * being purged below during calc_usage */
168 if (lock->ml.node == dlm->node_num)
169 dlm_lockres_drop_inflight_ref(dlm, res);
166 170
167 spin_unlock(&res->spinlock); 171 spin_unlock(&res->spinlock);
168 wake_up(&res->wq); 172 wake_up(&res->wq);
@@ -437,7 +441,8 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
437 * held on exit: none 441 * held on exit: none
438 * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED 442 * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
439 */ 443 */
440int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) 444int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
445 void **ret_data)
441{ 446{
442 struct dlm_ctxt *dlm = data; 447 struct dlm_ctxt *dlm = data;
443 struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf; 448 struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0ad872055cb3..77e4e6169a0d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -99,9 +99,10 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
99 int idx); 99 int idx);
100 100
101static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); 101static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
102static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, 102static int dlm_do_assert_master(struct dlm_ctxt *dlm,
103 unsigned int namelen, void *nodemap, 103 struct dlm_lock_resource *res,
104 u32 flags); 104 void *nodemap, u32 flags);
105static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
105 106
106static inline int dlm_mle_equal(struct dlm_ctxt *dlm, 107static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
107 struct dlm_master_list_entry *mle, 108 struct dlm_master_list_entry *mle,
@@ -237,7 +238,8 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
237 struct dlm_master_list_entry **mle, 238 struct dlm_master_list_entry **mle,
238 char *name, unsigned int namelen); 239 char *name, unsigned int namelen);
239 240
240static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to); 241static int dlm_do_master_request(struct dlm_lock_resource *res,
242 struct dlm_master_list_entry *mle, int to);
241 243
242 244
243static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, 245static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
@@ -687,6 +689,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
687 INIT_LIST_HEAD(&res->purge); 689 INIT_LIST_HEAD(&res->purge);
688 atomic_set(&res->asts_reserved, 0); 690 atomic_set(&res->asts_reserved, 0);
689 res->migration_pending = 0; 691 res->migration_pending = 0;
692 res->inflight_locks = 0;
690 693
691 kref_init(&res->refs); 694 kref_init(&res->refs);
692 695
@@ -700,6 +703,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
700 res->last_used = 0; 703 res->last_used = 0;
701 704
702 memset(res->lvb, 0, DLM_LVB_LEN); 705 memset(res->lvb, 0, DLM_LVB_LEN);
706 memset(res->refmap, 0, sizeof(res->refmap));
703} 707}
704 708
705struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, 709struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
@@ -722,6 +726,42 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
722 return res; 726 return res;
723} 727}
724 728
729void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
730 struct dlm_lock_resource *res,
731 int new_lockres,
732 const char *file,
733 int line)
734{
735 if (!new_lockres)
736 assert_spin_locked(&res->spinlock);
737
738 if (!test_bit(dlm->node_num, res->refmap)) {
739 BUG_ON(res->inflight_locks != 0);
740 dlm_lockres_set_refmap_bit(dlm->node_num, res);
741 }
742 res->inflight_locks++;
743 mlog(0, "%s:%.*s: inflight++: now %u\n",
744 dlm->name, res->lockname.len, res->lockname.name,
745 res->inflight_locks);
746}
747
748void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
749 struct dlm_lock_resource *res,
750 const char *file,
751 int line)
752{
753 assert_spin_locked(&res->spinlock);
754
755 BUG_ON(res->inflight_locks == 0);
756 res->inflight_locks--;
757 mlog(0, "%s:%.*s: inflight--: now %u\n",
758 dlm->name, res->lockname.len, res->lockname.name,
759 res->inflight_locks);
760 if (res->inflight_locks == 0)
761 dlm_lockres_clear_refmap_bit(dlm->node_num, res);
762 wake_up(&res->wq);
763}
764
725/* 765/*
726 * lookup a lock resource by name. 766 * lookup a lock resource by name.
727 * may already exist in the hashtable. 767 * may already exist in the hashtable.
@@ -752,6 +792,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
752 unsigned int hash; 792 unsigned int hash;
753 int tries = 0; 793 int tries = 0;
754 int bit, wait_on_recovery = 0; 794 int bit, wait_on_recovery = 0;
795 int drop_inflight_if_nonlocal = 0;
755 796
756 BUG_ON(!lockid); 797 BUG_ON(!lockid);
757 798
@@ -761,9 +802,30 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
761 802
762lookup: 803lookup:
763 spin_lock(&dlm->spinlock); 804 spin_lock(&dlm->spinlock);
764 tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); 805 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
765 if (tmpres) { 806 if (tmpres) {
807 int dropping_ref = 0;
808
809 spin_lock(&tmpres->spinlock);
810 if (tmpres->owner == dlm->node_num) {
811 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
812 dlm_lockres_grab_inflight_ref(dlm, tmpres);
813 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
814 dropping_ref = 1;
815 spin_unlock(&tmpres->spinlock);
766 spin_unlock(&dlm->spinlock); 816 spin_unlock(&dlm->spinlock);
817
818 /* wait until done messaging the master, drop our ref to allow
819 * the lockres to be purged, start over. */
820 if (dropping_ref) {
821 spin_lock(&tmpres->spinlock);
822 __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
823 spin_unlock(&tmpres->spinlock);
824 dlm_lockres_put(tmpres);
825 tmpres = NULL;
826 goto lookup;
827 }
828
767 mlog(0, "found in hash!\n"); 829 mlog(0, "found in hash!\n");
768 if (res) 830 if (res)
769 dlm_lockres_put(res); 831 dlm_lockres_put(res);
@@ -793,6 +855,7 @@ lookup:
793 spin_lock(&res->spinlock); 855 spin_lock(&res->spinlock);
794 dlm_change_lockres_owner(dlm, res, dlm->node_num); 856 dlm_change_lockres_owner(dlm, res, dlm->node_num);
795 __dlm_insert_lockres(dlm, res); 857 __dlm_insert_lockres(dlm, res);
858 dlm_lockres_grab_inflight_ref(dlm, res);
796 spin_unlock(&res->spinlock); 859 spin_unlock(&res->spinlock);
797 spin_unlock(&dlm->spinlock); 860 spin_unlock(&dlm->spinlock);
798 /* lockres still marked IN_PROGRESS */ 861 /* lockres still marked IN_PROGRESS */
@@ -805,29 +868,40 @@ lookup:
805 /* if we found a block, wait for lock to be mastered by another node */ 868 /* if we found a block, wait for lock to be mastered by another node */
806 blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); 869 blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
807 if (blocked) { 870 if (blocked) {
871 int mig;
808 if (mle->type == DLM_MLE_MASTER) { 872 if (mle->type == DLM_MLE_MASTER) {
809 mlog(ML_ERROR, "master entry for nonexistent lock!\n"); 873 mlog(ML_ERROR, "master entry for nonexistent lock!\n");
810 BUG(); 874 BUG();
811 } else if (mle->type == DLM_MLE_MIGRATION) { 875 }
812 /* migration is in progress! */ 876 mig = (mle->type == DLM_MLE_MIGRATION);
813 /* the good news is that we now know the 877 /* if there is a migration in progress, let the migration
814 * "current" master (mle->master). */ 878 * finish before continuing. we can wait for the absence
815 879 * of the MIGRATION mle: either the migrate finished or
880 * one of the nodes died and the mle was cleaned up.
881 * if there is a BLOCK here, but it already has a master
882 * set, we are too late. the master does not have a ref
883 * for us in the refmap. detach the mle and drop it.
884 * either way, go back to the top and start over. */
885 if (mig || mle->master != O2NM_MAX_NODES) {
886 BUG_ON(mig && mle->master == dlm->node_num);
887 /* we arrived too late. the master does not
888 * have a ref for us. retry. */
889 mlog(0, "%s:%.*s: late on %s\n",
890 dlm->name, namelen, lockid,
891 mig ? "MIGRATION" : "BLOCK");
816 spin_unlock(&dlm->master_lock); 892 spin_unlock(&dlm->master_lock);
817 assert_spin_locked(&dlm->spinlock);
818
819 /* set the lockres owner and hash it */
820 spin_lock(&res->spinlock);
821 dlm_set_lockres_owner(dlm, res, mle->master);
822 __dlm_insert_lockres(dlm, res);
823 spin_unlock(&res->spinlock);
824 spin_unlock(&dlm->spinlock); 893 spin_unlock(&dlm->spinlock);
825 894
826 /* master is known, detach */ 895 /* master is known, detach */
827 dlm_mle_detach_hb_events(dlm, mle); 896 if (!mig)
897 dlm_mle_detach_hb_events(dlm, mle);
828 dlm_put_mle(mle); 898 dlm_put_mle(mle);
829 mle = NULL; 899 mle = NULL;
830 goto wake_waiters; 900 /* this is lame, but we cant wait on either
901 * the mle or lockres waitqueue here */
902 if (mig)
903 msleep(100);
904 goto lookup;
831 } 905 }
832 } else { 906 } else {
833 /* go ahead and try to master lock on this node */ 907 /* go ahead and try to master lock on this node */
@@ -858,6 +932,13 @@ lookup:
858 932
859 /* finally add the lockres to its hash bucket */ 933 /* finally add the lockres to its hash bucket */
860 __dlm_insert_lockres(dlm, res); 934 __dlm_insert_lockres(dlm, res);
935 /* since this lockres is new it doesnt not require the spinlock */
936 dlm_lockres_grab_inflight_ref_new(dlm, res);
937
938 /* if this node does not become the master make sure to drop
939 * this inflight reference below */
940 drop_inflight_if_nonlocal = 1;
941
861 /* get an extra ref on the mle in case this is a BLOCK 942 /* get an extra ref on the mle in case this is a BLOCK
862 * if so, the creator of the BLOCK may try to put the last 943 * if so, the creator of the BLOCK may try to put the last
863 * ref at this time in the assert master handler, so we 944 * ref at this time in the assert master handler, so we
@@ -910,7 +991,7 @@ redo_request:
910 ret = -EINVAL; 991 ret = -EINVAL;
911 dlm_node_iter_init(mle->vote_map, &iter); 992 dlm_node_iter_init(mle->vote_map, &iter);
912 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 993 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
913 ret = dlm_do_master_request(mle, nodenum); 994 ret = dlm_do_master_request(res, mle, nodenum);
914 if (ret < 0) 995 if (ret < 0)
915 mlog_errno(ret); 996 mlog_errno(ret);
916 if (mle->master != O2NM_MAX_NODES) { 997 if (mle->master != O2NM_MAX_NODES) {
@@ -960,6 +1041,8 @@ wait:
960 1041
961wake_waiters: 1042wake_waiters:
962 spin_lock(&res->spinlock); 1043 spin_lock(&res->spinlock);
1044 if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
1045 dlm_lockres_drop_inflight_ref(dlm, res);
963 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 1046 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
964 spin_unlock(&res->spinlock); 1047 spin_unlock(&res->spinlock);
965 wake_up(&res->wq); 1048 wake_up(&res->wq);
@@ -998,7 +1081,7 @@ recheck:
998 /* this will cause the master to re-assert across 1081 /* this will cause the master to re-assert across
999 * the whole cluster, freeing up mles */ 1082 * the whole cluster, freeing up mles */
1000 if (res->owner != dlm->node_num) { 1083 if (res->owner != dlm->node_num) {
1001 ret = dlm_do_master_request(mle, res->owner); 1084 ret = dlm_do_master_request(res, mle, res->owner);
1002 if (ret < 0) { 1085 if (ret < 0) {
1003 /* give recovery a chance to run */ 1086 /* give recovery a chance to run */
1004 mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); 1087 mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
@@ -1062,6 +1145,8 @@ recheck:
1062 * now tell other nodes that I am 1145 * now tell other nodes that I am
1063 * mastering this. */ 1146 * mastering this. */
1064 mle->master = dlm->node_num; 1147 mle->master = dlm->node_num;
1148 /* ref was grabbed in get_lock_resource
1149 * will be dropped in dlmlock_master */
1065 assert = 1; 1150 assert = 1;
1066 sleep = 0; 1151 sleep = 0;
1067 } 1152 }
@@ -1087,7 +1172,8 @@ recheck:
1087 (atomic_read(&mle->woken) == 1), 1172 (atomic_read(&mle->woken) == 1),
1088 timeo); 1173 timeo);
1089 if (res->owner == O2NM_MAX_NODES) { 1174 if (res->owner == O2NM_MAX_NODES) {
1090 mlog(0, "waiting again\n"); 1175 mlog(0, "%s:%.*s: waiting again\n", dlm->name,
1176 res->lockname.len, res->lockname.name);
1091 goto recheck; 1177 goto recheck;
1092 } 1178 }
1093 mlog(0, "done waiting, master is %u\n", res->owner); 1179 mlog(0, "done waiting, master is %u\n", res->owner);
@@ -1100,8 +1186,7 @@ recheck:
1100 m = dlm->node_num; 1186 m = dlm->node_num;
1101 mlog(0, "about to master %.*s here, this=%u\n", 1187 mlog(0, "about to master %.*s here, this=%u\n",
1102 res->lockname.len, res->lockname.name, m); 1188 res->lockname.len, res->lockname.name, m);
1103 ret = dlm_do_assert_master(dlm, res->lockname.name, 1189 ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
1104 res->lockname.len, mle->vote_map, 0);
1105 if (ret) { 1190 if (ret) {
1106 /* This is a failure in the network path, 1191 /* This is a failure in the network path,
1107 * not in the response to the assert_master 1192 * not in the response to the assert_master
@@ -1117,6 +1202,8 @@ recheck:
1117 1202
1118 /* set the lockres owner */ 1203 /* set the lockres owner */
1119 spin_lock(&res->spinlock); 1204 spin_lock(&res->spinlock);
1205 /* mastery reference obtained either during
1206 * assert_master_handler or in get_lock_resource */
1120 dlm_change_lockres_owner(dlm, res, m); 1207 dlm_change_lockres_owner(dlm, res, m);
1121 spin_unlock(&res->spinlock); 1208 spin_unlock(&res->spinlock);
1122 1209
@@ -1283,7 +1370,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1283 * 1370 *
1284 */ 1371 */
1285 1372
1286static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to) 1373static int dlm_do_master_request(struct dlm_lock_resource *res,
1374 struct dlm_master_list_entry *mle, int to)
1287{ 1375{
1288 struct dlm_ctxt *dlm = mle->dlm; 1376 struct dlm_ctxt *dlm = mle->dlm;
1289 struct dlm_master_request request; 1377 struct dlm_master_request request;
@@ -1339,6 +1427,9 @@ again:
1339 case DLM_MASTER_RESP_YES: 1427 case DLM_MASTER_RESP_YES:
1340 set_bit(to, mle->response_map); 1428 set_bit(to, mle->response_map);
1341 mlog(0, "node %u is the master, response=YES\n", to); 1429 mlog(0, "node %u is the master, response=YES\n", to);
1430 mlog(0, "%s:%.*s: master node %u now knows I have a "
1431 "reference\n", dlm->name, res->lockname.len,
1432 res->lockname.name, to);
1342 mle->master = to; 1433 mle->master = to;
1343 break; 1434 break;
1344 case DLM_MASTER_RESP_NO: 1435 case DLM_MASTER_RESP_NO:
@@ -1379,7 +1470,8 @@ out:
1379 * 1470 *
1380 * if possible, TRIM THIS DOWN!!! 1471 * if possible, TRIM THIS DOWN!!!
1381 */ 1472 */
1382int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) 1473int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
1474 void **ret_data)
1383{ 1475{
1384 u8 response = DLM_MASTER_RESP_MAYBE; 1476 u8 response = DLM_MASTER_RESP_MAYBE;
1385 struct dlm_ctxt *dlm = data; 1477 struct dlm_ctxt *dlm = data;
@@ -1417,10 +1509,11 @@ way_up_top:
1417 1509
1418 /* take care of the easy cases up front */ 1510 /* take care of the easy cases up front */
1419 spin_lock(&res->spinlock); 1511 spin_lock(&res->spinlock);
1420 if (res->state & DLM_LOCK_RES_RECOVERING) { 1512 if (res->state & (DLM_LOCK_RES_RECOVERING|
1513 DLM_LOCK_RES_MIGRATING)) {
1421 spin_unlock(&res->spinlock); 1514 spin_unlock(&res->spinlock);
1422 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " 1515 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1423 "being recovered\n"); 1516 "being recovered/migrated\n");
1424 response = DLM_MASTER_RESP_ERROR; 1517 response = DLM_MASTER_RESP_ERROR;
1425 if (mle) 1518 if (mle)
1426 kmem_cache_free(dlm_mle_cache, mle); 1519 kmem_cache_free(dlm_mle_cache, mle);
@@ -1428,8 +1521,10 @@ way_up_top:
1428 } 1521 }
1429 1522
1430 if (res->owner == dlm->node_num) { 1523 if (res->owner == dlm->node_num) {
1524 mlog(0, "%s:%.*s: setting bit %u in refmap\n",
1525 dlm->name, namelen, name, request->node_idx);
1526 dlm_lockres_set_refmap_bit(request->node_idx, res);
1431 spin_unlock(&res->spinlock); 1527 spin_unlock(&res->spinlock);
1432 // mlog(0, "this node is the master\n");
1433 response = DLM_MASTER_RESP_YES; 1528 response = DLM_MASTER_RESP_YES;
1434 if (mle) 1529 if (mle)
1435 kmem_cache_free(dlm_mle_cache, mle); 1530 kmem_cache_free(dlm_mle_cache, mle);
@@ -1477,7 +1572,6 @@ way_up_top:
1477 mlog(0, "node %u is master, but trying to migrate to " 1572 mlog(0, "node %u is master, but trying to migrate to "
1478 "node %u.\n", tmpmle->master, tmpmle->new_master); 1573 "node %u.\n", tmpmle->master, tmpmle->new_master);
1479 if (tmpmle->master == dlm->node_num) { 1574 if (tmpmle->master == dlm->node_num) {
1480 response = DLM_MASTER_RESP_YES;
1481 mlog(ML_ERROR, "no owner on lockres, but this " 1575 mlog(ML_ERROR, "no owner on lockres, but this "
1482 "node is trying to migrate it to %u?!\n", 1576 "node is trying to migrate it to %u?!\n",
1483 tmpmle->new_master); 1577 tmpmle->new_master);
@@ -1494,6 +1588,10 @@ way_up_top:
1494 * go back and clean the mles on any 1588 * go back and clean the mles on any
1495 * other nodes */ 1589 * other nodes */
1496 dispatch_assert = 1; 1590 dispatch_assert = 1;
1591 dlm_lockres_set_refmap_bit(request->node_idx, res);
1592 mlog(0, "%s:%.*s: setting bit %u in refmap\n",
1593 dlm->name, namelen, name,
1594 request->node_idx);
1497 } else 1595 } else
1498 response = DLM_MASTER_RESP_NO; 1596 response = DLM_MASTER_RESP_NO;
1499 } else { 1597 } else {
@@ -1607,17 +1705,24 @@ send_response:
1607 * can periodically run all locks owned by this node 1705 * can periodically run all locks owned by this node
1608 * and re-assert across the cluster... 1706 * and re-assert across the cluster...
1609 */ 1707 */
1610static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, 1708int dlm_do_assert_master(struct dlm_ctxt *dlm,
1611 unsigned int namelen, void *nodemap, 1709 struct dlm_lock_resource *res,
1612 u32 flags) 1710 void *nodemap, u32 flags)
1613{ 1711{
1614 struct dlm_assert_master assert; 1712 struct dlm_assert_master assert;
1615 int to, tmpret; 1713 int to, tmpret;
1616 struct dlm_node_iter iter; 1714 struct dlm_node_iter iter;
1617 int ret = 0; 1715 int ret = 0;
1618 int reassert; 1716 int reassert;
1717 const char *lockname = res->lockname.name;
1718 unsigned int namelen = res->lockname.len;
1619 1719
1620 BUG_ON(namelen > O2NM_MAX_NAME_LEN); 1720 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
1721
1722 spin_lock(&res->spinlock);
1723 res->state |= DLM_LOCK_RES_SETREF_INPROG;
1724 spin_unlock(&res->spinlock);
1725
1621again: 1726again:
1622 reassert = 0; 1727 reassert = 0;
1623 1728
@@ -1647,6 +1752,7 @@ again:
1647 mlog(0, "link to %d went down!\n", to); 1752 mlog(0, "link to %d went down!\n", to);
1648 /* any nonzero status return will do */ 1753 /* any nonzero status return will do */
1649 ret = tmpret; 1754 ret = tmpret;
1755 r = 0;
1650 } else if (r < 0) { 1756 } else if (r < 0) {
1651 /* ok, something horribly messed. kill thyself. */ 1757 /* ok, something horribly messed. kill thyself. */
1652 mlog(ML_ERROR,"during assert master of %.*s to %u, " 1758 mlog(ML_ERROR,"during assert master of %.*s to %u, "
@@ -1661,17 +1767,39 @@ again:
1661 spin_unlock(&dlm->master_lock); 1767 spin_unlock(&dlm->master_lock);
1662 spin_unlock(&dlm->spinlock); 1768 spin_unlock(&dlm->spinlock);
1663 BUG(); 1769 BUG();
1664 } else if (r == EAGAIN) { 1770 }
1771
1772 if (r & DLM_ASSERT_RESPONSE_REASSERT &&
1773 !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
1774 mlog(ML_ERROR, "%.*s: very strange, "
1775 "master MLE but no lockres on %u\n",
1776 namelen, lockname, to);
1777 }
1778
1779 if (r & DLM_ASSERT_RESPONSE_REASSERT) {
1665 mlog(0, "%.*s: node %u create mles on other " 1780 mlog(0, "%.*s: node %u create mles on other "
1666 "nodes and requests a re-assert\n", 1781 "nodes and requests a re-assert\n",
1667 namelen, lockname, to); 1782 namelen, lockname, to);
1668 reassert = 1; 1783 reassert = 1;
1669 } 1784 }
1785 if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
1786 mlog(0, "%.*s: node %u has a reference to this "
1787 "lockres, set the bit in the refmap\n",
1788 namelen, lockname, to);
1789 spin_lock(&res->spinlock);
1790 dlm_lockres_set_refmap_bit(to, res);
1791 spin_unlock(&res->spinlock);
1792 }
1670 } 1793 }
1671 1794
1672 if (reassert) 1795 if (reassert)
1673 goto again; 1796 goto again;
1674 1797
1798 spin_lock(&res->spinlock);
1799 res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
1800 spin_unlock(&res->spinlock);
1801 wake_up(&res->wq);
1802
1675 return ret; 1803 return ret;
1676} 1804}
1677 1805
@@ -1684,7 +1812,8 @@ again:
1684 * 1812 *
1685 * if possible, TRIM THIS DOWN!!! 1813 * if possible, TRIM THIS DOWN!!!
1686 */ 1814 */
1687int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) 1815int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1816 void **ret_data)
1688{ 1817{
1689 struct dlm_ctxt *dlm = data; 1818 struct dlm_ctxt *dlm = data;
1690 struct dlm_master_list_entry *mle = NULL; 1819 struct dlm_master_list_entry *mle = NULL;
@@ -1693,7 +1822,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1693 char *name; 1822 char *name;
1694 unsigned int namelen, hash; 1823 unsigned int namelen, hash;
1695 u32 flags; 1824 u32 flags;
1696 int master_request = 0; 1825 int master_request = 0, have_lockres_ref = 0;
1697 int ret = 0; 1826 int ret = 0;
1698 1827
1699 if (!dlm_grab(dlm)) 1828 if (!dlm_grab(dlm))
@@ -1851,6 +1980,7 @@ ok:
1851 spin_unlock(&mle->spinlock); 1980 spin_unlock(&mle->spinlock);
1852 1981
1853 if (res) { 1982 if (res) {
1983 int wake = 0;
1854 spin_lock(&res->spinlock); 1984 spin_lock(&res->spinlock);
1855 if (mle->type == DLM_MLE_MIGRATION) { 1985 if (mle->type == DLM_MLE_MIGRATION) {
1856 mlog(0, "finishing off migration of lockres %.*s, " 1986 mlog(0, "finishing off migration of lockres %.*s, "
@@ -1858,12 +1988,16 @@ ok:
1858 res->lockname.len, res->lockname.name, 1988 res->lockname.len, res->lockname.name,
1859 dlm->node_num, mle->new_master); 1989 dlm->node_num, mle->new_master);
1860 res->state &= ~DLM_LOCK_RES_MIGRATING; 1990 res->state &= ~DLM_LOCK_RES_MIGRATING;
1991 wake = 1;
1861 dlm_change_lockres_owner(dlm, res, mle->new_master); 1992 dlm_change_lockres_owner(dlm, res, mle->new_master);
1862 BUG_ON(res->state & DLM_LOCK_RES_DIRTY); 1993 BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1863 } else { 1994 } else {
1864 dlm_change_lockres_owner(dlm, res, mle->master); 1995 dlm_change_lockres_owner(dlm, res, mle->master);
1865 } 1996 }
1866 spin_unlock(&res->spinlock); 1997 spin_unlock(&res->spinlock);
1998 have_lockres_ref = 1;
1999 if (wake)
2000 wake_up(&res->wq);
1867 } 2001 }
1868 2002
1869 /* master is known, detach if not already detached. 2003 /* master is known, detach if not already detached.
@@ -1913,12 +2047,28 @@ ok:
1913 2047
1914done: 2048done:
1915 ret = 0; 2049 ret = 0;
1916 if (res) 2050 if (res) {
1917 dlm_lockres_put(res); 2051 spin_lock(&res->spinlock);
2052 res->state |= DLM_LOCK_RES_SETREF_INPROG;
2053 spin_unlock(&res->spinlock);
2054 *ret_data = (void *)res;
2055 }
1918 dlm_put(dlm); 2056 dlm_put(dlm);
1919 if (master_request) { 2057 if (master_request) {
1920 mlog(0, "need to tell master to reassert\n"); 2058 mlog(0, "need to tell master to reassert\n");
1921 ret = EAGAIN; // positive. negative would shoot down the node. 2059 /* positive. negative would shoot down the node. */
2060 ret |= DLM_ASSERT_RESPONSE_REASSERT;
2061 if (!have_lockres_ref) {
2062 mlog(ML_ERROR, "strange, got assert from %u, MASTER "
2063 "mle present here for %s:%.*s, but no lockres!\n",
2064 assert->node_idx, dlm->name, namelen, name);
2065 }
2066 }
2067 if (have_lockres_ref) {
2068 /* let the master know we have a reference to the lockres */
2069 ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
2070 mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
2071 dlm->name, namelen, name, assert->node_idx);
1922 } 2072 }
1923 return ret; 2073 return ret;
1924 2074
@@ -1929,11 +2079,25 @@ kill:
1929 __dlm_print_one_lock_resource(res); 2079 __dlm_print_one_lock_resource(res);
1930 spin_unlock(&res->spinlock); 2080 spin_unlock(&res->spinlock);
1931 spin_unlock(&dlm->spinlock); 2081 spin_unlock(&dlm->spinlock);
1932 dlm_lockres_put(res); 2082 *ret_data = (void *)res;
1933 dlm_put(dlm); 2083 dlm_put(dlm);
1934 return -EINVAL; 2084 return -EINVAL;
1935} 2085}
1936 2086
2087void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
2088{
2089 struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
2090
2091 if (ret_data) {
2092 spin_lock(&res->spinlock);
2093 res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
2094 spin_unlock(&res->spinlock);
2095 wake_up(&res->wq);
2096 dlm_lockres_put(res);
2097 }
2098 return;
2099}
2100
1937int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, 2101int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
1938 struct dlm_lock_resource *res, 2102 struct dlm_lock_resource *res,
1939 int ignore_higher, u8 request_from, u32 flags) 2103 int ignore_higher, u8 request_from, u32 flags)
@@ -2023,9 +2187,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
2023 * even if one or more nodes die */ 2187 * even if one or more nodes die */
2024 mlog(0, "worker about to master %.*s here, this=%u\n", 2188 mlog(0, "worker about to master %.*s here, this=%u\n",
2025 res->lockname.len, res->lockname.name, dlm->node_num); 2189 res->lockname.len, res->lockname.name, dlm->node_num);
2026 ret = dlm_do_assert_master(dlm, res->lockname.name, 2190 ret = dlm_do_assert_master(dlm, res, nodemap, flags);
2027 res->lockname.len,
2028 nodemap, flags);
2029 if (ret < 0) { 2191 if (ret < 0) {
2030 /* no need to restart, we are done */ 2192 /* no need to restart, we are done */
2031 if (!dlm_is_host_down(ret)) 2193 if (!dlm_is_host_down(ret))
@@ -2097,14 +2259,180 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
2097 return ret; 2259 return ret;
2098} 2260}
2099 2261
2262/*
2263 * DLM_DEREF_LOCKRES_MSG
2264 */
2265
2266int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2267{
2268 struct dlm_deref_lockres deref;
2269 int ret = 0, r;
2270 const char *lockname;
2271 unsigned int namelen;
2272
2273 lockname = res->lockname.name;
2274 namelen = res->lockname.len;
2275 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2276
2277 mlog(0, "%s:%.*s: sending deref to %d\n",
2278 dlm->name, namelen, lockname, res->owner);
2279 memset(&deref, 0, sizeof(deref));
2280 deref.node_idx = dlm->node_num;
2281 deref.namelen = namelen;
2282 memcpy(deref.name, lockname, namelen);
2283
2284 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2285 &deref, sizeof(deref), res->owner, &r);
2286 if (ret < 0)
2287 mlog_errno(ret);
2288 else if (r < 0) {
2289 /* BAD. other node says I did not have a ref. */
2290 mlog(ML_ERROR,"while dropping ref on %s:%.*s "
2291 "(master=%u) got %d.\n", dlm->name, namelen,
2292 lockname, res->owner, r);
2293 dlm_print_one_lock_resource(res);
2294 BUG();
2295 }
2296 return ret;
2297}
2298
2299int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2300 void **ret_data)
2301{
2302 struct dlm_ctxt *dlm = data;
2303 struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
2304 struct dlm_lock_resource *res = NULL;
2305 char *name;
2306 unsigned int namelen;
2307 int ret = -EINVAL;
2308 u8 node;
2309 unsigned int hash;
2310 struct dlm_work_item *item;
2311 int cleared = 0;
2312 int dispatch = 0;
2313
2314 if (!dlm_grab(dlm))
2315 return 0;
2316
2317 name = deref->name;
2318 namelen = deref->namelen;
2319 node = deref->node_idx;
2320
2321 if (namelen > DLM_LOCKID_NAME_MAX) {
2322 mlog(ML_ERROR, "Invalid name length!");
2323 goto done;
2324 }
2325 if (deref->node_idx >= O2NM_MAX_NODES) {
2326 mlog(ML_ERROR, "Invalid node number: %u\n", node);
2327 goto done;
2328 }
2329
2330 hash = dlm_lockid_hash(name, namelen);
2331
2332 spin_lock(&dlm->spinlock);
2333 res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2334 if (!res) {
2335 spin_unlock(&dlm->spinlock);
2336 mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2337 dlm->name, namelen, name);
2338 goto done;
2339 }
2340 spin_unlock(&dlm->spinlock);
2341
2342 spin_lock(&res->spinlock);
2343 if (res->state & DLM_LOCK_RES_SETREF_INPROG)
2344 dispatch = 1;
2345 else {
2346 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2347 if (test_bit(node, res->refmap)) {
2348 dlm_lockres_clear_refmap_bit(node, res);
2349 cleared = 1;
2350 }
2351 }
2352 spin_unlock(&res->spinlock);
2353
2354 if (!dispatch) {
2355 if (cleared)
2356 dlm_lockres_calc_usage(dlm, res);
2357 else {
2358 mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2359 "but it is already dropped!\n", dlm->name,
2360 res->lockname.len, res->lockname.name, node);
2361 __dlm_print_one_lock_resource(res);
2362 }
2363 ret = 0;
2364 goto done;
2365 }
2366
2367 item = kzalloc(sizeof(*item), GFP_NOFS);
2368 if (!item) {
2369 ret = -ENOMEM;
2370 mlog_errno(ret);
2371 goto done;
2372 }
2373
2374 dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
2375 item->u.dl.deref_res = res;
2376 item->u.dl.deref_node = node;
2377
2378 spin_lock(&dlm->work_lock);
2379 list_add_tail(&item->list, &dlm->work_list);
2380 spin_unlock(&dlm->work_lock);
2381
2382 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2383 return 0;
2384
2385done:
2386 if (res)
2387 dlm_lockres_put(res);
2388 dlm_put(dlm);
2389
2390 return ret;
2391}
2392
2393static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2394{
2395 struct dlm_ctxt *dlm;
2396 struct dlm_lock_resource *res;
2397 u8 node;
2398 u8 cleared = 0;
2399
2400 dlm = item->dlm;
2401 res = item->u.dl.deref_res;
2402 node = item->u.dl.deref_node;
2403
2404 spin_lock(&res->spinlock);
2405 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2406 if (test_bit(node, res->refmap)) {
2407 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2408 dlm_lockres_clear_refmap_bit(node, res);
2409 cleared = 1;
2410 }
2411 spin_unlock(&res->spinlock);
2412
2413 if (cleared) {
2414 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
2415 dlm->name, res->lockname.len, res->lockname.name, node);
2416 dlm_lockres_calc_usage(dlm, res);
2417 } else {
2418 mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2419 "but it is already dropped!\n", dlm->name,
2420 res->lockname.len, res->lockname.name, node);
2421 __dlm_print_one_lock_resource(res);
2422 }
2423
2424 dlm_lockres_put(res);
2425}
2426
2100 2427
2101/* 2428/*
2102 * DLM_MIGRATE_LOCKRES 2429 * DLM_MIGRATE_LOCKRES
2103 */ 2430 */
2104 2431
2105 2432
2106int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 2433static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2107 u8 target) 2434 struct dlm_lock_resource *res,
2435 u8 target)
2108{ 2436{
2109 struct dlm_master_list_entry *mle = NULL; 2437 struct dlm_master_list_entry *mle = NULL;
2110 struct dlm_master_list_entry *oldmle = NULL; 2438 struct dlm_master_list_entry *oldmle = NULL;
@@ -2116,7 +2444,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2116 struct list_head *queue, *iter; 2444 struct list_head *queue, *iter;
2117 int i; 2445 int i;
2118 struct dlm_lock *lock; 2446 struct dlm_lock *lock;
2119 int empty = 1; 2447 int empty = 1, wake = 0;
2120 2448
2121 if (!dlm_grab(dlm)) 2449 if (!dlm_grab(dlm))
2122 return -EINVAL; 2450 return -EINVAL;
@@ -2241,6 +2569,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2241 res->lockname.name, target); 2569 res->lockname.name, target);
2242 spin_lock(&res->spinlock); 2570 spin_lock(&res->spinlock);
2243 res->state &= ~DLM_LOCK_RES_MIGRATING; 2571 res->state &= ~DLM_LOCK_RES_MIGRATING;
2572 wake = 1;
2244 spin_unlock(&res->spinlock); 2573 spin_unlock(&res->spinlock);
2245 ret = -EINVAL; 2574 ret = -EINVAL;
2246 } 2575 }
@@ -2268,6 +2597,9 @@ fail:
2268 * the lockres 2597 * the lockres
2269 */ 2598 */
2270 2599
2600 /* now that remote nodes are spinning on the MIGRATING flag,
2601 * ensure that all assert_master work is flushed. */
2602 flush_workqueue(dlm->dlm_worker);
2271 2603
2272 /* get an extra reference on the mle. 2604 /* get an extra reference on the mle.
2273 * otherwise the assert_master from the new 2605 * otherwise the assert_master from the new
@@ -2296,6 +2628,7 @@ fail:
2296 dlm_put_mle_inuse(mle); 2628 dlm_put_mle_inuse(mle);
2297 spin_lock(&res->spinlock); 2629 spin_lock(&res->spinlock);
2298 res->state &= ~DLM_LOCK_RES_MIGRATING; 2630 res->state &= ~DLM_LOCK_RES_MIGRATING;
2631 wake = 1;
2299 spin_unlock(&res->spinlock); 2632 spin_unlock(&res->spinlock);
2300 goto leave; 2633 goto leave;
2301 } 2634 }
@@ -2322,7 +2655,8 @@ fail:
2322 res->owner == target) 2655 res->owner == target)
2323 break; 2656 break;
2324 2657
2325 mlog(0, "timed out during migration\n"); 2658 mlog(0, "%s:%.*s: timed out during migration\n",
2659 dlm->name, res->lockname.len, res->lockname.name);
2326 /* avoid hang during shutdown when migrating lockres 2660 /* avoid hang during shutdown when migrating lockres
2327 * to a node which also goes down */ 2661 * to a node which also goes down */
2328 if (dlm_is_node_dead(dlm, target)) { 2662 if (dlm_is_node_dead(dlm, target)) {
@@ -2330,20 +2664,20 @@ fail:
2330 "target %u is no longer up, restarting\n", 2664 "target %u is no longer up, restarting\n",
2331 dlm->name, res->lockname.len, 2665 dlm->name, res->lockname.len,
2332 res->lockname.name, target); 2666 res->lockname.name, target);
2333 ret = -ERESTARTSYS; 2667 ret = -EINVAL;
2668 /* migration failed, detach and clean up mle */
2669 dlm_mle_detach_hb_events(dlm, mle);
2670 dlm_put_mle(mle);
2671 dlm_put_mle_inuse(mle);
2672 spin_lock(&res->spinlock);
2673 res->state &= ~DLM_LOCK_RES_MIGRATING;
2674 wake = 1;
2675 spin_unlock(&res->spinlock);
2676 goto leave;
2334 } 2677 }
2335 } 2678 } else
2336 if (ret == -ERESTARTSYS) { 2679 mlog(0, "%s:%.*s: caught signal during migration\n",
2337 /* migration failed, detach and clean up mle */ 2680 dlm->name, res->lockname.len, res->lockname.name);
2338 dlm_mle_detach_hb_events(dlm, mle);
2339 dlm_put_mle(mle);
2340 dlm_put_mle_inuse(mle);
2341 spin_lock(&res->spinlock);
2342 res->state &= ~DLM_LOCK_RES_MIGRATING;
2343 spin_unlock(&res->spinlock);
2344 goto leave;
2345 }
2346 /* TODO: if node died: stop, clean up, return error */
2347 } 2681 }
2348 2682
2349 /* all done, set the owner, clear the flag */ 2683 /* all done, set the owner, clear the flag */
@@ -2366,6 +2700,11 @@ leave:
2366 if (ret < 0) 2700 if (ret < 0)
2367 dlm_kick_thread(dlm, res); 2701 dlm_kick_thread(dlm, res);
2368 2702
2703 /* wake up waiters if the MIGRATING flag got set
2704 * but migration failed */
2705 if (wake)
2706 wake_up(&res->wq);
2707
2369 /* TODO: cleanup */ 2708 /* TODO: cleanup */
2370 if (mres) 2709 if (mres)
2371 free_page((unsigned long)mres); 2710 free_page((unsigned long)mres);
@@ -2376,6 +2715,53 @@ leave:
2376 return ret; 2715 return ret;
2377} 2716}
2378 2717
2718#define DLM_MIGRATION_RETRY_MS 100
2719
2720/* Should be called only after beginning the domain leave process.
2721 * There should not be any remaining locks on nonlocal lock resources,
2722 * and there should be no local locks left on locally mastered resources.
2723 *
2724 * Called with the dlm spinlock held, may drop it to do migration, but
2725 * will re-acquire before exit.
2726 *
2727 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
2728int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2729{
2730 int ret;
2731 int lock_dropped = 0;
2732
2733 if (res->owner != dlm->node_num) {
2734 if (!__dlm_lockres_unused(res)) {
2735 mlog(ML_ERROR, "%s:%.*s: this node is not master, "
2736 "trying to free this but locks remain\n",
2737 dlm->name, res->lockname.len, res->lockname.name);
2738 }
2739 goto leave;
2740 }
2741
2742 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2743 spin_unlock(&dlm->spinlock);
2744 lock_dropped = 1;
2745 while (1) {
2746 ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
2747 if (ret >= 0)
2748 break;
2749 if (ret == -ENOTEMPTY) {
2750 mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
2751 res->lockname.len, res->lockname.name);
2752 BUG();
2753 }
2754
2755 mlog(0, "lockres %.*s: migrate failed, "
2756 "retrying\n", res->lockname.len,
2757 res->lockname.name);
2758 msleep(DLM_MIGRATION_RETRY_MS);
2759 }
2760 spin_lock(&dlm->spinlock);
2761leave:
2762 return lock_dropped;
2763}
2764
2379int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) 2765int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2380{ 2766{
2381 int ret; 2767 int ret;
@@ -2405,7 +2791,8 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2405 return can_proceed; 2791 return can_proceed;
2406} 2792}
2407 2793
2408int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2794static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
2795 struct dlm_lock_resource *res)
2409{ 2796{
2410 int ret; 2797 int ret;
2411 spin_lock(&res->spinlock); 2798 spin_lock(&res->spinlock);
@@ -2434,8 +2821,15 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2434 __dlm_lockres_reserve_ast(res); 2821 __dlm_lockres_reserve_ast(res);
2435 spin_unlock(&res->spinlock); 2822 spin_unlock(&res->spinlock);
2436 2823
2437 /* now flush all the pending asts.. hang out for a bit */ 2824 /* now flush all the pending asts */
2438 dlm_kick_thread(dlm, res); 2825 dlm_kick_thread(dlm, res);
2826 /* before waiting on DIRTY, block processes which may
2827 * try to dirty the lockres before MIGRATING is set */
2828 spin_lock(&res->spinlock);
2829 BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
2830 res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
2831 spin_unlock(&res->spinlock);
2832 /* now wait on any pending asts and the DIRTY state */
2439 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); 2833 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2440 dlm_lockres_release_ast(dlm, res); 2834 dlm_lockres_release_ast(dlm, res);
2441 2835
@@ -2461,6 +2855,13 @@ again:
2461 mlog(0, "trying again...\n"); 2855 mlog(0, "trying again...\n");
2462 goto again; 2856 goto again;
2463 } 2857 }
2858 /* now that we are sure the MIGRATING state is there, drop
2859 * the unneded state which blocked threads trying to DIRTY */
2860 spin_lock(&res->spinlock);
2861 BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2862 BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
2863 res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2864 spin_unlock(&res->spinlock);
2464 2865
2465 /* did the target go down or die? */ 2866 /* did the target go down or die? */
2466 spin_lock(&dlm->spinlock); 2867 spin_lock(&dlm->spinlock);
@@ -2490,7 +2891,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2490{ 2891{
2491 struct list_head *iter, *iter2; 2892 struct list_head *iter, *iter2;
2492 struct list_head *queue = &res->granted; 2893 struct list_head *queue = &res->granted;
2493 int i; 2894 int i, bit;
2494 struct dlm_lock *lock; 2895 struct dlm_lock *lock;
2495 2896
2496 assert_spin_locked(&res->spinlock); 2897 assert_spin_locked(&res->spinlock);
@@ -2508,12 +2909,28 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2508 BUG_ON(!list_empty(&lock->bast_list)); 2909 BUG_ON(!list_empty(&lock->bast_list));
2509 BUG_ON(lock->ast_pending); 2910 BUG_ON(lock->ast_pending);
2510 BUG_ON(lock->bast_pending); 2911 BUG_ON(lock->bast_pending);
2912 dlm_lockres_clear_refmap_bit(lock->ml.node, res);
2511 list_del_init(&lock->list); 2913 list_del_init(&lock->list);
2512 dlm_lock_put(lock); 2914 dlm_lock_put(lock);
2513 } 2915 }
2514 } 2916 }
2515 queue++; 2917 queue++;
2516 } 2918 }
2919 bit = 0;
2920 while (1) {
2921 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
2922 if (bit >= O2NM_MAX_NODES)
2923 break;
2924 /* do not clear the local node reference, if there is a
2925 * process holding this, let it drop the ref itself */
2926 if (bit != dlm->node_num) {
2927 mlog(0, "%s:%.*s: node %u had a ref to this "
2928 "migrating lockres, clearing\n", dlm->name,
2929 res->lockname.len, res->lockname.name, bit);
2930 dlm_lockres_clear_refmap_bit(bit, res);
2931 }
2932 bit++;
2933 }
2517} 2934}
2518 2935
2519/* for now this is not too intelligent. we will 2936/* for now this is not too intelligent. we will
@@ -2601,6 +3018,16 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2601 mlog(0, "migrate request (node %u) returned %d!\n", 3018 mlog(0, "migrate request (node %u) returned %d!\n",
2602 nodenum, status); 3019 nodenum, status);
2603 ret = status; 3020 ret = status;
3021 } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
3022 /* during the migration request we short-circuited
3023 * the mastery of the lockres. make sure we have
3024 * a mastery ref for nodenum */
3025 mlog(0, "%s:%.*s: need ref for node %u\n",
3026 dlm->name, res->lockname.len, res->lockname.name,
3027 nodenum);
3028 spin_lock(&res->spinlock);
3029 dlm_lockres_set_refmap_bit(nodenum, res);
3030 spin_unlock(&res->spinlock);
2604 } 3031 }
2605 } 3032 }
2606 3033
@@ -2619,7 +3046,8 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2619 * we will have no mle in the list to start with. now we can add an mle for 3046 * we will have no mle in the list to start with. now we can add an mle for
2620 * the migration and this should be the only one found for those scanning the 3047 * the migration and this should be the only one found for those scanning the
2621 * list. */ 3048 * list. */
2622int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) 3049int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3050 void **ret_data)
2623{ 3051{
2624 struct dlm_ctxt *dlm = data; 3052 struct dlm_ctxt *dlm = data;
2625 struct dlm_lock_resource *res = NULL; 3053 struct dlm_lock_resource *res = NULL;
@@ -2745,7 +3173,13 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
2745 /* remove it from the list so that only one 3173 /* remove it from the list so that only one
2746 * mle will be found */ 3174 * mle will be found */
2747 list_del_init(&tmp->list); 3175 list_del_init(&tmp->list);
2748 __dlm_mle_detach_hb_events(dlm, mle); 3176 /* this was obviously WRONG. mle is uninited here. should be tmp. */
3177 __dlm_mle_detach_hb_events(dlm, tmp);
3178 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3179 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
3180 "telling master to get ref for cleared out mle "
3181 "during migration\n", dlm->name, namelen, name,
3182 master, new_master);
2749 } 3183 }
2750 spin_unlock(&tmp->spinlock); 3184 spin_unlock(&tmp->spinlock);
2751 } 3185 }
@@ -2753,6 +3187,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
2753 /* now add a migration mle to the tail of the list */ 3187 /* now add a migration mle to the tail of the list */
2754 dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); 3188 dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
2755 mle->new_master = new_master; 3189 mle->new_master = new_master;
3190 /* the new master will be sending an assert master for this.
3191 * at that point we will get the refmap reference */
2756 mle->master = master; 3192 mle->master = master;
2757 /* do this for consistency with other mle types */ 3193 /* do this for consistency with other mle types */
2758 set_bit(new_master, mle->maybe_map); 3194 set_bit(new_master, mle->maybe_map);
@@ -2902,6 +3338,13 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2902 clear_bit(dlm->node_num, iter.node_map); 3338 clear_bit(dlm->node_num, iter.node_map);
2903 spin_unlock(&dlm->spinlock); 3339 spin_unlock(&dlm->spinlock);
2904 3340
3341 /* ownership of the lockres is changing. account for the
3342 * mastery reference here since old_master will briefly have
3343 * a reference after the migration completes */
3344 spin_lock(&res->spinlock);
3345 dlm_lockres_set_refmap_bit(old_master, res);
3346 spin_unlock(&res->spinlock);
3347
2905 mlog(0, "now time to do a migrate request to other nodes\n"); 3348 mlog(0, "now time to do a migrate request to other nodes\n");
2906 ret = dlm_do_migrate_request(dlm, res, old_master, 3349 ret = dlm_do_migrate_request(dlm, res, old_master,
2907 dlm->node_num, &iter); 3350 dlm->node_num, &iter);
@@ -2914,8 +3357,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2914 res->lockname.len, res->lockname.name); 3357 res->lockname.len, res->lockname.name);
2915 /* this call now finishes out the nodemap 3358 /* this call now finishes out the nodemap
2916 * even if one or more nodes die */ 3359 * even if one or more nodes die */
2917 ret = dlm_do_assert_master(dlm, res->lockname.name, 3360 ret = dlm_do_assert_master(dlm, res, iter.node_map,
2918 res->lockname.len, iter.node_map,
2919 DLM_ASSERT_MASTER_FINISH_MIGRATION); 3361 DLM_ASSERT_MASTER_FINISH_MIGRATION);
2920 if (ret < 0) { 3362 if (ret < 0) {
2921 /* no longer need to retry. all living nodes contacted. */ 3363 /* no longer need to retry. all living nodes contacted. */
@@ -2927,8 +3369,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2927 set_bit(old_master, iter.node_map); 3369 set_bit(old_master, iter.node_map);
2928 mlog(0, "doing assert master of %.*s back to %u\n", 3370 mlog(0, "doing assert master of %.*s back to %u\n",
2929 res->lockname.len, res->lockname.name, old_master); 3371 res->lockname.len, res->lockname.name, old_master);
2930 ret = dlm_do_assert_master(dlm, res->lockname.name, 3372 ret = dlm_do_assert_master(dlm, res, iter.node_map,
2931 res->lockname.len, iter.node_map,
2932 DLM_ASSERT_MASTER_FINISH_MIGRATION); 3373 DLM_ASSERT_MASTER_FINISH_MIGRATION);
2933 if (ret < 0) { 3374 if (ret < 0) {
2934 mlog(0, "assert master to original master failed " 3375 mlog(0, "assert master to original master failed "
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 367a11e9e2ed..6d4a83d50152 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -163,9 +163,6 @@ void dlm_dispatch_work(struct work_struct *work)
163 dlm_workfunc_t *workfunc; 163 dlm_workfunc_t *workfunc;
164 int tot=0; 164 int tot=0;
165 165
166 if (!dlm_joined(dlm))
167 return;
168
169 spin_lock(&dlm->work_lock); 166 spin_lock(&dlm->work_lock);
170 list_splice_init(&dlm->work_list, &tmp_list); 167 list_splice_init(&dlm->work_list, &tmp_list);
171 spin_unlock(&dlm->work_lock); 168 spin_unlock(&dlm->work_lock);
@@ -821,7 +818,8 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
821 818
822} 819}
823 820
824int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) 821int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
822 void **ret_data)
825{ 823{
826 struct dlm_ctxt *dlm = data; 824 struct dlm_ctxt *dlm = data;
827 struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; 825 struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
@@ -978,7 +976,8 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
978} 976}
979 977
980 978
981int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) 979int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
980 void **ret_data)
982{ 981{
983 struct dlm_ctxt *dlm = data; 982 struct dlm_ctxt *dlm = data;
984 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; 983 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
@@ -1129,6 +1128,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1129 if (total_locks == mres_total_locks) 1128 if (total_locks == mres_total_locks)
1130 mres->flags |= DLM_MRES_ALL_DONE; 1129 mres->flags |= DLM_MRES_ALL_DONE;
1131 1130
1131 mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
1132 dlm->name, res->lockname.len, res->lockname.name,
1133 orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery",
1134 send_to);
1135
1132 /* send it */ 1136 /* send it */
1133 ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, 1137 ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
1134 sz, send_to, &status); 1138 sz, send_to, &status);
@@ -1213,6 +1217,34 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
1213 return 0; 1217 return 0;
1214} 1218}
1215 1219
1220static void dlm_add_dummy_lock(struct dlm_ctxt *dlm,
1221 struct dlm_migratable_lockres *mres)
1222{
1223 struct dlm_lock dummy;
1224 memset(&dummy, 0, sizeof(dummy));
1225 dummy.ml.cookie = 0;
1226 dummy.ml.type = LKM_IVMODE;
1227 dummy.ml.convert_type = LKM_IVMODE;
1228 dummy.ml.highest_blocked = LKM_IVMODE;
1229 dummy.lksb = NULL;
1230 dummy.ml.node = dlm->node_num;
1231 dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST);
1232}
1233
1234static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm,
1235 struct dlm_migratable_lock *ml,
1236 u8 *nodenum)
1237{
1238 if (unlikely(ml->cookie == 0 &&
1239 ml->type == LKM_IVMODE &&
1240 ml->convert_type == LKM_IVMODE &&
1241 ml->highest_blocked == LKM_IVMODE &&
1242 ml->list == DLM_BLOCKED_LIST)) {
1243 *nodenum = ml->node;
1244 return 1;
1245 }
1246 return 0;
1247}
1216 1248
1217int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 1249int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1218 struct dlm_migratable_lockres *mres, 1250 struct dlm_migratable_lockres *mres,
@@ -1260,6 +1292,14 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1260 goto error; 1292 goto error;
1261 } 1293 }
1262 } 1294 }
1295 if (total_locks == 0) {
1296 /* send a dummy lock to indicate a mastery reference only */
1297 mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n",
1298 dlm->name, res->lockname.len, res->lockname.name,
1299 send_to, flags & DLM_MRES_RECOVERY ? "recovery" :
1300 "migration");
1301 dlm_add_dummy_lock(dlm, mres);
1302 }
1263 /* flush any remaining locks */ 1303 /* flush any remaining locks */
1264 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); 1304 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
1265 if (ret < 0) 1305 if (ret < 0)
@@ -1293,7 +1333,8 @@ error:
1293 * do we spin? returning an error only delays the problem really 1333 * do we spin? returning an error only delays the problem really
1294 */ 1334 */
1295 1335
1296int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) 1336int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
1337 void **ret_data)
1297{ 1338{
1298 struct dlm_ctxt *dlm = data; 1339 struct dlm_ctxt *dlm = data;
1299 struct dlm_migratable_lockres *mres = 1340 struct dlm_migratable_lockres *mres =
@@ -1382,17 +1423,21 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
1382 spin_lock(&res->spinlock); 1423 spin_lock(&res->spinlock);
1383 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 1424 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
1384 spin_unlock(&res->spinlock); 1425 spin_unlock(&res->spinlock);
1426 wake_up(&res->wq);
1385 1427
1386 /* add an extra ref for just-allocated lockres 1428 /* add an extra ref for just-allocated lockres
1387 * otherwise the lockres will be purged immediately */ 1429 * otherwise the lockres will be purged immediately */
1388 dlm_lockres_get(res); 1430 dlm_lockres_get(res);
1389
1390 } 1431 }
1391 1432
1392 /* at this point we have allocated everything we need, 1433 /* at this point we have allocated everything we need,
1393 * and we have a hashed lockres with an extra ref and 1434 * and we have a hashed lockres with an extra ref and
1394 * the proper res->state flags. */ 1435 * the proper res->state flags. */
1395 ret = 0; 1436 ret = 0;
1437 spin_lock(&res->spinlock);
1438 /* drop this either when master requery finds a different master
1439 * or when a lock is added by the recovery worker */
1440 dlm_lockres_grab_inflight_ref(dlm, res);
1396 if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { 1441 if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1397 /* migration cannot have an unknown master */ 1442 /* migration cannot have an unknown master */
1398 BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); 1443 BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
@@ -1400,10 +1445,11 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
1400 "unknown owner.. will need to requery: " 1445 "unknown owner.. will need to requery: "
1401 "%.*s\n", mres->lockname_len, mres->lockname); 1446 "%.*s\n", mres->lockname_len, mres->lockname);
1402 } else { 1447 } else {
1403 spin_lock(&res->spinlock); 1448 /* take a reference now to pin the lockres, drop it
1449 * when locks are added in the worker */
1404 dlm_change_lockres_owner(dlm, res, dlm->node_num); 1450 dlm_change_lockres_owner(dlm, res, dlm->node_num);
1405 spin_unlock(&res->spinlock);
1406 } 1451 }
1452 spin_unlock(&res->spinlock);
1407 1453
1408 /* queue up work for dlm_mig_lockres_worker */ 1454 /* queue up work for dlm_mig_lockres_worker */
1409 dlm_grab(dlm); /* get an extra ref for the work item */ 1455 dlm_grab(dlm); /* get an extra ref for the work item */
@@ -1459,6 +1505,9 @@ again:
1459 "this node will take it.\n", 1505 "this node will take it.\n",
1460 res->lockname.len, res->lockname.name); 1506 res->lockname.len, res->lockname.name);
1461 } else { 1507 } else {
1508 spin_lock(&res->spinlock);
1509 dlm_lockres_drop_inflight_ref(dlm, res);
1510 spin_unlock(&res->spinlock);
1462 mlog(0, "master needs to respond to sender " 1511 mlog(0, "master needs to respond to sender "
1463 "that node %u still owns %.*s\n", 1512 "that node %u still owns %.*s\n",
1464 real_master, res->lockname.len, 1513 real_master, res->lockname.len,
@@ -1578,7 +1627,8 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1578/* this function cannot error, so unless the sending 1627/* this function cannot error, so unless the sending
1579 * or receiving of the message failed, the owner can 1628 * or receiving of the message failed, the owner can
1580 * be trusted */ 1629 * be trusted */
1581int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) 1630int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
1631 void **ret_data)
1582{ 1632{
1583 struct dlm_ctxt *dlm = data; 1633 struct dlm_ctxt *dlm = data;
1584 struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; 1634 struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
@@ -1660,21 +1710,38 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1660{ 1710{
1661 struct dlm_migratable_lock *ml; 1711 struct dlm_migratable_lock *ml;
1662 struct list_head *queue; 1712 struct list_head *queue;
1713 struct list_head *tmpq = NULL;
1663 struct dlm_lock *newlock = NULL; 1714 struct dlm_lock *newlock = NULL;
1664 struct dlm_lockstatus *lksb = NULL; 1715 struct dlm_lockstatus *lksb = NULL;
1665 int ret = 0; 1716 int ret = 0;
1666 int i, bad; 1717 int i, j, bad;
1667 struct list_head *iter; 1718 struct list_head *iter;
1668 struct dlm_lock *lock = NULL; 1719 struct dlm_lock *lock = NULL;
1720 u8 from = O2NM_MAX_NODES;
1721 unsigned int added = 0;
1669 1722
1670 mlog(0, "running %d locks for this lockres\n", mres->num_locks); 1723 mlog(0, "running %d locks for this lockres\n", mres->num_locks);
1671 for (i=0; i<mres->num_locks; i++) { 1724 for (i=0; i<mres->num_locks; i++) {
1672 ml = &(mres->ml[i]); 1725 ml = &(mres->ml[i]);
1726
1727 if (dlm_is_dummy_lock(dlm, ml, &from)) {
1728 /* placeholder, just need to set the refmap bit */
1729 BUG_ON(mres->num_locks != 1);
1730 mlog(0, "%s:%.*s: dummy lock for %u\n",
1731 dlm->name, mres->lockname_len, mres->lockname,
1732 from);
1733 spin_lock(&res->spinlock);
1734 dlm_lockres_set_refmap_bit(from, res);
1735 spin_unlock(&res->spinlock);
1736 added++;
1737 break;
1738 }
1673 BUG_ON(ml->highest_blocked != LKM_IVMODE); 1739 BUG_ON(ml->highest_blocked != LKM_IVMODE);
1674 newlock = NULL; 1740 newlock = NULL;
1675 lksb = NULL; 1741 lksb = NULL;
1676 1742
1677 queue = dlm_list_num_to_pointer(res, ml->list); 1743 queue = dlm_list_num_to_pointer(res, ml->list);
1744 tmpq = NULL;
1678 1745
1679 /* if the lock is for the local node it needs to 1746 /* if the lock is for the local node it needs to
1680 * be moved to the proper location within the queue. 1747 * be moved to the proper location within the queue.
@@ -1684,11 +1751,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1684 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); 1751 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
1685 1752
1686 spin_lock(&res->spinlock); 1753 spin_lock(&res->spinlock);
1687 list_for_each(iter, queue) { 1754 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1688 lock = list_entry (iter, struct dlm_lock, list); 1755 tmpq = dlm_list_idx_to_ptr(res, j);
1689 if (lock->ml.cookie != ml->cookie) 1756 list_for_each(iter, tmpq) {
1690 lock = NULL; 1757 lock = list_entry (iter, struct dlm_lock, list);
1691 else 1758 if (lock->ml.cookie != ml->cookie)
1759 lock = NULL;
1760 else
1761 break;
1762 }
1763 if (lock)
1692 break; 1764 break;
1693 } 1765 }
1694 1766
@@ -1698,12 +1770,20 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1698 u64 c = ml->cookie; 1770 u64 c = ml->cookie;
1699 mlog(ML_ERROR, "could not find local lock " 1771 mlog(ML_ERROR, "could not find local lock "
1700 "with cookie %u:%llu!\n", 1772 "with cookie %u:%llu!\n",
1701 dlm_get_lock_cookie_node(c), 1773 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1702 dlm_get_lock_cookie_seq(c)); 1774 dlm_get_lock_cookie_seq(be64_to_cpu(c)));
1775 __dlm_print_one_lock_resource(res);
1703 BUG(); 1776 BUG();
1704 } 1777 }
1705 BUG_ON(lock->ml.node != ml->node); 1778 BUG_ON(lock->ml.node != ml->node);
1706 1779
1780 if (tmpq != queue) {
1781 mlog(0, "lock was on %u instead of %u for %.*s\n",
1782 j, ml->list, res->lockname.len, res->lockname.name);
1783 spin_unlock(&res->spinlock);
1784 continue;
1785 }
1786
1707 /* see NOTE above about why we do not update 1787 /* see NOTE above about why we do not update
1708 * to match the master here */ 1788 * to match the master here */
1709 1789
@@ -1711,6 +1791,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1711 /* do not alter lock refcount. switching lists. */ 1791 /* do not alter lock refcount. switching lists. */
1712 list_move_tail(&lock->list, queue); 1792 list_move_tail(&lock->list, queue);
1713 spin_unlock(&res->spinlock); 1793 spin_unlock(&res->spinlock);
1794 added++;
1714 1795
1715 mlog(0, "just reordered a local lock!\n"); 1796 mlog(0, "just reordered a local lock!\n");
1716 continue; 1797 continue;
@@ -1799,14 +1880,14 @@ skip_lvb:
1799 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " 1880 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
1800 "exists on this lockres!\n", dlm->name, 1881 "exists on this lockres!\n", dlm->name,
1801 res->lockname.len, res->lockname.name, 1882 res->lockname.len, res->lockname.name,
1802 dlm_get_lock_cookie_node(c), 1883 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1803 dlm_get_lock_cookie_seq(c)); 1884 dlm_get_lock_cookie_seq(be64_to_cpu(c)));
1804 1885
1805 mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " 1886 mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
1806 "node=%u, cookie=%u:%llu, queue=%d\n", 1887 "node=%u, cookie=%u:%llu, queue=%d\n",
1807 ml->type, ml->convert_type, ml->node, 1888 ml->type, ml->convert_type, ml->node,
1808 dlm_get_lock_cookie_node(ml->cookie), 1889 dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)),
1809 dlm_get_lock_cookie_seq(ml->cookie), 1890 dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)),
1810 ml->list); 1891 ml->list);
1811 1892
1812 __dlm_print_one_lock_resource(res); 1893 __dlm_print_one_lock_resource(res);
@@ -1817,12 +1898,22 @@ skip_lvb:
1817 if (!bad) { 1898 if (!bad) {
1818 dlm_lock_get(newlock); 1899 dlm_lock_get(newlock);
1819 list_add_tail(&newlock->list, queue); 1900 list_add_tail(&newlock->list, queue);
1901 mlog(0, "%s:%.*s: added lock for node %u, "
1902 "setting refmap bit\n", dlm->name,
1903 res->lockname.len, res->lockname.name, ml->node);
1904 dlm_lockres_set_refmap_bit(ml->node, res);
1905 added++;
1820 } 1906 }
1821 spin_unlock(&res->spinlock); 1907 spin_unlock(&res->spinlock);
1822 } 1908 }
1823 mlog(0, "done running all the locks\n"); 1909 mlog(0, "done running all the locks\n");
1824 1910
1825leave: 1911leave:
1912 /* balance the ref taken when the work was queued */
1913 spin_lock(&res->spinlock);
1914 dlm_lockres_drop_inflight_ref(dlm, res);
1915 spin_unlock(&res->spinlock);
1916
1826 if (ret < 0) { 1917 if (ret < 0) {
1827 mlog_errno(ret); 1918 mlog_errno(ret);
1828 if (newlock) 1919 if (newlock)
@@ -1935,9 +2026,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1935 if (res->owner == dead_node) { 2026 if (res->owner == dead_node) {
1936 list_del_init(&res->recovering); 2027 list_del_init(&res->recovering);
1937 spin_lock(&res->spinlock); 2028 spin_lock(&res->spinlock);
2029 /* new_master has our reference from
2030 * the lock state sent during recovery */
1938 dlm_change_lockres_owner(dlm, res, new_master); 2031 dlm_change_lockres_owner(dlm, res, new_master);
1939 res->state &= ~DLM_LOCK_RES_RECOVERING; 2032 res->state &= ~DLM_LOCK_RES_RECOVERING;
1940 if (!__dlm_lockres_unused(res)) 2033 if (__dlm_lockres_has_locks(res))
1941 __dlm_dirty_lockres(dlm, res); 2034 __dlm_dirty_lockres(dlm, res);
1942 spin_unlock(&res->spinlock); 2035 spin_unlock(&res->spinlock);
1943 wake_up(&res->wq); 2036 wake_up(&res->wq);
@@ -1977,9 +2070,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1977 dlm_lockres_put(res); 2070 dlm_lockres_put(res);
1978 } 2071 }
1979 spin_lock(&res->spinlock); 2072 spin_lock(&res->spinlock);
2073 /* new_master has our reference from
2074 * the lock state sent during recovery */
1980 dlm_change_lockres_owner(dlm, res, new_master); 2075 dlm_change_lockres_owner(dlm, res, new_master);
1981 res->state &= ~DLM_LOCK_RES_RECOVERING; 2076 res->state &= ~DLM_LOCK_RES_RECOVERING;
1982 if (!__dlm_lockres_unused(res)) 2077 if (__dlm_lockres_has_locks(res))
1983 __dlm_dirty_lockres(dlm, res); 2078 __dlm_dirty_lockres(dlm, res);
1984 spin_unlock(&res->spinlock); 2079 spin_unlock(&res->spinlock);
1985 wake_up(&res->wq); 2080 wake_up(&res->wq);
@@ -2048,6 +2143,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2048{ 2143{
2049 struct list_head *iter, *tmpiter; 2144 struct list_head *iter, *tmpiter;
2050 struct dlm_lock *lock; 2145 struct dlm_lock *lock;
2146 unsigned int freed = 0;
2051 2147
2052 /* this node is the lockres master: 2148 /* this node is the lockres master:
2053 * 1) remove any stale locks for the dead node 2149 * 1) remove any stale locks for the dead node
@@ -2062,6 +2158,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2062 if (lock->ml.node == dead_node) { 2158 if (lock->ml.node == dead_node) {
2063 list_del_init(&lock->list); 2159 list_del_init(&lock->list);
2064 dlm_lock_put(lock); 2160 dlm_lock_put(lock);
2161 freed++;
2065 } 2162 }
2066 } 2163 }
2067 list_for_each_safe(iter, tmpiter, &res->converting) { 2164 list_for_each_safe(iter, tmpiter, &res->converting) {
@@ -2069,6 +2166,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2069 if (lock->ml.node == dead_node) { 2166 if (lock->ml.node == dead_node) {
2070 list_del_init(&lock->list); 2167 list_del_init(&lock->list);
2071 dlm_lock_put(lock); 2168 dlm_lock_put(lock);
2169 freed++;
2072 } 2170 }
2073 } 2171 }
2074 list_for_each_safe(iter, tmpiter, &res->blocked) { 2172 list_for_each_safe(iter, tmpiter, &res->blocked) {
@@ -2076,9 +2174,23 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2076 if (lock->ml.node == dead_node) { 2174 if (lock->ml.node == dead_node) {
2077 list_del_init(&lock->list); 2175 list_del_init(&lock->list);
2078 dlm_lock_put(lock); 2176 dlm_lock_put(lock);
2177 freed++;
2079 } 2178 }
2080 } 2179 }
2081 2180
2181 if (freed) {
2182 mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
2183 "dropping ref from lockres\n", dlm->name,
2184 res->lockname.len, res->lockname.name, freed, dead_node);
2185 BUG_ON(!test_bit(dead_node, res->refmap));
2186 dlm_lockres_clear_refmap_bit(dead_node, res);
2187 } else if (test_bit(dead_node, res->refmap)) {
2188 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
2189 "no locks and had not purged before dying\n", dlm->name,
2190 res->lockname.len, res->lockname.name, dead_node);
2191 dlm_lockres_clear_refmap_bit(dead_node, res);
2192 }
2193
2082 /* do not kick thread yet */ 2194 /* do not kick thread yet */
2083 __dlm_dirty_lockres(dlm, res); 2195 __dlm_dirty_lockres(dlm, res);
2084} 2196}
@@ -2141,9 +2253,21 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2141 spin_lock(&res->spinlock); 2253 spin_lock(&res->spinlock);
2142 /* zero the lvb if necessary */ 2254 /* zero the lvb if necessary */
2143 dlm_revalidate_lvb(dlm, res, dead_node); 2255 dlm_revalidate_lvb(dlm, res, dead_node);
2144 if (res->owner == dead_node) 2256 if (res->owner == dead_node) {
2257 if (res->state & DLM_LOCK_RES_DROPPING_REF)
2258 mlog(0, "%s:%.*s: owned by "
2259 "dead node %u, this node was "
2260 "dropping its ref when it died. "
2261 "continue, dropping the flag.\n",
2262 dlm->name, res->lockname.len,
2263 res->lockname.name, dead_node);
2264
2265 /* the wake_up for this will happen when the
2266 * RECOVERING flag is dropped later */
2267 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2268
2145 dlm_move_lockres_to_recovery_list(dlm, res); 2269 dlm_move_lockres_to_recovery_list(dlm, res);
2146 else if (res->owner == dlm->node_num) { 2270 } else if (res->owner == dlm->node_num) {
2147 dlm_free_dead_locks(dlm, res, dead_node); 2271 dlm_free_dead_locks(dlm, res, dead_node);
2148 __dlm_lockres_calc_usage(dlm, res); 2272 __dlm_lockres_calc_usage(dlm, res);
2149 } 2273 }
@@ -2480,7 +2604,8 @@ retry:
2480 return ret; 2604 return ret;
2481} 2605}
2482 2606
2483int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) 2607int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2608 void **ret_data)
2484{ 2609{
2485 struct dlm_ctxt *dlm = data; 2610 struct dlm_ctxt *dlm = data;
2486 struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; 2611 struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
@@ -2608,7 +2733,8 @@ stage2:
2608 return ret; 2733 return ret;
2609} 2734}
2610 2735
2611int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) 2736int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2737 void **ret_data)
2612{ 2738{
2613 struct dlm_ctxt *dlm = data; 2739 struct dlm_ctxt *dlm = data;
2614 struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; 2740 struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 0c822f3ffb05..8ffa0916eb86 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -54,9 +54,6 @@
54#include "cluster/masklog.h" 54#include "cluster/masklog.h"
55 55
56static int dlm_thread(void *data); 56static int dlm_thread(void *data);
57static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
58 struct dlm_lock_resource *lockres);
59
60static void dlm_flush_asts(struct dlm_ctxt *dlm); 57static void dlm_flush_asts(struct dlm_ctxt *dlm);
61 58
62#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) 59#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
@@ -82,14 +79,33 @@ repeat:
82 current->state = TASK_RUNNING; 79 current->state = TASK_RUNNING;
83} 80}
84 81
85 82int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
86int __dlm_lockres_unused(struct dlm_lock_resource *res)
87{ 83{
88 if (list_empty(&res->granted) && 84 if (list_empty(&res->granted) &&
89 list_empty(&res->converting) && 85 list_empty(&res->converting) &&
90 list_empty(&res->blocked) && 86 list_empty(&res->blocked))
91 list_empty(&res->dirty)) 87 return 0;
92 return 1; 88 return 1;
89}
90
91/* "unused": the lockres has no locks, is not on the dirty list,
92 * has no inflight locks (in the gap between mastery and acquiring
93 * the first lock), and has no bits in its refmap.
94 * truly ready to be freed. */
95int __dlm_lockres_unused(struct dlm_lock_resource *res)
96{
97 if (!__dlm_lockres_has_locks(res) &&
98 (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
99 /* try not to scan the bitmap unless the first two
100 * conditions are already true */
101 int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
102 if (bit >= O2NM_MAX_NODES) {
103 /* since the bit for dlm->node_num is not
104 * set, inflight_locks better be zero */
105 BUG_ON(res->inflight_locks != 0);
106 return 1;
107 }
108 }
93 return 0; 109 return 0;
94} 110}
95 111
@@ -106,46 +122,21 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
106 assert_spin_locked(&res->spinlock); 122 assert_spin_locked(&res->spinlock);
107 123
108 if (__dlm_lockres_unused(res)){ 124 if (__dlm_lockres_unused(res)){
109 /* For now, just keep any resource we master */
110 if (res->owner == dlm->node_num)
111 {
112 if (!list_empty(&res->purge)) {
113 mlog(0, "we master %s:%.*s, but it is on "
114 "the purge list. Removing\n",
115 dlm->name, res->lockname.len,
116 res->lockname.name);
117 list_del_init(&res->purge);
118 dlm->purge_count--;
119 }
120 return;
121 }
122
123 if (list_empty(&res->purge)) { 125 if (list_empty(&res->purge)) {
124 mlog(0, "putting lockres %.*s from purge list\n", 126 mlog(0, "putting lockres %.*s:%p onto purge list\n",
125 res->lockname.len, res->lockname.name); 127 res->lockname.len, res->lockname.name, res);
126 128
127 res->last_used = jiffies; 129 res->last_used = jiffies;
130 dlm_lockres_get(res);
128 list_add_tail(&res->purge, &dlm->purge_list); 131 list_add_tail(&res->purge, &dlm->purge_list);
129 dlm->purge_count++; 132 dlm->purge_count++;
130
131 /* if this node is not the owner, there is
132 * no way to keep track of who the owner could be.
133 * unhash it to avoid serious problems. */
134 if (res->owner != dlm->node_num) {
135 mlog(0, "%s:%.*s: doing immediate "
136 "purge of lockres owned by %u\n",
137 dlm->name, res->lockname.len,
138 res->lockname.name, res->owner);
139
140 dlm_purge_lockres_now(dlm, res);
141 }
142 } 133 }
143 } else if (!list_empty(&res->purge)) { 134 } else if (!list_empty(&res->purge)) {
144 mlog(0, "removing lockres %.*s from purge list, " 135 mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
145 "owner=%u\n", res->lockname.len, res->lockname.name, 136 res->lockname.len, res->lockname.name, res, res->owner);
146 res->owner);
147 137
148 list_del_init(&res->purge); 138 list_del_init(&res->purge);
139 dlm_lockres_put(res);
149 dlm->purge_count--; 140 dlm->purge_count--;
150 } 141 }
151} 142}
@@ -163,68 +154,65 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
163 spin_unlock(&dlm->spinlock); 154 spin_unlock(&dlm->spinlock);
164} 155}
165 156
166/* TODO: Eventual API: Called with the dlm spinlock held, may drop it 157static int dlm_purge_lockres(struct dlm_ctxt *dlm,
167 * to do migration, but will re-acquire before exit. */ 158 struct dlm_lock_resource *res)
168void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
169{ 159{
170 int master; 160 int master;
171 int ret; 161 int ret = 0;
172
173 spin_lock(&lockres->spinlock);
174 master = lockres->owner == dlm->node_num;
175 spin_unlock(&lockres->spinlock);
176 162
177 mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len, 163 spin_lock(&res->spinlock);
178 lockres->lockname.name, master); 164 if (!__dlm_lockres_unused(res)) {
179 165 spin_unlock(&res->spinlock);
180 /* Non master is the easy case -- no migration required, just 166 mlog(0, "%s:%.*s: tried to purge but not unused\n",
181 * quit. */ 167 dlm->name, res->lockname.len, res->lockname.name);
168 return -ENOTEMPTY;
169 }
170 master = (res->owner == dlm->node_num);
182 if (!master) 171 if (!master)
183 goto finish; 172 res->state |= DLM_LOCK_RES_DROPPING_REF;
184 173 spin_unlock(&res->spinlock);
185 /* Wheee! Migrate lockres here! */
186 spin_unlock(&dlm->spinlock);
187again:
188 174
189 ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES); 175 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
190 if (ret == -ENOTEMPTY) { 176 res->lockname.name, master);
191 mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
192 lockres->lockname.len, lockres->lockname.name);
193 177
194 BUG(); 178 if (!master) {
195 } else if (ret < 0) { 179 spin_lock(&res->spinlock);
196 mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", 180 /* This ensures that clear refmap is sent after the set */
197 lockres->lockname.len, lockres->lockname.name); 181 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
198 msleep(100); 182 spin_unlock(&res->spinlock);
199 goto again; 183 /* drop spinlock to do messaging, retake below */
184 spin_unlock(&dlm->spinlock);
185 /* clear our bit from the master's refmap, ignore errors */
186 ret = dlm_drop_lockres_ref(dlm, res);
187 if (ret < 0) {
188 mlog_errno(ret);
189 if (!dlm_is_host_down(ret))
190 BUG();
191 }
192 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
193 dlm->name, res->lockname.len, res->lockname.name, ret);
194 spin_lock(&dlm->spinlock);
200 } 195 }
201 196
202 spin_lock(&dlm->spinlock); 197 if (!list_empty(&res->purge)) {
203 198 mlog(0, "removing lockres %.*s:%p from purgelist, "
204finish: 199 "master = %d\n", res->lockname.len, res->lockname.name,
205 if (!list_empty(&lockres->purge)) { 200 res, master);
206 list_del_init(&lockres->purge); 201 list_del_init(&res->purge);
202 dlm_lockres_put(res);
207 dlm->purge_count--; 203 dlm->purge_count--;
208 } 204 }
209 __dlm_unhash_lockres(lockres); 205 __dlm_unhash_lockres(res);
210}
211
212/* make an unused lockres go away immediately.
213 * as soon as the dlm spinlock is dropped, this lockres
214 * will not be found. kfree still happens on last put. */
215static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
216 struct dlm_lock_resource *lockres)
217{
218 assert_spin_locked(&dlm->spinlock);
219 assert_spin_locked(&lockres->spinlock);
220 206
221 BUG_ON(!__dlm_lockres_unused(lockres)); 207 /* lockres is not in the hash now. drop the flag and wake up
222 208 * any processes waiting in dlm_get_lock_resource. */
223 if (!list_empty(&lockres->purge)) { 209 if (!master) {
224 list_del_init(&lockres->purge); 210 spin_lock(&res->spinlock);
225 dlm->purge_count--; 211 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
212 spin_unlock(&res->spinlock);
213 wake_up(&res->wq);
226 } 214 }
227 __dlm_unhash_lockres(lockres); 215 return 0;
228} 216}
229 217
230static void dlm_run_purge_list(struct dlm_ctxt *dlm, 218static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -268,13 +256,17 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
268 break; 256 break;
269 } 257 }
270 258
259 mlog(0, "removing lockres %.*s:%p from purgelist\n",
260 lockres->lockname.len, lockres->lockname.name, lockres);
271 list_del_init(&lockres->purge); 261 list_del_init(&lockres->purge);
262 dlm_lockres_put(lockres);
272 dlm->purge_count--; 263 dlm->purge_count--;
273 264
274 /* This may drop and reacquire the dlm spinlock if it 265 /* This may drop and reacquire the dlm spinlock if it
275 * has to do migration. */ 266 * has to do migration. */
276 mlog(0, "calling dlm_purge_lockres!\n"); 267 mlog(0, "calling dlm_purge_lockres!\n");
277 dlm_purge_lockres(dlm, lockres); 268 if (dlm_purge_lockres(dlm, lockres))
269 BUG();
278 mlog(0, "DONE calling dlm_purge_lockres!\n"); 270 mlog(0, "DONE calling dlm_purge_lockres!\n");
279 271
280 /* Avoid adding any scheduling latencies */ 272 /* Avoid adding any scheduling latencies */
@@ -467,12 +459,17 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
467 assert_spin_locked(&res->spinlock); 459 assert_spin_locked(&res->spinlock);
468 460
469 /* don't shuffle secondary queues */ 461 /* don't shuffle secondary queues */
470 if ((res->owner == dlm->node_num) && 462 if ((res->owner == dlm->node_num)) {
471 !(res->state & DLM_LOCK_RES_DIRTY)) { 463 if (res->state & (DLM_LOCK_RES_MIGRATING |
472 /* ref for dirty_list */ 464 DLM_LOCK_RES_BLOCK_DIRTY))
473 dlm_lockres_get(res); 465 return;
474 list_add_tail(&res->dirty, &dlm->dirty_list); 466
475 res->state |= DLM_LOCK_RES_DIRTY; 467 if (list_empty(&res->dirty)) {
468 /* ref for dirty_list */
469 dlm_lockres_get(res);
470 list_add_tail(&res->dirty, &dlm->dirty_list);
471 res->state |= DLM_LOCK_RES_DIRTY;
472 }
476 } 473 }
477} 474}
478 475
@@ -651,7 +648,7 @@ static int dlm_thread(void *data)
651 dlm_lockres_get(res); 648 dlm_lockres_get(res);
652 649
653 spin_lock(&res->spinlock); 650 spin_lock(&res->spinlock);
654 res->state &= ~DLM_LOCK_RES_DIRTY; 651 /* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */
655 list_del_init(&res->dirty); 652 list_del_init(&res->dirty);
656 spin_unlock(&res->spinlock); 653 spin_unlock(&res->spinlock);
657 spin_unlock(&dlm->spinlock); 654 spin_unlock(&dlm->spinlock);
@@ -675,10 +672,11 @@ static int dlm_thread(void *data)
675 /* it is now ok to move lockreses in these states 672 /* it is now ok to move lockreses in these states
676 * to the dirty list, assuming that they will only be 673 * to the dirty list, assuming that they will only be
677 * dirty for a short while. */ 674 * dirty for a short while. */
675 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
678 if (res->state & (DLM_LOCK_RES_IN_PROGRESS | 676 if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
679 DLM_LOCK_RES_MIGRATING |
680 DLM_LOCK_RES_RECOVERING)) { 677 DLM_LOCK_RES_RECOVERING)) {
681 /* move it to the tail and keep going */ 678 /* move it to the tail and keep going */
679 res->state &= ~DLM_LOCK_RES_DIRTY;
682 spin_unlock(&res->spinlock); 680 spin_unlock(&res->spinlock);
683 mlog(0, "delaying list shuffling for in-" 681 mlog(0, "delaying list shuffling for in-"
684 "progress lockres %.*s, state=%d\n", 682 "progress lockres %.*s, state=%d\n",
@@ -699,6 +697,7 @@ static int dlm_thread(void *data)
699 697
700 /* called while holding lockres lock */ 698 /* called while holding lockres lock */
701 dlm_shuffle_lists(dlm, res); 699 dlm_shuffle_lists(dlm, res);
700 res->state &= ~DLM_LOCK_RES_DIRTY;
702 spin_unlock(&res->spinlock); 701 spin_unlock(&res->spinlock);
703 702
704 dlm_lockres_calc_usage(dlm, res); 703 dlm_lockres_calc_usage(dlm, res);
@@ -709,11 +708,8 @@ in_progress:
709 /* if the lock was in-progress, stick 708 /* if the lock was in-progress, stick
710 * it on the back of the list */ 709 * it on the back of the list */
711 if (delay) { 710 if (delay) {
712 /* ref for dirty_list */
713 dlm_lockres_get(res);
714 spin_lock(&res->spinlock); 711 spin_lock(&res->spinlock);
715 list_add_tail(&res->dirty, &dlm->dirty_list); 712 __dlm_dirty_lockres(dlm, res);
716 res->state |= DLM_LOCK_RES_DIRTY;
717 spin_unlock(&res->spinlock); 713 spin_unlock(&res->spinlock);
718 } 714 }
719 dlm_lockres_put(res); 715 dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 37be4b2e0d4a..86ca085ef324 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -147,6 +147,10 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
147 goto leave; 147 goto leave;
148 } 148 }
149 149
150 if (res->state & DLM_LOCK_RES_MIGRATING) {
151 status = DLM_MIGRATING;
152 goto leave;
153 }
150 154
151 /* see above for what the spec says about 155 /* see above for what the spec says about
152 * LKM_CANCEL and the lock queue state */ 156 * LKM_CANCEL and the lock queue state */
@@ -244,8 +248,8 @@ leave:
244 /* this should always be coupled with list removal */ 248 /* this should always be coupled with list removal */
245 BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK)); 249 BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
246 mlog(0, "lock %u:%llu should be gone now! refs=%d\n", 250 mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
247 dlm_get_lock_cookie_node(lock->ml.cookie), 251 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
248 dlm_get_lock_cookie_seq(lock->ml.cookie), 252 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
249 atomic_read(&lock->lock_refs.refcount)-1); 253 atomic_read(&lock->lock_refs.refcount)-1);
250 dlm_lock_put(lock); 254 dlm_lock_put(lock);
251 } 255 }
@@ -379,7 +383,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
379 * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, 383 * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
380 * return value from dlmunlock_master 384 * return value from dlmunlock_master
381 */ 385 */
382int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data) 386int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
387 void **ret_data)
383{ 388{
384 struct dlm_ctxt *dlm = data; 389 struct dlm_ctxt *dlm = data;
385 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; 390 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
@@ -502,8 +507,8 @@ not_found:
502 if (!found) 507 if (!found)
503 mlog(ML_ERROR, "failed to find lock to unlock! " 508 mlog(ML_ERROR, "failed to find lock to unlock! "
504 "cookie=%u:%llu\n", 509 "cookie=%u:%llu\n",
505 dlm_get_lock_cookie_node(unlock->cookie), 510 dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
506 dlm_get_lock_cookie_seq(unlock->cookie)); 511 dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie)));
507 else 512 else
508 dlm_lock_put(lock); 513 dlm_lock_put(lock);
509 514
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index e1216364d191..d026b4f27757 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -306,8 +306,8 @@ int ocfs2_journal_dirty_data(handle_t *handle,
306 * for the dinode, one for the new block. */ 306 * for the dinode, one for the new block. */
307#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 307#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
308 308
309/* file update (nlink, etc) + dir entry block */ 309/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
310#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 310#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1)
311 311
312/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 312/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
313 * dir inode link */ 313 * dir inode link */
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index 0afd8b9af70f..f30e63b9910c 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -887,7 +887,7 @@ static inline int ocfs2_translate_response(int response)
887 887
888static int ocfs2_handle_response_message(struct o2net_msg *msg, 888static int ocfs2_handle_response_message(struct o2net_msg *msg,
889 u32 len, 889 u32 len,
890 void *data) 890 void *data, void **ret_data)
891{ 891{
892 unsigned int response_id, node_num; 892 unsigned int response_id, node_num;
893 int response_status; 893 int response_status;
@@ -943,7 +943,7 @@ bail:
943 943
944static int ocfs2_handle_vote_message(struct o2net_msg *msg, 944static int ocfs2_handle_vote_message(struct o2net_msg *msg,
945 u32 len, 945 u32 len,
946 void *data) 946 void *data, void **ret_data)
947{ 947{
948 int status; 948 int status;
949 struct ocfs2_super *osb = data; 949 struct ocfs2_super *osb = data;
@@ -1007,7 +1007,7 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb)
1007 osb->net_key, 1007 osb->net_key,
1008 sizeof(struct ocfs2_response_msg), 1008 sizeof(struct ocfs2_response_msg),
1009 ocfs2_handle_response_message, 1009 ocfs2_handle_response_message,
1010 osb, &osb->osb_net_handlers); 1010 osb, NULL, &osb->osb_net_handlers);
1011 if (status) { 1011 if (status) {
1012 mlog_errno(status); 1012 mlog_errno(status);
1013 goto bail; 1013 goto bail;
@@ -1017,7 +1017,7 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb)
1017 osb->net_key, 1017 osb->net_key,
1018 sizeof(struct ocfs2_vote_msg), 1018 sizeof(struct ocfs2_vote_msg),
1019 ocfs2_handle_vote_message, 1019 ocfs2_handle_vote_message,
1020 osb, &osb->osb_net_handlers); 1020 osb, NULL, &osb->osb_net_handlers);
1021 if (status) { 1021 if (status) {
1022 mlog_errno(status); 1022 mlog_errno(status);
1023 goto bail; 1023 goto bail;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e8f540d38d48..d3b9f5f07db1 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -16,6 +16,7 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17 17
18#include <asm/uaccess.h> 18#include <asm/uaccess.h>
19#include <asm/semaphore.h>
19 20
20#include "sysfs.h" 21#include "sysfs.h"
21 22
@@ -146,7 +147,7 @@ static int open(struct inode * inode, struct file * file)
146 Error: 147 Error:
147 module_put(attr->attr.owner); 148 module_put(attr->attr.owner);
148 Done: 149 Done:
149 if (error && kobj) 150 if (error)
150 kobject_put(kobj); 151 kobject_put(kobj);
151 return error; 152 return error;
152} 153}
@@ -157,8 +158,7 @@ static int release(struct inode * inode, struct file * file)
157 struct bin_attribute * attr = to_bin_attr(file->f_path.dentry); 158 struct bin_attribute * attr = to_bin_attr(file->f_path.dentry);
158 u8 * buffer = file->private_data; 159 u8 * buffer = file->private_data;
159 160
160 if (kobj) 161 kobject_put(kobj);
161 kobject_put(kobj);
162 module_put(attr->attr.owner); 162 module_put(attr->attr.owner);
163 kfree(buffer); 163 kfree(buffer);
164 return 0; 164 return 0;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 511edef8b321..9dcdf556c99c 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -9,6 +9,7 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/kobject.h> 10#include <linux/kobject.h>
11#include <linux/namei.h> 11#include <linux/namei.h>
12#include <asm/semaphore.h>
12#include "sysfs.h" 13#include "sysfs.h"
13 14
14DECLARE_RWSEM(sysfs_rename_sem); 15DECLARE_RWSEM(sysfs_rename_sem);
@@ -32,8 +33,7 @@ static struct dentry_operations sysfs_dentry_ops = {
32/* 33/*
33 * Allocates a new sysfs_dirent and links it to the parent sysfs_dirent 34 * Allocates a new sysfs_dirent and links it to the parent sysfs_dirent
34 */ 35 */
35static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, 36static struct sysfs_dirent * __sysfs_new_dirent(void * element)
36 void * element)
37{ 37{
38 struct sysfs_dirent * sd; 38 struct sysfs_dirent * sd;
39 39
@@ -45,12 +45,28 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd,
45 atomic_set(&sd->s_count, 1); 45 atomic_set(&sd->s_count, 1);
46 atomic_set(&sd->s_event, 1); 46 atomic_set(&sd->s_event, 1);
47 INIT_LIST_HEAD(&sd->s_children); 47 INIT_LIST_HEAD(&sd->s_children);
48 list_add(&sd->s_sibling, &parent_sd->s_children); 48 INIT_LIST_HEAD(&sd->s_sibling);
49 sd->s_element = element; 49 sd->s_element = element;
50 50
51 return sd; 51 return sd;
52} 52}
53 53
54static void __sysfs_list_dirent(struct sysfs_dirent *parent_sd,
55 struct sysfs_dirent *sd)
56{
57 if (sd)
58 list_add(&sd->s_sibling, &parent_sd->s_children);
59}
60
61static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent *parent_sd,
62 void * element)
63{
64 struct sysfs_dirent *sd;
65 sd = __sysfs_new_dirent(element);
66 __sysfs_list_dirent(parent_sd, sd);
67 return sd;
68}
69
54/* 70/*
55 * 71 *
56 * Return -EEXIST if there is already a sysfs element with the same name for 72 * Return -EEXIST if there is already a sysfs element with the same name for
@@ -77,14 +93,14 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd,
77} 93}
78 94
79 95
80int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry, 96static struct sysfs_dirent *
81 void * element, umode_t mode, int type) 97__sysfs_make_dirent(struct dentry *dentry, void *element, mode_t mode, int type)
82{ 98{
83 struct sysfs_dirent * sd; 99 struct sysfs_dirent * sd;
84 100
85 sd = sysfs_new_dirent(parent_sd, element); 101 sd = __sysfs_new_dirent(element);
86 if (!sd) 102 if (!sd)
87 return -ENOMEM; 103 goto out;
88 104
89 sd->s_mode = mode; 105 sd->s_mode = mode;
90 sd->s_type = type; 106 sd->s_type = type;
@@ -94,7 +110,19 @@ int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry,
94 dentry->d_op = &sysfs_dentry_ops; 110 dentry->d_op = &sysfs_dentry_ops;
95 } 111 }
96 112
97 return 0; 113out:
114 return sd;
115}
116
117int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry,
118 void * element, umode_t mode, int type)
119{
120 struct sysfs_dirent *sd;
121
122 sd = __sysfs_make_dirent(dentry, element, mode, type);
123 __sysfs_list_dirent(parent_sd, sd);
124
125 return sd ? 0 : -ENOMEM;
98} 126}
99 127
100static int init_dir(struct inode * inode) 128static int init_dir(struct inode * inode)
@@ -165,11 +193,11 @@ int sysfs_create_subdir(struct kobject * k, const char * n, struct dentry ** d)
165 193
166/** 194/**
167 * sysfs_create_dir - create a directory for an object. 195 * sysfs_create_dir - create a directory for an object.
168 * @parent: parent parent object.
169 * @kobj: object we're creating directory for. 196 * @kobj: object we're creating directory for.
197 * @shadow_parent: parent parent object.
170 */ 198 */
171 199
172int sysfs_create_dir(struct kobject * kobj) 200int sysfs_create_dir(struct kobject * kobj, struct dentry *shadow_parent)
173{ 201{
174 struct dentry * dentry = NULL; 202 struct dentry * dentry = NULL;
175 struct dentry * parent; 203 struct dentry * parent;
@@ -177,7 +205,9 @@ int sysfs_create_dir(struct kobject * kobj)
177 205
178 BUG_ON(!kobj); 206 BUG_ON(!kobj);
179 207
180 if (kobj->parent) 208 if (shadow_parent)
209 parent = shadow_parent;
210 else if (kobj->parent)
181 parent = kobj->parent->dentry; 211 parent = kobj->parent->dentry;
182 else if (sysfs_mount && sysfs_mount->mnt_sb) 212 else if (sysfs_mount && sysfs_mount->mnt_sb)
183 parent = sysfs_mount->mnt_sb->s_root; 213 parent = sysfs_mount->mnt_sb->s_root;
@@ -298,21 +328,12 @@ void sysfs_remove_subdir(struct dentry * d)
298} 328}
299 329
300 330
301/** 331static void __sysfs_remove_dir(struct dentry *dentry)
302 * sysfs_remove_dir - remove an object's directory.
303 * @kobj: object.
304 *
305 * The only thing special about this is that we remove any files in
306 * the directory before we remove the directory, and we've inlined
307 * what used to be sysfs_rmdir() below, instead of calling separately.
308 */
309
310void sysfs_remove_dir(struct kobject * kobj)
311{ 332{
312 struct dentry * dentry = dget(kobj->dentry);
313 struct sysfs_dirent * parent_sd; 333 struct sysfs_dirent * parent_sd;
314 struct sysfs_dirent * sd, * tmp; 334 struct sysfs_dirent * sd, * tmp;
315 335
336 dget(dentry);
316 if (!dentry) 337 if (!dentry)
317 return; 338 return;
318 339
@@ -333,32 +354,60 @@ void sysfs_remove_dir(struct kobject * kobj)
333 * Drop reference from dget() on entrance. 354 * Drop reference from dget() on entrance.
334 */ 355 */
335 dput(dentry); 356 dput(dentry);
357}
358
359/**
360 * sysfs_remove_dir - remove an object's directory.
361 * @kobj: object.
362 *
363 * The only thing special about this is that we remove any files in
364 * the directory before we remove the directory, and we've inlined
365 * what used to be sysfs_rmdir() below, instead of calling separately.
366 */
367
368void sysfs_remove_dir(struct kobject * kobj)
369{
370 __sysfs_remove_dir(kobj->dentry);
336 kobj->dentry = NULL; 371 kobj->dentry = NULL;
337} 372}
338 373
339int sysfs_rename_dir(struct kobject * kobj, const char *new_name) 374int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent,
375 const char *new_name)
340{ 376{
341 int error = 0; 377 int error = 0;
342 struct dentry * new_dentry, * parent; 378 struct dentry * new_dentry;
343
344 if (!strcmp(kobject_name(kobj), new_name))
345 return -EINVAL;
346 379
347 if (!kobj->parent) 380 if (!new_parent)
348 return -EINVAL; 381 return -EFAULT;
349 382
350 down_write(&sysfs_rename_sem); 383 down_write(&sysfs_rename_sem);
351 parent = kobj->parent->dentry; 384 mutex_lock(&new_parent->d_inode->i_mutex);
352
353 mutex_lock(&parent->d_inode->i_mutex);
354 385
355 new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); 386 new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name));
356 if (!IS_ERR(new_dentry)) { 387 if (!IS_ERR(new_dentry)) {
357 if (!new_dentry->d_inode) { 388 /* By allowing two different directories with the
389 * same d_parent we allow this routine to move
390 * between different shadows of the same directory
391 */
392 if (kobj->dentry->d_parent->d_inode != new_parent->d_inode)
393 return -EINVAL;
394 else if (new_dentry->d_parent->d_inode != new_parent->d_inode)
395 error = -EINVAL;
396 else if (new_dentry == kobj->dentry)
397 error = -EINVAL;
398 else if (!new_dentry->d_inode) {
358 error = kobject_set_name(kobj, "%s", new_name); 399 error = kobject_set_name(kobj, "%s", new_name);
359 if (!error) { 400 if (!error) {
401 struct sysfs_dirent *sd, *parent_sd;
402
360 d_add(new_dentry, NULL); 403 d_add(new_dentry, NULL);
361 d_move(kobj->dentry, new_dentry); 404 d_move(kobj->dentry, new_dentry);
405
406 sd = kobj->dentry->d_fsdata;
407 parent_sd = new_parent->d_fsdata;
408
409 list_del_init(&sd->s_sibling);
410 list_add(&sd->s_sibling, &parent_sd->s_children);
362 } 411 }
363 else 412 else
364 d_drop(new_dentry); 413 d_drop(new_dentry);
@@ -366,7 +415,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
366 error = -EEXIST; 415 error = -EEXIST;
367 dput(new_dentry); 416 dput(new_dentry);
368 } 417 }
369 mutex_unlock(&parent->d_inode->i_mutex); 418 mutex_unlock(&new_parent->d_inode->i_mutex);
370 up_write(&sysfs_rename_sem); 419 up_write(&sysfs_rename_sem);
371 420
372 return error; 421 return error;
@@ -378,12 +427,10 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent)
378 struct sysfs_dirent *new_parent_sd, *sd; 427 struct sysfs_dirent *new_parent_sd, *sd;
379 int error; 428 int error;
380 429
381 if (!new_parent)
382 return -EINVAL;
383
384 old_parent_dentry = kobj->parent ? 430 old_parent_dentry = kobj->parent ?
385 kobj->parent->dentry : sysfs_mount->mnt_sb->s_root; 431 kobj->parent->dentry : sysfs_mount->mnt_sb->s_root;
386 new_parent_dentry = new_parent->dentry; 432 new_parent_dentry = new_parent ?
433 new_parent->dentry : sysfs_mount->mnt_sb->s_root;
387 434
388again: 435again:
389 mutex_lock(&old_parent_dentry->d_inode->i_mutex); 436 mutex_lock(&old_parent_dentry->d_inode->i_mutex);
@@ -547,6 +594,95 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
547 return offset; 594 return offset;
548} 595}
549 596
597
598/**
599 * sysfs_make_shadowed_dir - Setup so a directory can be shadowed
600 * @kobj: object we're creating shadow of.
601 */
602
603int sysfs_make_shadowed_dir(struct kobject *kobj,
604 void * (*follow_link)(struct dentry *, struct nameidata *))
605{
606 struct inode *inode;
607 struct inode_operations *i_op;
608
609 inode = kobj->dentry->d_inode;
610 if (inode->i_op != &sysfs_dir_inode_operations)
611 return -EINVAL;
612
613 i_op = kmalloc(sizeof(*i_op), GFP_KERNEL);
614 if (!i_op)
615 return -ENOMEM;
616
617 memcpy(i_op, &sysfs_dir_inode_operations, sizeof(*i_op));
618 i_op->follow_link = follow_link;
619
620 /* Locking of inode->i_op?
621 * Since setting i_op is a single word write and they
622 * are atomic we should be ok here.
623 */
624 inode->i_op = i_op;
625 return 0;
626}
627
628/**
629 * sysfs_create_shadow_dir - create a shadow directory for an object.
630 * @kobj: object we're creating directory for.
631 *
632 * sysfs_make_shadowed_dir must already have been called on this
633 * directory.
634 */
635
636struct dentry *sysfs_create_shadow_dir(struct kobject *kobj)
637{
638 struct sysfs_dirent *sd;
639 struct dentry *parent, *dir, *shadow;
640 struct inode *inode;
641
642 dir = kobj->dentry;
643 inode = dir->d_inode;
644 parent = dir->d_parent;
645 shadow = ERR_PTR(-EINVAL);
646 if (!sysfs_is_shadowed_inode(inode))
647 goto out;
648
649 shadow = d_alloc(parent, &dir->d_name);
650 if (!shadow)
651 goto nomem;
652
653 sd = __sysfs_make_dirent(shadow, kobj, inode->i_mode, SYSFS_DIR);
654 if (!sd)
655 goto nomem;
656
657 d_instantiate(shadow, igrab(inode));
658 inc_nlink(inode);
659 inc_nlink(parent->d_inode);
660 shadow->d_op = &sysfs_dentry_ops;
661
662 dget(shadow); /* Extra count - pin the dentry in core */
663
664out:
665 return shadow;
666nomem:
667 dput(shadow);
668 shadow = ERR_PTR(-ENOMEM);
669 goto out;
670}
671
672/**
673 * sysfs_remove_shadow_dir - remove an object's directory.
674 * @shadow: dentry of shadow directory
675 *
676 * The only thing special about this is that we remove any files in
677 * the directory before we remove the directory, and we've inlined
678 * what used to be sysfs_rmdir() below, instead of calling separately.
679 */
680
681void sysfs_remove_shadow_dir(struct dentry *shadow)
682{
683 __sysfs_remove_dir(shadow);
684}
685
550const struct file_operations sysfs_dir_operations = { 686const struct file_operations sysfs_dir_operations = {
551 .open = sysfs_dir_open, 687 .open = sysfs_dir_open,
552 .release = sysfs_dir_close, 688 .release = sysfs_dir_close,
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 9cfe53e1e00d..c0e117649a4d 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -7,6 +7,7 @@
7#include <linux/kobject.h> 7#include <linux/kobject.h>
8#include <linux/namei.h> 8#include <linux/namei.h>
9#include <linux/poll.h> 9#include <linux/poll.h>
10#include <linux/list.h>
10#include <asm/uaccess.h> 11#include <asm/uaccess.h>
11#include <asm/semaphore.h> 12#include <asm/semaphore.h>
12 13
@@ -50,17 +51,29 @@ static struct sysfs_ops subsys_sysfs_ops = {
50 .store = subsys_attr_store, 51 .store = subsys_attr_store,
51}; 52};
52 53
54/**
55 * add_to_collection - add buffer to a collection
56 * @buffer: buffer to be added
57 * @node inode of set to add to
58 */
53 59
54struct sysfs_buffer { 60static inline void
55 size_t count; 61add_to_collection(struct sysfs_buffer *buffer, struct inode *node)
56 loff_t pos; 62{
57 char * page; 63 struct sysfs_buffer_collection *set = node->i_private;
58 struct sysfs_ops * ops;
59 struct semaphore sem;
60 int needs_read_fill;
61 int event;
62};
63 64
65 mutex_lock(&node->i_mutex);
66 list_add(&buffer->associates, &set->associates);
67 mutex_unlock(&node->i_mutex);
68}
69
70static inline void
71remove_from_collection(struct sysfs_buffer *buffer, struct inode *node)
72{
73 mutex_lock(&node->i_mutex);
74 list_del(&buffer->associates);
75 mutex_unlock(&node->i_mutex);
76}
64 77
65/** 78/**
66 * fill_read_buffer - allocate and fill buffer from object. 79 * fill_read_buffer - allocate and fill buffer from object.
@@ -70,7 +83,8 @@ struct sysfs_buffer {
70 * Allocate @buffer->page, if it hasn't been already, then call the 83 * Allocate @buffer->page, if it hasn't been already, then call the
71 * kobject's show() method to fill the buffer with this attribute's 84 * kobject's show() method to fill the buffer with this attribute's
72 * data. 85 * data.
73 * This is called only once, on the file's first read. 86 * This is called only once, on the file's first read unless an error
87 * is returned.
74 */ 88 */
75static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) 89static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
76{ 90{
@@ -88,12 +102,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
88 102
89 buffer->event = atomic_read(&sd->s_event); 103 buffer->event = atomic_read(&sd->s_event);
90 count = ops->show(kobj,attr,buffer->page); 104 count = ops->show(kobj,attr,buffer->page);
91 buffer->needs_read_fill = 0;
92 BUG_ON(count > (ssize_t)PAGE_SIZE); 105 BUG_ON(count > (ssize_t)PAGE_SIZE);
93 if (count >= 0) 106 if (count >= 0) {
107 buffer->needs_read_fill = 0;
94 buffer->count = count; 108 buffer->count = count;
95 else 109 } else {
96 ret = count; 110 ret = count;
111 }
97 return ret; 112 return ret;
98} 113}
99 114
@@ -153,6 +168,10 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
153 ssize_t retval = 0; 168 ssize_t retval = 0;
154 169
155 down(&buffer->sem); 170 down(&buffer->sem);
171 if (buffer->orphaned) {
172 retval = -ENODEV;
173 goto out;
174 }
156 if (buffer->needs_read_fill) { 175 if (buffer->needs_read_fill) {
157 if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) 176 if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
158 goto out; 177 goto out;
@@ -165,7 +184,6 @@ out:
165 return retval; 184 return retval;
166} 185}
167 186
168
169/** 187/**
170 * fill_write_buffer - copy buffer from userspace. 188 * fill_write_buffer - copy buffer from userspace.
171 * @buffer: data buffer for file. 189 * @buffer: data buffer for file.
@@ -243,19 +261,25 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t
243 ssize_t len; 261 ssize_t len;
244 262
245 down(&buffer->sem); 263 down(&buffer->sem);
264 if (buffer->orphaned) {
265 len = -ENODEV;
266 goto out;
267 }
246 len = fill_write_buffer(buffer, buf, count); 268 len = fill_write_buffer(buffer, buf, count);
247 if (len > 0) 269 if (len > 0)
248 len = flush_write_buffer(file->f_path.dentry, buffer, len); 270 len = flush_write_buffer(file->f_path.dentry, buffer, len);
249 if (len > 0) 271 if (len > 0)
250 *ppos += len; 272 *ppos += len;
273out:
251 up(&buffer->sem); 274 up(&buffer->sem);
252 return len; 275 return len;
253} 276}
254 277
255static int check_perm(struct inode * inode, struct file * file) 278static int sysfs_open_file(struct inode *inode, struct file *file)
256{ 279{
257 struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); 280 struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent);
258 struct attribute * attr = to_attr(file->f_path.dentry); 281 struct attribute * attr = to_attr(file->f_path.dentry);
282 struct sysfs_buffer_collection *set;
259 struct sysfs_buffer * buffer; 283 struct sysfs_buffer * buffer;
260 struct sysfs_ops * ops = NULL; 284 struct sysfs_ops * ops = NULL;
261 int error = 0; 285 int error = 0;
@@ -285,6 +309,18 @@ static int check_perm(struct inode * inode, struct file * file)
285 if (!ops) 309 if (!ops)
286 goto Eaccess; 310 goto Eaccess;
287 311
312 /* make sure we have a collection to add our buffers to */
313 mutex_lock(&inode->i_mutex);
314 if (!(set = inode->i_private)) {
315 if (!(set = inode->i_private = kmalloc(sizeof(struct sysfs_buffer_collection), GFP_KERNEL))) {
316 error = -ENOMEM;
317 goto Done;
318 } else {
319 INIT_LIST_HEAD(&set->associates);
320 }
321 }
322 mutex_unlock(&inode->i_mutex);
323
288 /* File needs write support. 324 /* File needs write support.
289 * The inode's perms must say it's ok, 325 * The inode's perms must say it's ok,
290 * and we must have a store method. 326 * and we must have a store method.
@@ -310,9 +346,11 @@ static int check_perm(struct inode * inode, struct file * file)
310 */ 346 */
311 buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL); 347 buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL);
312 if (buffer) { 348 if (buffer) {
349 INIT_LIST_HEAD(&buffer->associates);
313 init_MUTEX(&buffer->sem); 350 init_MUTEX(&buffer->sem);
314 buffer->needs_read_fill = 1; 351 buffer->needs_read_fill = 1;
315 buffer->ops = ops; 352 buffer->ops = ops;
353 add_to_collection(buffer, inode);
316 file->private_data = buffer; 354 file->private_data = buffer;
317 } else 355 } else
318 error = -ENOMEM; 356 error = -ENOMEM;
@@ -325,16 +363,11 @@ static int check_perm(struct inode * inode, struct file * file)
325 error = -EACCES; 363 error = -EACCES;
326 module_put(attr->owner); 364 module_put(attr->owner);
327 Done: 365 Done:
328 if (error && kobj) 366 if (error)
329 kobject_put(kobj); 367 kobject_put(kobj);
330 return error; 368 return error;
331} 369}
332 370
333static int sysfs_open_file(struct inode * inode, struct file * filp)
334{
335 return check_perm(inode,filp);
336}
337
338static int sysfs_release(struct inode * inode, struct file * filp) 371static int sysfs_release(struct inode * inode, struct file * filp)
339{ 372{
340 struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent); 373 struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent);
@@ -342,8 +375,9 @@ static int sysfs_release(struct inode * inode, struct file * filp)
342 struct module * owner = attr->owner; 375 struct module * owner = attr->owner;
343 struct sysfs_buffer * buffer = filp->private_data; 376 struct sysfs_buffer * buffer = filp->private_data;
344 377
345 if (kobj) 378 if (buffer)
346 kobject_put(kobj); 379 remove_from_collection(buffer, inode);
380 kobject_put(kobj);
347 /* After this point, attr should not be accessed. */ 381 /* After this point, attr should not be accessed. */
348 module_put(owner); 382 module_put(owner);
349 383
@@ -548,7 +582,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
548 582
549void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) 583void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
550{ 584{
551 sysfs_hash_and_remove(kobj->dentry,attr->name); 585 sysfs_hash_and_remove(kobj->dentry, attr->name);
552} 586}
553 587
554 588
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 122145b0895c..b20951c93761 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -13,6 +13,8 @@
13#include <linux/dcache.h> 13#include <linux/dcache.h>
14#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/fs.h>
17#include <asm/semaphore.h>
16#include "sysfs.h" 18#include "sysfs.h"
17 19
18 20
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e79e38d52c00..542d2bcc73df 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -13,6 +13,7 @@
13#include <linux/backing-dev.h> 13#include <linux/backing-dev.h>
14#include <linux/capability.h> 14#include <linux/capability.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <asm/semaphore.h>
16#include "sysfs.h" 17#include "sysfs.h"
17 18
18extern struct super_block * sysfs_sb; 19extern struct super_block * sysfs_sb;
@@ -32,6 +33,16 @@ static struct inode_operations sysfs_inode_operations ={
32 .setattr = sysfs_setattr, 33 .setattr = sysfs_setattr,
33}; 34};
34 35
36void sysfs_delete_inode(struct inode *inode)
37{
38 /* Free the shadowed directory inode operations */
39 if (sysfs_is_shadowed_inode(inode)) {
40 kfree(inode->i_op);
41 inode->i_op = NULL;
42 }
43 return generic_delete_inode(inode);
44}
45
35int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) 46int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
36{ 47{
37 struct inode * inode = dentry->d_inode; 48 struct inode * inode = dentry->d_inode;
@@ -209,6 +220,22 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd)
209 return NULL; 220 return NULL;
210} 221}
211 222
223static inline void orphan_all_buffers(struct inode *node)
224{
225 struct sysfs_buffer_collection *set = node->i_private;
226 struct sysfs_buffer *buf;
227
228 mutex_lock_nested(&node->i_mutex, I_MUTEX_CHILD);
229 if (node->i_private) {
230 list_for_each_entry(buf, &set->associates, associates) {
231 down(&buf->sem);
232 buf->orphaned = 1;
233 up(&buf->sem);
234 }
235 }
236 mutex_unlock(&node->i_mutex);
237}
238
212 239
213/* 240/*
214 * Unhashes the dentry corresponding to given sysfs_dirent 241 * Unhashes the dentry corresponding to given sysfs_dirent
@@ -217,16 +244,23 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd)
217void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) 244void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent)
218{ 245{
219 struct dentry * dentry = sd->s_dentry; 246 struct dentry * dentry = sd->s_dentry;
247 struct inode *inode;
220 248
221 if (dentry) { 249 if (dentry) {
222 spin_lock(&dcache_lock); 250 spin_lock(&dcache_lock);
223 spin_lock(&dentry->d_lock); 251 spin_lock(&dentry->d_lock);
224 if (!(d_unhashed(dentry) && dentry->d_inode)) { 252 if (!(d_unhashed(dentry) && dentry->d_inode)) {
253 inode = dentry->d_inode;
254 spin_lock(&inode->i_lock);
255 __iget(inode);
256 spin_unlock(&inode->i_lock);
225 dget_locked(dentry); 257 dget_locked(dentry);
226 __d_drop(dentry); 258 __d_drop(dentry);
227 spin_unlock(&dentry->d_lock); 259 spin_unlock(&dentry->d_lock);
228 spin_unlock(&dcache_lock); 260 spin_unlock(&dcache_lock);
229 simple_unlink(parent->d_inode, dentry); 261 simple_unlink(parent->d_inode, dentry);
262 orphan_all_buffers(inode);
263 iput(inode);
230 } else { 264 } else {
231 spin_unlock(&dentry->d_lock); 265 spin_unlock(&dentry->d_lock);
232 spin_unlock(&dcache_lock); 266 spin_unlock(&dcache_lock);
@@ -248,7 +282,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name)
248 return -ENOENT; 282 return -ENOENT;
249 283
250 parent_sd = dir->d_fsdata; 284 parent_sd = dir->d_fsdata;
251 mutex_lock(&dir->d_inode->i_mutex); 285 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
252 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 286 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
253 if (!sd->s_element) 287 if (!sd->s_element)
254 continue; 288 continue;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index e503f858fba8..f6a87a824883 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -8,6 +8,7 @@
8#include <linux/mount.h> 8#include <linux/mount.h>
9#include <linux/pagemap.h> 9#include <linux/pagemap.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <asm/semaphore.h>
11 12
12#include "sysfs.h" 13#include "sysfs.h"
13 14
@@ -18,9 +19,12 @@ struct vfsmount *sysfs_mount;
18struct super_block * sysfs_sb = NULL; 19struct super_block * sysfs_sb = NULL;
19struct kmem_cache *sysfs_dir_cachep; 20struct kmem_cache *sysfs_dir_cachep;
20 21
22static void sysfs_clear_inode(struct inode *inode);
23
21static struct super_operations sysfs_ops = { 24static struct super_operations sysfs_ops = {
22 .statfs = simple_statfs, 25 .statfs = simple_statfs,
23 .drop_inode = generic_delete_inode, 26 .drop_inode = sysfs_delete_inode,
27 .clear_inode = sysfs_clear_inode,
24}; 28};
25 29
26static struct sysfs_dirent sysfs_root = { 30static struct sysfs_dirent sysfs_root = {
@@ -31,6 +35,11 @@ static struct sysfs_dirent sysfs_root = {
31 .s_iattr = NULL, 35 .s_iattr = NULL,
32}; 36};
33 37
38static void sysfs_clear_inode(struct inode *inode)
39{
40 kfree(inode->i_private);
41}
42
34static int sysfs_fill_super(struct super_block *sb, void *data, int silent) 43static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
35{ 44{
36 struct inode *inode; 45 struct inode *inode;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index f50e3cc2ded8..4869f611192f 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -7,6 +7,7 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/kobject.h> 8#include <linux/kobject.h>
9#include <linux/namei.h> 9#include <linux/namei.h>
10#include <asm/semaphore.h>
10 11
11#include "sysfs.h" 12#include "sysfs.h"
12 13
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index bd7cec295dab..fe1cbfd208ed 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -2,6 +2,7 @@
2extern struct vfsmount * sysfs_mount; 2extern struct vfsmount * sysfs_mount;
3extern struct kmem_cache *sysfs_dir_cachep; 3extern struct kmem_cache *sysfs_dir_cachep;
4 4
5extern void sysfs_delete_inode(struct inode *inode);
5extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); 6extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *);
6extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); 7extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *));
7 8
@@ -33,6 +34,22 @@ struct sysfs_symlink {
33 struct kobject * target_kobj; 34 struct kobject * target_kobj;
34}; 35};
35 36
37struct sysfs_buffer {
38 struct list_head associates;
39 size_t count;
40 loff_t pos;
41 char * page;
42 struct sysfs_ops * ops;
43 struct semaphore sem;
44 int orphaned;
45 int needs_read_fill;
46 int event;
47};
48
49struct sysfs_buffer_collection {
50 struct list_head associates;
51};
52
36static inline struct kobject * to_kobj(struct dentry * dentry) 53static inline struct kobject * to_kobj(struct dentry * dentry)
37{ 54{
38 struct sysfs_dirent * sd = dentry->d_fsdata; 55 struct sysfs_dirent * sd = dentry->d_fsdata;
@@ -96,3 +113,7 @@ static inline void sysfs_put(struct sysfs_dirent * sd)
96 release_sysfs_dirent(sd); 113 release_sysfs_dirent(sd);
97} 114}
98 115
116static inline int sysfs_is_shadowed_inode(struct inode *inode)
117{
118 return S_ISDIR(inode->i_mode) && inode->i_op->follow_link;
119}