aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ocfs2.txt16
-rw-r--r--Documentation/ioctl-number.txt1
-rw-r--r--fs/Kconfig14
-rw-r--r--fs/configfs/dir.c5
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/ocfs2/Makefile5
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/aops.c137
-rw-r--r--fs/ocfs2/buffer_head_io.c65
-rw-r--r--fs/ocfs2/buffer_head_io.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/tcp.h4
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h8
-rw-r--r--fs/ocfs2/cluster/ver.c2
-rw-r--r--fs/ocfs2/dcache.c8
-rw-r--r--fs/ocfs2/dir.c8
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c2
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c19
-rw-r--r--fs/ocfs2/dlm/dlmver.c2
-rw-r--r--fs/ocfs2/dlmglue.c546
-rw-r--r--fs/ocfs2/dlmglue.h31
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/file.c163
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/heartbeat.c80
-rw-r--r--fs/ocfs2/heartbeat.h2
-rw-r--r--fs/ocfs2/inode.c84
-rw-r--r--fs/ocfs2/inode.h10
-rw-r--r--fs/ocfs2/ioctl.c31
-rw-r--r--fs/ocfs2/journal.c51
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/ocfs2/localalloc.c50
-rw-r--r--fs/ocfs2/locks.c125
-rw-r--r--fs/ocfs2/locks.h (renamed from fs/ocfs2/vote.h)29
-rw-r--r--fs/ocfs2/mmap.c17
-rw-r--r--fs/ocfs2/namei.c66
-rw-r--r--fs/ocfs2/ocfs2.h35
-rw-r--r--fs/ocfs2/ocfs2_fs.h22
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/resize.c634
-rw-r--r--fs/ocfs2/resize.h32
-rw-r--r--fs/ocfs2/slot_map.c19
-rw-r--r--fs/ocfs2/slot_map.h2
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/ocfs2/suballoc.h8
-rw-r--r--fs/ocfs2/super.c140
-rw-r--r--fs/ocfs2/sysfile.c2
-rw-r--r--fs/ocfs2/ver.c2
-rw-r--r--fs/ocfs2/vote.c756
-rw-r--r--include/linux/Kbuild1
-rw-r--r--include/linux/dlm.h140
-rw-r--r--include/linux/dlmconstants.h159
53 files changed, 2005 insertions, 1592 deletions
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index ed55238023a9..c318a8bbb1ef 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -35,7 +35,6 @@ Features which OCFS2 does not support yet:
35 - Directory change notification (F_NOTIFY) 35 - Directory change notification (F_NOTIFY)
36 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) 36 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
37 - POSIX ACLs 37 - POSIX ACLs
38 - readpages / writepages (not user visible)
39 38
40Mount options 39Mount options
41============= 40=============
@@ -62,3 +61,18 @@ data=writeback Data ordering is not preserved, data may be written
62preferred_slot=0(*) During mount, try to use this filesystem slot first. If 61preferred_slot=0(*) During mount, try to use this filesystem slot first. If
63 it is in use by another node, the first empty one found 62 it is in use by another node, the first empty one found
64 will be chosen. Invalid values will be ignored. 63 will be chosen. Invalid values will be ignored.
64commit=nrsec (*) Ocfs2 can be told to sync all its data and metadata
65 every 'nrsec' seconds. The default value is 5 seconds.
66 This means that if you lose your power, you will lose
67 as much as the latest 5 seconds of work (your
68 filesystem will not be damaged though, thanks to the
69 journaling). This default value (or any low value)
70 will hurt performance, but it's good for data-safety.
71 Setting it to 0 will have the same effect as leaving
72 it at the default (5 seconds).
73 Setting it to very large values will improve
74 performance.
75localalloc=8(*) Allows custom localalloc size in MB. If the value is too
76 large, the fs will silently revert it to the default.
77 Localalloc is not enabled for local mounts.
78localflocks This disables cluster aware flock.
diff --git a/Documentation/ioctl-number.txt b/Documentation/ioctl-number.txt
index 5c7fbf9d96b4..c18363bd8d11 100644
--- a/Documentation/ioctl-number.txt
+++ b/Documentation/ioctl-number.txt
@@ -138,6 +138,7 @@ Code Seq# Include File Comments
138'm' 00-1F net/irda/irmod.h conflict! 138'm' 00-1F net/irda/irmod.h conflict!
139'n' 00-7F linux/ncp_fs.h 139'n' 00-7F linux/ncp_fs.h
140'n' E0-FF video/matrox.h matroxfb 140'n' E0-FF video/matrox.h matroxfb
141'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2
141'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this) 142'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this)
142'p' 00-3F linux/mc146818rtc.h conflict! 143'p' 00-3F linux/mc146818rtc.h conflict!
143'p' 40-7F linux/nvram.h 144'p' 40-7F linux/nvram.h
diff --git a/fs/Kconfig b/fs/Kconfig
index b4799efaf9e8..b6df18f1f677 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -440,14 +440,8 @@ config OCFS2_FS
440 Tools web page: http://oss.oracle.com/projects/ocfs2-tools 440 Tools web page: http://oss.oracle.com/projects/ocfs2-tools
441 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ 441 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
442 442
443 Note: Features which OCFS2 does not support yet: 443 For more information on OCFS2, see the file
444 - extended attributes 444 <file:Documentation/filesystems/ocfs2.txt>.
445 - quotas
446 - cluster aware flock
447 - Directory change notification (F_NOTIFY)
448 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
449 - POSIX ACLs
450 - readpages / writepages (not user visible)
451 445
452config OCFS2_DEBUG_MASKLOG 446config OCFS2_DEBUG_MASKLOG
453 bool "OCFS2 logging support" 447 bool "OCFS2 logging support"
@@ -1028,8 +1022,8 @@ config HUGETLB_PAGE
1028 def_bool HUGETLBFS 1022 def_bool HUGETLBFS
1029 1023
1030config CONFIGFS_FS 1024config CONFIGFS_FS
1031 tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" 1025 tristate "Userspace-driven configuration filesystem"
1032 depends on SYSFS && EXPERIMENTAL 1026 depends on SYSFS
1033 help 1027 help
1034 configfs is a ram-based filesystem that provides the converse 1028 configfs is a ram-based filesystem that provides the converse
1035 of sysfs's functionality. Where sysfs is a filesystem-based 1029 of sysfs's functionality. Where sysfs is a filesystem-based
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 50ed691098bc..a48dc7dd8765 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -546,7 +546,7 @@ static int populate_groups(struct config_group *group)
546 * That said, taking our i_mutex is closer to mkdir 546 * That said, taking our i_mutex is closer to mkdir
547 * emulation, and shouldn't hurt. 547 * emulation, and shouldn't hurt.
548 */ 548 */
549 mutex_lock(&dentry->d_inode->i_mutex); 549 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
550 550
551 for (i = 0; group->default_groups[i]; i++) { 551 for (i = 0; group->default_groups[i]; i++) {
552 new_group = group->default_groups[i]; 552 new_group = group->default_groups[i];
@@ -1405,7 +1405,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
1405 sd = configfs_sb->s_root->d_fsdata; 1405 sd = configfs_sb->s_root->d_fsdata;
1406 link_group(to_config_group(sd->s_element), group); 1406 link_group(to_config_group(sd->s_element), group);
1407 1407
1408 mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); 1408 mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
1409 I_MUTEX_PARENT);
1409 1410
1410 name.name = group->cg_item.ci_name; 1411 name.name = group->cg_item.ci_name;
1411 name.len = strlen(name.name); 1412 name.len = strlen(name.name);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index a3658f9a082c..397cb503a180 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -320,7 +320,7 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
320 umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; 320 umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
321 int error = 0; 321 int error = 0;
322 322
323 mutex_lock(&dir->d_inode->i_mutex); 323 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
324 error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); 324 error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
325 mutex_unlock(&dir->d_inode->i_mutex); 325 mutex_unlock(&dir->d_inode->i_mutex);
326 326
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b0..4d4ce48bb42c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -19,16 +19,17 @@ ocfs2-objs := \
19 ioctl.o \ 19 ioctl.o \
20 journal.o \ 20 journal.o \
21 localalloc.o \ 21 localalloc.o \
22 locks.o \
22 mmap.o \ 23 mmap.o \
23 namei.o \ 24 namei.o \
25 resize.o \
24 slot_map.o \ 26 slot_map.o \
25 suballoc.o \ 27 suballoc.o \
26 super.o \ 28 super.o \
27 symlink.o \ 29 symlink.o \
28 sysfile.o \ 30 sysfile.o \
29 uptodate.o \ 31 uptodate.o \
30 ver.o \ 32 ver.o
31 vote.o
32 33
33obj-$(CONFIG_OCFS2_FS) += cluster/ 34obj-$(CONFIG_OCFS2_FS) += cluster/
34obj-$(CONFIG_OCFS2_FS) += dlm/ 35obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 23c8cda43f19..e6df06ac6405 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4731,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4731 4731
4732 mutex_lock(&data_alloc_inode->i_mutex); 4732 mutex_lock(&data_alloc_inode->i_mutex);
4733 4733
4734 status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); 4734 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
4735 if (status < 0) { 4735 if (status < 0) {
4736 mlog_errno(status); 4736 mlog_errno(status);
4737 goto out_mutex; 4737 goto out_mutex;
@@ -4753,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4753 4753
4754out_unlock: 4754out_unlock:
4755 brelse(data_alloc_bh); 4755 brelse(data_alloc_bh);
4756 ocfs2_meta_unlock(data_alloc_inode, 1); 4756 ocfs2_inode_unlock(data_alloc_inode, 1);
4757 4757
4758out_mutex: 4758out_mutex:
4759 mutex_unlock(&data_alloc_inode->i_mutex); 4759 mutex_unlock(&data_alloc_inode->i_mutex);
@@ -5077,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5077 5077
5078 mutex_lock(&inode->i_mutex); 5078 mutex_lock(&inode->i_mutex);
5079 5079
5080 ret = ocfs2_meta_lock(inode, &di_bh, 1); 5080 ret = ocfs2_inode_lock(inode, &di_bh, 1);
5081 if (ret) { 5081 if (ret) {
5082 mlog_errno(ret); 5082 mlog_errno(ret);
5083 goto out_mutex; 5083 goto out_mutex;
@@ -5118,7 +5118,7 @@ out_journal:
5118 ocfs2_commit_trans(osb, handle); 5118 ocfs2_commit_trans(osb, handle);
5119 5119
5120out_unlock: 5120out_unlock:
5121 ocfs2_meta_unlock(inode, 1); 5121 ocfs2_inode_unlock(inode, 1);
5122 brelse(di_bh); 5122 brelse(di_bh);
5123out_mutex: 5123out_mutex:
5124 mutex_unlock(&inode->i_mutex); 5124 mutex_unlock(&inode->i_mutex);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 56f7790cad46..bc7b4cbbe8ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -26,6 +26,7 @@
26#include <asm/byteorder.h> 26#include <asm/byteorder.h>
27#include <linux/swap.h> 27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h>
29 30
30#define MLOG_MASK_PREFIX ML_FILE_IO 31#define MLOG_MASK_PREFIX ML_FILE_IO
31#include <cluster/masklog.h> 32#include <cluster/masklog.h>
@@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
139{ 140{
140 int err = 0; 141 int err = 0;
141 unsigned int ext_flags; 142 unsigned int ext_flags;
142 u64 p_blkno, past_eof; 143 u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
144 u64 p_blkno, count, past_eof;
143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
144 146
145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 147 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
155 goto bail; 157 goto bail;
156 } 158 }
157 159
158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, 160 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
159 &ext_flags); 161 &ext_flags);
160 if (err) { 162 if (err) {
161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 163 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
164 goto bail; 166 goto bail;
165 } 167 }
166 168
169 if (max_blocks < count)
170 count = max_blocks;
171
167 /* 172 /*
168 * ocfs2 never allocates in this function - the only time we 173 * ocfs2 never allocates in this function - the only time we
169 * need to use BH_New is when we're extending i_size on a file 174 * need to use BH_New is when we're extending i_size on a file
@@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 183 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
179 map_bh(bh_result, inode->i_sb, p_blkno); 184 map_bh(bh_result, inode->i_sb, p_blkno);
180 185
186 bh_result->b_size = count << inode->i_blkbits;
187
181 if (!ocfs2_sparse_alloc(osb)) { 188 if (!ocfs2_sparse_alloc(osb)) {
182 if (p_blkno == 0) { 189 if (p_blkno == 0) {
183 err = -EIO; 190 err = -EIO;
@@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
210 struct buffer_head *di_bh) 217 struct buffer_head *di_bh)
211{ 218{
212 void *kaddr; 219 void *kaddr;
213 unsigned int size; 220 loff_t size;
214 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 221 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
215 222
216 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { 223 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
224 if (size > PAGE_CACHE_SIZE || 231 if (size > PAGE_CACHE_SIZE ||
225 size > ocfs2_max_inline_data(inode->i_sb)) { 232 size > ocfs2_max_inline_data(inode->i_sb)) {
226 ocfs2_error(inode->i_sb, 233 ocfs2_error(inode->i_sb,
227 "Inode %llu has with inline data has bad size: %u", 234 "Inode %llu has with inline data has bad size: %Lu",
228 (unsigned long long)OCFS2_I(inode)->ip_blkno, size); 235 (unsigned long long)OCFS2_I(inode)->ip_blkno,
236 (unsigned long long)size);
229 return -EROFS; 237 return -EROFS;
230 } 238 }
231 239
@@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
275 283
276 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 284 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
277 285
278 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); 286 ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
279 if (ret != 0) { 287 if (ret != 0) {
280 if (ret == AOP_TRUNCATED_PAGE) 288 if (ret == AOP_TRUNCATED_PAGE)
281 unlock = 0; 289 unlock = 0;
@@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
285 293
286 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 294 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
287 ret = AOP_TRUNCATED_PAGE; 295 ret = AOP_TRUNCATED_PAGE;
288 goto out_meta_unlock; 296 goto out_inode_unlock;
289 } 297 }
290 298
291 /* 299 /*
@@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page)
305 goto out_alloc; 313 goto out_alloc;
306 } 314 }
307 315
308 ret = ocfs2_data_lock_with_page(inode, 0, page);
309 if (ret != 0) {
310 if (ret == AOP_TRUNCATED_PAGE)
311 unlock = 0;
312 mlog_errno(ret);
313 goto out_alloc;
314 }
315
316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
317 ret = ocfs2_readpage_inline(inode, page); 317 ret = ocfs2_readpage_inline(inode, page);
318 else 318 else
319 ret = block_read_full_page(page, ocfs2_get_block); 319 ret = block_read_full_page(page, ocfs2_get_block);
320 unlock = 0; 320 unlock = 0;
321 321
322 ocfs2_data_unlock(inode, 0);
323out_alloc: 322out_alloc:
324 up_read(&OCFS2_I(inode)->ip_alloc_sem); 323 up_read(&OCFS2_I(inode)->ip_alloc_sem);
325out_meta_unlock: 324out_inode_unlock:
326 ocfs2_meta_unlock(inode, 0); 325 ocfs2_inode_unlock(inode, 0);
327out: 326out:
328 if (unlock) 327 if (unlock)
329 unlock_page(page); 328 unlock_page(page);
@@ -331,6 +330,62 @@ out:
331 return ret; 330 return ret;
332} 331}
333 332
333/*
334 * This is used only for read-ahead. Failures or difficult to handle
335 * situations are safe to ignore.
336 *
337 * Right now, we don't bother with BH_Boundary - in-inode extent lists
338 * are quite large (243 extents on 4k blocks), so most inodes don't
339 * grow out to a tree. If need be, detecting boundary extents could
340 * trivially be added in a future version of ocfs2_get_block().
341 */
342static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
343 struct list_head *pages, unsigned nr_pages)
344{
345 int ret, err = -EIO;
346 struct inode *inode = mapping->host;
347 struct ocfs2_inode_info *oi = OCFS2_I(inode);
348 loff_t start;
349 struct page *last;
350
351 /*
352 * Use the nonblocking flag for the dlm code to avoid page
353 * lock inversion, but don't bother with retrying.
354 */
355 ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
356 if (ret)
357 return err;
358
359 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
360 ocfs2_inode_unlock(inode, 0);
361 return err;
362 }
363
364 /*
365 * Don't bother with inline-data. There isn't anything
366 * to read-ahead in that case anyway...
367 */
368 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
369 goto out_unlock;
370
371 /*
372 * Check whether a remote node truncated this file - we just
373 * drop out in that case as it's not worth handling here.
374 */
375 last = list_entry(pages->prev, struct page, lru);
376 start = (loff_t)last->index << PAGE_CACHE_SHIFT;
377 if (start >= i_size_read(inode))
378 goto out_unlock;
379
380 err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
381
382out_unlock:
383 up_read(&oi->ip_alloc_sem);
384 ocfs2_inode_unlock(inode, 0);
385
386 return err;
387}
388
334/* Note: Because we don't support holes, our allocation has 389/* Note: Because we don't support holes, our allocation has
335 * already happened (allocation writes zeros to the file data) 390 * already happened (allocation writes zeros to the file data)
336 * so we don't have to worry about ordered writes in 391 * so we don't have to worry about ordered writes in
@@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
452 * accessed concurrently from multiple nodes. 507 * accessed concurrently from multiple nodes.
453 */ 508 */
454 if (!INODE_JOURNAL(inode)) { 509 if (!INODE_JOURNAL(inode)) {
455 err = ocfs2_meta_lock(inode, NULL, 0); 510 err = ocfs2_inode_lock(inode, NULL, 0);
456 if (err) { 511 if (err) {
457 if (err != -ENOENT) 512 if (err != -ENOENT)
458 mlog_errno(err); 513 mlog_errno(err);
@@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
467 522
468 if (!INODE_JOURNAL(inode)) { 523 if (!INODE_JOURNAL(inode)) {
469 up_read(&OCFS2_I(inode)->ip_alloc_sem); 524 up_read(&OCFS2_I(inode)->ip_alloc_sem);
470 ocfs2_meta_unlock(inode, 0); 525 ocfs2_inode_unlock(inode, 0);
471 } 526 }
472 527
473 if (err) { 528 if (err) {
@@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw,
638 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 693 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
639 return 0; 694 return 0;
640 695
641 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
642 /*
643 * We get PR data locks even for O_DIRECT. This
644 * allows concurrent O_DIRECT I/O but doesn't let
645 * O_DIRECT with extending and buffered zeroing writes
646 * race. If they did race then the buffered zeroing
647 * could be written back after the O_DIRECT I/O. It's
648 * one thing to tell people not to mix buffered and
649 * O_DIRECT writes, but expecting them to understand
650 * that file extension is also an implicit buffered
651 * write is too much. By getting the PR we force
652 * writeback of the buffered zeroing before
653 * proceeding.
654 */
655 ret = ocfs2_data_lock(inode, 0);
656 if (ret < 0) {
657 mlog_errno(ret);
658 goto out;
659 }
660 ocfs2_data_unlock(inode, 0);
661 }
662
663 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 696 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
664 inode->i_sb->s_bdev, iov, offset, 697 inode->i_sb->s_bdev, iov, offset,
665 nr_segs, 698 nr_segs,
666 ocfs2_direct_IO_get_blocks, 699 ocfs2_direct_IO_get_blocks,
667 ocfs2_dio_end_io); 700 ocfs2_dio_end_io);
668out: 701
669 mlog_exit(ret); 702 mlog_exit(ret);
670 return ret; 703 return ret;
671} 704}
@@ -1754,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1754 struct buffer_head *di_bh = NULL; 1787 struct buffer_head *di_bh = NULL;
1755 struct inode *inode = mapping->host; 1788 struct inode *inode = mapping->host;
1756 1789
1757 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1790 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1758 if (ret) { 1791 if (ret) {
1759 mlog_errno(ret); 1792 mlog_errno(ret);
1760 return ret; 1793 return ret;
@@ -1769,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1769 */ 1802 */
1770 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1803 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1771 1804
1772 ret = ocfs2_data_lock(inode, 1);
1773 if (ret) {
1774 mlog_errno(ret);
1775 goto out_fail;
1776 }
1777
1778 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1805 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1779 fsdata, di_bh, NULL); 1806 fsdata, di_bh, NULL);
1780 if (ret) { 1807 if (ret) {
1781 mlog_errno(ret); 1808 mlog_errno(ret);
1782 goto out_fail_data; 1809 goto out_fail;
1783 } 1810 }
1784 1811
1785 brelse(di_bh); 1812 brelse(di_bh);
1786 1813
1787 return 0; 1814 return 0;
1788 1815
1789out_fail_data:
1790 ocfs2_data_unlock(inode, 1);
1791out_fail: 1816out_fail:
1792 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1817 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1793 1818
1794 brelse(di_bh); 1819 brelse(di_bh);
1795 ocfs2_meta_unlock(inode, 1); 1820 ocfs2_inode_unlock(inode, 1);
1796 1821
1797 return ret; 1822 return ret;
1798} 1823}
@@ -1908,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1908 1933
1909 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); 1934 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1910 1935
1911 ocfs2_data_unlock(inode, 1);
1912 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1936 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1913 ocfs2_meta_unlock(inode, 1); 1937 ocfs2_inode_unlock(inode, 1);
1914 1938
1915 return ret; 1939 return ret;
1916} 1940}
1917 1941
1918const struct address_space_operations ocfs2_aops = { 1942const struct address_space_operations ocfs2_aops = {
1919 .readpage = ocfs2_readpage, 1943 .readpage = ocfs2_readpage,
1944 .readpages = ocfs2_readpages,
1920 .writepage = ocfs2_writepage, 1945 .writepage = ocfs2_writepage,
1921 .write_begin = ocfs2_write_begin, 1946 .write_begin = ocfs2_write_begin,
1922 .write_end = ocfs2_write_end, 1947 .write_end = ocfs2_write_end,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f6..f136639f5b41 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
79 * information for this bh as it's not marked locally 79 * information for this bh as it's not marked locally
80 * uptodate. */ 80 * uptodate. */
81 ret = -EIO; 81 ret = -EIO;
82 brelse(bh); 82 put_bh(bh);
83 } 83 }
84 84
85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
256 * for this bh as it's not marked locally 256 * for this bh as it's not marked locally
257 * uptodate. */ 257 * uptodate. */
258 status = -EIO; 258 status = -EIO;
259 brelse(bh); 259 put_bh(bh);
260 bhs[i] = NULL; 260 bhs[i] = NULL;
261 continue; 261 continue;
262 } 262 }
@@ -280,3 +280,64 @@ bail:
280 mlog_exit(status); 280 mlog_exit(status);
281 return status; 281 return status;
282} 282}
283
284/* Check whether the blkno is the super block or one of the backups. */
285static void ocfs2_check_super_or_backup(struct super_block *sb,
286 sector_t blkno)
287{
288 int i;
289 u64 backup_blkno;
290
291 if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
292 return;
293
294 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
295 backup_blkno = ocfs2_backup_super_blkno(sb, i);
296 if (backup_blkno == blkno)
297 return;
298 }
299
300 BUG();
301}
302
303/*
304 * Write super block and backups doesn't need to collaborate with journal,
305 * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
306 * into this function.
307 */
308int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
309 struct buffer_head *bh)
310{
311 int ret = 0;
312
313 mlog_entry_void();
314
315 BUG_ON(buffer_jbd(bh));
316 ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
317
318 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
319 ret = -EROFS;
320 goto out;
321 }
322
323 lock_buffer(bh);
324 set_buffer_uptodate(bh);
325
326 /* remove from dirty list before I/O. */
327 clear_buffer_dirty(bh);
328
329 get_bh(bh); /* for end_buffer_write_sync() */
330 bh->b_end_io = end_buffer_write_sync;
331 submit_bh(WRITE, bh);
332
333 wait_on_buffer(bh);
334
335 if (!buffer_uptodate(bh)) {
336 ret = -EIO;
337 put_bh(bh);
338 }
339
340out:
341 mlog_exit(ret);
342 return ret;
343}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac3..c2e78614c3e5 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super *osb,
47 int flags, 47 int flags,
48 struct inode *inode); 48 struct inode *inode);
49 49
50int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
51 struct buffer_head *bh);
50 52
51#define OCFS2_BH_CACHED 1 53#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 54#define OCFS2_BH_READAHEAD 8
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 35397dd5ecdb..e511339886b3 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -35,7 +35,7 @@
35#define O2HB_LIVE_THRESHOLD 2 35#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 36/* number of equal samples to be seen as dead */
37extern unsigned int o2hb_dead_threshold; 37extern unsigned int o2hb_dead_threshold;
38#define O2HB_DEFAULT_DEAD_THRESHOLD 7 38#define O2HB_DEFAULT_DEAD_THRESHOLD 31
39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ 39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
40#define O2HB_MIN_DEAD_THRESHOLD 2 40#define O2HB_MIN_DEAD_THRESHOLD 2
41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) 41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index da880fc215f0..f36f66aab3dd 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
60/* same as hb delay, we're waiting for another node to recognize our hb */ 60/* same as hb delay, we're waiting for another node to recognize our hb */
61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
62 62
63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
65 65
66 66
67/* TODO: figure this out.... */ 67/* TODO: figure this out.... */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 9606111fe89d..b2e832aca567 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * New in version 10:
42 * - Meta/data locks combined
43 *
44 * New in version 9:
45 * - All votes removed
46 *
41 * New in version 8: 47 * New in version 8:
42 * - Replace delete inode votes with a cluster lock 48 * - Replace delete inode votes with a cluster lock
43 * 49 *
@@ -60,7 +66,7 @@
60 * - full 64 bit i_size in the metadata lock lvbs 66 * - full 64 bit i_size in the metadata lock lvbs
61 * - introduction of "rw" lock and pushing meta/data locking down 67 * - introduction of "rw" lock and pushing meta/data locking down
62 */ 68 */
63#define O2NET_PROTOCOL_VERSION 8ULL 69#define O2NET_PROTOCOL_VERSION 10ULL
64struct o2net_handshake { 70struct o2net_handshake {
65 __be64 protocol_version; 71 __be64 protocol_version;
66 __be64 connector_id; 72 __be64 connector_id;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
index 7286c48bb30d..a56eee6abad3 100644
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -28,7 +28,7 @@
28 28
29#include "ver.h" 29#include "ver.h"
30 30
31#define CLUSTER_BUILD_VERSION "1.3.3" 31#define CLUSTER_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION 33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 9923278ea6d4..b1cc7c381e88 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
128/* 128/*
129 * Walk the inode alias list, and find a dentry which has a given 129 * Walk the inode alias list, and find a dentry which has a given
130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it 130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
131 * is looking for a dentry_lock reference. The vote thread is looking 131 * is looking for a dentry_lock reference. The downconvert thread is
132 * to unhash aliases, so we allow it to skip any that already have 132 * looking to unhash aliases, so we allow it to skip any that already
133 * that property. 133 * have that property.
134 */ 134 */
135struct dentry *ocfs2_find_local_alias(struct inode *inode, 135struct dentry *ocfs2_find_local_alias(struct inode *inode,
136 u64 parent_blkno, 136 u64 parent_blkno,
@@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
266 dl->dl_count = 0; 266 dl->dl_count = 0;
267 /* 267 /*
268 * Does this have to happen below, for all attaches, in case 268 * Does this have to happen below, for all attaches, in case
269 * the struct inode gets blown away by votes? 269 * the struct inode gets blown away by the downconvert thread?
270 */ 270 */
271 dl->dl_inode = igrab(inode); 271 dl->dl_inode = igrab(inode);
272 dl->dl_parent_blkno = parent_blkno; 272 dl->dl_parent_blkno = parent_blkno;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 63b28fdceb4a..6b0107f21344 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
846 mlog_entry("dirino=%llu\n", 846 mlog_entry("dirino=%llu\n",
847 (unsigned long long)OCFS2_I(inode)->ip_blkno); 847 (unsigned long long)OCFS2_I(inode)->ip_blkno);
848 848
849 error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 849 error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
850 if (lock_level && error >= 0) { 850 if (lock_level && error >= 0) {
851 /* We release EX lock which used to update atime 851 /* We release EX lock which used to update atime
852 * and get PR lock again to reduce contention 852 * and get PR lock again to reduce contention
853 * on commonly accessed directories. */ 853 * on commonly accessed directories. */
854 ocfs2_meta_unlock(inode, 1); 854 ocfs2_inode_unlock(inode, 1);
855 lock_level = 0; 855 lock_level = 0;
856 error = ocfs2_meta_lock(inode, NULL, 0); 856 error = ocfs2_inode_lock(inode, NULL, 0);
857 } 857 }
858 if (error < 0) { 858 if (error < 0) {
859 if (error != -ENOENT) 859 if (error != -ENOENT)
@@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, 865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
866 dirent, filldir, NULL); 866 dirent, filldir, NULL);
867 867
868 ocfs2_meta_unlock(inode, lock_level); 868 ocfs2_inode_unlock(inode, lock_level);
869 869
870bail_nolock: 870bail_nolock:
871 mlog_exit(error); 871 mlog_exit(error);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
index d2be3ad841f9..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmfsver.h" 29#include "dlmfsver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2fde7bf91434..91f747b8a538 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2270 } 2270 }
2271 } 2271 }
2272 2272
2273 /* Clean up join state on node death. */
2274 if (dlm->joining_node == idx) {
2275 mlog(0, "Clearing join state for node %u\n", idx);
2276 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2277 }
2278
2273 /* check to see if the node is already considered dead */ 2279 /* check to see if the node is already considered dead */
2274 if (!test_bit(idx, dlm->live_nodes_map)) { 2280 if (!test_bit(idx, dlm->live_nodes_map)) {
2275 mlog(0, "for domain %s, node %d is already dead. " 2281 mlog(0, "for domain %s, node %d is already dead. "
@@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2288 2294
2289 clear_bit(idx, dlm->live_nodes_map); 2295 clear_bit(idx, dlm->live_nodes_map);
2290 2296
2291 /* Clean up join state on node death. */
2292 if (dlm->joining_node == idx) {
2293 mlog(0, "Clearing join state for node %u\n", idx);
2294 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2295 }
2296
2297 /* make sure local cleanup occurs before the heartbeat events */ 2297 /* make sure local cleanup occurs before the heartbeat events */
2298 if (!test_bit(idx, dlm->recovery_map)) 2298 if (!test_bit(idx, dlm->recovery_map))
2299 dlm_do_local_recovery_cleanup(dlm, idx); 2299 dlm_do_local_recovery_cleanup(dlm, idx);
@@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
2321 if (!dlm_grab(dlm)) 2321 if (!dlm_grab(dlm))
2322 return; 2322 return;
2323 2323
2324 /*
2325 * This will notify any dlm users that a node in our domain
2326 * went away without notifying us first.
2327 */
2328 if (test_bit(idx, dlm->domain_map))
2329 dlm_fire_domain_eviction_callbacks(dlm, idx);
2330
2324 spin_lock(&dlm->spinlock); 2331 spin_lock(&dlm->spinlock);
2325 __dlm_hb_node_down(dlm, idx); 2332 __dlm_hb_node_down(dlm, idx);
2326 spin_unlock(&dlm->spinlock); 2333 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
index 7ef2653f8f41..dfc0da4d158d 100644
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmver.h" 29#include "dlmver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4e97dcceaf8f..3867244fb144 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
55#include "slot_map.h" 55#include "slot_map.h"
56#include "super.h" 56#include "super.h"
57#include "uptodate.h" 57#include "uptodate.h"
58#include "vote.h"
59 58
60#include "buffer_head_io.h" 59#include "buffer_head_io.h"
61 60
@@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
69 68
70static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 69static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); 70static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
72 72
73/* 73/*
74 * Return value from ->downconvert_worker functions. 74 * Return value from ->downconvert_worker functions.
@@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); 153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
154 154
155 /* 155 /*
156 * Optionally called in the downconvert (or "vote") thread 156 * Optionally called in the downconvert thread after a
157 * after a successful downconvert. The lockres will not be 157 * successful downconvert. The lockres will not be referenced
158 * referenced after this callback is called, so it is safe to 158 * after this callback is called, so it is safe to free
159 * free memory, etc. 159 * memory, etc.
160 * 160 *
161 * The exact semantics of when this is called are controlled 161 * The exact semantics of when this is called are controlled
162 * by ->downconvert_worker() 162 * by ->downconvert_worker()
@@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
225 .flags = 0, 225 .flags = 0,
226}; 226};
227 227
228static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { 228static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
229 .get_osb = ocfs2_get_inode_osb, 229 .get_osb = ocfs2_get_inode_osb,
230 .check_downconvert = ocfs2_check_meta_downconvert, 230 .check_downconvert = ocfs2_check_meta_downconvert,
231 .set_lvb = ocfs2_set_meta_lvb, 231 .set_lvb = ocfs2_set_meta_lvb,
232 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
233};
234
235static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
236 .get_osb = ocfs2_get_inode_osb,
237 .downconvert_worker = ocfs2_data_convert_worker, 232 .downconvert_worker = ocfs2_data_convert_worker,
238 .flags = 0, 233 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
239}; 234};
240 235
241static struct ocfs2_lock_res_ops ocfs2_super_lops = { 236static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
258 .flags = 0, 253 .flags = 0,
259}; 254};
260 255
256static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
257 .get_osb = ocfs2_get_file_osb,
258 .flags = 0,
259};
260
261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
262{ 262{
263 return lockres->l_type == OCFS2_LOCK_TYPE_META || 263 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
264 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
265 lockres->l_type == OCFS2_LOCK_TYPE_RW || 264 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
266 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 265 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
267} 266}
@@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
310 "resource %s: %s\n", dlm_errname(_stat), _func, \ 309 "resource %s: %s\n", dlm_errname(_stat), _func, \
311 _lockres->l_name, dlm_errmsg(_stat)); \ 310 _lockres->l_name, dlm_errmsg(_stat)); \
312} while (0) 311} while (0)
313static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 312static int ocfs2_downconvert_thread(void *arg);
314 struct ocfs2_lock_res *lockres); 313static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
315static int ocfs2_meta_lock_update(struct inode *inode, 314 struct ocfs2_lock_res *lockres);
315static int ocfs2_inode_lock_update(struct inode *inode,
316 struct buffer_head **bh); 316 struct buffer_head **bh);
317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
318static inline int ocfs2_highest_compat_lock_level(int level); 318static inline int ocfs2_highest_compat_lock_level(int level);
319static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
320 int new_level);
321static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
322 struct ocfs2_lock_res *lockres,
323 int new_level,
324 int lvb);
325static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
326 struct ocfs2_lock_res *lockres);
327static int ocfs2_cancel_convert(struct ocfs2_super *osb,
328 struct ocfs2_lock_res *lockres);
329
319 330
320static void ocfs2_build_lock_name(enum ocfs2_lock_type type, 331static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
321 u64 blkno, 332 u64 blkno,
@@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
402 ops = &ocfs2_inode_rw_lops; 413 ops = &ocfs2_inode_rw_lops;
403 break; 414 break;
404 case OCFS2_LOCK_TYPE_META: 415 case OCFS2_LOCK_TYPE_META:
405 ops = &ocfs2_inode_meta_lops; 416 ops = &ocfs2_inode_inode_lops;
406 break;
407 case OCFS2_LOCK_TYPE_DATA:
408 ops = &ocfs2_inode_data_lops;
409 break; 417 break;
410 case OCFS2_LOCK_TYPE_OPEN: 418 case OCFS2_LOCK_TYPE_OPEN:
411 ops = &ocfs2_inode_open_lops; 419 ops = &ocfs2_inode_open_lops;
@@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
428 return OCFS2_SB(inode->i_sb); 436 return OCFS2_SB(inode->i_sb);
429} 437}
430 438
439static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
440{
441 struct ocfs2_file_private *fp = lockres->l_priv;
442
443 return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
444}
445
431static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) 446static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
432{ 447{
433 __be64 inode_blkno_be; 448 __be64 inode_blkno_be;
@@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
508 &ocfs2_rename_lops, osb); 523 &ocfs2_rename_lops, osb);
509} 524}
510 525
526void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
527 struct ocfs2_file_private *fp)
528{
529 struct inode *inode = fp->fp_file->f_mapping->host;
530 struct ocfs2_inode_info *oi = OCFS2_I(inode);
531
532 ocfs2_lock_res_init_once(lockres);
533 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
534 inode->i_generation, lockres->l_name);
535 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
536 OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
537 fp);
538 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
539}
540
511void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 541void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
512{ 542{
513 mlog_entry_void(); 543 mlog_entry_void();
@@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
724 lockres->l_name, level, lockres->l_level, 754 lockres->l_name, level, lockres->l_level,
725 ocfs2_lock_type_string(lockres->l_type)); 755 ocfs2_lock_type_string(lockres->l_type));
726 756
757 /*
758 * We can skip the bast for locks which don't enable caching -
759 * they'll be dropped at the earliest possible time anyway.
760 */
761 if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
762 return;
763
727 spin_lock_irqsave(&lockres->l_lock, flags); 764 spin_lock_irqsave(&lockres->l_lock, flags);
728 needs_downconvert = ocfs2_generic_handle_bast(lockres, level); 765 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
729 if (needs_downconvert) 766 if (needs_downconvert)
@@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
732 769
733 wake_up(&lockres->l_event); 770 wake_up(&lockres->l_event);
734 771
735 ocfs2_kick_vote_thread(osb); 772 ocfs2_wake_downconvert_thread(osb);
736} 773}
737 774
738static void ocfs2_locking_ast(void *opaque) 775static void ocfs2_locking_ast(void *opaque)
@@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
935 972
936} 973}
937 974
975static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
976 struct ocfs2_lock_res *lockres)
977{
978 int ret;
979
980 ret = wait_for_completion_interruptible(&mw->mw_complete);
981 if (ret)
982 lockres_remove_mask_waiter(lockres, mw);
983 else
984 ret = mw->mw_status;
985 /* Re-arm the completion in case we want to wait on it again */
986 INIT_COMPLETION(mw->mw_complete);
987 return ret;
988}
989
938static int ocfs2_cluster_lock(struct ocfs2_super *osb, 990static int ocfs2_cluster_lock(struct ocfs2_super *osb,
939 struct ocfs2_lock_res *lockres, 991 struct ocfs2_lock_res *lockres,
940 int level, 992 int level,
@@ -1089,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1089 mlog_entry_void(); 1141 mlog_entry_void();
1090 spin_lock_irqsave(&lockres->l_lock, flags); 1142 spin_lock_irqsave(&lockres->l_lock, flags);
1091 ocfs2_dec_holders(lockres, level); 1143 ocfs2_dec_holders(lockres, level);
1092 ocfs2_vote_on_unlock(osb, lockres); 1144 ocfs2_downconvert_on_unlock(osb, lockres);
1093 spin_unlock_irqrestore(&lockres->l_lock, flags); 1145 spin_unlock_irqrestore(&lockres->l_lock, flags);
1094 mlog_exit_void(); 1146 mlog_exit_void();
1095} 1147}
@@ -1147,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1147 * We don't want to use LKM_LOCAL on a meta data lock as they 1199 * We don't want to use LKM_LOCAL on a meta data lock as they
1148 * don't use a generation in their lock names. 1200 * don't use a generation in their lock names.
1149 */ 1201 */
1150 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0); 1202 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
1151 if (ret) {
1152 mlog_errno(ret);
1153 goto bail;
1154 }
1155
1156 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
1157 if (ret) { 1203 if (ret) {
1158 mlog_errno(ret); 1204 mlog_errno(ret);
1159 goto bail; 1205 goto bail;
@@ -1311,76 +1357,221 @@ out:
1311 mlog_exit_void(); 1357 mlog_exit_void();
1312} 1358}
1313 1359
1314int ocfs2_data_lock_full(struct inode *inode, 1360static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
1315 int write, 1361 int level)
1316 int arg_flags)
1317{ 1362{
1318 int status = 0, level; 1363 int ret;
1319 struct ocfs2_lock_res *lockres; 1364 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1320 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 unsigned long flags;
1366 struct ocfs2_mask_waiter mw;
1321 1367
1322 BUG_ON(!inode); 1368 ocfs2_init_mask_waiter(&mw);
1323 1369
1324 mlog_entry_void(); 1370retry_cancel:
1371 spin_lock_irqsave(&lockres->l_lock, flags);
1372 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
1373 ret = ocfs2_prepare_cancel_convert(osb, lockres);
1374 if (ret) {
1375 spin_unlock_irqrestore(&lockres->l_lock, flags);
1376 ret = ocfs2_cancel_convert(osb, lockres);
1377 if (ret < 0) {
1378 mlog_errno(ret);
1379 goto out;
1380 }
1381 goto retry_cancel;
1382 }
1383 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1384 spin_unlock_irqrestore(&lockres->l_lock, flags);
1325 1385
1326 mlog(0, "inode %llu take %s DATA lock\n", 1386 ocfs2_wait_for_mask(&mw);
1327 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1387 goto retry_cancel;
1328 write ? "EXMODE" : "PRMODE"); 1388 }
1329 1389
1330 /* We'll allow faking a readonly data lock for 1390 ret = -ERESTARTSYS;
1331 * rodevices. */ 1391 /*
1332 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { 1392 * We may still have gotten the lock, in which case there's no
1333 if (write) { 1393 * point to restarting the syscall.
1334 status = -EROFS; 1394 */
1335 mlog_errno(status); 1395 if (lockres->l_level == level)
1396 ret = 0;
1397
1398 mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
1399 lockres->l_flags, lockres->l_level, lockres->l_action);
1400
1401 spin_unlock_irqrestore(&lockres->l_lock, flags);
1402
1403out:
1404 return ret;
1405}
1406
1407/*
1408 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1409 * flock() calls. The locking approach this requires is sufficiently
1410 * different from all other cluster lock types that we implement a
1411 * seperate path to the "low-level" dlm calls. In particular:
1412 *
1413 * - No optimization of lock levels is done - we take at exactly
1414 * what's been requested.
1415 *
1416 * - No lock caching is employed. We immediately downconvert to
1417 * no-lock at unlock time. This also means flock locks never go on
1418 * the blocking list).
1419 *
1420 * - Since userspace can trivially deadlock itself with flock, we make
1421 * sure to allow cancellation of a misbehaving applications flock()
1422 * request.
1423 *
1424 * - Access to any flock lockres doesn't require concurrency, so we
1425 * can simplify the code by requiring the caller to guarantee
1426 * serialization of dlmglue flock calls.
1427 */
1428int ocfs2_file_lock(struct file *file, int ex, int trylock)
1429{
1430 int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
1431 unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
1432 unsigned long flags;
1433 struct ocfs2_file_private *fp = file->private_data;
1434 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1435 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1436 struct ocfs2_mask_waiter mw;
1437
1438 ocfs2_init_mask_waiter(&mw);
1439
1440 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
1441 (lockres->l_level > LKM_NLMODE)) {
1442 mlog(ML_ERROR,
1443 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1444 "level: %u\n", lockres->l_name, lockres->l_flags,
1445 lockres->l_level);
1446 return -EINVAL;
1447 }
1448
1449 spin_lock_irqsave(&lockres->l_lock, flags);
1450 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1451 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1452 spin_unlock_irqrestore(&lockres->l_lock, flags);
1453
1454 /*
1455 * Get the lock at NLMODE to start - that way we
1456 * can cancel the upconvert request if need be.
1457 */
1458 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
1459 if (ret < 0) {
1460 mlog_errno(ret);
1461 goto out;
1336 } 1462 }
1337 goto out; 1463
1464 ret = ocfs2_wait_for_mask(&mw);
1465 if (ret) {
1466 mlog_errno(ret);
1467 goto out;
1468 }
1469 spin_lock_irqsave(&lockres->l_lock, flags);
1338 } 1470 }
1339 1471
1340 if (ocfs2_mount_local(osb)) 1472 lockres->l_action = OCFS2_AST_CONVERT;
1341 goto out; 1473 lkm_flags |= LKM_CONVERT;
1474 lockres->l_requested = level;
1475 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1342 1476
1343 lockres = &OCFS2_I(inode)->ip_data_lockres; 1477 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1478 spin_unlock_irqrestore(&lockres->l_lock, flags);
1344 1479
1345 level = write ? LKM_EXMODE : LKM_PRMODE; 1480 ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
1481 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
1482 ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
1483 if (ret != DLM_NORMAL) {
1484 if (trylock && ret == DLM_NOTQUEUED)
1485 ret = -EAGAIN;
1486 else {
1487 ocfs2_log_dlm_error("dlmlock", ret, lockres);
1488 ret = -EINVAL;
1489 }
1346 1490
1347 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 1491 ocfs2_recover_from_dlm_error(lockres, 1);
1348 0, arg_flags); 1492 lockres_remove_mask_waiter(lockres, &mw);
1349 if (status < 0 && status != -EAGAIN) 1493 goto out;
1350 mlog_errno(status); 1494 }
1495
1496 ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
1497 if (ret == -ERESTARTSYS) {
1498 /*
1499 * Userspace can cause deadlock itself with
1500 * flock(). Current behavior locally is to allow the
1501 * deadlock, but abort the system call if a signal is
1502 * received. We follow this example, otherwise a
1503 * poorly written program could sit in kernel until
1504 * reboot.
1505 *
1506 * Handling this is a bit more complicated for Ocfs2
1507 * though. We can't exit this function with an
1508 * outstanding lock request, so a cancel convert is
1509 * required. We intentionally overwrite 'ret' - if the
1510 * cancel fails and the lock was granted, it's easier
1511 * to just bubble sucess back up to the user.
1512 */
1513 ret = ocfs2_flock_handle_signal(lockres, level);
1514 }
1351 1515
1352out: 1516out:
1353 mlog_exit(status); 1517
1354 return status; 1518 mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
1519 lockres->l_name, ex, trylock, ret);
1520 return ret;
1355} 1521}
1356 1522
1357/* see ocfs2_meta_lock_with_page() */ 1523void ocfs2_file_unlock(struct file *file)
1358int ocfs2_data_lock_with_page(struct inode *inode,
1359 int write,
1360 struct page *page)
1361{ 1524{
1362 int ret; 1525 int ret;
1526 unsigned long flags;
1527 struct ocfs2_file_private *fp = file->private_data;
1528 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1529 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1530 struct ocfs2_mask_waiter mw;
1363 1531
1364 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); 1532 ocfs2_init_mask_waiter(&mw);
1365 if (ret == -EAGAIN) { 1533
1366 unlock_page(page); 1534 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
1367 if (ocfs2_data_lock(inode, write) == 0) 1535 return;
1368 ocfs2_data_unlock(inode, write); 1536
1369 ret = AOP_TRUNCATED_PAGE; 1537 if (lockres->l_level == LKM_NLMODE)
1538 return;
1539
1540 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
1541 lockres->l_name, lockres->l_flags, lockres->l_level,
1542 lockres->l_action);
1543
1544 spin_lock_irqsave(&lockres->l_lock, flags);
1545 /*
1546 * Fake a blocking ast for the downconvert code.
1547 */
1548 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1549 lockres->l_blocking = LKM_EXMODE;
1550
1551 ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
1552 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1553 spin_unlock_irqrestore(&lockres->l_lock, flags);
1554
1555 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
1556 if (ret) {
1557 mlog_errno(ret);
1558 return;
1370 } 1559 }
1371 1560
1372 return ret; 1561 ret = ocfs2_wait_for_mask(&mw);
1562 if (ret)
1563 mlog_errno(ret);
1373} 1564}
1374 1565
1375static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 1566static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
1376 struct ocfs2_lock_res *lockres) 1567 struct ocfs2_lock_res *lockres)
1377{ 1568{
1378 int kick = 0; 1569 int kick = 0;
1379 1570
1380 mlog_entry_void(); 1571 mlog_entry_void();
1381 1572
1382 /* If we know that another node is waiting on our lock, kick 1573 /* If we know that another node is waiting on our lock, kick
1383 * the vote thread * pre-emptively when we reach a release 1574 * the downconvert thread * pre-emptively when we reach a release
1384 * condition. */ 1575 * condition. */
1385 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1576 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1386 switch(lockres->l_blocking) { 1577 switch(lockres->l_blocking) {
@@ -1398,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1398 } 1589 }
1399 1590
1400 if (kick) 1591 if (kick)
1401 ocfs2_kick_vote_thread(osb); 1592 ocfs2_wake_downconvert_thread(osb);
1402
1403 mlog_exit_void();
1404}
1405
1406void ocfs2_data_unlock(struct inode *inode,
1407 int write)
1408{
1409 int level = write ? LKM_EXMODE : LKM_PRMODE;
1410 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1411 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1412
1413 mlog_entry_void();
1414
1415 mlog(0, "inode %llu drop %s DATA lock\n",
1416 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1417 write ? "EXMODE" : "PRMODE");
1418
1419 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
1420 !ocfs2_mount_local(osb))
1421 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1422 1593
1423 mlog_exit_void(); 1594 mlog_exit_void();
1424} 1595}
@@ -1442,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
1442 1613
1443/* Call this with the lockres locked. I am reasonably sure we don't 1614/* Call this with the lockres locked. I am reasonably sure we don't
1444 * need ip_lock in this function as anyone who would be changing those 1615 * need ip_lock in this function as anyone who would be changing those
1445 * values is supposed to be blocked in ocfs2_meta_lock right now. */ 1616 * values is supposed to be blocked in ocfs2_inode_lock right now. */
1446static void __ocfs2_stuff_meta_lvb(struct inode *inode) 1617static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1447{ 1618{
1448 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1619 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1449 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1620 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1450 struct ocfs2_meta_lvb *lvb; 1621 struct ocfs2_meta_lvb *lvb;
1451 1622
1452 mlog_entry_void(); 1623 mlog_entry_void();
@@ -1496,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
1496static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 1667static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1497{ 1668{
1498 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1669 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1499 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1670 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1500 struct ocfs2_meta_lvb *lvb; 1671 struct ocfs2_meta_lvb *lvb;
1501 1672
1502 mlog_entry_void(); 1673 mlog_entry_void();
@@ -1604,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
1604} 1775}
1605 1776
1606/* may or may not return a bh if it went to disk. */ 1777/* may or may not return a bh if it went to disk. */
1607static int ocfs2_meta_lock_update(struct inode *inode, 1778static int ocfs2_inode_lock_update(struct inode *inode,
1608 struct buffer_head **bh) 1779 struct buffer_head **bh)
1609{ 1780{
1610 int status = 0; 1781 int status = 0;
1611 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1782 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1612 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1783 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1613 struct ocfs2_dinode *fe; 1784 struct ocfs2_dinode *fe;
1614 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1615 1786
@@ -1721,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
1721 * returns < 0 error if the callback will never be called, otherwise 1892 * returns < 0 error if the callback will never be called, otherwise
1722 * the result of the lock will be communicated via the callback. 1893 * the result of the lock will be communicated via the callback.
1723 */ 1894 */
1724int ocfs2_meta_lock_full(struct inode *inode, 1895int ocfs2_inode_lock_full(struct inode *inode,
1725 struct buffer_head **ret_bh, 1896 struct buffer_head **ret_bh,
1726 int ex, 1897 int ex,
1727 int arg_flags) 1898 int arg_flags)
@@ -1756,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
1756 wait_event(osb->recovery_event, 1927 wait_event(osb->recovery_event,
1757 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1928 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1758 1929
1759 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1930 lockres = &OCFS2_I(inode)->ip_inode_lockres;
1760 level = ex ? LKM_EXMODE : LKM_PRMODE; 1931 level = ex ? LKM_EXMODE : LKM_PRMODE;
1761 dlm_flags = 0; 1932 dlm_flags = 0;
1762 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 1933 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1795,11 +1966,11 @@ local:
1795 } 1966 }
1796 1967
1797 /* This is fun. The caller may want a bh back, or it may 1968 /* This is fun. The caller may want a bh back, or it may
1798 * not. ocfs2_meta_lock_update definitely wants one in, but 1969 * not. ocfs2_inode_lock_update definitely wants one in, but
1799 * may or may not read one, depending on what's in the 1970 * may or may not read one, depending on what's in the
1800 * LVB. The result of all of this is that we've *only* gone to 1971 * LVB. The result of all of this is that we've *only* gone to
1801 * disk if we have to, so the complexity is worthwhile. */ 1972 * disk if we have to, so the complexity is worthwhile. */
1802 status = ocfs2_meta_lock_update(inode, &local_bh); 1973 status = ocfs2_inode_lock_update(inode, &local_bh);
1803 if (status < 0) { 1974 if (status < 0) {
1804 if (status != -ENOENT) 1975 if (status != -ENOENT)
1805 mlog_errno(status); 1976 mlog_errno(status);
@@ -1821,7 +1992,7 @@ bail:
1821 *ret_bh = NULL; 1992 *ret_bh = NULL;
1822 } 1993 }
1823 if (acquired) 1994 if (acquired)
1824 ocfs2_meta_unlock(inode, ex); 1995 ocfs2_inode_unlock(inode, ex);
1825 } 1996 }
1826 1997
1827 if (local_bh) 1998 if (local_bh)
@@ -1832,19 +2003,20 @@ bail:
1832} 2003}
1833 2004
1834/* 2005/*
1835 * This is working around a lock inversion between tasks acquiring DLM locks 2006 * This is working around a lock inversion between tasks acquiring DLM
1836 * while holding a page lock and the vote thread which blocks dlm lock acquiry 2007 * locks while holding a page lock and the downconvert thread which
1837 * while acquiring page locks. 2008 * blocks dlm lock acquiry while acquiring page locks.
1838 * 2009 *
1839 * ** These _with_page variantes are only intended to be called from aop 2010 * ** These _with_page variantes are only intended to be called from aop
1840 * methods that hold page locks and return a very specific *positive* error 2011 * methods that hold page locks and return a very specific *positive* error
1841 * code that aop methods pass up to the VFS -- test for errors with != 0. ** 2012 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1842 * 2013 *
1843 * The DLM is called such that it returns -EAGAIN if it would have blocked 2014 * The DLM is called such that it returns -EAGAIN if it would have
1844 * waiting for the vote thread. In that case we unlock our page so the vote 2015 * blocked waiting for the downconvert thread. In that case we unlock
1845 * thread can make progress. Once we've done this we have to return 2016 * our page so the downconvert thread can make progress. Once we've
1846 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up 2017 * done this we have to return AOP_TRUNCATED_PAGE so the aop method
1847 * into the VFS who will then immediately retry the aop call. 2018 * that called us can bubble that back up into the VFS who will then
2019 * immediately retry the aop call.
1848 * 2020 *
1849 * We do a blocking lock and immediate unlock before returning, though, so that 2021 * We do a blocking lock and immediate unlock before returning, though, so that
1850 * the lock has a great chance of being cached on this node by the time the VFS 2022 * the lock has a great chance of being cached on this node by the time the VFS
@@ -1852,32 +2024,32 @@ bail:
1852 * ping locks back and forth, but that's a risk we're willing to take to avoid 2024 * ping locks back and forth, but that's a risk we're willing to take to avoid
1853 * the lock inversion simply. 2025 * the lock inversion simply.
1854 */ 2026 */
1855int ocfs2_meta_lock_with_page(struct inode *inode, 2027int ocfs2_inode_lock_with_page(struct inode *inode,
1856 struct buffer_head **ret_bh, 2028 struct buffer_head **ret_bh,
1857 int ex, 2029 int ex,
1858 struct page *page) 2030 struct page *page)
1859{ 2031{
1860 int ret; 2032 int ret;
1861 2033
1862 ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); 2034 ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
1863 if (ret == -EAGAIN) { 2035 if (ret == -EAGAIN) {
1864 unlock_page(page); 2036 unlock_page(page);
1865 if (ocfs2_meta_lock(inode, ret_bh, ex) == 0) 2037 if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
1866 ocfs2_meta_unlock(inode, ex); 2038 ocfs2_inode_unlock(inode, ex);
1867 ret = AOP_TRUNCATED_PAGE; 2039 ret = AOP_TRUNCATED_PAGE;
1868 } 2040 }
1869 2041
1870 return ret; 2042 return ret;
1871} 2043}
1872 2044
1873int ocfs2_meta_lock_atime(struct inode *inode, 2045int ocfs2_inode_lock_atime(struct inode *inode,
1874 struct vfsmount *vfsmnt, 2046 struct vfsmount *vfsmnt,
1875 int *level) 2047 int *level)
1876{ 2048{
1877 int ret; 2049 int ret;
1878 2050
1879 mlog_entry_void(); 2051 mlog_entry_void();
1880 ret = ocfs2_meta_lock(inode, NULL, 0); 2052 ret = ocfs2_inode_lock(inode, NULL, 0);
1881 if (ret < 0) { 2053 if (ret < 0) {
1882 mlog_errno(ret); 2054 mlog_errno(ret);
1883 return ret; 2055 return ret;
@@ -1890,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1890 if (ocfs2_should_update_atime(inode, vfsmnt)) { 2062 if (ocfs2_should_update_atime(inode, vfsmnt)) {
1891 struct buffer_head *bh = NULL; 2063 struct buffer_head *bh = NULL;
1892 2064
1893 ocfs2_meta_unlock(inode, 0); 2065 ocfs2_inode_unlock(inode, 0);
1894 ret = ocfs2_meta_lock(inode, &bh, 1); 2066 ret = ocfs2_inode_lock(inode, &bh, 1);
1895 if (ret < 0) { 2067 if (ret < 0) {
1896 mlog_errno(ret); 2068 mlog_errno(ret);
1897 return ret; 2069 return ret;
@@ -1908,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1908 return ret; 2080 return ret;
1909} 2081}
1910 2082
1911void ocfs2_meta_unlock(struct inode *inode, 2083void ocfs2_inode_unlock(struct inode *inode,
1912 int ex) 2084 int ex)
1913{ 2085{
1914 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2086 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1915 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 2087 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
1916 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2088 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1917 2089
1918 mlog_entry_void(); 2090 mlog_entry_void();
@@ -2320,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2320 goto bail; 2492 goto bail;
2321 } 2493 }
2322 2494
2323 /* launch vote thread */ 2495 /* launch downconvert thread */
2324 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); 2496 osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
2325 if (IS_ERR(osb->vote_task)) { 2497 if (IS_ERR(osb->dc_task)) {
2326 status = PTR_ERR(osb->vote_task); 2498 status = PTR_ERR(osb->dc_task);
2327 osb->vote_task = NULL; 2499 osb->dc_task = NULL;
2328 mlog_errno(status); 2500 mlog_errno(status);
2329 goto bail; 2501 goto bail;
2330 } 2502 }
@@ -2353,8 +2525,8 @@ local:
2353bail: 2525bail:
2354 if (status < 0) { 2526 if (status < 0) {
2355 ocfs2_dlm_shutdown_debug(osb); 2527 ocfs2_dlm_shutdown_debug(osb);
2356 if (osb->vote_task) 2528 if (osb->dc_task)
2357 kthread_stop(osb->vote_task); 2529 kthread_stop(osb->dc_task);
2358 } 2530 }
2359 2531
2360 mlog_exit(status); 2532 mlog_exit(status);
@@ -2369,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2369 2541
2370 ocfs2_drop_osb_locks(osb); 2542 ocfs2_drop_osb_locks(osb);
2371 2543
2372 if (osb->vote_task) { 2544 if (osb->dc_task) {
2373 kthread_stop(osb->vote_task); 2545 kthread_stop(osb->dc_task);
2374 osb->vote_task = NULL; 2546 osb->dc_task = NULL;
2375 } 2547 }
2376 2548
2377 ocfs2_lock_res_free(&osb->osb_super_lockres); 2549 ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2527,7 +2699,7 @@ out:
2527 2699
2528/* Mark the lockres as being dropped. It will no longer be 2700/* Mark the lockres as being dropped. It will no longer be
2529 * queued if blocking, but we still may have to wait on it 2701 * queued if blocking, but we still may have to wait on it
2530 * being dequeued from the vote thread before we can consider 2702 * being dequeued from the downconvert thread before we can consider
2531 * it safe to drop. 2703 * it safe to drop.
2532 * 2704 *
2533 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 2705 * You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2590,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2590 status = err; 2762 status = err;
2591 2763
2592 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2764 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2593 &OCFS2_I(inode)->ip_data_lockres); 2765 &OCFS2_I(inode)->ip_inode_lockres);
2594 if (err < 0)
2595 mlog_errno(err);
2596 if (err < 0 && !status)
2597 status = err;
2598
2599 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2600 &OCFS2_I(inode)->ip_meta_lockres);
2601 if (err < 0) 2766 if (err < 0)
2602 mlog_errno(err); 2767 mlog_errno(err);
2603 if (err < 0 && !status) 2768 if (err < 0 && !status)
@@ -2850,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2850 inode = ocfs2_lock_res_inode(lockres); 3015 inode = ocfs2_lock_res_inode(lockres);
2851 mapping = inode->i_mapping; 3016 mapping = inode->i_mapping;
2852 3017
3018 if (S_ISREG(inode->i_mode))
3019 goto out;
3020
2853 /* 3021 /*
2854 * We need this before the filemap_fdatawrite() so that it can 3022 * We need this before the filemap_fdatawrite() so that it can
2855 * transfer the dirty bit from the PTE to the 3023 * transfer the dirty bit from the PTE to the
@@ -2875,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2875 filemap_fdatawait(mapping); 3043 filemap_fdatawait(mapping);
2876 } 3044 }
2877 3045
3046out:
2878 return UNBLOCK_CONTINUE; 3047 return UNBLOCK_CONTINUE;
2879} 3048}
2880 3049
@@ -2903,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
2903 3072
2904/* 3073/*
2905 * Does the final reference drop on our dentry lock. Right now this 3074 * Does the final reference drop on our dentry lock. Right now this
2906 * happens in the vote thread, but we could choose to simplify the 3075 * happens in the downconvert thread, but we could choose to simplify the
2907 * dlmglue API and push these off to the ocfs2_wq in the future. 3076 * dlmglue API and push these off to the ocfs2_wq in the future.
2908 */ 3077 */
2909static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 3078static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3042,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3042 mlog(0, "lockres %s blocked.\n", lockres->l_name); 3211 mlog(0, "lockres %s blocked.\n", lockres->l_name);
3043 3212
3044 /* Detect whether a lock has been marked as going away while 3213 /* Detect whether a lock has been marked as going away while
3045 * the vote thread was processing other things. A lock can 3214 * the downconvert thread was processing other things. A lock can
3046 * still be marked with OCFS2_LOCK_FREEING after this check, 3215 * still be marked with OCFS2_LOCK_FREEING after this check,
3047 * but short circuiting here will still save us some 3216 * but short circuiting here will still save us some
3048 * performance. */ 3217 * performance. */
@@ -3091,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3091 3260
3092 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); 3261 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
3093 3262
3094 spin_lock(&osb->vote_task_lock); 3263 spin_lock(&osb->dc_task_lock);
3095 if (list_empty(&lockres->l_blocked_list)) { 3264 if (list_empty(&lockres->l_blocked_list)) {
3096 list_add_tail(&lockres->l_blocked_list, 3265 list_add_tail(&lockres->l_blocked_list,
3097 &osb->blocked_lock_list); 3266 &osb->blocked_lock_list);
3098 osb->blocked_lock_count++; 3267 osb->blocked_lock_count++;
3099 } 3268 }
3100 spin_unlock(&osb->vote_task_lock); 3269 spin_unlock(&osb->dc_task_lock);
3270
3271 mlog_exit_void();
3272}
3273
3274static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
3275{
3276 unsigned long processed;
3277 struct ocfs2_lock_res *lockres;
3278
3279 mlog_entry_void();
3280
3281 spin_lock(&osb->dc_task_lock);
3282 /* grab this early so we know to try again if a state change and
3283 * wake happens part-way through our work */
3284 osb->dc_work_sequence = osb->dc_wake_sequence;
3285
3286 processed = osb->blocked_lock_count;
3287 while (processed) {
3288 BUG_ON(list_empty(&osb->blocked_lock_list));
3289
3290 lockres = list_entry(osb->blocked_lock_list.next,
3291 struct ocfs2_lock_res, l_blocked_list);
3292 list_del_init(&lockres->l_blocked_list);
3293 osb->blocked_lock_count--;
3294 spin_unlock(&osb->dc_task_lock);
3295
3296 BUG_ON(!processed);
3297 processed--;
3298
3299 ocfs2_process_blocked_lock(osb, lockres);
3300
3301 spin_lock(&osb->dc_task_lock);
3302 }
3303 spin_unlock(&osb->dc_task_lock);
3101 3304
3102 mlog_exit_void(); 3305 mlog_exit_void();
3103} 3306}
3307
3308static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
3309{
3310 int empty = 0;
3311
3312 spin_lock(&osb->dc_task_lock);
3313 if (list_empty(&osb->blocked_lock_list))
3314 empty = 1;
3315
3316 spin_unlock(&osb->dc_task_lock);
3317 return empty;
3318}
3319
3320static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
3321{
3322 int should_wake = 0;
3323
3324 spin_lock(&osb->dc_task_lock);
3325 if (osb->dc_work_sequence != osb->dc_wake_sequence)
3326 should_wake = 1;
3327 spin_unlock(&osb->dc_task_lock);
3328
3329 return should_wake;
3330}
3331
3332int ocfs2_downconvert_thread(void *arg)
3333{
3334 int status = 0;
3335 struct ocfs2_super *osb = arg;
3336
3337 /* only quit once we've been asked to stop and there is no more
3338 * work available */
3339 while (!(kthread_should_stop() &&
3340 ocfs2_downconvert_thread_lists_empty(osb))) {
3341
3342 wait_event_interruptible(osb->dc_event,
3343 ocfs2_downconvert_thread_should_wake(osb) ||
3344 kthread_should_stop());
3345
3346 mlog(0, "downconvert_thread: awoken\n");
3347
3348 ocfs2_downconvert_thread_do_work(osb);
3349 }
3350
3351 osb->dc_task = NULL;
3352 return status;
3353}
3354
3355void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
3356{
3357 spin_lock(&osb->dc_task_lock);
3358 /* make sure the voting thread gets a swipe at whatever changes
3359 * the caller may have made to the voting state */
3360 osb->dc_wake_sequence++;
3361 spin_unlock(&osb->dc_task_lock);
3362 wake_up(&osb->dc_event);
3363}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 87a785e41205..5f17243ba501 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,12 +49,12 @@ struct ocfs2_meta_lvb {
49 __be32 lvb_reserved2; 49 __be32 lvb_reserved2;
50}; 50};
51 51
52/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ 52/* ocfs2_inode_lock_full() 'arg_flags' flags */
53/* don't wait on recovery. */ 53/* don't wait on recovery. */
54#define OCFS2_META_LOCK_RECOVERY (0x01) 54#define OCFS2_META_LOCK_RECOVERY (0x01)
55/* Instruct the dlm not to queue ourselves on the other node. */ 55/* Instruct the dlm not to queue ourselves on the other node. */
56#define OCFS2_META_LOCK_NOQUEUE (0x02) 56#define OCFS2_META_LOCK_NOQUEUE (0x02)
57/* don't block waiting for the vote thread, instead return -EAGAIN */ 57/* don't block waiting for the downconvert thread, instead return -EAGAIN */
58#define OCFS2_LOCK_NONBLOCK (0x04) 58#define OCFS2_LOCK_NONBLOCK (0x04)
59 59
60int ocfs2_dlm_init(struct ocfs2_super *osb); 60int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
66 struct inode *inode); 66 struct inode *inode);
67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, 67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
68 u64 parent, struct inode *inode); 68 u64 parent, struct inode *inode);
69struct ocfs2_file_private;
70void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
71 struct ocfs2_file_private *fp);
69void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 72void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
70int ocfs2_create_new_inode_locks(struct inode *inode); 73int ocfs2_create_new_inode_locks(struct inode *inode);
71int ocfs2_drop_inode_locks(struct inode *inode); 74int ocfs2_drop_inode_locks(struct inode *inode);
72int ocfs2_data_lock_full(struct inode *inode,
73 int write,
74 int arg_flags);
75#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
76int ocfs2_data_lock_with_page(struct inode *inode,
77 int write,
78 struct page *page);
79void ocfs2_data_unlock(struct inode *inode,
80 int write);
81int ocfs2_rw_lock(struct inode *inode, int write); 75int ocfs2_rw_lock(struct inode *inode, int write);
82void ocfs2_rw_unlock(struct inode *inode, int write); 76void ocfs2_rw_unlock(struct inode *inode, int write);
83int ocfs2_open_lock(struct inode *inode); 77int ocfs2_open_lock(struct inode *inode);
84int ocfs2_try_open_lock(struct inode *inode, int write); 78int ocfs2_try_open_lock(struct inode *inode, int write);
85void ocfs2_open_unlock(struct inode *inode); 79void ocfs2_open_unlock(struct inode *inode);
86int ocfs2_meta_lock_atime(struct inode *inode, 80int ocfs2_inode_lock_atime(struct inode *inode,
87 struct vfsmount *vfsmnt, 81 struct vfsmount *vfsmnt,
88 int *level); 82 int *level);
89int ocfs2_meta_lock_full(struct inode *inode, 83int ocfs2_inode_lock_full(struct inode *inode,
90 struct buffer_head **ret_bh, 84 struct buffer_head **ret_bh,
91 int ex, 85 int ex,
92 int arg_flags); 86 int arg_flags);
93int ocfs2_meta_lock_with_page(struct inode *inode, 87int ocfs2_inode_lock_with_page(struct inode *inode,
94 struct buffer_head **ret_bh, 88 struct buffer_head **ret_bh,
95 int ex, 89 int ex,
96 struct page *page); 90 struct page *page);
97/* 99% of the time we don't want to supply any additional flags -- 91/* 99% of the time we don't want to supply any additional flags --
98 * those are for very specific cases only. */ 92 * those are for very specific cases only. */
99#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0) 93#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
100void ocfs2_meta_unlock(struct inode *inode, 94void ocfs2_inode_unlock(struct inode *inode,
101 int ex); 95 int ex);
102int ocfs2_super_lock(struct ocfs2_super *osb, 96int ocfs2_super_lock(struct ocfs2_super *osb,
103 int ex); 97 int ex);
@@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
107void ocfs2_rename_unlock(struct ocfs2_super *osb); 101void ocfs2_rename_unlock(struct ocfs2_super *osb);
108int ocfs2_dentry_lock(struct dentry *dentry, int ex); 102int ocfs2_dentry_lock(struct dentry *dentry, int ex);
109void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 103void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
104int ocfs2_file_lock(struct file *file, int ex, int trylock);
105void ocfs2_file_unlock(struct file *file);
110 106
111void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 107void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
112void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 108void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
113 struct ocfs2_lock_res *lockres); 109 struct ocfs2_lock_res *lockres);
114 110
115/* for the vote thread */ 111/* for the downconvert thread */
116void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 112void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
117 struct ocfs2_lock_res *lockres); 113 struct ocfs2_lock_res *lockres);
114void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
118 115
119struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); 116struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
120void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); 117void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index ff257628af16..1942e09f6ee5 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
37 *var = cpu_to_le64(le64_to_cpu(*var) + val); 37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38} 38}
39 39
40static inline void le32_and_cpu(__le32 *var, u32 val)
41{
42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
43}
44
45static inline void be32_add_cpu(__be32 *var, u32 val) 40static inline void be32_add_cpu(__be32 *var, u32 val)
46{ 41{
47 *var = cpu_to_be32(be32_to_cpu(*var) + val); 42 *var = cpu_to_be32(be32_to_cpu(*var) + val);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 535bfa9568a4..67527cebf214 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
58 return ERR_PTR(-ESTALE); 58 return ERR_PTR(-ESTALE);
59 } 59 }
60 60
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); 61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
62 62
63 if (IS_ERR(inode)) 63 if (IS_ERR(inode))
64 return (void *)inode; 64 return (void *)inode;
@@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
95 mlog(0, "find parent of directory %llu\n", 95 mlog(0, "find parent of directory %llu\n",
96 (unsigned long long)OCFS2_I(dir)->ip_blkno); 96 (unsigned long long)OCFS2_I(dir)->ip_blkno);
97 97
98 status = ocfs2_meta_lock(dir, NULL, 0); 98 status = ocfs2_inode_lock(dir, NULL, 0);
99 if (status < 0) { 99 if (status < 0) {
100 if (status != -ENOENT) 100 if (status != -ENOENT)
101 mlog_errno(status); 101 mlog_errno(status);
@@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
109 goto bail_unlock; 109 goto bail_unlock;
110 } 110 }
111 111
112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
113 if (IS_ERR(inode)) { 113 if (IS_ERR(inode)) {
114 mlog(ML_ERROR, "Unable to create inode %llu\n", 114 mlog(ML_ERROR, "Unable to create inode %llu\n",
115 (unsigned long long)blkno); 115 (unsigned long long)blkno);
@@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
126 parent->d_op = &ocfs2_dentry_ops; 126 parent->d_op = &ocfs2_dentry_ops;
127 127
128bail_unlock: 128bail_unlock:
129 ocfs2_meta_unlock(dir, 0); 129 ocfs2_inode_unlock(dir, 0);
130 130
131bail: 131bail:
132 mlog_exit_ptr(parent); 132 mlog_exit_ptr(parent);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b75b2e1f0e42..ed5d5232e85d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -51,6 +51,7 @@
51#include "inode.h" 51#include "inode.h"
52#include "ioctl.h" 52#include "ioctl.h"
53#include "journal.h" 53#include "journal.h"
54#include "locks.h"
54#include "mmap.h" 55#include "mmap.h"
55#include "suballoc.h" 56#include "suballoc.h"
56#include "super.h" 57#include "super.h"
@@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
63 return sync_mapping_buffers(inode->i_mapping); 64 return sync_mapping_buffers(inode->i_mapping);
64} 65}
65 66
67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
68{
69 struct ocfs2_file_private *fp;
70
71 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
72 if (!fp)
73 return -ENOMEM;
74
75 fp->fp_file = file;
76 mutex_init(&fp->fp_mutex);
77 ocfs2_file_lock_res_init(&fp->fp_flock, fp);
78 file->private_data = fp;
79
80 return 0;
81}
82
83static void ocfs2_free_file_private(struct inode *inode, struct file *file)
84{
85 struct ocfs2_file_private *fp = file->private_data;
86 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
87
88 if (fp) {
89 ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
90 ocfs2_lock_res_free(&fp->fp_flock);
91 kfree(fp);
92 file->private_data = NULL;
93 }
94}
95
66static int ocfs2_file_open(struct inode *inode, struct file *file) 96static int ocfs2_file_open(struct inode *inode, struct file *file)
67{ 97{
68 int status; 98 int status;
@@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
89 119
90 oi->ip_open_count++; 120 oi->ip_open_count++;
91 spin_unlock(&oi->ip_lock); 121 spin_unlock(&oi->ip_lock);
92 status = 0; 122
123 status = ocfs2_init_file_private(inode, file);
124 if (status) {
125 /*
126 * We want to set open count back if we're failing the
127 * open.
128 */
129 spin_lock(&oi->ip_lock);
130 oi->ip_open_count--;
131 spin_unlock(&oi->ip_lock);
132 }
133
93leave: 134leave:
94 mlog_exit(status); 135 mlog_exit(status);
95 return status; 136 return status;
@@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
108 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
109 spin_unlock(&oi->ip_lock); 150 spin_unlock(&oi->ip_lock);
110 151
152 ocfs2_free_file_private(inode, file);
153
111 mlog_exit(0); 154 mlog_exit(0);
112 155
113 return 0; 156 return 0;
114} 157}
115 158
159static int ocfs2_dir_open(struct inode *inode, struct file *file)
160{
161 return ocfs2_init_file_private(inode, file);
162}
163
164static int ocfs2_dir_release(struct inode *inode, struct file *file)
165{
166 ocfs2_free_file_private(inode, file);
167 return 0;
168}
169
116static int ocfs2_sync_file(struct file *file, 170static int ocfs2_sync_file(struct file *file,
117 struct dentry *dentry, 171 struct dentry *dentry,
118 int datasync) 172 int datasync)
@@ -382,18 +436,13 @@ static int ocfs2_truncate_file(struct inode *inode,
382 436
383 down_write(&OCFS2_I(inode)->ip_alloc_sem); 437 down_write(&OCFS2_I(inode)->ip_alloc_sem);
384 438
385 /* This forces other nodes to sync and drop their pages. Do 439 /*
386 * this even if we have a truncate without allocation change - 440 * The inode lock forced other nodes to sync and drop their
387 * ocfs2 cluster sizes can be much greater than page size, so 441 * pages, which (correctly) happens even if we have a truncate
388 * we have to truncate them anyway. */ 442 * without allocation change - ocfs2 cluster sizes can be much
389 status = ocfs2_data_lock(inode, 1); 443 * greater than page size, so we have to truncate them
390 if (status < 0) { 444 * anyway.
391 up_write(&OCFS2_I(inode)->ip_alloc_sem); 445 */
392
393 mlog_errno(status);
394 goto bail;
395 }
396
397 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 446 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
398 truncate_inode_pages(inode->i_mapping, new_i_size); 447 truncate_inode_pages(inode->i_mapping, new_i_size);
399 448
@@ -403,7 +452,7 @@ static int ocfs2_truncate_file(struct inode *inode,
403 if (status) 452 if (status)
404 mlog_errno(status); 453 mlog_errno(status);
405 454
406 goto bail_unlock_data; 455 goto bail_unlock_sem;
407 } 456 }
408 457
409 /* alright, we're going to need to do a full blown alloc size 458 /* alright, we're going to need to do a full blown alloc size
@@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode,
413 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 462 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
414 if (status < 0) { 463 if (status < 0) {
415 mlog_errno(status); 464 mlog_errno(status);
416 goto bail_unlock_data; 465 goto bail_unlock_sem;
417 } 466 }
418 467
419 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 468 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
420 if (status < 0) { 469 if (status < 0) {
421 mlog_errno(status); 470 mlog_errno(status);
422 goto bail_unlock_data; 471 goto bail_unlock_sem;
423 } 472 }
424 473
425 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 474 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
426 if (status < 0) { 475 if (status < 0) {
427 mlog_errno(status); 476 mlog_errno(status);
428 goto bail_unlock_data; 477 goto bail_unlock_sem;
429 } 478 }
430 479
431 /* TODO: orphan dir cleanup here. */ 480 /* TODO: orphan dir cleanup here. */
432bail_unlock_data: 481bail_unlock_sem:
433 ocfs2_data_unlock(inode, 1);
434
435 up_write(&OCFS2_I(inode)->ip_alloc_sem); 482 up_write(&OCFS2_I(inode)->ip_alloc_sem);
436 483
437bail: 484bail:
@@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
579 626
580 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 627 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
581 "clusters_to_add = %u, extents_to_split = %u\n", 628 "clusters_to_add = %u, extents_to_split = %u\n",
582 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 629 (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
583 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); 630 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
584 631
585 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 632 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@@ -760,7 +807,7 @@ restarted_transaction:
760 le32_to_cpu(fe->i_clusters), 807 le32_to_cpu(fe->i_clusters),
761 (unsigned long long)le64_to_cpu(fe->i_size)); 808 (unsigned long long)le64_to_cpu(fe->i_size));
762 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 809 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
763 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 810 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
764 811
765leave: 812leave:
766 if (handle) { 813 if (handle) {
@@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode,
917 struct buffer_head *di_bh, 964 struct buffer_head *di_bh,
918 u64 new_i_size) 965 u64 new_i_size)
919{ 966{
920 int ret = 0, data_locked = 0; 967 int ret = 0;
921 struct ocfs2_inode_info *oi = OCFS2_I(inode); 968 struct ocfs2_inode_info *oi = OCFS2_I(inode);
922 969
923 BUG_ON(!di_bh); 970 BUG_ON(!di_bh);
@@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode,
943 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 990 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
944 goto out_update_size; 991 goto out_update_size;
945 992
946 /*
947 * protect the pages that ocfs2_zero_extend is going to be
948 * pulling into the page cache.. we do this before the
949 * metadata extend so that we don't get into the situation
950 * where we've extended the metadata but can't get the data
951 * lock to zero.
952 */
953 ret = ocfs2_data_lock(inode, 1);
954 if (ret < 0) {
955 mlog_errno(ret);
956 goto out;
957 }
958 data_locked = 1;
959
960 /* 993 /*
961 * The alloc sem blocks people in read/write from reading our 994 * The alloc sem blocks people in read/write from reading our
962 * allocation until we're done changing it. We depend on 995 * allocation until we're done changing it. We depend on
@@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode,
980 up_write(&oi->ip_alloc_sem); 1013 up_write(&oi->ip_alloc_sem);
981 1014
982 mlog_errno(ret); 1015 mlog_errno(ret);
983 goto out_unlock; 1016 goto out;
984 } 1017 }
985 } 1018 }
986 1019
@@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode,
991 1024
992 if (ret < 0) { 1025 if (ret < 0) {
993 mlog_errno(ret); 1026 mlog_errno(ret);
994 goto out_unlock; 1027 goto out;
995 } 1028 }
996 1029
997out_update_size: 1030out_update_size:
@@ -999,10 +1032,6 @@ out_update_size:
999 if (ret < 0) 1032 if (ret < 0)
1000 mlog_errno(ret); 1033 mlog_errno(ret);
1001 1034
1002out_unlock:
1003 if (data_locked)
1004 ocfs2_data_unlock(inode, 1);
1005
1006out: 1035out:
1007 return ret; 1036 return ret;
1008} 1037}
@@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1050 } 1079 }
1051 } 1080 }
1052 1081
1053 status = ocfs2_meta_lock(inode, &bh, 1); 1082 status = ocfs2_inode_lock(inode, &bh, 1);
1054 if (status < 0) { 1083 if (status < 0) {
1055 if (status != -ENOENT) 1084 if (status != -ENOENT)
1056 mlog_errno(status); 1085 mlog_errno(status);
@@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1102bail_commit: 1131bail_commit:
1103 ocfs2_commit_trans(osb, handle); 1132 ocfs2_commit_trans(osb, handle);
1104bail_unlock: 1133bail_unlock:
1105 ocfs2_meta_unlock(inode, 1); 1134 ocfs2_inode_unlock(inode, 1);
1106bail_unlock_rw: 1135bail_unlock_rw:
1107 if (size_change) 1136 if (size_change)
1108 ocfs2_rw_unlock(inode, 1); 1137 ocfs2_rw_unlock(inode, 1);
@@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1149 1178
1150 mlog_entry_void(); 1179 mlog_entry_void();
1151 1180
1152 ret = ocfs2_meta_lock(inode, NULL, 0); 1181 ret = ocfs2_inode_lock(inode, NULL, 0);
1153 if (ret) { 1182 if (ret) {
1154 if (ret != -ENOENT) 1183 if (ret != -ENOENT)
1155 mlog_errno(ret); 1184 mlog_errno(ret);
@@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1158 1187
1159 ret = generic_permission(inode, mask, NULL); 1188 ret = generic_permission(inode, mask, NULL);
1160 1189
1161 ocfs2_meta_unlock(inode, 0); 1190 ocfs2_inode_unlock(inode, 0);
1162out: 1191out:
1163 mlog_exit(ret); 1192 mlog_exit(ret);
1164 return ret; 1193 return ret;
@@ -1630,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1630 goto out; 1659 goto out;
1631 } 1660 }
1632 1661
1633 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1662 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1634 if (ret) { 1663 if (ret) {
1635 mlog_errno(ret); 1664 mlog_errno(ret);
1636 goto out_rw_unlock; 1665 goto out_rw_unlock;
@@ -1638,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1638 1667
1639 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1668 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1640 ret = -EPERM; 1669 ret = -EPERM;
1641 goto out_meta_unlock; 1670 goto out_inode_unlock;
1642 } 1671 }
1643 1672
1644 switch (sr->l_whence) { 1673 switch (sr->l_whence) {
@@ -1652,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1652 break; 1681 break;
1653 default: 1682 default:
1654 ret = -EINVAL; 1683 ret = -EINVAL;
1655 goto out_meta_unlock; 1684 goto out_inode_unlock;
1656 } 1685 }
1657 sr->l_whence = 0; 1686 sr->l_whence = 0;
1658 1687
@@ -1663,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1663 || (sr->l_start + llen) < 0 1692 || (sr->l_start + llen) < 0
1664 || (sr->l_start + llen) > max_off) { 1693 || (sr->l_start + llen) > max_off) {
1665 ret = -EINVAL; 1694 ret = -EINVAL;
1666 goto out_meta_unlock; 1695 goto out_inode_unlock;
1667 } 1696 }
1668 size = sr->l_start + sr->l_len; 1697 size = sr->l_start + sr->l_len;
1669 1698
1670 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1699 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1671 if (sr->l_len <= 0) { 1700 if (sr->l_len <= 0) {
1672 ret = -EINVAL; 1701 ret = -EINVAL;
1673 goto out_meta_unlock; 1702 goto out_inode_unlock;
1674 } 1703 }
1675 } 1704 }
1676 1705
@@ -1678,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1678 ret = __ocfs2_write_remove_suid(inode, di_bh); 1707 ret = __ocfs2_write_remove_suid(inode, di_bh);
1679 if (ret) { 1708 if (ret) {
1680 mlog_errno(ret); 1709 mlog_errno(ret);
1681 goto out_meta_unlock; 1710 goto out_inode_unlock;
1682 } 1711 }
1683 } 1712 }
1684 1713
@@ -1704,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1704 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1733 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1705 if (ret) { 1734 if (ret) {
1706 mlog_errno(ret); 1735 mlog_errno(ret);
1707 goto out_meta_unlock; 1736 goto out_inode_unlock;
1708 } 1737 }
1709 1738
1710 /* 1739 /*
@@ -1714,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1714 if (IS_ERR(handle)) { 1743 if (IS_ERR(handle)) {
1715 ret = PTR_ERR(handle); 1744 ret = PTR_ERR(handle);
1716 mlog_errno(ret); 1745 mlog_errno(ret);
1717 goto out_meta_unlock; 1746 goto out_inode_unlock;
1718 } 1747 }
1719 1748
1720 if (change_size && i_size_read(inode) < size) 1749 if (change_size && i_size_read(inode) < size)
@@ -1727,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1727 1756
1728 ocfs2_commit_trans(osb, handle); 1757 ocfs2_commit_trans(osb, handle);
1729 1758
1730out_meta_unlock: 1759out_inode_unlock:
1731 brelse(di_bh); 1760 brelse(di_bh);
1732 ocfs2_meta_unlock(inode, 1); 1761 ocfs2_inode_unlock(inode, 1);
1733out_rw_unlock: 1762out_rw_unlock:
1734 ocfs2_rw_unlock(inode, 1); 1763 ocfs2_rw_unlock(inode, 1);
1735 1764
@@ -1799,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1799 * if we need to make modifications here. 1828 * if we need to make modifications here.
1800 */ 1829 */
1801 for(;;) { 1830 for(;;) {
1802 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1831 ret = ocfs2_inode_lock(inode, NULL, meta_level);
1803 if (ret < 0) { 1832 if (ret < 0) {
1804 meta_level = -1; 1833 meta_level = -1;
1805 mlog_errno(ret); 1834 mlog_errno(ret);
@@ -1817,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1817 * set inode->i_size at the end of a write. */ 1846 * set inode->i_size at the end of a write. */
1818 if (should_remove_suid(dentry)) { 1847 if (should_remove_suid(dentry)) {
1819 if (meta_level == 0) { 1848 if (meta_level == 0) {
1820 ocfs2_meta_unlock(inode, meta_level); 1849 ocfs2_inode_unlock(inode, meta_level);
1821 meta_level = 1; 1850 meta_level = 1;
1822 continue; 1851 continue;
1823 } 1852 }
@@ -1886,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1886 *ppos = saved_pos; 1915 *ppos = saved_pos;
1887 1916
1888out_unlock: 1917out_unlock:
1889 ocfs2_meta_unlock(inode, meta_level); 1918 ocfs2_inode_unlock(inode, meta_level);
1890 1919
1891out: 1920out:
1892 return ret; 1921 return ret;
@@ -2099,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2099 /* 2128 /*
2100 * See the comment in ocfs2_file_aio_read() 2129 * See the comment in ocfs2_file_aio_read()
2101 */ 2130 */
2102 ret = ocfs2_meta_lock(inode, NULL, 0); 2131 ret = ocfs2_inode_lock(inode, NULL, 0);
2103 if (ret < 0) { 2132 if (ret < 0) {
2104 mlog_errno(ret); 2133 mlog_errno(ret);
2105 goto bail; 2134 goto bail;
2106 } 2135 }
2107 ocfs2_meta_unlock(inode, 0); 2136 ocfs2_inode_unlock(inode, 0);
2108 2137
2109 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2138 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2110 2139
@@ -2160,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2160 * like i_size. This allows the checks down below 2189 * like i_size. This allows the checks down below
2161 * generic_file_aio_read() a chance of actually working. 2190 * generic_file_aio_read() a chance of actually working.
2162 */ 2191 */
2163 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2192 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2164 if (ret < 0) { 2193 if (ret < 0) {
2165 mlog_errno(ret); 2194 mlog_errno(ret);
2166 goto bail; 2195 goto bail;
2167 } 2196 }
2168 ocfs2_meta_unlock(inode, lock_level); 2197 ocfs2_inode_unlock(inode, lock_level);
2169 2198
2170 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2199 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2171 if (ret == -EINVAL) 2200 if (ret == -EINVAL)
@@ -2204,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
2204}; 2233};
2205 2234
2206const struct file_operations ocfs2_fops = { 2235const struct file_operations ocfs2_fops = {
2236 .llseek = generic_file_llseek,
2207 .read = do_sync_read, 2237 .read = do_sync_read,
2208 .write = do_sync_write, 2238 .write = do_sync_write,
2209 .mmap = ocfs2_mmap, 2239 .mmap = ocfs2_mmap,
@@ -2216,16 +2246,21 @@ const struct file_operations ocfs2_fops = {
2216#ifdef CONFIG_COMPAT 2246#ifdef CONFIG_COMPAT
2217 .compat_ioctl = ocfs2_compat_ioctl, 2247 .compat_ioctl = ocfs2_compat_ioctl,
2218#endif 2248#endif
2249 .flock = ocfs2_flock,
2219 .splice_read = ocfs2_file_splice_read, 2250 .splice_read = ocfs2_file_splice_read,
2220 .splice_write = ocfs2_file_splice_write, 2251 .splice_write = ocfs2_file_splice_write,
2221}; 2252};
2222 2253
2223const struct file_operations ocfs2_dops = { 2254const struct file_operations ocfs2_dops = {
2255 .llseek = generic_file_llseek,
2224 .read = generic_read_dir, 2256 .read = generic_read_dir,
2225 .readdir = ocfs2_readdir, 2257 .readdir = ocfs2_readdir,
2226 .fsync = ocfs2_sync_file, 2258 .fsync = ocfs2_sync_file,
2259 .release = ocfs2_dir_release,
2260 .open = ocfs2_dir_open,
2227 .ioctl = ocfs2_ioctl, 2261 .ioctl = ocfs2_ioctl,
2228#ifdef CONFIG_COMPAT 2262#ifdef CONFIG_COMPAT
2229 .compat_ioctl = ocfs2_compat_ioctl, 2263 .compat_ioctl = ocfs2_compat_ioctl,
2230#endif 2264#endif
2265 .flock = ocfs2_flock,
2231}; 2266};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 066f14add3a8..048ddcaf5c80 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
32extern const struct inode_operations ocfs2_special_file_iops; 32extern const struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context; 33struct ocfs2_alloc_context;
34 34
35struct ocfs2_file_private {
36 struct file *fp_file;
37 struct mutex fp_mutex;
38 struct ocfs2_lock_res fp_flock;
39};
40
35enum ocfs2_alloc_restarted { 41enum ocfs2_alloc_restarted {
36 RESTART_NONE = 0, 42 RESTART_NONE = 0,
37 RESTART_TRANS, 43 RESTART_TRANS,
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c4c36171240d..c0efd9489fe8 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,9 +30,6 @@
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/kmod.h> 31#include <linux/kmod.h>
32 32
33#include <cluster/heartbeat.h>
34#include <cluster/nodemanager.h>
35
36#include <dlm/dlmapi.h> 33#include <dlm/dlmapi.h>
37 34
38#define MLOG_MASK_PREFIX ML_SUPER 35#define MLOG_MASK_PREFIX ML_SUPER
@@ -44,13 +41,9 @@
44#include "heartbeat.h" 41#include "heartbeat.h"
45#include "inode.h" 42#include "inode.h"
46#include "journal.h" 43#include "journal.h"
47#include "vote.h"
48 44
49#include "buffer_head_io.h" 45#include "buffer_head_io.h"
50 46
51#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
52#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
53
54static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, 47static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
55 int bit); 48 int bit);
56static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, 49static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
64void ocfs2_init_node_maps(struct ocfs2_super *osb) 57void ocfs2_init_node_maps(struct ocfs2_super *osb)
65{ 58{
66 spin_lock_init(&osb->node_map_lock); 59 spin_lock_init(&osb->node_map_lock);
67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map); 60 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map);
70 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); 61 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
71} 62}
72 63
@@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num,
87 return; 78 return;
88 } 79 }
89 80
90 if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
91 /* If a node is in the umount map, then we've been
92 * expecting him to go down and we know ahead of time
93 * that recovery is not necessary. */
94 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
95 return;
96 }
97
98 ocfs2_recovery_thread(osb, node_num); 81 ocfs2_recovery_thread(osb, node_num);
99
100 ocfs2_remove_node_from_vote_queues(osb, node_num);
101}
102
103static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
104 int node_num,
105 void *data)
106{
107 ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
108} 82}
109 83
110/* Called from the dlm when it's about to evict a node. We may also 84/* Called from the dlm when it's about to evict a node. We may also
@@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
121 ocfs2_do_node_down(node_num, osb); 95 ocfs2_do_node_down(node_num, osb);
122} 96}
123 97
124static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
125 int node_num,
126 void *data)
127{
128 struct ocfs2_super *osb = data;
129
130 BUG_ON(osb->node_num == node_num);
131
132 mlog(0, "node up event for %d\n", node_num);
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) 98void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
137{ 99{
138 o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
139 ocfs2_hb_node_down_cb, osb,
140 OCFS2_HB_NODE_DOWN_PRI);
141
142 o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
143 ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
144
145 /* Not exactly a heartbeat callback, but leads to essentially 100 /* Not exactly a heartbeat callback, but leads to essentially
146 * the same path so we set it up here. */ 101 * the same path so we set it up here. */
147 dlm_setup_eviction_cb(&osb->osb_eviction_cb, 102 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
149 osb); 104 osb);
150} 105}
151 106
152/* Most functions here are just stubs for now... */
153int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
154{
155 int status;
156
157 if (ocfs2_mount_local(osb))
158 return 0;
159
160 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
161 if (status < 0) {
162 mlog_errno(status);
163 goto bail;
164 }
165
166 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
167 if (status < 0) {
168 mlog_errno(status);
169 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
170 }
171
172bail:
173 return status;
174}
175
176void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
177{
178 if (ocfs2_mount_local(osb))
179 return;
180
181 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
182 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
183}
184
185void ocfs2_stop_heartbeat(struct ocfs2_super *osb) 107void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
186{ 108{
187 int ret; 109 int ret;
@@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
341 263
342 spin_lock(&osb->node_map_lock); 264 spin_lock(&osb->node_map_lock);
343 265
344 __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
345
346 if (!test_bit(num, osb->recovery_map.map)) { 266 if (!test_bit(num, osb->recovery_map.map)) {
347 __ocfs2_node_map_set_bit(&osb->recovery_map, num); 267 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
348 set = 1; 268 set = 1;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e4..56859211888a 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,8 +29,6 @@
29void ocfs2_init_node_maps(struct ocfs2_super *osb); 29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30 30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); 31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
32int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
33void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
34void ocfs2_stop_heartbeat(struct ocfs2_super *osb); 32void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
35 33
36/* node map functions - used to keep track of mounted and in-recovery 34/* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ebb2bbe30f35..7e9e4c79aec7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,7 +49,6 @@
49#include "symlink.h" 49#include "symlink.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h" 51#include "uptodate.h"
52#include "vote.h"
53 52
54#include "buffer_head_io.h" 53#include "buffer_head_io.h"
55 54
@@ -58,8 +57,11 @@ struct ocfs2_find_inode_args
58 u64 fi_blkno; 57 u64 fi_blkno;
59 unsigned long fi_ino; 58 unsigned long fi_ino;
60 unsigned int fi_flags; 59 unsigned int fi_flags;
60 unsigned int fi_sysfile_type;
61}; 61};
62 62
63static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
64
63static int ocfs2_read_locked_inode(struct inode *inode, 65static int ocfs2_read_locked_inode(struct inode *inode,
64 struct ocfs2_find_inode_args *args); 66 struct ocfs2_find_inode_args *args);
65static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); 67static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
107 oi->ip_attr |= OCFS2_DIRSYNC_FL; 109 oi->ip_attr |= OCFS2_DIRSYNC_FL;
108} 110}
109 111
110struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) 112struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
113 int sysfile_type)
111{ 114{
112 struct inode *inode = NULL; 115 struct inode *inode = NULL;
113 struct super_block *sb = osb->sb; 116 struct super_block *sb = osb->sb;
@@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
127 args.fi_blkno = blkno; 130 args.fi_blkno = blkno;
128 args.fi_flags = flags; 131 args.fi_flags = flags;
129 args.fi_ino = ino_from_blkno(sb, blkno); 132 args.fi_ino = ino_from_blkno(sb, blkno);
133 args.fi_sysfile_type = sysfile_type;
130 134
131 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, 135 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
132 ocfs2_init_locked_inode, &args); 136 ocfs2_init_locked_inode, &args);
@@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
201 205
202 inode->i_ino = args->fi_ino; 206 inode->i_ino = args->fi_ino;
203 OCFS2_I(inode)->ip_blkno = args->fi_blkno; 207 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
208 if (args->fi_sysfile_type != 0)
209 lockdep_set_class(&inode->i_mutex,
210 &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
204 211
205 mlog_exit(0); 212 mlog_exit(0);
206 return 0; 213 return 0;
@@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
322 */ 329 */
323 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); 330 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
324 331
325 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 332 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
326 OCFS2_LOCK_TYPE_META, 0, inode); 333 OCFS2_LOCK_TYPE_META, 0, inode);
327 334
328 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, 335 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
333 OCFS2_LOCK_TYPE_RW, inode->i_generation, 340 OCFS2_LOCK_TYPE_RW, inode->i_generation,
334 inode); 341 inode);
335 342
336 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
337 OCFS2_LOCK_TYPE_DATA, inode->i_generation,
338 inode);
339
340 ocfs2_set_inode_flags(inode); 343 ocfs2_set_inode_flags(inode);
341 344
342 status = 0; 345 status = 0;
@@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
414 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 417 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
415 generation = osb->fs_generation; 418 generation = osb->fs_generation;
416 419
417 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 420 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
418 OCFS2_LOCK_TYPE_META, 421 OCFS2_LOCK_TYPE_META,
419 generation, inode); 422 generation, inode);
420 423
@@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
429 mlog_errno(status); 432 mlog_errno(status);
430 return status; 433 return status;
431 } 434 }
432 status = ocfs2_meta_lock(inode, NULL, 0); 435 status = ocfs2_inode_lock(inode, NULL, 0);
433 if (status) { 436 if (status) {
434 make_bad_inode(inode); 437 make_bad_inode(inode);
435 mlog_errno(status); 438 mlog_errno(status);
@@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
484 487
485bail: 488bail:
486 if (can_lock) 489 if (can_lock)
487 ocfs2_meta_unlock(inode, 0); 490 ocfs2_inode_unlock(inode, 0);
488 491
489 if (status < 0) 492 if (status < 0)
490 make_bad_inode(inode); 493 make_bad_inode(inode);
@@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode,
586 } 589 }
587 590
588 mutex_lock(&inode_alloc_inode->i_mutex); 591 mutex_lock(&inode_alloc_inode->i_mutex);
589 status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1); 592 status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
590 if (status < 0) { 593 if (status < 0) {
591 mutex_unlock(&inode_alloc_inode->i_mutex); 594 mutex_unlock(&inode_alloc_inode->i_mutex);
592 595
@@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
617 } 620 }
618 621
619 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 622 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
620 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 623 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
621 624
622 status = ocfs2_journal_dirty(handle, di_bh); 625 status = ocfs2_journal_dirty(handle, di_bh);
623 if (status < 0) { 626 if (status < 0) {
@@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
635bail_commit: 638bail_commit:
636 ocfs2_commit_trans(osb, handle); 639 ocfs2_commit_trans(osb, handle);
637bail_unlock: 640bail_unlock:
638 ocfs2_meta_unlock(inode_alloc_inode, 1); 641 ocfs2_inode_unlock(inode_alloc_inode, 1);
639 mutex_unlock(&inode_alloc_inode->i_mutex); 642 mutex_unlock(&inode_alloc_inode->i_mutex);
640 brelse(inode_alloc_bh); 643 brelse(inode_alloc_bh);
641bail: 644bail:
@@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
709 * delete_inode operation. We do this now to avoid races with 712 * delete_inode operation. We do this now to avoid races with
710 * recovery completion on other nodes. */ 713 * recovery completion on other nodes. */
711 mutex_lock(&orphan_dir_inode->i_mutex); 714 mutex_lock(&orphan_dir_inode->i_mutex);
712 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 715 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
713 if (status < 0) { 716 if (status < 0) {
714 mutex_unlock(&orphan_dir_inode->i_mutex); 717 mutex_unlock(&orphan_dir_inode->i_mutex);
715 718
@@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
718 } 721 }
719 722
720 /* we do this while holding the orphan dir lock because we 723 /* we do this while holding the orphan dir lock because we
721 * don't want recovery being run from another node to vote for 724 * don't want recovery being run from another node to try an
722 * an inode delete on us -- this will result in two nodes 725 * inode delete underneath us -- this will result in two nodes
723 * truncating the same file! */ 726 * truncating the same file! */
724 status = ocfs2_truncate_for_delete(osb, inode, di_bh); 727 status = ocfs2_truncate_for_delete(osb, inode, di_bh);
725 if (status < 0) { 728 if (status < 0) {
@@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
733 mlog_errno(status); 736 mlog_errno(status);
734 737
735bail_unlock_dir: 738bail_unlock_dir:
736 ocfs2_meta_unlock(orphan_dir_inode, 1); 739 ocfs2_inode_unlock(orphan_dir_inode, 1);
737 mutex_unlock(&orphan_dir_inode->i_mutex); 740 mutex_unlock(&orphan_dir_inode->i_mutex);
738 brelse(orphan_dir_bh); 741 brelse(orphan_dir_bh);
739bail: 742bail:
@@ -744,7 +747,7 @@ bail:
744} 747}
745 748
746/* There is a series of simple checks that should be done before a 749/* There is a series of simple checks that should be done before a
747 * vote is even considered. Encapsulate those in this function. */ 750 * trylock is even considered. Encapsulate those in this function. */
748static int ocfs2_inode_is_valid_to_delete(struct inode *inode) 751static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
749{ 752{
750 int ret = 0; 753 int ret = 0;
@@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
758 goto bail; 761 goto bail;
759 } 762 }
760 763
761 /* If we're coming from process_vote we can't go into our own 764 /* If we're coming from downconvert_thread we can't go into our own
762 * voting [hello, deadlock city!], so unforuntately we just 765 * voting [hello, deadlock city!], so unforuntately we just
763 * have to skip deleting this guy. That's OK though because 766 * have to skip deleting this guy. That's OK though because
764 * the node who's doing the actual deleting should handle it 767 * the node who's doing the actual deleting should handle it
765 * anyway. */ 768 * anyway. */
766 if (current == osb->vote_task) { 769 if (current == osb->dc_task) {
767 mlog(0, "Skipping delete of %lu because we're currently " 770 mlog(0, "Skipping delete of %lu because we're currently "
768 "in process_vote\n", inode->i_ino); 771 "in downconvert\n", inode->i_ino);
769 goto bail; 772 goto bail;
770 } 773 }
771 774
@@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
779 goto bail_unlock; 782 goto bail_unlock;
780 } 783 }
781 784
782 /* If we have voted "yes" on the wipe of this inode for 785 /* If we have allowd wipe of this inode for another node, it
783 * another node, it will be marked here so we can safely skip 786 * will be marked here so we can safely skip it. Recovery will
784 * it. Recovery will cleanup any inodes we might inadvertantly 787 * cleanup any inodes we might inadvertantly skip here. */
785 * skip here. */
786 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { 788 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
787 mlog(0, "Skipping delete of %lu because another node " 789 mlog(0, "Skipping delete of %lu because another node "
788 "has done this for us.\n", inode->i_ino); 790 "has done this for us.\n", inode->i_ino);
@@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode)
929 931
930 /* Lock down the inode. This gives us an up to date view of 932 /* Lock down the inode. This gives us an up to date view of
931 * it's metadata (for verification), and allows us to 933 * it's metadata (for verification), and allows us to
932 * serialize delete_inode votes. 934 * serialize delete_inode on multiple nodes.
933 * 935 *
934 * Even though we might be doing a truncate, we don't take the 936 * Even though we might be doing a truncate, we don't take the
935 * allocation lock here as it won't be needed - nobody will 937 * allocation lock here as it won't be needed - nobody will
936 * have the file open. 938 * have the file open.
937 */ 939 */
938 status = ocfs2_meta_lock(inode, &di_bh, 1); 940 status = ocfs2_inode_lock(inode, &di_bh, 1);
939 if (status < 0) { 941 if (status < 0) {
940 if (status != -ENOENT) 942 if (status != -ENOENT)
941 mlog_errno(status); 943 mlog_errno(status);
@@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode)
947 * before we go ahead and wipe the inode. */ 949 * before we go ahead and wipe the inode. */
948 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); 950 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
949 if (!wipe || status < 0) { 951 if (!wipe || status < 0) {
950 /* Error and inode busy vote both mean we won't be 952 /* Error and remote inode busy both mean we won't be
951 * removing the inode, so they take almost the same 953 * removing the inode, so they take almost the same
952 * path. */ 954 * path. */
953 if (status < 0) 955 if (status < 0)
954 mlog_errno(status); 956 mlog_errno(status);
955 957
956 /* Someone in the cluster has voted to not wipe this 958 /* Someone in the cluster has disallowed a wipe of
957 * inode, or it was never completely orphaned. Write 959 * this inode, or it was never completely
958 * out the pages and exit now. */ 960 * orphaned. Write out the pages and exit now. */
959 ocfs2_cleanup_delete_inode(inode, 1); 961 ocfs2_cleanup_delete_inode(inode, 1);
960 goto bail_unlock_inode; 962 goto bail_unlock_inode;
961 } 963 }
@@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode)
981 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; 983 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
982 984
983bail_unlock_inode: 985bail_unlock_inode:
984 ocfs2_meta_unlock(inode, 1); 986 ocfs2_inode_unlock(inode, 1);
985 brelse(di_bh); 987 brelse(di_bh);
986bail_unblock: 988bail_unblock:
987 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 989 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode)
1008 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1010 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1009 "Inode=%lu\n", inode->i_ino); 1011 "Inode=%lu\n", inode->i_ino);
1010 1012
1011 /* For remove delete_inode vote, we hold open lock before, 1013 /* To preven remote deletes we hold open lock before, now it
1012 * now it is time to unlock PR and EX open locks. */ 1014 * is time to unlock PR and EX open locks. */
1013 ocfs2_open_unlock(inode); 1015 ocfs2_open_unlock(inode);
1014 1016
1015 /* Do these before all the other work so that we don't bounce 1017 /* Do these before all the other work so that we don't bounce
1016 * the vote thread while waiting to destroy the locks. */ 1018 * the downconvert thread while waiting to destroy the locks. */
1017 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 1019 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
1018 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); 1020 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
1019 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
1020 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1021 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1021 1022
1022 /* We very well may get a clear_inode before all an inodes 1023 /* We very well may get a clear_inode before all an inodes
@@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode)
1039 mlog_errno(status); 1040 mlog_errno(status);
1040 1041
1041 ocfs2_lock_res_free(&oi->ip_rw_lockres); 1042 ocfs2_lock_res_free(&oi->ip_rw_lockres);
1042 ocfs2_lock_res_free(&oi->ip_meta_lockres); 1043 ocfs2_lock_res_free(&oi->ip_inode_lockres);
1043 ocfs2_lock_res_free(&oi->ip_data_lockres);
1044 ocfs2_lock_res_free(&oi->ip_open_lockres); 1044 ocfs2_lock_res_free(&oi->ip_open_lockres);
1045 1045
1046 ocfs2_metadata_cache_purge(inode); 1046 ocfs2_metadata_cache_purge(inode);
@@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
1184 } 1184 }
1185 spin_unlock(&OCFS2_I(inode)->ip_lock); 1185 spin_unlock(&OCFS2_I(inode)->ip_lock);
1186 1186
1187 /* Let ocfs2_meta_lock do the work of updating our struct 1187 /* Let ocfs2_inode_lock do the work of updating our struct
1188 * inode for us. */ 1188 * inode for us. */
1189 status = ocfs2_meta_lock(inode, NULL, 0); 1189 status = ocfs2_inode_lock(inode, NULL, 0);
1190 if (status < 0) { 1190 if (status < 0) {
1191 if (status != -ENOENT) 1191 if (status != -ENOENT)
1192 mlog_errno(status); 1192 mlog_errno(status);
1193 goto bail; 1193 goto bail;
1194 } 1194 }
1195 ocfs2_meta_unlock(inode, 0); 1195 ocfs2_inode_unlock(inode, 0);
1196bail: 1196bail:
1197 mlog_exit(status); 1197 mlog_exit(status);
1198 1198
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 70e881c55536..390a85596aa0 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,8 +34,7 @@ struct ocfs2_inode_info
34 u64 ip_blkno; 34 u64 ip_blkno;
35 35
36 struct ocfs2_lock_res ip_rw_lockres; 36 struct ocfs2_lock_res ip_rw_lockres;
37 struct ocfs2_lock_res ip_meta_lockres; 37 struct ocfs2_lock_res ip_inode_lockres;
38 struct ocfs2_lock_res ip_data_lockres;
39 struct ocfs2_lock_res ip_open_lockres; 38 struct ocfs2_lock_res ip_open_lockres;
40 39
41 /* protects allocation changes on this inode. */ 40 /* protects allocation changes on this inode. */
@@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
121void ocfs2_drop_inode(struct inode *inode); 120void ocfs2_drop_inode(struct inode *inode);
122 121
123/* Flags for ocfs2_iget() */ 122/* Flags for ocfs2_iget() */
124#define OCFS2_FI_FLAG_SYSFILE 0x4 123#define OCFS2_FI_FLAG_SYSFILE 0x1
125#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 124#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
126struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); 125struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
126 int sysfile_type);
127int ocfs2_inode_init_private(struct inode *inode); 127int ocfs2_inode_init_private(struct inode *inode);
128int ocfs2_inode_revalidate(struct dentry *dentry); 128int ocfs2_inode_revalidate(struct dentry *dentry);
129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 87dcece7e1b5..5177fba5162b 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -20,6 +20,7 @@
20 20
21#include "ocfs2_fs.h" 21#include "ocfs2_fs.h"
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h"
23 24
24#include <linux/ext2_fs.h> 25#include <linux/ext2_fs.h>
25 26
@@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
27{ 28{
28 int status; 29 int status;
29 30
30 status = ocfs2_meta_lock(inode, NULL, 0); 31 status = ocfs2_inode_lock(inode, NULL, 0);
31 if (status < 0) { 32 if (status < 0) {
32 mlog_errno(status); 33 mlog_errno(status);
33 return status; 34 return status;
34 } 35 }
35 ocfs2_get_inode_flags(OCFS2_I(inode)); 36 ocfs2_get_inode_flags(OCFS2_I(inode));
36 *flags = OCFS2_I(inode)->ip_attr; 37 *flags = OCFS2_I(inode)->ip_attr;
37 ocfs2_meta_unlock(inode, 0); 38 ocfs2_inode_unlock(inode, 0);
38 39
39 mlog_exit(status); 40 mlog_exit(status);
40 return status; 41 return status;
@@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
52 53
53 mutex_lock(&inode->i_mutex); 54 mutex_lock(&inode->i_mutex);
54 55
55 status = ocfs2_meta_lock(inode, &bh, 1); 56 status = ocfs2_inode_lock(inode, &bh, 1);
56 if (status < 0) { 57 if (status < 0) {
57 mlog_errno(status); 58 mlog_errno(status);
58 goto bail; 59 goto bail;
@@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
100 101
101 ocfs2_commit_trans(osb, handle); 102 ocfs2_commit_trans(osb, handle);
102bail_unlock: 103bail_unlock:
103 ocfs2_meta_unlock(inode, 1); 104 ocfs2_inode_unlock(inode, 1);
104bail: 105bail:
105 mutex_unlock(&inode->i_mutex); 106 mutex_unlock(&inode->i_mutex);
106 107
@@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
115 unsigned int cmd, unsigned long arg) 116 unsigned int cmd, unsigned long arg)
116{ 117{
117 unsigned int flags; 118 unsigned int flags;
119 int new_clusters;
118 int status; 120 int status;
119 struct ocfs2_space_resv sr; 121 struct ocfs2_space_resv sr;
122 struct ocfs2_new_group_input input;
120 123
121 switch (cmd) { 124 switch (cmd) {
122 case OCFS2_IOC_GETFLAGS: 125 case OCFS2_IOC_GETFLAGS:
@@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
140 return -EFAULT; 143 return -EFAULT;
141 144
142 return ocfs2_change_file_space(filp, cmd, &sr); 145 return ocfs2_change_file_space(filp, cmd, &sr);
146 case OCFS2_IOC_GROUP_EXTEND:
147 if (!capable(CAP_SYS_RESOURCE))
148 return -EPERM;
149
150 if (get_user(new_clusters, (int __user *)arg))
151 return -EFAULT;
152
153 return ocfs2_group_extend(inode, new_clusters);
154 case OCFS2_IOC_GROUP_ADD:
155 case OCFS2_IOC_GROUP_ADD64:
156 if (!capable(CAP_SYS_RESOURCE))
157 return -EPERM;
158
159 if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
160 return -EFAULT;
161
162 return ocfs2_group_add(inode, &input);
143 default: 163 default:
144 return -ENOTTY; 164 return -ENOTTY;
145 } 165 }
@@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
162 case OCFS2_IOC_RESVSP64: 182 case OCFS2_IOC_RESVSP64:
163 case OCFS2_IOC_UNRESVSP: 183 case OCFS2_IOC_UNRESVSP:
164 case OCFS2_IOC_UNRESVSP64: 184 case OCFS2_IOC_UNRESVSP64:
185 case OCFS2_IOC_GROUP_EXTEND:
186 case OCFS2_IOC_GROUP_ADD:
187 case OCFS2_IOC_GROUP_ADD64:
165 break; 188 break;
166 default: 189 default:
167 return -ENOIOCTLCMD; 190 return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8d81f6c1b877..f31c7e8c19c3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -44,7 +44,6 @@
44#include "localalloc.h" 44#include "localalloc.h"
45#include "slot_map.h" 45#include "slot_map.h"
46#include "super.h" 46#include "super.h"
47#include "vote.h"
48#include "sysfile.h" 47#include "sysfile.h"
49 48
50#include "buffer_head_io.h" 49#include "buffer_head_io.h"
@@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
103 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", 102 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
104 journal->j_trans_id, flushed); 103 journal->j_trans_id, flushed);
105 104
106 ocfs2_kick_vote_thread(osb); 105 ocfs2_wake_downconvert_thread(osb);
107 wake_up(&journal->j_checkpointed); 106 wake_up(&journal->j_checkpointed);
108finally: 107finally:
109 mlog_exit(status); 108 mlog_exit(status);
@@ -314,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
314 return err; 313 return err;
315} 314}
316 315
317#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) 316#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
318 317
319void ocfs2_set_journal_params(struct ocfs2_super *osb) 318void ocfs2_set_journal_params(struct ocfs2_super *osb)
320{ 319{
321 journal_t *journal = osb->journal->j_journal; 320 journal_t *journal = osb->journal->j_journal;
321 unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
322
323 if (osb->osb_commit_interval)
324 commit_interval = osb->osb_commit_interval;
322 325
323 spin_lock(&journal->j_state_lock); 326 spin_lock(&journal->j_state_lock);
324 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; 327 journal->j_commit_interval = commit_interval;
325 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 328 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
326 journal->j_flags |= JFS_BARRIER; 329 journal->j_flags |= JFS_BARRIER;
327 else 330 else
@@ -337,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
337 struct ocfs2_dinode *di = NULL; 340 struct ocfs2_dinode *di = NULL;
338 struct buffer_head *bh = NULL; 341 struct buffer_head *bh = NULL;
339 struct ocfs2_super *osb; 342 struct ocfs2_super *osb;
340 int meta_lock = 0; 343 int inode_lock = 0;
341 344
342 mlog_entry_void(); 345 mlog_entry_void();
343 346
@@ -367,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
367 /* Skip recovery waits here - journal inode metadata never 370 /* Skip recovery waits here - journal inode metadata never
368 * changes in a live cluster so it can be considered an 371 * changes in a live cluster so it can be considered an
369 * exception to the rule. */ 372 * exception to the rule. */
370 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 373 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
371 if (status < 0) { 374 if (status < 0) {
372 if (status != -ERESTARTSYS) 375 if (status != -ERESTARTSYS)
373 mlog(ML_ERROR, "Could not get lock on journal!\n"); 376 mlog(ML_ERROR, "Could not get lock on journal!\n");
374 goto done; 377 goto done;
375 } 378 }
376 379
377 meta_lock = 1; 380 inode_lock = 1;
378 di = (struct ocfs2_dinode *)bh->b_data; 381 di = (struct ocfs2_dinode *)bh->b_data;
379 382
380 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { 383 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
@@ -414,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
414 status = 0; 417 status = 0;
415done: 418done:
416 if (status < 0) { 419 if (status < 0) {
417 if (meta_lock) 420 if (inode_lock)
418 ocfs2_meta_unlock(inode, 1); 421 ocfs2_inode_unlock(inode, 1);
419 if (bh != NULL) 422 if (bh != NULL)
420 brelse(bh); 423 brelse(bh);
421 if (inode) { 424 if (inode) {
@@ -544,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
544 OCFS2_I(inode)->ip_open_count--; 547 OCFS2_I(inode)->ip_open_count--;
545 548
546 /* unlock our journal */ 549 /* unlock our journal */
547 ocfs2_meta_unlock(inode, 1); 550 ocfs2_inode_unlock(inode, 1);
548 551
549 brelse(journal->j_bh); 552 brelse(journal->j_bh);
550 journal->j_bh = NULL; 553 journal->j_bh = NULL;
@@ -883,8 +886,8 @@ restart:
883 ocfs2_super_unlock(osb, 1); 886 ocfs2_super_unlock(osb, 1);
884 887
885 /* We always run recovery on our own orphan dir - the dead 888 /* We always run recovery on our own orphan dir - the dead
886 * node(s) may have voted "no" on an inode delete earlier. A 889 * node(s) may have disallowd a previos inode delete. Re-processing
887 * revote is therefore required. */ 890 * is therefore required. */
888 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 891 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
889 NULL); 892 NULL);
890 893
@@ -973,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
973 } 976 }
974 SET_INODE_JOURNAL(inode); 977 SET_INODE_JOURNAL(inode);
975 978
976 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 979 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
977 if (status < 0) { 980 if (status < 0) {
978 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); 981 mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
979 if (status != -ERESTARTSYS) 982 if (status != -ERESTARTSYS)
980 mlog(ML_ERROR, "Could not lock journal!\n"); 983 mlog(ML_ERROR, "Could not lock journal!\n");
981 goto done; 984 goto done;
@@ -1047,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1047done: 1050done:
1048 /* drop the lock on this nodes journal */ 1051 /* drop the lock on this nodes journal */
1049 if (got_lock) 1052 if (got_lock)
1050 ocfs2_meta_unlock(inode, 1); 1053 ocfs2_inode_unlock(inode, 1);
1051 1054
1052 if (inode) 1055 if (inode)
1053 iput(inode); 1056 iput(inode);
@@ -1162,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1162 SET_INODE_JOURNAL(inode); 1165 SET_INODE_JOURNAL(inode);
1163 1166
1164 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; 1167 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1165 status = ocfs2_meta_lock_full(inode, NULL, 1, flags); 1168 status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
1166 if (status < 0) { 1169 if (status < 0) {
1167 if (status != -EAGAIN) 1170 if (status != -EAGAIN)
1168 mlog_errno(status); 1171 mlog_errno(status);
1169 goto bail; 1172 goto bail;
1170 } 1173 }
1171 1174
1172 ocfs2_meta_unlock(inode, 1); 1175 ocfs2_inode_unlock(inode, 1);
1173bail: 1176bail:
1174 if (inode) 1177 if (inode)
1175 iput(inode); 1178 iput(inode);
@@ -1241,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
1241 1244
1242 /* Skip bad inodes so that recovery can continue */ 1245 /* Skip bad inodes so that recovery can continue */
1243 iter = ocfs2_iget(p->osb, ino, 1246 iter = ocfs2_iget(p->osb, ino,
1244 OCFS2_FI_FLAG_ORPHAN_RECOVERY); 1247 OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
1245 if (IS_ERR(iter)) 1248 if (IS_ERR(iter))
1246 return 0; 1249 return 0;
1247 1250
@@ -1277,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1277 } 1280 }
1278 1281
1279 mutex_lock(&orphan_dir_inode->i_mutex); 1282 mutex_lock(&orphan_dir_inode->i_mutex);
1280 status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0); 1283 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
1281 if (status < 0) { 1284 if (status < 0) {
1282 mlog_errno(status); 1285 mlog_errno(status);
1283 goto out; 1286 goto out;
@@ -1293,7 +1296,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1293 *head = priv.head; 1296 *head = priv.head;
1294 1297
1295out_cluster: 1298out_cluster:
1296 ocfs2_meta_unlock(orphan_dir_inode, 0); 1299 ocfs2_inode_unlock(orphan_dir_inode, 0);
1297out: 1300out:
1298 mutex_unlock(&orphan_dir_inode->i_mutex); 1301 mutex_unlock(&orphan_dir_inode->i_mutex);
1299 iput(orphan_dir_inode); 1302 iput(orphan_dir_inode);
@@ -1380,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1380 iter = oi->ip_next_orphan; 1383 iter = oi->ip_next_orphan;
1381 1384
1382 spin_lock(&oi->ip_lock); 1385 spin_lock(&oi->ip_lock);
1383 /* Delete voting may have set these on the assumption 1386 /* The remote delete code may have set these on the
1384 * that the other node would wipe them successfully. 1387 * assumption that the other node would wipe them
1385 * If they are still in the node's orphan dir, we need 1388 * successfully. If they are still in the node's
1386 * to reset that state. */ 1389 * orphan dir, we need to reset that state. */
1387 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); 1390 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1388 1391
1389 /* Set the proper information to get us going into 1392 /* Set the proper information to get us going into
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4b32e0961568..220f3e818e78 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -278,6 +278,12 @@ int ocfs2_journal_dirty_data(handle_t *handle,
278/* simple file updates like chmod, etc. */ 278/* simple file updates like chmod, etc. */
279#define OCFS2_INODE_UPDATE_CREDITS 1 279#define OCFS2_INODE_UPDATE_CREDITS 1
280 280
281/* group extend. inode update and last group update. */
282#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
283
284/* group add. inode update and the new group update. */
285#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
286
281/* get one bit out of a suballocator: dinode + group descriptor + 287/* get one bit out of a suballocator: dinode + group descriptor +
282 * prev. group desc. if we relink. */ 288 * prev. group desc. if we relink. */
283#define OCFS2_SUBALLOC_ALLOC (3) 289#define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 58ea88b5af36..add1ffdc5c6c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 76 struct inode *local_alloc_inode);
77 77
78/*
79 * Determine how large our local alloc window should be, in bits.
80 *
81 * These values (and the behavior in ocfs2_alloc_should_use_local) have
82 * been chosen so that most allocations, including new block groups go
83 * through local alloc.
84 */
85static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) 78static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
86{ 79{
87 BUG_ON(osb->s_clustersize_bits < 12); 80 BUG_ON(osb->s_clustersize_bits > 20);
88 81
89 return 2048 >> (osb->s_clustersize_bits - 12); 82 /* Size local alloc windows by the megabyte */
83 return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
90} 84}
91 85
92/* 86/*
@@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
96int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) 90int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
97{ 91{
98 int la_bits = ocfs2_local_alloc_window_bits(osb); 92 int la_bits = ocfs2_local_alloc_window_bits(osb);
93 int ret = 0;
99 94
100 if (osb->local_alloc_state != OCFS2_LA_ENABLED) 95 if (osb->local_alloc_state != OCFS2_LA_ENABLED)
101 return 0; 96 goto bail;
102 97
103 /* la_bits should be at least twice the size (in clusters) of 98 /* la_bits should be at least twice the size (in clusters) of
104 * a new block group. We want to be sure block group 99 * a new block group. We want to be sure block group
105 * allocations go through the local alloc, so allow an 100 * allocations go through the local alloc, so allow an
106 * allocation to take up to half the bitmap. */ 101 * allocation to take up to half the bitmap. */
107 if (bits > (la_bits / 2)) 102 if (bits > (la_bits / 2))
108 return 0; 103 goto bail;
109 104
110 return 1; 105 ret = 1;
106bail:
107 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
108 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
109 return ret;
111} 110}
112 111
113int ocfs2_load_local_alloc(struct ocfs2_super *osb) 112int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
121 120
122 mlog_entry_void(); 121 mlog_entry_void();
123 122
123 if (ocfs2_mount_local(osb))
124 goto bail;
125
126 if (osb->local_alloc_size == 0)
127 goto bail;
128
129 if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
130 mlog(ML_NOTICE, "Requested local alloc window %d is larger "
131 "than max possible %u. Using defaults.\n",
132 ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
133 osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
134 }
135
124 /* read the alloc off disk */ 136 /* read the alloc off disk */
125 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, 137 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
126 osb->slot_num); 138 osb->slot_num);
@@ -181,6 +193,9 @@ bail:
181 if (inode) 193 if (inode)
182 iput(inode); 194 iput(inode);
183 195
196 mlog(0, "Local alloc window bits = %d\n",
197 ocfs2_local_alloc_window_bits(osb));
198
184 mlog_exit(status); 199 mlog_exit(status);
185 return status; 200 return status;
186} 201}
@@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
231 246
232 mutex_lock(&main_bm_inode->i_mutex); 247 mutex_lock(&main_bm_inode->i_mutex);
233 248
234 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 249 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
235 if (status < 0) { 250 if (status < 0) {
236 mlog_errno(status); 251 mlog_errno(status);
237 goto out_mutex; 252 goto out_mutex;
@@ -286,7 +301,7 @@ out_unlock:
286 if (main_bm_bh) 301 if (main_bm_bh)
287 brelse(main_bm_bh); 302 brelse(main_bm_bh);
288 303
289 ocfs2_meta_unlock(main_bm_inode, 1); 304 ocfs2_inode_unlock(main_bm_inode, 1);
290 305
291out_mutex: 306out_mutex:
292 mutex_unlock(&main_bm_inode->i_mutex); 307 mutex_unlock(&main_bm_inode->i_mutex);
@@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
399 414
400 mutex_lock(&main_bm_inode->i_mutex); 415 mutex_lock(&main_bm_inode->i_mutex);
401 416
402 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 417 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
403 if (status < 0) { 418 if (status < 0) {
404 mlog_errno(status); 419 mlog_errno(status);
405 goto out_mutex; 420 goto out_mutex;
@@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
424 ocfs2_commit_trans(osb, handle); 439 ocfs2_commit_trans(osb, handle);
425 440
426out_unlock: 441out_unlock:
427 ocfs2_meta_unlock(main_bm_inode, 1); 442 ocfs2_inode_unlock(main_bm_inode, 1);
428 443
429out_mutex: 444out_mutex:
430 mutex_unlock(&main_bm_inode->i_mutex); 445 mutex_unlock(&main_bm_inode->i_mutex);
@@ -521,6 +536,9 @@ bail:
521 iput(local_alloc_inode); 536 iput(local_alloc_inode);
522 } 537 }
523 538
539 mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
540 status);
541
524 mlog_exit(status); 542 mlog_exit(status);
525 return status; 543 return status;
526} 544}
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 000000000000..203f87143877
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,125 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * locks.c
5 *
6 * Userspace file locking support
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27
28#define MLOG_MASK_PREFIX ML_INODE
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32
33#include "dlmglue.h"
34#include "file.h"
35#include "locks.h"
36
37static int ocfs2_do_flock(struct file *file, struct inode *inode,
38 int cmd, struct file_lock *fl)
39{
40 int ret = 0, level = 0, trylock = 0;
41 struct ocfs2_file_private *fp = file->private_data;
42 struct ocfs2_lock_res *lockres = &fp->fp_flock;
43
44 if (fl->fl_type == F_WRLCK)
45 level = 1;
46 if (!IS_SETLKW(cmd))
47 trylock = 1;
48
49 mutex_lock(&fp->fp_mutex);
50
51 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
52 lockres->l_level > LKM_NLMODE) {
53 int old_level = 0;
54
55 if (lockres->l_level == LKM_EXMODE)
56 old_level = 1;
57
58 if (level == old_level)
59 goto out;
60
61 /*
62 * Converting an existing lock is not guaranteed to be
63 * atomic, so we can get away with simply unlocking
64 * here and allowing the lock code to try at the new
65 * level.
66 */
67
68 flock_lock_file_wait(file,
69 &(struct file_lock){.fl_type = F_UNLCK});
70
71 ocfs2_file_unlock(file);
72 }
73
74 ret = ocfs2_file_lock(file, level, trylock);
75 if (ret) {
76 if (ret == -EAGAIN && trylock)
77 ret = -EWOULDBLOCK;
78 else
79 mlog_errno(ret);
80 goto out;
81 }
82
83 ret = flock_lock_file_wait(file, fl);
84
85out:
86 mutex_unlock(&fp->fp_mutex);
87
88 return ret;
89}
90
91static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
92{
93 int ret;
94 struct ocfs2_file_private *fp = file->private_data;
95
96 mutex_lock(&fp->fp_mutex);
97 ocfs2_file_unlock(file);
98 ret = flock_lock_file_wait(file, fl);
99 mutex_unlock(&fp->fp_mutex);
100
101 return ret;
102}
103
104/*
105 * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
106 */
107int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
108{
109 struct inode *inode = file->f_mapping->host;
110 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
111
112 if (!(fl->fl_flags & FL_FLOCK))
113 return -ENOLCK;
114 if (__mandatory_lock(inode))
115 return -ENOLCK;
116
117 if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
118 ocfs2_mount_local(osb))
119 return flock_lock_file_wait(file, fl);
120
121 if (fl->fl_type == F_UNLCK)
122 return ocfs2_do_funlock(file, cmd, fl);
123 else
124 return ocfs2_do_flock(file, inode, cmd, fl);
125}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/locks.h
index 9ea46f62de31..9743ef2324ec 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/locks.h
@@ -1,9 +1,9 @@
1/* -*- mode: c; c-basic-offset: 8; -*- 1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * vote.h 4 * locks.h
5 * 5 *
6 * description here 6 * Function prototypes for Userspace file locking support
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
@@ -23,26 +23,9 @@
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26#ifndef OCFS2_LOCKS_H
27#define OCFS2_LOCKS_H
26 28
27#ifndef VOTE_H 29int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
28#define VOTE_H
29 30
30int ocfs2_vote_thread(void *arg); 31#endif /* OCFS2_LOCKS_H */
31static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
32{
33 spin_lock(&osb->vote_task_lock);
34 /* make sure the voting thread gets a swipe at whatever changes
35 * the caller may have made to the voting state */
36 osb->vote_wake_sequence++;
37 spin_unlock(&osb->vote_task_lock);
38 wake_up(&osb->vote_event);
39}
40
41int ocfs2_request_mount_vote(struct ocfs2_super *osb);
42int ocfs2_request_umount_vote(struct ocfs2_super *osb);
43int ocfs2_register_net_handlers(struct ocfs2_super *osb);
44void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
45
46void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
47 int node_num);
48#endif
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 98756156d298..3dc18d67557c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
168 * node. Taking the data lock will also ensure that we don't 168 * node. Taking the data lock will also ensure that we don't
169 * attempt page truncation as part of a downconvert. 169 * attempt page truncation as part of a downconvert.
170 */ 170 */
171 ret = ocfs2_meta_lock(inode, &di_bh, 1); 171 ret = ocfs2_inode_lock(inode, &di_bh, 1);
172 if (ret < 0) { 172 if (ret < 0) {
173 mlog_errno(ret); 173 mlog_errno(ret);
174 goto out; 174 goto out;
@@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
181 */ 181 */
182 down_write(&OCFS2_I(inode)->ip_alloc_sem); 182 down_write(&OCFS2_I(inode)->ip_alloc_sem);
183 183
184 ret = ocfs2_data_lock(inode, 1);
185 if (ret < 0) {
186 mlog_errno(ret);
187 goto out_meta_unlock;
188 }
189
190 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 184 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
191 185
192 ocfs2_data_unlock(inode, 1);
193
194out_meta_unlock:
195 up_write(&OCFS2_I(inode)->ip_alloc_sem); 186 up_write(&OCFS2_I(inode)->ip_alloc_sem);
196 187
197 brelse(di_bh); 188 brelse(di_bh);
198 ocfs2_meta_unlock(inode, 1); 189 ocfs2_inode_unlock(inode, 1);
199 190
200out: 191out:
201 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 192 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
214{ 205{
215 int ret = 0, lock_level = 0; 206 int ret = 0, lock_level = 0;
216 207
217 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 208 ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
218 file->f_vfsmnt, &lock_level); 209 file->f_vfsmnt, &lock_level);
219 if (ret < 0) { 210 if (ret < 0) {
220 mlog_errno(ret); 211 mlog_errno(ret);
221 goto out; 212 goto out;
222 } 213 }
223 ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level); 214 ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
224out: 215out:
225 vma->vm_ops = &ocfs2_file_vm_ops; 216 vma->vm_ops = &ocfs2_file_vm_ops;
226 vma->vm_flags |= VM_CAN_NONLINEAR; 217 vma->vm_flags |= VM_CAN_NONLINEAR;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 989ac2718587..ae9ad9587516 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,7 +60,6 @@
60#include "symlink.h" 60#include "symlink.h"
61#include "sysfile.h" 61#include "sysfile.h"
62#include "uptodate.h" 62#include "uptodate.h"
63#include "vote.h"
64 63
65#include "buffer_head_io.h" 64#include "buffer_head_io.h"
66 65
@@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
116 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len, 115 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
117 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); 116 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
118 117
119 status = ocfs2_meta_lock(dir, NULL, 0); 118 status = ocfs2_inode_lock(dir, NULL, 0);
120 if (status < 0) { 119 if (status < 0) {
121 if (status != -ENOENT) 120 if (status != -ENOENT)
122 mlog_errno(status); 121 mlog_errno(status);
@@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
129 if (status < 0) 128 if (status < 0)
130 goto bail_add; 129 goto bail_add;
131 130
132 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 131 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
133 if (IS_ERR(inode)) { 132 if (IS_ERR(inode)) {
134 ret = ERR_PTR(-EACCES); 133 ret = ERR_PTR(-EACCES);
135 goto bail_unlock; 134 goto bail_unlock;
@@ -176,8 +175,8 @@ bail_unlock:
176 /* Don't drop the cluster lock until *after* the d_add -- 175 /* Don't drop the cluster lock until *after* the d_add --
177 * unlink on another node will message us to remove that 176 * unlink on another node will message us to remove that
178 * dentry under this lock so otherwise we can race this with 177 * dentry under this lock so otherwise we can race this with
179 * the vote thread and have a stale dentry. */ 178 * the downconvert thread and have a stale dentry. */
180 ocfs2_meta_unlock(dir, 0); 179 ocfs2_inode_unlock(dir, 0);
181 180
182bail: 181bail:
183 182
@@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
209 /* get our super block */ 208 /* get our super block */
210 osb = OCFS2_SB(dir->i_sb); 209 osb = OCFS2_SB(dir->i_sb);
211 210
212 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 211 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
213 if (status < 0) { 212 if (status < 0) {
214 if (status != -ENOENT) 213 if (status != -ENOENT)
215 mlog_errno(status); 214 mlog_errno(status);
@@ -323,7 +322,7 @@ leave:
323 if (handle) 322 if (handle)
324 ocfs2_commit_trans(osb, handle); 323 ocfs2_commit_trans(osb, handle);
325 324
326 ocfs2_meta_unlock(dir, 1); 325 ocfs2_inode_unlock(dir, 1);
327 326
328 if (status == -ENOSPC) 327 if (status == -ENOSPC)
329 mlog(0, "Disk is full\n"); 328 mlog(0, "Disk is full\n");
@@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
553 if (S_ISDIR(inode->i_mode)) 552 if (S_ISDIR(inode->i_mode))
554 return -EPERM; 553 return -EPERM;
555 554
556 err = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 555 err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
557 if (err < 0) { 556 if (err < 0) {
558 if (err != -ENOENT) 557 if (err != -ENOENT)
559 mlog_errno(err); 558 mlog_errno(err);
@@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
578 goto out; 577 goto out;
579 } 578 }
580 579
581 err = ocfs2_meta_lock(inode, &fe_bh, 1); 580 err = ocfs2_inode_lock(inode, &fe_bh, 1);
582 if (err < 0) { 581 if (err < 0) {
583 if (err != -ENOENT) 582 if (err != -ENOENT)
584 mlog_errno(err); 583 mlog_errno(err);
@@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
643out_commit: 642out_commit:
644 ocfs2_commit_trans(osb, handle); 643 ocfs2_commit_trans(osb, handle);
645out_unlock_inode: 644out_unlock_inode:
646 ocfs2_meta_unlock(inode, 1); 645 ocfs2_inode_unlock(inode, 1);
647 646
648out: 647out:
649 ocfs2_meta_unlock(dir, 1); 648 ocfs2_inode_unlock(dir, 1);
650 649
651 if (de_bh) 650 if (de_bh)
652 brelse(de_bh); 651 brelse(de_bh);
@@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
720 return -EPERM; 719 return -EPERM;
721 } 720 }
722 721
723 status = ocfs2_meta_lock(dir, &parent_node_bh, 1); 722 status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
724 if (status < 0) { 723 if (status < 0) {
725 if (status != -ENOENT) 724 if (status != -ENOENT)
726 mlog_errno(status); 725 mlog_errno(status);
@@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
745 goto leave; 744 goto leave;
746 } 745 }
747 746
748 status = ocfs2_meta_lock(inode, &fe_bh, 1); 747 status = ocfs2_inode_lock(inode, &fe_bh, 1);
749 if (status < 0) { 748 if (status < 0) {
750 if (status != -ENOENT) 749 if (status != -ENOENT)
751 mlog_errno(status); 750 mlog_errno(status);
@@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
765 764
766 status = ocfs2_remote_dentry_delete(dentry); 765 status = ocfs2_remote_dentry_delete(dentry);
767 if (status < 0) { 766 if (status < 0) {
768 /* This vote should succeed under all normal 767 /* This remote delete should succeed under all normal
769 * circumstances. */ 768 * circumstances. */
770 mlog_errno(status); 769 mlog_errno(status);
771 goto leave; 770 goto leave;
@@ -841,13 +840,13 @@ leave:
841 ocfs2_commit_trans(osb, handle); 840 ocfs2_commit_trans(osb, handle);
842 841
843 if (child_locked) 842 if (child_locked)
844 ocfs2_meta_unlock(inode, 1); 843 ocfs2_inode_unlock(inode, 1);
845 844
846 ocfs2_meta_unlock(dir, 1); 845 ocfs2_inode_unlock(dir, 1);
847 846
848 if (orphan_dir) { 847 if (orphan_dir) {
849 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 848 /* This was locked for us in ocfs2_prepare_orphan_dir() */
850 ocfs2_meta_unlock(orphan_dir, 1); 849 ocfs2_inode_unlock(orphan_dir, 1);
851 mutex_unlock(&orphan_dir->i_mutex); 850 mutex_unlock(&orphan_dir->i_mutex);
852 iput(orphan_dir); 851 iput(orphan_dir);
853 } 852 }
@@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
908 inode1 = tmpinode; 907 inode1 = tmpinode;
909 } 908 }
910 /* lock id2 */ 909 /* lock id2 */
911 status = ocfs2_meta_lock(inode2, bh2, 1); 910 status = ocfs2_inode_lock(inode2, bh2, 1);
912 if (status < 0) { 911 if (status < 0) {
913 if (status != -ENOENT) 912 if (status != -ENOENT)
914 mlog_errno(status); 913 mlog_errno(status);
@@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
917 } 916 }
918 917
919 /* lock id1 */ 918 /* lock id1 */
920 status = ocfs2_meta_lock(inode1, bh1, 1); 919 status = ocfs2_inode_lock(inode1, bh1, 1);
921 if (status < 0) { 920 if (status < 0) {
922 /* 921 /*
923 * An error return must mean that no cluster locks 922 * An error return must mean that no cluster locks
924 * were held on function exit. 923 * were held on function exit.
925 */ 924 */
926 if (oi1->ip_blkno != oi2->ip_blkno) 925 if (oi1->ip_blkno != oi2->ip_blkno)
927 ocfs2_meta_unlock(inode2, 1); 926 ocfs2_inode_unlock(inode2, 1);
928 927
929 if (status != -ENOENT) 928 if (status != -ENOENT)
930 mlog_errno(status); 929 mlog_errno(status);
@@ -937,10 +936,10 @@ bail:
937 936
938static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) 937static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
939{ 938{
940 ocfs2_meta_unlock(inode1, 1); 939 ocfs2_inode_unlock(inode1, 1);
941 940
942 if (inode1 != inode2) 941 if (inode1 != inode2)
943 ocfs2_meta_unlock(inode2, 1); 942 ocfs2_inode_unlock(inode2, 1);
944} 943}
945 944
946static int ocfs2_rename(struct inode *old_dir, 945static int ocfs2_rename(struct inode *old_dir,
@@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir,
1031 1030
1032 /* 1031 /*
1033 * Aside from allowing a meta data update, the locking here 1032 * Aside from allowing a meta data update, the locking here
1034 * also ensures that the vote thread on other nodes won't have 1033 * also ensures that the downconvert thread on other nodes
1035 * to concurrently downconvert the inode and the dentry locks. 1034 * won't have to concurrently downconvert the inode and the
1035 * dentry locks.
1036 */ 1036 */
1037 status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1); 1037 status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
1038 if (status < 0) { 1038 if (status < 0) {
1039 if (status != -ENOENT) 1039 if (status != -ENOENT)
1040 mlog_errno(status); 1040 mlog_errno(status);
@@ -1143,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
1143 goto bail; 1143 goto bail;
1144 } 1144 }
1145 1145
1146 status = ocfs2_meta_lock(new_inode, &newfe_bh, 1); 1146 status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
1147 if (status < 0) { 1147 if (status < 0) {
1148 if (status != -ENOENT) 1148 if (status != -ENOENT)
1149 mlog_errno(status); 1149 mlog_errno(status);
@@ -1355,14 +1355,14 @@ bail:
1355 ocfs2_double_unlock(old_dir, new_dir); 1355 ocfs2_double_unlock(old_dir, new_dir);
1356 1356
1357 if (old_child_locked) 1357 if (old_child_locked)
1358 ocfs2_meta_unlock(old_inode, 1); 1358 ocfs2_inode_unlock(old_inode, 1);
1359 1359
1360 if (new_child_locked) 1360 if (new_child_locked)
1361 ocfs2_meta_unlock(new_inode, 1); 1361 ocfs2_inode_unlock(new_inode, 1);
1362 1362
1363 if (orphan_dir) { 1363 if (orphan_dir) {
1364 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 1364 /* This was locked for us in ocfs2_prepare_orphan_dir() */
1365 ocfs2_meta_unlock(orphan_dir, 1); 1365 ocfs2_inode_unlock(orphan_dir, 1);
1366 mutex_unlock(&orphan_dir->i_mutex); 1366 mutex_unlock(&orphan_dir->i_mutex);
1367 iput(orphan_dir); 1367 iput(orphan_dir);
1368 } 1368 }
@@ -1530,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
1530 credits = ocfs2_calc_symlink_credits(sb); 1530 credits = ocfs2_calc_symlink_credits(sb);
1531 1531
1532 /* lock the parent directory */ 1532 /* lock the parent directory */
1533 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 1533 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
1534 if (status < 0) { 1534 if (status < 0) {
1535 if (status != -ENOENT) 1535 if (status != -ENOENT)
1536 mlog_errno(status); 1536 mlog_errno(status);
@@ -1657,7 +1657,7 @@ bail:
1657 if (handle) 1657 if (handle)
1658 ocfs2_commit_trans(osb, handle); 1658 ocfs2_commit_trans(osb, handle);
1659 1659
1660 ocfs2_meta_unlock(dir, 1); 1660 ocfs2_inode_unlock(dir, 1);
1661 1661
1662 if (new_fe_bh) 1662 if (new_fe_bh)
1663 brelse(new_fe_bh); 1663 brelse(new_fe_bh);
@@ -1735,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1735 1735
1736 mutex_lock(&orphan_dir_inode->i_mutex); 1736 mutex_lock(&orphan_dir_inode->i_mutex);
1737 1737
1738 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 1738 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
1739 if (status < 0) { 1739 if (status < 0) {
1740 mlog_errno(status); 1740 mlog_errno(status);
1741 goto leave; 1741 goto leave;
@@ -1745,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1745 orphan_dir_bh, name, 1745 orphan_dir_bh, name,
1746 OCFS2_ORPHAN_NAMELEN, de_bh); 1746 OCFS2_ORPHAN_NAMELEN, de_bh);
1747 if (status < 0) { 1747 if (status < 0) {
1748 ocfs2_meta_unlock(orphan_dir_inode, 1); 1748 ocfs2_inode_unlock(orphan_dir_inode, 1);
1749 1749
1750 mlog_errno(status); 1750 mlog_errno(status);
1751 goto leave; 1751 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 60a23e1906b0..d08480580470 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
101 * about to be 101 * about to be
102 * dropped. */ 102 * dropped. */
103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ 103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
104#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
104 105
105struct ocfs2_lock_res_ops; 106struct ocfs2_lock_res_ops;
106 107
@@ -170,6 +171,7 @@ enum ocfs2_mount_options
170 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 171 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
171 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 172 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
172 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ 173 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
174 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
173}; 175};
174 176
175#define OCFS2_OSB_SOFT_RO 0x0001 177#define OCFS2_OSB_SOFT_RO 0x0001
@@ -189,9 +191,7 @@ struct ocfs2_super
189 struct ocfs2_slot_info *slot_info; 191 struct ocfs2_slot_info *slot_info;
190 192
191 spinlock_t node_map_lock; 193 spinlock_t node_map_lock;
192 struct ocfs2_node_map mounted_map;
193 struct ocfs2_node_map recovery_map; 194 struct ocfs2_node_map recovery_map;
194 struct ocfs2_node_map umount_map;
195 195
196 u64 root_blkno; 196 u64 root_blkno;
197 u64 system_dir_blkno; 197 u64 system_dir_blkno;
@@ -231,7 +231,9 @@ struct ocfs2_super
231 wait_queue_head_t checkpoint_event; 231 wait_queue_head_t checkpoint_event;
232 atomic_t needs_checkpoint; 232 atomic_t needs_checkpoint;
233 struct ocfs2_journal *journal; 233 struct ocfs2_journal *journal;
234 unsigned long osb_commit_interval;
234 235
236 int local_alloc_size;
235 enum ocfs2_local_alloc_state local_alloc_state; 237 enum ocfs2_local_alloc_state local_alloc_state;
236 struct buffer_head *local_alloc_bh; 238 struct buffer_head *local_alloc_bh;
237 u64 la_last_gd; 239 u64 la_last_gd;
@@ -254,28 +256,21 @@ struct ocfs2_super
254 256
255 wait_queue_head_t recovery_event; 257 wait_queue_head_t recovery_event;
256 258
257 spinlock_t vote_task_lock; 259 spinlock_t dc_task_lock;
258 struct task_struct *vote_task; 260 struct task_struct *dc_task;
259 wait_queue_head_t vote_event; 261 wait_queue_head_t dc_event;
260 unsigned long vote_wake_sequence; 262 unsigned long dc_wake_sequence;
261 unsigned long vote_work_sequence; 263 unsigned long dc_work_sequence;
262 264
265 /*
266 * Any thread can add locks to the list, but the downconvert
267 * thread is the only one allowed to remove locks. Any change
268 * to this rule requires updating
269 * ocfs2_downconvert_thread_do_work().
270 */
263 struct list_head blocked_lock_list; 271 struct list_head blocked_lock_list;
264 unsigned long blocked_lock_count; 272 unsigned long blocked_lock_count;
265 273
266 struct list_head vote_list;
267 int vote_count;
268
269 u32 net_key;
270 spinlock_t net_response_lock;
271 unsigned int net_response_ids;
272 struct list_head net_response_list;
273
274 struct o2hb_callback_func osb_hb_up;
275 struct o2hb_callback_func osb_hb_down;
276
277 struct list_head osb_net_handlers;
278
279 wait_queue_head_t osb_mount_event; 274 wait_queue_head_t osb_mount_event;
280 275
281 /* Truncate log info */ 276 /* Truncate log info */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 6ef876759a73..3633edd3982f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,6 +231,20 @@ struct ocfs2_space_resv {
231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) 231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) 232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
233 233
234/* Used to pass group descriptor data when online resize is done */
235struct ocfs2_new_group_input {
236 __u64 group; /* Group descriptor's blkno. */
237 __u32 clusters; /* Total number of clusters in this group */
238 __u32 frees; /* Total free clusters in this group */
239 __u16 chain; /* Chain for this group */
240 __u16 reserved1;
241 __u32 reserved2;
242};
243
244#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
245#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
246#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
247
234/* 248/*
235 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 249 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
236 */ 250 */
@@ -256,6 +270,14 @@ struct ocfs2_space_resv {
256/* Journal limits (in bytes) */ 270/* Journal limits (in bytes) */
257#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 271#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
258 272
273/*
274 * Default local alloc size (in megabytes)
275 *
276 * The value chosen should be such that most allocations, including new
277 * block groups, use local alloc.
278 */
279#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
280
259struct ocfs2_system_inode_info { 281struct ocfs2_system_inode_info {
260 char *si_name; 282 char *si_name;
261 int si_iflags; 283 int si_iflags;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4ca02b1c38ac..86f3e3799c2b 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
45 OCFS2_LOCK_TYPE_RW, 45 OCFS2_LOCK_TYPE_RW,
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK,
48 OCFS2_NUM_LOCK_TYPES 49 OCFS2_NUM_LOCK_TYPES
49}; 50};
50 51
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
73 case OCFS2_LOCK_TYPE_OPEN: 74 case OCFS2_LOCK_TYPE_OPEN:
74 c = 'O'; 75 c = 'O';
75 break; 76 break;
77 case OCFS2_LOCK_TYPE_FLOCK:
78 c = 'F';
79 break;
76 default: 80 default:
77 c = '\0'; 81 c = '\0';
78 } 82 }
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
90 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 94 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
91 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 95 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
92 [OCFS2_LOCK_TYPE_OPEN] = "Open", 96 [OCFS2_LOCK_TYPE_OPEN] = "Open",
97 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
93}; 98};
94 99
95static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 000000000000..37835ffcb039
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,634 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.c
5 *
6 * volume resize.
7 * Inspired by ext3/resize.c.
8 *
9 * Copyright (C) 2007 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29
30#define MLOG_MASK_PREFIX ML_DISK_ALLOC
31#include <cluster/masklog.h>
32
33#include "ocfs2.h"
34
35#include "alloc.h"
36#include "dlmglue.h"
37#include "inode.h"
38#include "journal.h"
39#include "super.h"
40#include "sysfile.h"
41#include "uptodate.h"
42
43#include "buffer_head_io.h"
44#include "suballoc.h"
45#include "resize.h"
46
47/*
48 * Check whether there are new backup superblocks exist
49 * in the last group. If there are some, mark them or clear
50 * them in the bitmap.
51 *
52 * Return how many backups we find in the last group.
53 */
54static u16 ocfs2_calc_new_backup_super(struct inode *inode,
55 struct ocfs2_group_desc *gd,
56 int new_clusters,
57 u32 first_new_cluster,
58 u16 cl_cpg,
59 int set)
60{
61 int i;
62 u16 backups = 0;
63 u32 cluster;
64 u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
65
66 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
67 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
68 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
69
70 gd_blkno = ocfs2_which_cluster_group(inode, cluster);
71 if (gd_blkno < lgd_blkno)
72 continue;
73 else if (gd_blkno > lgd_blkno)
74 break;
75
76 if (set)
77 ocfs2_set_bit(cluster % cl_cpg,
78 (unsigned long *)gd->bg_bitmap);
79 else
80 ocfs2_clear_bit(cluster % cl_cpg,
81 (unsigned long *)gd->bg_bitmap);
82 backups++;
83 }
84
85 mlog_exit_void();
86 return backups;
87}
88
89static int ocfs2_update_last_group_and_inode(handle_t *handle,
90 struct inode *bm_inode,
91 struct buffer_head *bm_bh,
92 struct buffer_head *group_bh,
93 u32 first_new_cluster,
94 int new_clusters)
95{
96 int ret = 0;
97 struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
98 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
99 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
100 struct ocfs2_chain_rec *cr;
101 struct ocfs2_group_desc *group;
102 u16 chain, num_bits, backups = 0;
103 u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
104 u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
105
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster);
108
109 ret = ocfs2_journal_access(handle, bm_inode, group_bh,
110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) {
112 mlog_errno(ret);
113 goto out;
114 }
115
116 group = (struct ocfs2_group_desc *)group_bh->b_data;
117
118 /* update the group first. */
119 num_bits = new_clusters * cl_bpc;
120 le16_add_cpu(&group->bg_bits, num_bits);
121 le16_add_cpu(&group->bg_free_bits_count, num_bits);
122
123 /*
124 * check whether there are some new backup superblocks exist in
125 * this group and update the group bitmap accordingly.
126 */
127 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
128 OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
129 backups = ocfs2_calc_new_backup_super(bm_inode,
130 group,
131 new_clusters,
132 first_new_cluster,
133 cl_cpg, 1);
134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
135 }
136
137 ret = ocfs2_journal_dirty(handle, group_bh);
138 if (ret < 0) {
139 mlog_errno(ret);
140 goto out_rollback;
141 }
142
143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) {
147 mlog_errno(ret);
148 goto out_rollback;
149 }
150
151 chain = le16_to_cpu(group->bg_chain);
152 cr = (&cl->cl_recs[chain]);
153 le32_add_cpu(&cr->c_total, num_bits);
154 le32_add_cpu(&cr->c_free, num_bits);
155 le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
156 le32_add_cpu(&fe->i_clusters, new_clusters);
157
158 if (backups) {
159 le32_add_cpu(&cr->c_free, -1 * backups);
160 le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
161 }
162
163 spin_lock(&OCFS2_I(bm_inode)->ip_lock);
164 OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
165 le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
166 spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
167 i_size_write(bm_inode, le64_to_cpu(fe->i_size));
168
169 ocfs2_journal_dirty(handle, bm_bh);
170
171out_rollback:
172 if (ret < 0) {
173 ocfs2_calc_new_backup_super(bm_inode,
174 group,
175 new_clusters,
176 first_new_cluster,
177 cl_cpg, 0);
178 le16_add_cpu(&group->bg_free_bits_count, backups);
179 le16_add_cpu(&group->bg_bits, -1 * num_bits);
180 le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
181 }
182out:
183 mlog_exit(ret);
184 return ret;
185}
186
187static int update_backups(struct inode * inode, u32 clusters, char *data)
188{
189 int i, ret = 0;
190 u32 cluster;
191 u64 blkno;
192 struct buffer_head *backup = NULL;
193 struct ocfs2_dinode *backup_di = NULL;
194 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
195
196 /* calculate the real backups we need to update. */
197 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
198 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
199 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
200 if (cluster > clusters)
201 break;
202
203 ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
204 if (ret < 0) {
205 mlog_errno(ret);
206 break;
207 }
208
209 memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
210
211 backup_di = (struct ocfs2_dinode *)backup->b_data;
212 backup_di->i_blkno = cpu_to_le64(blkno);
213
214 ret = ocfs2_write_super_or_backup(osb, backup);
215 brelse(backup);
216 backup = NULL;
217 if (ret < 0) {
218 mlog_errno(ret);
219 break;
220 }
221 }
222
223 return ret;
224}
225
226static void ocfs2_update_super_and_backups(struct inode *inode,
227 int new_clusters)
228{
229 int ret;
230 u32 clusters = 0;
231 struct buffer_head *super_bh = NULL;
232 struct ocfs2_dinode *super_di = NULL;
233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234
235 /*
236 * update the superblock last.
237 * It doesn't matter if the write failed.
238 */
239 ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
240 &super_bh, 0, NULL);
241 if (ret < 0) {
242 mlog_errno(ret);
243 goto out;
244 }
245
246 super_di = (struct ocfs2_dinode *)super_bh->b_data;
247 le32_add_cpu(&super_di->i_clusters, new_clusters);
248 clusters = le32_to_cpu(super_di->i_clusters);
249
250 ret = ocfs2_write_super_or_backup(osb, super_bh);
251 if (ret < 0) {
252 mlog_errno(ret);
253 goto out;
254 }
255
256 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
257 ret = update_backups(inode, clusters, super_bh->b_data);
258
259out:
260 brelse(super_bh);
261 if (ret)
262 printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
263 " during fs resize. This condition is not fatal,"
264 " but fsck.ocfs2 should be run to fix it\n",
265 osb->dev_str);
266 return;
267}
268
269/*
270 * Extend the filesystem to the new number of clusters specified. This entry
271 * point is only used to extend the current filesystem to the end of the last
272 * existing group.
273 */
274int ocfs2_group_extend(struct inode * inode, int new_clusters)
275{
276 int ret;
277 handle_t *handle;
278 struct buffer_head *main_bm_bh = NULL;
279 struct buffer_head *group_bh = NULL;
280 struct inode *main_bm_inode = NULL;
281 struct ocfs2_dinode *fe = NULL;
282 struct ocfs2_group_desc *group = NULL;
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 u16 cl_bpc;
285 u32 first_new_cluster;
286 u64 lgd_blkno;
287
288 mlog_entry_void();
289
290 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
291 return -EROFS;
292
293 if (new_clusters < 0)
294 return -EINVAL;
295 else if (new_clusters == 0)
296 return 0;
297
298 main_bm_inode = ocfs2_get_system_file_inode(osb,
299 GLOBAL_BITMAP_SYSTEM_INODE,
300 OCFS2_INVALID_SLOT);
301 if (!main_bm_inode) {
302 ret = -EINVAL;
303 mlog_errno(ret);
304 goto out;
305 }
306
307 mutex_lock(&main_bm_inode->i_mutex);
308
309 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
310 if (ret < 0) {
311 mlog_errno(ret);
312 goto out_mutex;
313 }
314
315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
316
317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
318 ocfs2_group_bitmap_size(osb->sb) * 8) {
319 mlog(ML_ERROR, "The disk is too old and small. "
320 "Force to do offline resize.");
321 ret = -EINVAL;
322 goto out_unlock;
323 }
324
325 if (!OCFS2_IS_VALID_DINODE(fe)) {
326 OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
327 ret = -EIO;
328 goto out_unlock;
329 }
330
331 first_new_cluster = le32_to_cpu(fe->i_clusters);
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1);
334
335 ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
336 main_bm_inode);
337 if (ret < 0) {
338 mlog_errno(ret);
339 goto out_unlock;
340 }
341
342 group = (struct ocfs2_group_desc *)group_bh->b_data;
343
344 ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
345 if (ret) {
346 mlog_errno(ret);
347 goto out_unlock;
348 }
349
350 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
351 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
352 le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
353 ret = -EINVAL;
354 goto out_unlock;
355 }
356
357 mlog(0, "extend the last group at %llu, new clusters = %d\n",
358 (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
359
360 handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
361 if (IS_ERR(handle)) {
362 mlog_errno(PTR_ERR(handle));
363 ret = -EINVAL;
364 goto out_unlock;
365 }
366
367 /* update the last group descriptor and inode. */
368 ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
369 main_bm_bh, group_bh,
370 first_new_cluster,
371 new_clusters);
372 if (ret) {
373 mlog_errno(ret);
374 goto out_commit;
375 }
376
377 ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
378
379out_commit:
380 ocfs2_commit_trans(osb, handle);
381out_unlock:
382 brelse(group_bh);
383 brelse(main_bm_bh);
384
385 ocfs2_inode_unlock(main_bm_inode, 1);
386
387out_mutex:
388 mutex_unlock(&main_bm_inode->i_mutex);
389 iput(main_bm_inode);
390
391out:
392 mlog_exit_void();
393 return ret;
394}
395
396static int ocfs2_check_new_group(struct inode *inode,
397 struct ocfs2_dinode *di,
398 struct ocfs2_new_group_input *input,
399 struct buffer_head *group_bh)
400{
401 int ret;
402 struct ocfs2_group_desc *gd;
403 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
404 unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
405 le16_to_cpu(di->id2.i_chain.cl_bpc);
406
407
408 gd = (struct ocfs2_group_desc *)group_bh->b_data;
409
410 ret = -EIO;
411 if (!OCFS2_IS_VALID_GROUP_DESC(gd))
412 mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
413 (unsigned long long)le64_to_cpu(gd->bg_blkno));
414 else if (di->i_blkno != gd->bg_parent_dinode)
415 mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
416 "pointer (%llu, expected %llu)\n",
417 (unsigned long long)le64_to_cpu(gd->bg_blkno),
418 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
419 (unsigned long long)le64_to_cpu(di->i_blkno));
420 else if (le16_to_cpu(gd->bg_bits) > max_bits)
421 mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
422 (unsigned long long)le64_to_cpu(gd->bg_blkno),
423 le16_to_cpu(gd->bg_bits));
424 else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
425 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
426 "claims that %u are free\n",
427 (unsigned long long)le64_to_cpu(gd->bg_blkno),
428 le16_to_cpu(gd->bg_bits),
429 le16_to_cpu(gd->bg_free_bits_count));
430 else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
431 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
432 "max bitmap bits of %u\n",
433 (unsigned long long)le64_to_cpu(gd->bg_blkno),
434 le16_to_cpu(gd->bg_bits),
435 8 * le16_to_cpu(gd->bg_size));
436 else if (le16_to_cpu(gd->bg_chain) != input->chain)
437 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
438 "while input has %u set.\n",
439 (unsigned long long)le64_to_cpu(gd->bg_blkno),
440 le16_to_cpu(gd->bg_chain), input->chain);
441 else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
442 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
443 "input has %u clusters set\n",
444 (unsigned long long)le64_to_cpu(gd->bg_blkno),
445 le16_to_cpu(gd->bg_bits), input->clusters);
446 else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
447 mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
448 "but it should have %u set\n",
449 (unsigned long long)le64_to_cpu(gd->bg_blkno),
450 le16_to_cpu(gd->bg_bits),
451 input->frees * cl_bpc);
452 else
453 ret = 0;
454
455 return ret;
456}
457
458static int ocfs2_verify_group_and_input(struct inode *inode,
459 struct ocfs2_dinode *di,
460 struct ocfs2_new_group_input *input,
461 struct buffer_head *group_bh)
462{
463 u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
464 u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
465 u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
466 u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
467 u32 total_clusters = le32_to_cpu(di->i_clusters);
468 int ret = -EINVAL;
469
470 if (cluster < total_clusters)
471 mlog(ML_ERROR, "add a group which is in the current volume.\n");
472 else if (input->chain >= cl_count)
473 mlog(ML_ERROR, "input chain exceeds the limit.\n");
474 else if (next_free != cl_count && next_free != input->chain)
475 mlog(ML_ERROR,
476 "the add group should be in chain %u\n", next_free);
477 else if (total_clusters + input->clusters < total_clusters)
478 mlog(ML_ERROR, "add group's clusters overflow.\n");
479 else if (input->clusters > cl_cpg)
480 mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
481 else if (input->frees > input->clusters)
482 mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
483 else if (total_clusters % cl_cpg != 0)
484 mlog(ML_ERROR,
485 "the last group isn't full. Use group extend first.\n");
486 else if (input->group != ocfs2_which_cluster_group(inode, cluster))
487 mlog(ML_ERROR, "group blkno is invalid\n");
488 else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
489 mlog(ML_ERROR, "group descriptor check failed.\n");
490 else
491 ret = 0;
492
493 return ret;
494}
495
496/* Add a new group descriptor to global_bitmap. */
497int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
498{
499 int ret;
500 handle_t *handle;
501 struct buffer_head *main_bm_bh = NULL;
502 struct inode *main_bm_inode = NULL;
503 struct ocfs2_dinode *fe = NULL;
504 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
505 struct buffer_head *group_bh = NULL;
506 struct ocfs2_group_desc *group = NULL;
507 struct ocfs2_chain_list *cl;
508 struct ocfs2_chain_rec *cr;
509 u16 cl_bpc;
510
511 mlog_entry_void();
512
513 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
514 return -EROFS;
515
516 main_bm_inode = ocfs2_get_system_file_inode(osb,
517 GLOBAL_BITMAP_SYSTEM_INODE,
518 OCFS2_INVALID_SLOT);
519 if (!main_bm_inode) {
520 ret = -EINVAL;
521 mlog_errno(ret);
522 goto out;
523 }
524
525 mutex_lock(&main_bm_inode->i_mutex);
526
527 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
528 if (ret < 0) {
529 mlog_errno(ret);
530 goto out_mutex;
531 }
532
533 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
534
535 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
536 ocfs2_group_bitmap_size(osb->sb) * 8) {
537 mlog(ML_ERROR, "The disk is too old and small."
538 " Force to do offline resize.");
539 ret = -EINVAL;
540 goto out_unlock;
541 }
542
543 ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
544 if (ret < 0) {
545 mlog(ML_ERROR, "Can't read the group descriptor # %llu "
546 "from the device.", (unsigned long long)input->group);
547 goto out_unlock;
548 }
549
550 ocfs2_set_new_buffer_uptodate(inode, group_bh);
551
552 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
553 if (ret) {
554 mlog_errno(ret);
555 goto out_unlock;
556 }
557
558 mlog(0, "Add a new group %llu in chain = %u, length = %u\n",
559 (unsigned long long)input->group, input->chain, input->clusters);
560
561 handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
562 if (IS_ERR(handle)) {
563 mlog_errno(PTR_ERR(handle));
564 ret = -EINVAL;
565 goto out_unlock;
566 }
567
568 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
569 cl = &fe->id2.i_chain;
570 cr = &cl->cl_recs[input->chain];
571
572 ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
573 OCFS2_JOURNAL_ACCESS_WRITE);
574 if (ret < 0) {
575 mlog_errno(ret);
576 goto out_commit;
577 }
578
579 group = (struct ocfs2_group_desc *)group_bh->b_data;
580 group->bg_next_group = cr->c_blkno;
581
582 ret = ocfs2_journal_dirty(handle, group_bh);
583 if (ret < 0) {
584 mlog_errno(ret);
585 goto out_commit;
586 }
587
588 ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
589 OCFS2_JOURNAL_ACCESS_WRITE);
590 if (ret < 0) {
591 mlog_errno(ret);
592 goto out_commit;
593 }
594
595 if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
596 le16_add_cpu(&cl->cl_next_free_rec, 1);
597 memset(cr, 0, sizeof(struct ocfs2_chain_rec));
598 }
599
600 cr->c_blkno = le64_to_cpu(input->group);
601 le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
602 le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
603
604 le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
605 le32_add_cpu(&fe->id1.bitmap1.i_used,
606 (input->clusters - input->frees) * cl_bpc);
607 le32_add_cpu(&fe->i_clusters, input->clusters);
608
609 ocfs2_journal_dirty(handle, main_bm_bh);
610
611 spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
612 OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
613 le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
614 spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
615 i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
616
617 ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
618
619out_commit:
620 ocfs2_commit_trans(osb, handle);
621out_unlock:
622 brelse(group_bh);
623 brelse(main_bm_bh);
624
625 ocfs2_inode_unlock(main_bm_inode, 1);
626
627out_mutex:
628 mutex_unlock(&main_bm_inode->i_mutex);
629 iput(main_bm_inode);
630
631out:
632 mlog_exit_void();
633 return ret;
634}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 000000000000..f38841abf10b
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_RESIZE_H
27#define OCFS2_RESIZE_H
28
29int ocfs2_group_extend(struct inode * inode, int new_clusters);
30int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
31
32#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index af4882b62cfa..3a50ce555e64 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
48 s16 slot_num, 48 s16 slot_num,
49 s16 node_num); 49 s16 node_num);
50 50
51/* Use the slot information we've collected to create a map of mounted
52 * nodes. Should be holding an EX on super block. assumes slot info is
53 * up to date. Note that we call this *after* we find a slot, so our
54 * own node should be set in the map too... */
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
56{
57 int i;
58 struct ocfs2_slot_info *si = osb->slot_info;
59
60 spin_lock(&si->si_lock);
61
62 for (i = 0; i < si->si_size; i++)
63 if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
64 ocfs2_node_map_set_bit(osb, &osb->mounted_map,
65 si->si_global_node_nums[i]);
66
67 spin_unlock(&si->si_lock);
68}
69
70/* post the slot information on disk into our slot_info struct. */ 51/* post the slot information on disk into our slot_info struct. */
71void ocfs2_update_slot_info(struct ocfs2_slot_info *si) 52void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
72{ 53{
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031b..1025872aaade 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
52void ocfs2_clear_slot(struct ocfs2_slot_info *si, 52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num); 53 s16 slot_num);
54 54
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
56
57static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, 55static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
58 int slot_num) 56 int slot_num)
59{ 57{
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8f09f5235e3a..7e397e2c25dd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
102 u64 bg_blkno, 102 u64 bg_blkno,
103 u16 bg_bit_off); 103 u16 bg_bit_off);
104static inline u64 ocfs2_which_cluster_group(struct inode *inode,
105 u32 cluster);
106static inline void ocfs2_block_to_cluster_group(struct inode *inode, 104static inline void ocfs2_block_to_cluster_group(struct inode *inode,
107 u64 data_blkno, 105 u64 data_blkno,
108 u64 *bg_blkno, 106 u64 *bg_blkno,
@@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
114 112
115 if (inode) { 113 if (inode) {
116 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 114 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
117 ocfs2_meta_unlock(inode, 1); 115 ocfs2_inode_unlock(inode, 1);
118 116
119 mutex_unlock(&inode->i_mutex); 117 mutex_unlock(&inode->i_mutex);
120 118
@@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
131} 129}
132 130
133/* somewhat more expensive than our other checks, so use sparingly. */ 131/* somewhat more expensive than our other checks, so use sparingly. */
134static int ocfs2_check_group_descriptor(struct super_block *sb, 132int ocfs2_check_group_descriptor(struct super_block *sb,
135 struct ocfs2_dinode *di, 133 struct ocfs2_dinode *di,
136 struct ocfs2_group_desc *gd) 134 struct ocfs2_group_desc *gd)
137{ 135{
138 unsigned int max_bits; 136 unsigned int max_bits;
139 137
@@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
412 410
413 mutex_lock(&alloc_inode->i_mutex); 411 mutex_lock(&alloc_inode->i_mutex);
414 412
415 status = ocfs2_meta_lock(alloc_inode, &bh, 1); 413 status = ocfs2_inode_lock(alloc_inode, &bh, 1);
416 if (status < 0) { 414 if (status < 0) {
417 mutex_unlock(&alloc_inode->i_mutex); 415 mutex_unlock(&alloc_inode->i_mutex);
418 iput(alloc_inode); 416 iput(alloc_inode);
@@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1443 1441
1444/* given a cluster offset, calculate which block group it belongs to 1442/* given a cluster offset, calculate which block group it belongs to
1445 * and return that block offset. */ 1443 * and return that block offset. */
1446static inline u64 ocfs2_which_cluster_group(struct inode *inode, 1444u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1447 u32 cluster)
1448{ 1445{
1449 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1446 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1450 u32 group_no; 1447 u32 group_no;
@@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1519 if (min_clusters > (osb->bitmap_cpg - 1)) { 1516 if (min_clusters > (osb->bitmap_cpg - 1)) {
1520 /* The only paths asking for contiguousness 1517 /* The only paths asking for contiguousness
1521 * should know about this already. */ 1518 * should know about this already. */
1522 mlog(ML_ERROR, "minimum allocation requested exceeds " 1519 mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1523 "group bitmap size!"); 1520 "group bitmap size %u!\n", min_clusters,
1521 osb->bitmap_cpg);
1524 status = -ENOSPC; 1522 status = -ENOSPC;
1525 goto bail; 1523 goto bail;
1526 } 1524 }
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index cafe93703095..8799033bb459 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
148 struct ocfs2_alloc_context *ac); 148 struct ocfs2_alloc_context *ac);
149 149
150/* given a cluster offset, calculate which block group it belongs to
151 * and return that block offset. */
152u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
153
154/* somewhat more expensive than our other checks, so use sparingly. */
155int ocfs2_check_group_descriptor(struct super_block *sb,
156 struct ocfs2_dinode *di,
157 struct ocfs2_group_desc *gd);
150#endif /* _CHAINALLOC_H_ */ 158#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5ee775420665..01fe40ee5ea9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,7 +65,6 @@
65#include "sysfile.h" 65#include "sysfile.h"
66#include "uptodate.h" 66#include "uptodate.h"
67#include "ver.h" 67#include "ver.h"
68#include "vote.h"
69 68
70#include "buffer_head_io.h" 69#include "buffer_head_io.h"
71 70
@@ -84,9 +83,11 @@ MODULE_LICENSE("GPL");
84 83
85struct mount_options 84struct mount_options
86{ 85{
86 unsigned long commit_interval;
87 unsigned long mount_opt; 87 unsigned long mount_opt;
88 unsigned int atime_quantum; 88 unsigned int atime_quantum;
89 signed short slot; 89 signed short slot;
90 unsigned int localalloc_opt;
90}; 91};
91 92
92static int ocfs2_parse_options(struct super_block *sb, char *options, 93static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -150,6 +151,9 @@ enum {
150 Opt_data_writeback, 151 Opt_data_writeback,
151 Opt_atime_quantum, 152 Opt_atime_quantum,
152 Opt_slot, 153 Opt_slot,
154 Opt_commit,
155 Opt_localalloc,
156 Opt_localflocks,
153 Opt_err, 157 Opt_err,
154}; 158};
155 159
@@ -165,6 +169,9 @@ static match_table_t tokens = {
165 {Opt_data_writeback, "data=writeback"}, 169 {Opt_data_writeback, "data=writeback"},
166 {Opt_atime_quantum, "atime_quantum=%u"}, 170 {Opt_atime_quantum, "atime_quantum=%u"},
167 {Opt_slot, "preferred_slot=%u"}, 171 {Opt_slot, "preferred_slot=%u"},
172 {Opt_commit, "commit=%u"},
173 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"},
168 {Opt_err, NULL} 175 {Opt_err, NULL}
169}; 176};
170 177
@@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
213 220
214 mlog_entry_void(); 221 mlog_entry_void();
215 222
216 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE); 223 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
217 if (IS_ERR(new)) { 224 if (IS_ERR(new)) {
218 status = PTR_ERR(new); 225 status = PTR_ERR(new);
219 mlog_errno(status); 226 mlog_errno(status);
@@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
221 } 228 }
222 osb->root_inode = new; 229 osb->root_inode = new;
223 230
224 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE); 231 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
225 if (IS_ERR(new)) { 232 if (IS_ERR(new)) {
226 status = PTR_ERR(new); 233 status = PTR_ERR(new);
227 mlog_errno(status); 234 mlog_errno(status);
@@ -443,6 +450,8 @@ unlock_osb:
443 osb->s_mount_opt = parsed_options.mount_opt; 450 osb->s_mount_opt = parsed_options.mount_opt;
444 osb->s_atime_quantum = parsed_options.atime_quantum; 451 osb->s_atime_quantum = parsed_options.atime_quantum;
445 osb->preferred_slot = parsed_options.slot; 452 osb->preferred_slot = parsed_options.slot;
453 if (parsed_options.commit_interval)
454 osb->osb_commit_interval = parsed_options.commit_interval;
446 455
447 if (!ocfs2_is_hard_readonly(osb)) 456 if (!ocfs2_is_hard_readonly(osb))
448 ocfs2_set_journal_params(osb); 457 ocfs2_set_journal_params(osb);
@@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
597 osb->s_mount_opt = parsed_options.mount_opt; 606 osb->s_mount_opt = parsed_options.mount_opt;
598 osb->s_atime_quantum = parsed_options.atime_quantum; 607 osb->s_atime_quantum = parsed_options.atime_quantum;
599 osb->preferred_slot = parsed_options.slot; 608 osb->preferred_slot = parsed_options.slot;
609 osb->osb_commit_interval = parsed_options.commit_interval;
610 osb->local_alloc_size = parsed_options.localalloc_opt;
600 611
601 sb->s_magic = OCFS2_SUPER_MAGIC; 612 sb->s_magic = OCFS2_SUPER_MAGIC;
602 613
@@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb,
747 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 758 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
748 options ? options : "(none)"); 759 options ? options : "(none)");
749 760
761 mopt->commit_interval = 0;
750 mopt->mount_opt = 0; 762 mopt->mount_opt = 0;
751 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 763 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
752 mopt->slot = OCFS2_INVALID_SLOT; 764 mopt->slot = OCFS2_INVALID_SLOT;
765 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
753 766
754 if (!options) { 767 if (!options) {
755 status = 1; 768 status = 1;
@@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb,
816 if (option) 829 if (option)
817 mopt->slot = (s16)option; 830 mopt->slot = (s16)option;
818 break; 831 break;
832 case Opt_commit:
833 option = 0;
834 if (match_int(&args[0], &option)) {
835 status = 0;
836 goto bail;
837 }
838 if (option < 0)
839 return 0;
840 if (option == 0)
841 option = JBD_DEFAULT_MAX_COMMIT_AGE;
842 mopt->commit_interval = HZ * option;
843 break;
844 case Opt_localalloc:
845 option = 0;
846 if (match_int(&args[0], &option)) {
847 status = 0;
848 goto bail;
849 }
850 if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
851 mopt->localalloc_opt = option;
852 break;
853 case Opt_localflocks:
854 /*
855 * Changing this during remount could race
856 * flock() requests, or "unbalance" existing
857 * ones (e.g., a lock is taken in one mode but
858 * dropped in the other). If users care enough
859 * to flip locking modes during remount, we
860 * could add a "local" flag to individual
861 * flock structures for proper tracking of
862 * state.
863 */
864 if (!is_remount)
865 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
866 break;
819 default: 867 default:
820 mlog(ML_ERROR, 868 mlog(ML_ERROR,
821 "Unrecognized mount option \"%s\" " 869 "Unrecognized mount option \"%s\" "
@@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
864 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) 912 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
865 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); 913 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
866 914
915 if (osb->osb_commit_interval)
916 seq_printf(s, ",commit=%u",
917 (unsigned) (osb->osb_commit_interval / HZ));
918
919 if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
920 seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
921
922 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
923 seq_printf(s, ",localflocks,");
924
867 return 0; 925 return 0;
868} 926}
869 927
@@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
965 goto bail; 1023 goto bail;
966 } 1024 }
967 1025
968 status = ocfs2_meta_lock(inode, &bh, 0); 1026 status = ocfs2_inode_lock(inode, &bh, 0);
969 if (status < 0) { 1027 if (status < 0) {
970 mlog_errno(status); 1028 mlog_errno(status);
971 goto bail; 1029 goto bail;
@@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
989 1047
990 brelse(bh); 1048 brelse(bh);
991 1049
992 ocfs2_meta_unlock(inode, 0); 1050 ocfs2_inode_unlock(inode, 0);
993 status = 0; 1051 status = 0;
994bail: 1052bail:
995 if (inode) 1053 if (inode)
@@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
1020 oi->ip_clusters = 0; 1078 oi->ip_clusters = 0;
1021 1079
1022 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1080 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1023 ocfs2_lock_res_init_once(&oi->ip_meta_lockres); 1081 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1024 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
1025 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1082 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1026 1083
1027 ocfs2_metadata_cache_init(&oi->vfs_inode); 1084 ocfs2_metadata_cache_init(&oi->vfs_inode);
@@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb)
1117 goto leave; 1174 goto leave;
1118 } 1175 }
1119 1176
1120 status = ocfs2_register_hb_callbacks(osb);
1121 if (status < 0) {
1122 mlog_errno(status);
1123 goto leave;
1124 }
1125
1126 status = ocfs2_dlm_init(osb); 1177 status = ocfs2_dlm_init(osb);
1127 if (status < 0) { 1178 if (status < 0) {
1128 mlog_errno(status); 1179 mlog_errno(status);
1129 goto leave; 1180 goto leave;
1130 } 1181 }
1131 1182
1132 /* requires vote_thread to be running. */
1133 status = ocfs2_register_net_handlers(osb);
1134 if (status < 0) {
1135 mlog_errno(status);
1136 goto leave;
1137 }
1138
1139 status = ocfs2_super_lock(osb, 1); 1183 status = ocfs2_super_lock(osb, 1);
1140 if (status < 0) { 1184 if (status < 0) {
1141 mlog_errno(status); 1185 mlog_errno(status);
@@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1150 goto leave; 1194 goto leave;
1151 } 1195 }
1152 1196
1153 ocfs2_populate_mounted_map(osb);
1154
1155 /* load all node-local system inodes */ 1197 /* load all node-local system inodes */
1156 status = ocfs2_init_local_system_inodes(osb); 1198 status = ocfs2_init_local_system_inodes(osb);
1157 if (status < 0) { 1199 if (status < 0) {
@@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1174 if (ocfs2_mount_local(osb)) 1216 if (ocfs2_mount_local(osb))
1175 goto leave; 1217 goto leave;
1176 1218
1177 /* This should be sent *after* we recovered our journal as it
1178 * will cause other nodes to unmark us as needing
1179 * recovery. However, we need to send it *before* dropping the
1180 * super block lock as otherwise their recovery threads might
1181 * try to clean us up while we're live! */
1182 status = ocfs2_request_mount_vote(osb);
1183 if (status < 0)
1184 mlog_errno(status);
1185
1186leave: 1219leave:
1187 if (unlock_super) 1220 if (unlock_super)
1188 ocfs2_super_unlock(osb, 1); 1221 ocfs2_super_unlock(osb, 1);
@@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1240 mlog_errno(tmp); 1273 mlog_errno(tmp);
1241 return; 1274 return;
1242 } 1275 }
1243
1244 tmp = ocfs2_request_umount_vote(osb);
1245 if (tmp < 0)
1246 mlog_errno(tmp);
1247 } 1276 }
1248 1277
1249 if (osb->slot_num != OCFS2_INVALID_SLOT) 1278 if (osb->slot_num != OCFS2_INVALID_SLOT)
@@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1254 1283
1255 ocfs2_release_system_inodes(osb); 1284 ocfs2_release_system_inodes(osb);
1256 1285
1257 if (osb->dlm) { 1286 if (osb->dlm)
1258 ocfs2_unregister_net_handlers(osb);
1259
1260 ocfs2_dlm_shutdown(osb); 1287 ocfs2_dlm_shutdown(osb);
1261 }
1262
1263 ocfs2_clear_hb_callbacks(osb);
1264 1288
1265 debugfs_remove(osb->osb_debug_root); 1289 debugfs_remove(osb->osb_debug_root);
1266 1290
@@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1315 int i, cbits, bbits; 1339 int i, cbits, bbits;
1316 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1340 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1317 struct inode *inode = NULL; 1341 struct inode *inode = NULL;
1318 struct buffer_head *bitmap_bh = NULL;
1319 struct ocfs2_journal *journal; 1342 struct ocfs2_journal *journal;
1320 __le32 uuid_net_key; 1343 __le32 uuid_net_key;
1321 struct ocfs2_super *osb; 1344 struct ocfs2_super *osb;
@@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
1344 osb->s_sectsize_bits = blksize_bits(sector_size); 1367 osb->s_sectsize_bits = blksize_bits(sector_size);
1345 BUG_ON(!osb->s_sectsize_bits); 1368 BUG_ON(!osb->s_sectsize_bits);
1346 1369
1347 osb->net_response_ids = 0;
1348 spin_lock_init(&osb->net_response_lock);
1349 INIT_LIST_HEAD(&osb->net_response_list);
1350
1351 INIT_LIST_HEAD(&osb->osb_net_handlers);
1352 init_waitqueue_head(&osb->recovery_event); 1370 init_waitqueue_head(&osb->recovery_event);
1353 spin_lock_init(&osb->vote_task_lock); 1371 spin_lock_init(&osb->dc_task_lock);
1354 init_waitqueue_head(&osb->vote_event); 1372 init_waitqueue_head(&osb->dc_event);
1355 osb->vote_work_sequence = 0; 1373 osb->dc_work_sequence = 0;
1356 osb->vote_wake_sequence = 0; 1374 osb->dc_wake_sequence = 0;
1357 INIT_LIST_HEAD(&osb->blocked_lock_list); 1375 INIT_LIST_HEAD(&osb->blocked_lock_list);
1358 osb->blocked_lock_count = 0; 1376 osb->blocked_lock_count = 0;
1359 INIT_LIST_HEAD(&osb->vote_list);
1360 spin_lock_init(&osb->osb_lock); 1377 spin_lock_init(&osb->osb_lock);
1361 1378
1362 atomic_set(&osb->alloc_stats.moves, 0); 1379 atomic_set(&osb->alloc_stats.moves, 0);
@@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1496 } 1513 }
1497 1514
1498 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); 1515 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
1499 osb->net_key = le32_to_cpu(uuid_net_key);
1500 1516
1501 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 1517 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
1502 osb->vol_label[63] = '\0'; 1518 osb->vol_label[63] = '\0';
@@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
1539 } 1555 }
1540 1556
1541 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 1557 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1542
1543 /* We don't have a cluster lock on the bitmap here because
1544 * we're only interested in static information and the extra
1545 * complexity at mount time isn't worht it. Don't pass the
1546 * inode in to the read function though as we don't want it to
1547 * be put in the cache. */
1548 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1549 NULL);
1550 iput(inode); 1558 iput(inode);
1551 if (status < 0) {
1552 mlog_errno(status);
1553 goto bail;
1554 }
1555 1559
1556 di = (struct ocfs2_dinode *) bitmap_bh->b_data; 1560 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
1557 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1558 brelse(bitmap_bh);
1559 mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
1560 (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
1561 1561
1562 status = ocfs2_init_slot_info(osb); 1562 status = ocfs2_init_slot_info(osb);
1563 if (status < 0) { 1563 if (status < 0) {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fd2e846e3e6f..ab713ebdd546 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
112 goto bail; 112 goto bail;
113 } 113 }
114 114
115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE); 115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
116 if (IS_ERR(inode)) { 116 if (IS_ERR(inode)) {
117 mlog_errno(PTR_ERR(inode)); 117 mlog_errno(PTR_ERR(inode));
118 inode = NULL; 118 inode = NULL;
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
index 5405ce121c99..e2488f4128a2 100644
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
@@ -29,7 +29,7 @@
29 29
30#include "ver.h" 30#include "ver.h"
31 31
32#define OCFS2_BUILD_VERSION "1.3.3" 32#define OCFS2_BUILD_VERSION "1.5.0"
33 33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION 34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35 35
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index c05358538f2b..000000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,756 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.c
5 *
6 * description here
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/kthread.h>
30
31#include <cluster/heartbeat.h>
32#include <cluster/nodemanager.h>
33#include <cluster/tcp.h>
34
35#include <dlm/dlmapi.h>
36
37#define MLOG_MASK_PREFIX ML_VOTE
38#include <cluster/masklog.h>
39
40#include "ocfs2.h"
41
42#include "alloc.h"
43#include "dlmglue.h"
44#include "extent_map.h"
45#include "heartbeat.h"
46#include "inode.h"
47#include "journal.h"
48#include "slot_map.h"
49#include "vote.h"
50
51#include "buffer_head_io.h"
52
53#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
54#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
55struct ocfs2_msg_hdr
56{
57 __be32 h_response_id; /* used to lookup message handle on sending
58 * node. */
59 __be32 h_request;
60 __be64 h_blkno;
61 __be32 h_generation;
62 __be32 h_node_num; /* node sending this particular message. */
63};
64
65struct ocfs2_vote_msg
66{
67 struct ocfs2_msg_hdr v_hdr;
68 __be32 v_reserved1;
69} __attribute__ ((packed));
70
71/* Responses are given these values to maintain backwards
72 * compatibility with older ocfs2 versions */
73#define OCFS2_RESPONSE_OK (0)
74#define OCFS2_RESPONSE_BUSY (-16)
75#define OCFS2_RESPONSE_BAD_MSG (-22)
76
77struct ocfs2_response_msg
78{
79 struct ocfs2_msg_hdr r_hdr;
80 __be32 r_response;
81} __attribute__ ((packed));
82
83struct ocfs2_vote_work {
84 struct list_head w_list;
85 struct ocfs2_vote_msg w_msg;
86};
87
88enum ocfs2_vote_request {
89 OCFS2_VOTE_REQ_INVALID = 0,
90 OCFS2_VOTE_REQ_MOUNT,
91 OCFS2_VOTE_REQ_UMOUNT,
92 OCFS2_VOTE_REQ_LAST
93};
94
95static inline int ocfs2_is_valid_vote_request(int request)
96{
97 return OCFS2_VOTE_REQ_INVALID < request &&
98 request < OCFS2_VOTE_REQ_LAST;
99}
100
101typedef void (*ocfs2_net_response_callback)(void *priv,
102 struct ocfs2_response_msg *resp);
103struct ocfs2_net_response_cb {
104 ocfs2_net_response_callback rc_cb;
105 void *rc_priv;
106};
107
108struct ocfs2_net_wait_ctxt {
109 struct list_head n_list;
110 u32 n_response_id;
111 wait_queue_head_t n_event;
112 struct ocfs2_node_map n_node_map;
113 int n_response; /* an agreggate response. 0 if
114 * all nodes are go, < 0 on any
115 * negative response from any
116 * node or network error. */
117 struct ocfs2_net_response_cb *n_callback;
118};
119
120static void ocfs2_process_mount_request(struct ocfs2_super *osb,
121 unsigned int node_num)
122{
123 mlog(0, "MOUNT vote from node %u\n", node_num);
124 /* The other node only sends us this message when he has an EX
125 * on the superblock, so our recovery threads (if having been
126 * launched) are waiting on it.*/
127 ocfs2_recovery_map_clear(osb, node_num);
128 ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
129
130 /* We clear the umount map here because a node may have been
131 * previously mounted, safely unmounted but never stopped
132 * heartbeating - in which case we'd have a stale entry. */
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136static void ocfs2_process_umount_request(struct ocfs2_super *osb,
137 unsigned int node_num)
138{
139 mlog(0, "UMOUNT vote from node %u\n", node_num);
140 ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
141 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
142}
143
144static void ocfs2_process_vote(struct ocfs2_super *osb,
145 struct ocfs2_vote_msg *msg)
146{
147 int net_status, vote_response;
148 unsigned int node_num;
149 u64 blkno;
150 enum ocfs2_vote_request request;
151 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
152 struct ocfs2_response_msg response;
153
154 /* decode the network mumbo jumbo into local variables. */
155 request = be32_to_cpu(hdr->h_request);
156 blkno = be64_to_cpu(hdr->h_blkno);
157 node_num = be32_to_cpu(hdr->h_node_num);
158
159 mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
160 request, (unsigned long long)blkno, node_num);
161
162 if (!ocfs2_is_valid_vote_request(request)) {
163 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
164 request, node_num);
165 vote_response = OCFS2_RESPONSE_BAD_MSG;
166 goto respond;
167 }
168
169 vote_response = OCFS2_RESPONSE_OK;
170
171 switch (request) {
172 case OCFS2_VOTE_REQ_UMOUNT:
173 ocfs2_process_umount_request(osb, node_num);
174 goto respond;
175 case OCFS2_VOTE_REQ_MOUNT:
176 ocfs2_process_mount_request(osb, node_num);
177 goto respond;
178 default:
179 /* avoids a gcc warning */
180 break;
181 }
182
183respond:
184 /* Response struture is small so we just put it on the stack
185 * and stuff it inline. */
186 memset(&response, 0, sizeof(struct ocfs2_response_msg));
187 response.r_hdr.h_response_id = hdr->h_response_id;
188 response.r_hdr.h_blkno = hdr->h_blkno;
189 response.r_hdr.h_generation = hdr->h_generation;
190 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
191 response.r_response = cpu_to_be32(vote_response);
192
193 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
194 osb->net_key,
195 &response,
196 sizeof(struct ocfs2_response_msg),
197 node_num,
198 NULL);
199 /* We still want to error print for ENOPROTOOPT here. The
200 * sending node shouldn't have unregistered his net handler
201 * without sending an unmount vote 1st */
202 if (net_status < 0
203 && net_status != -ETIMEDOUT
204 && net_status != -ENOTCONN)
205 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
206 node_num, net_status);
207}
208
209static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
210{
211 unsigned long processed;
212 struct ocfs2_lock_res *lockres;
213 struct ocfs2_vote_work *work;
214
215 mlog_entry_void();
216
217 spin_lock(&osb->vote_task_lock);
218 /* grab this early so we know to try again if a state change and
219 * wake happens part-way through our work */
220 osb->vote_work_sequence = osb->vote_wake_sequence;
221
222 processed = osb->blocked_lock_count;
223 while (processed) {
224 BUG_ON(list_empty(&osb->blocked_lock_list));
225
226 lockres = list_entry(osb->blocked_lock_list.next,
227 struct ocfs2_lock_res, l_blocked_list);
228 list_del_init(&lockres->l_blocked_list);
229 osb->blocked_lock_count--;
230 spin_unlock(&osb->vote_task_lock);
231
232 BUG_ON(!processed);
233 processed--;
234
235 ocfs2_process_blocked_lock(osb, lockres);
236
237 spin_lock(&osb->vote_task_lock);
238 }
239
240 while (osb->vote_count) {
241 BUG_ON(list_empty(&osb->vote_list));
242 work = list_entry(osb->vote_list.next,
243 struct ocfs2_vote_work, w_list);
244 list_del(&work->w_list);
245 osb->vote_count--;
246 spin_unlock(&osb->vote_task_lock);
247
248 ocfs2_process_vote(osb, &work->w_msg);
249 kfree(work);
250
251 spin_lock(&osb->vote_task_lock);
252 }
253 spin_unlock(&osb->vote_task_lock);
254
255 mlog_exit_void();
256}
257
258static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
259{
260 int empty = 0;
261
262 spin_lock(&osb->vote_task_lock);
263 if (list_empty(&osb->blocked_lock_list) &&
264 list_empty(&osb->vote_list))
265 empty = 1;
266
267 spin_unlock(&osb->vote_task_lock);
268 return empty;
269}
270
271static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
272{
273 int should_wake = 0;
274
275 spin_lock(&osb->vote_task_lock);
276 if (osb->vote_work_sequence != osb->vote_wake_sequence)
277 should_wake = 1;
278 spin_unlock(&osb->vote_task_lock);
279
280 return should_wake;
281}
282
283int ocfs2_vote_thread(void *arg)
284{
285 int status = 0;
286 struct ocfs2_super *osb = arg;
287
288 /* only quit once we've been asked to stop and there is no more
289 * work available */
290 while (!(kthread_should_stop() &&
291 ocfs2_vote_thread_lists_empty(osb))) {
292
293 wait_event_interruptible(osb->vote_event,
294 ocfs2_vote_thread_should_wake(osb) ||
295 kthread_should_stop());
296
297 mlog(0, "vote_thread: awoken\n");
298
299 ocfs2_vote_thread_do_work(osb);
300 }
301
302 osb->vote_task = NULL;
303 return status;
304}
305
306static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
307{
308 struct ocfs2_net_wait_ctxt *w;
309
310 w = kzalloc(sizeof(*w), GFP_NOFS);
311 if (!w) {
312 mlog_errno(-ENOMEM);
313 goto bail;
314 }
315
316 INIT_LIST_HEAD(&w->n_list);
317 init_waitqueue_head(&w->n_event);
318 ocfs2_node_map_init(&w->n_node_map);
319 w->n_response_id = response_id;
320 w->n_callback = NULL;
321bail:
322 return w;
323}
324
325static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
326{
327 unsigned int ret;
328
329 spin_lock(&osb->net_response_lock);
330 ret = ++osb->net_response_ids;
331 spin_unlock(&osb->net_response_lock);
332
333 return ret;
334}
335
336static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
337 struct ocfs2_net_wait_ctxt *w)
338{
339 spin_lock(&osb->net_response_lock);
340 list_del(&w->n_list);
341 spin_unlock(&osb->net_response_lock);
342}
343
344static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
345 struct ocfs2_net_wait_ctxt *w)
346{
347 spin_lock(&osb->net_response_lock);
348 list_add_tail(&w->n_list,
349 &osb->net_response_list);
350 spin_unlock(&osb->net_response_lock);
351}
352
353static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
354 struct ocfs2_net_wait_ctxt *w,
355 int node_num)
356{
357 assert_spin_locked(&osb->net_response_lock);
358
359 ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
360 if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
361 wake_up(&w->n_event);
362}
363
364/* Intended to be called from the node down callback, we fake remove
365 * the node from all our response contexts */
366void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
367 int node_num)
368{
369 struct list_head *p;
370 struct ocfs2_net_wait_ctxt *w = NULL;
371
372 spin_lock(&osb->net_response_lock);
373
374 list_for_each(p, &osb->net_response_list) {
375 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
376
377 __ocfs2_mark_node_responded(osb, w, node_num);
378 }
379
380 spin_unlock(&osb->net_response_lock);
381}
382
383static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
384 struct ocfs2_vote_msg *request,
385 unsigned int response_id,
386 int *response,
387 struct ocfs2_net_response_cb *callback)
388{
389 int status, i, remote_err;
390 struct ocfs2_net_wait_ctxt *w = NULL;
391 int dequeued = 0;
392
393 mlog_entry_void();
394
395 w = ocfs2_new_net_wait_ctxt(response_id);
396 if (!w) {
397 status = -ENOMEM;
398 mlog_errno(status);
399 goto bail;
400 }
401 w->n_callback = callback;
402
403 /* we're pretty much ready to go at this point, and this fills
404 * in n_response which we need anyway... */
405 ocfs2_queue_net_wait_ctxt(osb, w);
406
407 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
408
409 while (i != O2NM_INVALID_NODE_NUM) {
410 if (i != osb->node_num) {
411 mlog(0, "trying to send request to node %i\n", i);
412 ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
413
414 remote_err = 0;
415 status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
416 osb->net_key,
417 request,
418 sizeof(*request),
419 i,
420 &remote_err);
421 if (status == -ETIMEDOUT) {
422 mlog(0, "remote node %d timed out!\n", i);
423 status = -EAGAIN;
424 goto bail;
425 }
426 if (remote_err < 0) {
427 status = remote_err;
428 mlog(0, "remote error %d on node %d!\n",
429 remote_err, i);
430 mlog_errno(status);
431 goto bail;
432 }
433 if (status < 0) {
434 mlog_errno(status);
435 goto bail;
436 }
437 }
438 i++;
439 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
440 mlog(0, "next is %d, i am %d\n", i, osb->node_num);
441 }
442 mlog(0, "done sending, now waiting on responses...\n");
443
444 wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
445
446 ocfs2_dequeue_net_wait_ctxt(osb, w);
447 dequeued = 1;
448
449 *response = w->n_response;
450 status = 0;
451bail:
452 if (w) {
453 if (!dequeued)
454 ocfs2_dequeue_net_wait_ctxt(osb, w);
455 kfree(w);
456 }
457
458 mlog_exit(status);
459 return status;
460}
461
462static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
463 u64 blkno,
464 unsigned int generation,
465 enum ocfs2_vote_request type)
466{
467 struct ocfs2_vote_msg *request;
468 struct ocfs2_msg_hdr *hdr;
469
470 BUG_ON(!ocfs2_is_valid_vote_request(type));
471
472 request = kzalloc(sizeof(*request), GFP_NOFS);
473 if (!request) {
474 mlog_errno(-ENOMEM);
475 } else {
476 hdr = &request->v_hdr;
477 hdr->h_node_num = cpu_to_be32(osb->node_num);
478 hdr->h_request = cpu_to_be32(type);
479 hdr->h_blkno = cpu_to_be64(blkno);
480 hdr->h_generation = cpu_to_be32(generation);
481 }
482
483 return request;
484}
485
486/* Complete the buildup of a new vote request and process the
487 * broadcast return value. */
488static int ocfs2_do_request_vote(struct ocfs2_super *osb,
489 struct ocfs2_vote_msg *request,
490 struct ocfs2_net_response_cb *callback)
491{
492 int status, response = -EBUSY;
493 unsigned int response_id;
494 struct ocfs2_msg_hdr *hdr;
495
496 response_id = ocfs2_new_response_id(osb);
497
498 hdr = &request->v_hdr;
499 hdr->h_response_id = cpu_to_be32(response_id);
500
501 status = ocfs2_broadcast_vote(osb, request, response_id, &response,
502 callback);
503 if (status < 0) {
504 mlog_errno(status);
505 goto bail;
506 }
507
508 status = response;
509bail:
510
511 return status;
512}
513
514int ocfs2_request_mount_vote(struct ocfs2_super *osb)
515{
516 int status;
517 struct ocfs2_vote_msg *request = NULL;
518
519 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
520 if (!request) {
521 status = -ENOMEM;
522 goto bail;
523 }
524
525 status = -EAGAIN;
526 while (status == -EAGAIN) {
527 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
528 signal_pending(current)) {
529 status = -ERESTARTSYS;
530 goto bail;
531 }
532
533 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
534 osb->node_num)) {
535 status = 0;
536 goto bail;
537 }
538
539 status = ocfs2_do_request_vote(osb, request, NULL);
540 }
541
542bail:
543 kfree(request);
544 return status;
545}
546
547int ocfs2_request_umount_vote(struct ocfs2_super *osb)
548{
549 int status;
550 struct ocfs2_vote_msg *request = NULL;
551
552 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
553 if (!request) {
554 status = -ENOMEM;
555 goto bail;
556 }
557
558 status = -EAGAIN;
559 while (status == -EAGAIN) {
560 /* Do not check signals on this vote... We really want
561 * this one to go all the way through. */
562
563 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
564 osb->node_num)) {
565 status = 0;
566 goto bail;
567 }
568
569 status = ocfs2_do_request_vote(osb, request, NULL);
570 }
571
572bail:
573 kfree(request);
574 return status;
575}
576
577/* TODO: This should eventually be a hash table! */
578static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
579 u32 response_id)
580{
581 struct list_head *p;
582 struct ocfs2_net_wait_ctxt *w = NULL;
583
584 list_for_each(p, &osb->net_response_list) {
585 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
586 if (response_id == w->n_response_id)
587 break;
588 w = NULL;
589 }
590
591 return w;
592}
593
594/* Translate response codes into local node errno values */
595static inline int ocfs2_translate_response(int response)
596{
597 int ret;
598
599 switch (response) {
600 case OCFS2_RESPONSE_OK:
601 ret = 0;
602 break;
603
604 case OCFS2_RESPONSE_BUSY:
605 ret = -EBUSY;
606 break;
607
608 default:
609 ret = -EINVAL;
610 }
611
612 return ret;
613}
614
615static int ocfs2_handle_response_message(struct o2net_msg *msg,
616 u32 len,
617 void *data, void **ret_data)
618{
619 unsigned int response_id, node_num;
620 int response_status;
621 struct ocfs2_super *osb = data;
622 struct ocfs2_response_msg *resp;
623 struct ocfs2_net_wait_ctxt * w;
624 struct ocfs2_net_response_cb *resp_cb;
625
626 resp = (struct ocfs2_response_msg *) msg->buf;
627
628 response_id = be32_to_cpu(resp->r_hdr.h_response_id);
629 node_num = be32_to_cpu(resp->r_hdr.h_node_num);
630 response_status =
631 ocfs2_translate_response(be32_to_cpu(resp->r_response));
632
633 mlog(0, "received response message:\n");
634 mlog(0, "h_response_id = %u\n", response_id);
635 mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
636 mlog(0, "h_blkno = %llu\n",
637 (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
638 mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
639 mlog(0, "h_node_num = %u\n", node_num);
640 mlog(0, "r_response = %d\n", response_status);
641
642 spin_lock(&osb->net_response_lock);
643 w = __ocfs2_find_net_wait_ctxt(osb, response_id);
644 if (!w) {
645 mlog(0, "request not found!\n");
646 goto bail;
647 }
648 resp_cb = w->n_callback;
649
650 if (response_status && (!w->n_response)) {
651 /* we only really need one negative response so don't
652 * set it twice. */
653 w->n_response = response_status;
654 }
655
656 if (resp_cb) {
657 spin_unlock(&osb->net_response_lock);
658
659 resp_cb->rc_cb(resp_cb->rc_priv, resp);
660
661 spin_lock(&osb->net_response_lock);
662 }
663
664 __ocfs2_mark_node_responded(osb, w, node_num);
665bail:
666 spin_unlock(&osb->net_response_lock);
667
668 return 0;
669}
670
671static int ocfs2_handle_vote_message(struct o2net_msg *msg,
672 u32 len,
673 void *data, void **ret_data)
674{
675 int status;
676 struct ocfs2_super *osb = data;
677 struct ocfs2_vote_work *work;
678
679 work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
680 if (!work) {
681 status = -ENOMEM;
682 mlog_errno(status);
683 goto bail;
684 }
685
686 INIT_LIST_HEAD(&work->w_list);
687 memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
688
689 mlog(0, "scheduling vote request:\n");
690 mlog(0, "h_response_id = %u\n",
691 be32_to_cpu(work->w_msg.v_hdr.h_response_id));
692 mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
693 mlog(0, "h_blkno = %llu\n",
694 (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
695 mlog(0, "h_generation = %u\n",
696 be32_to_cpu(work->w_msg.v_hdr.h_generation));
697 mlog(0, "h_node_num = %u\n",
698 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
699
700 spin_lock(&osb->vote_task_lock);
701 list_add_tail(&work->w_list, &osb->vote_list);
702 osb->vote_count++;
703 spin_unlock(&osb->vote_task_lock);
704
705 ocfs2_kick_vote_thread(osb);
706
707 status = 0;
708bail:
709 return status;
710}
711
712void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
713{
714 if (!osb->net_key)
715 return;
716
717 o2net_unregister_handler_list(&osb->osb_net_handlers);
718
719 if (!list_empty(&osb->net_response_list))
720 mlog(ML_ERROR, "net response list not empty!\n");
721
722 osb->net_key = 0;
723}
724
725int ocfs2_register_net_handlers(struct ocfs2_super *osb)
726{
727 int status = 0;
728
729 if (ocfs2_mount_local(osb))
730 return 0;
731
732 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
733 osb->net_key,
734 sizeof(struct ocfs2_response_msg),
735 ocfs2_handle_response_message,
736 osb, NULL, &osb->osb_net_handlers);
737 if (status) {
738 mlog_errno(status);
739 goto bail;
740 }
741
742 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
743 osb->net_key,
744 sizeof(struct ocfs2_vote_msg),
745 ocfs2_handle_vote_message,
746 osb, NULL, &osb->osb_net_handlers);
747 if (status) {
748 mlog_errno(status);
749 goto bail;
750 }
751bail:
752 if (status < 0)
753 ocfs2_unregister_net_handlers(osb);
754
755 return status;
756}
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index f30fa92a44a1..bd694f779346 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -49,6 +49,7 @@ header-y += comstats.h
49header-y += const.h 49header-y += const.h
50header-y += cgroupstats.h 50header-y += cgroupstats.h
51header-y += cycx_cfm.h 51header-y += cycx_cfm.h
52header-y += dlmconstants.h
52header-y += dlm_device.h 53header-y += dlm_device.h
53header-y += dlm_netlink.h 54header-y += dlm_netlink.h
54header-y += dm-ioctl.h 55header-y += dm-ioctl.h
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index be9d278761e0..c743fbc769db 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -19,148 +19,12 @@
19 * routines and structures to use DLM lockspaces 19 * routines and structures to use DLM lockspaces
20 */ 20 */
21 21
22/* 22/* Lock levels and flags are here */
23 * Lock Modes 23#include <linux/dlmconstants.h>
24 */
25 24
26#define DLM_LOCK_IV -1 /* invalid */
27#define DLM_LOCK_NL 0 /* null */
28#define DLM_LOCK_CR 1 /* concurrent read */
29#define DLM_LOCK_CW 2 /* concurrent write */
30#define DLM_LOCK_PR 3 /* protected read */
31#define DLM_LOCK_PW 4 /* protected write */
32#define DLM_LOCK_EX 5 /* exclusive */
33
34/*
35 * Maximum size in bytes of a dlm_lock name
36 */
37 25
38#define DLM_RESNAME_MAXLEN 64 26#define DLM_RESNAME_MAXLEN 64
39 27
40/*
41 * Flags to dlm_lock
42 *
43 * DLM_LKF_NOQUEUE
44 *
45 * Do not queue the lock request on the wait queue if it cannot be granted
46 * immediately. If the lock cannot be granted because of this flag, DLM will
47 * either return -EAGAIN from the dlm_lock call or will return 0 from
48 * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
49 *
50 * DLM_LKF_CANCEL
51 *
52 * Used to cancel a pending lock request or conversion. A converting lock is
53 * returned to its previously granted mode.
54 *
55 * DLM_LKF_CONVERT
56 *
57 * Indicates a lock conversion request. For conversions the name and namelen
58 * are ignored and the lock ID in the LKSB is used to identify the lock.
59 *
60 * DLM_LKF_VALBLK
61 *
62 * Requests DLM to return the current contents of the lock value block in the
63 * lock status block. When this flag is set in a lock conversion from PW or EX
64 * modes, DLM assigns the value specified in the lock status block to the lock
65 * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
66 * containing application-specific information.
67 *
68 * DLM_LKF_QUECVT
69 *
70 * Force a conversion request to be queued, even if it is compatible with
71 * the granted modes of other locks on the same resource.
72 *
73 * DLM_LKF_IVVALBLK
74 *
75 * Invalidate the lock value block.
76 *
77 * DLM_LKF_CONVDEADLK
78 *
79 * Allows the dlm to resolve conversion deadlocks internally by demoting the
80 * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
81 * returned for a conversion that's been effected by this.
82 *
83 * DLM_LKF_PERSISTENT
84 *
85 * Only relevant to locks originating in userspace. A persistent lock will not
86 * be removed if the process holding the lock exits.
87 *
88 * DLM_LKF_NODLCKWT
89 *
90 * Do not cancel the lock if it gets into conversion deadlock.
91 * Exclude this lock from being monitored due to DLM_LSFL_TIMEWARN.
92 *
93 * DLM_LKF_NODLCKBLK
94 *
95 * net yet implemented
96 *
97 * DLM_LKF_EXPEDITE
98 *
99 * Used only with new requests for NL mode locks. Tells the lock manager
100 * to grant the lock, ignoring other locks in convert and wait queues.
101 *
102 * DLM_LKF_NOQUEUEBAST
103 *
104 * Send blocking AST's before returning -EAGAIN to the caller. It is only
105 * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
106 * NOQUEUE requests otherwise.
107 *
108 * DLM_LKF_HEADQUE
109 *
110 * Add a lock to the head of the convert or wait queue rather than the tail.
111 *
112 * DLM_LKF_NOORDER
113 *
114 * Disregard the standard grant order rules and grant a lock as soon as it
115 * is compatible with other granted locks.
116 *
117 * DLM_LKF_ORPHAN
118 *
119 * not yet implemented
120 *
121 * DLM_LKF_ALTPR
122 *
123 * If the requested mode cannot be granted immediately, try to grant the lock
124 * in PR mode instead. If this alternate mode is granted instead of the
125 * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
126 *
127 * DLM_LKF_ALTCW
128 *
129 * The same as ALTPR, but the alternate mode is CW.
130 *
131 * DLM_LKF_FORCEUNLOCK
132 *
133 * Unlock the lock even if it is converting or waiting or has sublocks.
134 * Only really for use by the userland device.c code.
135 *
136 */
137
138#define DLM_LKF_NOQUEUE 0x00000001
139#define DLM_LKF_CANCEL 0x00000002
140#define DLM_LKF_CONVERT 0x00000004
141#define DLM_LKF_VALBLK 0x00000008
142#define DLM_LKF_QUECVT 0x00000010
143#define DLM_LKF_IVVALBLK 0x00000020
144#define DLM_LKF_CONVDEADLK 0x00000040
145#define DLM_LKF_PERSISTENT 0x00000080
146#define DLM_LKF_NODLCKWT 0x00000100
147#define DLM_LKF_NODLCKBLK 0x00000200
148#define DLM_LKF_EXPEDITE 0x00000400
149#define DLM_LKF_NOQUEUEBAST 0x00000800
150#define DLM_LKF_HEADQUE 0x00001000
151#define DLM_LKF_NOORDER 0x00002000
152#define DLM_LKF_ORPHAN 0x00004000
153#define DLM_LKF_ALTPR 0x00008000
154#define DLM_LKF_ALTCW 0x00010000
155#define DLM_LKF_FORCEUNLOCK 0x00020000
156#define DLM_LKF_TIMEOUT 0x00040000
157
158/*
159 * Some return codes that are not in errno.h
160 */
161
162#define DLM_ECANCEL 0x10001
163#define DLM_EUNLOCK 0x10002
164 28
165typedef void dlm_lockspace_t; 29typedef void dlm_lockspace_t;
166 30
diff --git a/include/linux/dlmconstants.h b/include/linux/dlmconstants.h
new file mode 100644
index 000000000000..fddb3d3ff321
--- /dev/null
+++ b/include/linux/dlmconstants.h
@@ -0,0 +1,159 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLMCONSTANTS_DOT_H__
15#define __DLMCONSTANTS_DOT_H__
16
17/*
18 * Constants used by DLM interface.
19 */
20
21/*
22 * Lock Modes
23 */
24
25#define DLM_LOCK_IV (-1) /* invalid */
26#define DLM_LOCK_NL 0 /* null */
27#define DLM_LOCK_CR 1 /* concurrent read */
28#define DLM_LOCK_CW 2 /* concurrent write */
29#define DLM_LOCK_PR 3 /* protected read */
30#define DLM_LOCK_PW 4 /* protected write */
31#define DLM_LOCK_EX 5 /* exclusive */
32
33
34/*
35 * Flags to dlm_lock
36 *
37 * DLM_LKF_NOQUEUE
38 *
39 * Do not queue the lock request on the wait queue if it cannot be granted
40 * immediately. If the lock cannot be granted because of this flag, DLM will
41 * either return -EAGAIN from the dlm_lock call or will return 0 from
42 * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
43 *
44 * DLM_LKF_CANCEL
45 *
46 * Used to cancel a pending lock request or conversion. A converting lock is
47 * returned to its previously granted mode.
48 *
49 * DLM_LKF_CONVERT
50 *
51 * Indicates a lock conversion request. For conversions the name and namelen
52 * are ignored and the lock ID in the LKSB is used to identify the lock.
53 *
54 * DLM_LKF_VALBLK
55 *
56 * Requests DLM to return the current contents of the lock value block in the
57 * lock status block. When this flag is set in a lock conversion from PW or EX
58 * modes, DLM assigns the value specified in the lock status block to the lock
59 * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
60 * containing application-specific information.
61 *
62 * DLM_LKF_QUECVT
63 *
64 * Force a conversion request to be queued, even if it is compatible with
65 * the granted modes of other locks on the same resource.
66 *
67 * DLM_LKF_IVVALBLK
68 *
69 * Invalidate the lock value block.
70 *
71 * DLM_LKF_CONVDEADLK
72 *
73 * Allows the dlm to resolve conversion deadlocks internally by demoting the
74 * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
75 * returned for a conversion that's been effected by this.
76 *
77 * DLM_LKF_PERSISTENT
78 *
79 * Only relevant to locks originating in userspace. A persistent lock will not
80 * be removed if the process holding the lock exits.
81 *
82 * DLM_LKF_NODLCKWT
83 *
84 * Do not cancel the lock if it gets into conversion deadlock.
85 * Exclude this lock from being monitored due to DLM_LSFL_TIMEWARN.
86 *
87 * DLM_LKF_NODLCKBLK
88 *
89 * net yet implemented
90 *
91 * DLM_LKF_EXPEDITE
92 *
93 * Used only with new requests for NL mode locks. Tells the lock manager
94 * to grant the lock, ignoring other locks in convert and wait queues.
95 *
96 * DLM_LKF_NOQUEUEBAST
97 *
98 * Send blocking AST's before returning -EAGAIN to the caller. It is only
99 * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
100 * NOQUEUE requests otherwise.
101 *
102 * DLM_LKF_HEADQUE
103 *
104 * Add a lock to the head of the convert or wait queue rather than the tail.
105 *
106 * DLM_LKF_NOORDER
107 *
108 * Disregard the standard grant order rules and grant a lock as soon as it
109 * is compatible with other granted locks.
110 *
111 * DLM_LKF_ORPHAN
112 *
113 * not yet implemented
114 *
115 * DLM_LKF_ALTPR
116 *
117 * If the requested mode cannot be granted immediately, try to grant the lock
118 * in PR mode instead. If this alternate mode is granted instead of the
119 * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
120 *
121 * DLM_LKF_ALTCW
122 *
123 * The same as ALTPR, but the alternate mode is CW.
124 *
125 * DLM_LKF_FORCEUNLOCK
126 *
127 * Unlock the lock even if it is converting or waiting or has sublocks.
128 * Only really for use by the userland device.c code.
129 *
130 */
131
132#define DLM_LKF_NOQUEUE 0x00000001
133#define DLM_LKF_CANCEL 0x00000002
134#define DLM_LKF_CONVERT 0x00000004
135#define DLM_LKF_VALBLK 0x00000008
136#define DLM_LKF_QUECVT 0x00000010
137#define DLM_LKF_IVVALBLK 0x00000020
138#define DLM_LKF_CONVDEADLK 0x00000040
139#define DLM_LKF_PERSISTENT 0x00000080
140#define DLM_LKF_NODLCKWT 0x00000100
141#define DLM_LKF_NODLCKBLK 0x00000200
142#define DLM_LKF_EXPEDITE 0x00000400
143#define DLM_LKF_NOQUEUEBAST 0x00000800
144#define DLM_LKF_HEADQUE 0x00001000
145#define DLM_LKF_NOORDER 0x00002000
146#define DLM_LKF_ORPHAN 0x00004000
147#define DLM_LKF_ALTPR 0x00008000
148#define DLM_LKF_ALTCW 0x00010000
149#define DLM_LKF_FORCEUNLOCK 0x00020000
150#define DLM_LKF_TIMEOUT 0x00040000
151
152/*
153 * Some return codes that are not in errno.h
154 */
155
156#define DLM_ECANCEL 0x10001
157#define DLM_EUNLOCK 0x10002
158
159#endif /* __DLMCONSTANTS_DOT_H__ */