aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Makefile5
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/aops.c137
-rw-r--r--fs/ocfs2/buffer_head_io.c65
-rw-r--r--fs/ocfs2/buffer_head_io.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/tcp.h4
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h8
-rw-r--r--fs/ocfs2/cluster/ver.c2
-rw-r--r--fs/ocfs2/dcache.c8
-rw-r--r--fs/ocfs2/dir.c8
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c2
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c19
-rw-r--r--fs/ocfs2/dlm/dlmver.c2
-rw-r--r--fs/ocfs2/dlmglue.c546
-rw-r--r--fs/ocfs2/dlmglue.h31
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/file.c163
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/heartbeat.c80
-rw-r--r--fs/ocfs2/heartbeat.h2
-rw-r--r--fs/ocfs2/inode.c84
-rw-r--r--fs/ocfs2/inode.h10
-rw-r--r--fs/ocfs2/ioctl.c31
-rw-r--r--fs/ocfs2/journal.c51
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/ocfs2/localalloc.c50
-rw-r--r--fs/ocfs2/locks.c125
-rw-r--r--fs/ocfs2/locks.h (renamed from fs/ocfs2/vote.h)29
-rw-r--r--fs/ocfs2/mmap.c17
-rw-r--r--fs/ocfs2/namei.c66
-rw-r--r--fs/ocfs2/ocfs2.h35
-rw-r--r--fs/ocfs2/ocfs2_fs.h22
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/resize.c634
-rw-r--r--fs/ocfs2/resize.h32
-rw-r--r--fs/ocfs2/slot_map.c19
-rw-r--r--fs/ocfs2/slot_map.h2
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/ocfs2/suballoc.h8
-rw-r--r--fs/ocfs2/super.c140
-rw-r--r--fs/ocfs2/sysfile.c2
-rw-r--r--fs/ocfs2/ver.c2
-rw-r--r--fs/ocfs2/vote.c756
45 files changed, 1819 insertions, 1440 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b0..4d4ce48bb42c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -19,16 +19,17 @@ ocfs2-objs := \
19 ioctl.o \ 19 ioctl.o \
20 journal.o \ 20 journal.o \
21 localalloc.o \ 21 localalloc.o \
22 locks.o \
22 mmap.o \ 23 mmap.o \
23 namei.o \ 24 namei.o \
25 resize.o \
24 slot_map.o \ 26 slot_map.o \
25 suballoc.o \ 27 suballoc.o \
26 super.o \ 28 super.o \
27 symlink.o \ 29 symlink.o \
28 sysfile.o \ 30 sysfile.o \
29 uptodate.o \ 31 uptodate.o \
30 ver.o \ 32 ver.o
31 vote.o
32 33
33obj-$(CONFIG_OCFS2_FS) += cluster/ 34obj-$(CONFIG_OCFS2_FS) += cluster/
34obj-$(CONFIG_OCFS2_FS) += dlm/ 35obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 23c8cda43f19..e6df06ac6405 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4731,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4731 4731
4732 mutex_lock(&data_alloc_inode->i_mutex); 4732 mutex_lock(&data_alloc_inode->i_mutex);
4733 4733
4734 status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); 4734 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
4735 if (status < 0) { 4735 if (status < 0) {
4736 mlog_errno(status); 4736 mlog_errno(status);
4737 goto out_mutex; 4737 goto out_mutex;
@@ -4753,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4753 4753
4754out_unlock: 4754out_unlock:
4755 brelse(data_alloc_bh); 4755 brelse(data_alloc_bh);
4756 ocfs2_meta_unlock(data_alloc_inode, 1); 4756 ocfs2_inode_unlock(data_alloc_inode, 1);
4757 4757
4758out_mutex: 4758out_mutex:
4759 mutex_unlock(&data_alloc_inode->i_mutex); 4759 mutex_unlock(&data_alloc_inode->i_mutex);
@@ -5077,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5077 5077
5078 mutex_lock(&inode->i_mutex); 5078 mutex_lock(&inode->i_mutex);
5079 5079
5080 ret = ocfs2_meta_lock(inode, &di_bh, 1); 5080 ret = ocfs2_inode_lock(inode, &di_bh, 1);
5081 if (ret) { 5081 if (ret) {
5082 mlog_errno(ret); 5082 mlog_errno(ret);
5083 goto out_mutex; 5083 goto out_mutex;
@@ -5118,7 +5118,7 @@ out_journal:
5118 ocfs2_commit_trans(osb, handle); 5118 ocfs2_commit_trans(osb, handle);
5119 5119
5120out_unlock: 5120out_unlock:
5121 ocfs2_meta_unlock(inode, 1); 5121 ocfs2_inode_unlock(inode, 1);
5122 brelse(di_bh); 5122 brelse(di_bh);
5123out_mutex: 5123out_mutex:
5124 mutex_unlock(&inode->i_mutex); 5124 mutex_unlock(&inode->i_mutex);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 56f7790cad46..bc7b4cbbe8ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -26,6 +26,7 @@
26#include <asm/byteorder.h> 26#include <asm/byteorder.h>
27#include <linux/swap.h> 27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h>
29 30
30#define MLOG_MASK_PREFIX ML_FILE_IO 31#define MLOG_MASK_PREFIX ML_FILE_IO
31#include <cluster/masklog.h> 32#include <cluster/masklog.h>
@@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
139{ 140{
140 int err = 0; 141 int err = 0;
141 unsigned int ext_flags; 142 unsigned int ext_flags;
142 u64 p_blkno, past_eof; 143 u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
144 u64 p_blkno, count, past_eof;
143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
144 146
145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 147 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
155 goto bail; 157 goto bail;
156 } 158 }
157 159
158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, 160 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
159 &ext_flags); 161 &ext_flags);
160 if (err) { 162 if (err) {
161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 163 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
164 goto bail; 166 goto bail;
165 } 167 }
166 168
169 if (max_blocks < count)
170 count = max_blocks;
171
167 /* 172 /*
168 * ocfs2 never allocates in this function - the only time we 173 * ocfs2 never allocates in this function - the only time we
169 * need to use BH_New is when we're extending i_size on a file 174 * need to use BH_New is when we're extending i_size on a file
@@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 183 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
179 map_bh(bh_result, inode->i_sb, p_blkno); 184 map_bh(bh_result, inode->i_sb, p_blkno);
180 185
186 bh_result->b_size = count << inode->i_blkbits;
187
181 if (!ocfs2_sparse_alloc(osb)) { 188 if (!ocfs2_sparse_alloc(osb)) {
182 if (p_blkno == 0) { 189 if (p_blkno == 0) {
183 err = -EIO; 190 err = -EIO;
@@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
210 struct buffer_head *di_bh) 217 struct buffer_head *di_bh)
211{ 218{
212 void *kaddr; 219 void *kaddr;
213 unsigned int size; 220 loff_t size;
214 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 221 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
215 222
216 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { 223 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
224 if (size > PAGE_CACHE_SIZE || 231 if (size > PAGE_CACHE_SIZE ||
225 size > ocfs2_max_inline_data(inode->i_sb)) { 232 size > ocfs2_max_inline_data(inode->i_sb)) {
226 ocfs2_error(inode->i_sb, 233 ocfs2_error(inode->i_sb,
227 "Inode %llu has with inline data has bad size: %u", 234 "Inode %llu has with inline data has bad size: %Lu",
228 (unsigned long long)OCFS2_I(inode)->ip_blkno, size); 235 (unsigned long long)OCFS2_I(inode)->ip_blkno,
236 (unsigned long long)size);
229 return -EROFS; 237 return -EROFS;
230 } 238 }
231 239
@@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
275 283
276 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 284 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
277 285
278 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); 286 ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
279 if (ret != 0) { 287 if (ret != 0) {
280 if (ret == AOP_TRUNCATED_PAGE) 288 if (ret == AOP_TRUNCATED_PAGE)
281 unlock = 0; 289 unlock = 0;
@@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
285 293
286 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 294 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
287 ret = AOP_TRUNCATED_PAGE; 295 ret = AOP_TRUNCATED_PAGE;
288 goto out_meta_unlock; 296 goto out_inode_unlock;
289 } 297 }
290 298
291 /* 299 /*
@@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page)
305 goto out_alloc; 313 goto out_alloc;
306 } 314 }
307 315
308 ret = ocfs2_data_lock_with_page(inode, 0, page);
309 if (ret != 0) {
310 if (ret == AOP_TRUNCATED_PAGE)
311 unlock = 0;
312 mlog_errno(ret);
313 goto out_alloc;
314 }
315
316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
317 ret = ocfs2_readpage_inline(inode, page); 317 ret = ocfs2_readpage_inline(inode, page);
318 else 318 else
319 ret = block_read_full_page(page, ocfs2_get_block); 319 ret = block_read_full_page(page, ocfs2_get_block);
320 unlock = 0; 320 unlock = 0;
321 321
322 ocfs2_data_unlock(inode, 0);
323out_alloc: 322out_alloc:
324 up_read(&OCFS2_I(inode)->ip_alloc_sem); 323 up_read(&OCFS2_I(inode)->ip_alloc_sem);
325out_meta_unlock: 324out_inode_unlock:
326 ocfs2_meta_unlock(inode, 0); 325 ocfs2_inode_unlock(inode, 0);
327out: 326out:
328 if (unlock) 327 if (unlock)
329 unlock_page(page); 328 unlock_page(page);
@@ -331,6 +330,62 @@ out:
331 return ret; 330 return ret;
332} 331}
333 332
333/*
334 * This is used only for read-ahead. Failures or difficult to handle
335 * situations are safe to ignore.
336 *
337 * Right now, we don't bother with BH_Boundary - in-inode extent lists
338 * are quite large (243 extents on 4k blocks), so most inodes don't
339 * grow out to a tree. If need be, detecting boundary extents could
340 * trivially be added in a future version of ocfs2_get_block().
341 */
342static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
343 struct list_head *pages, unsigned nr_pages)
344{
345 int ret, err = -EIO;
346 struct inode *inode = mapping->host;
347 struct ocfs2_inode_info *oi = OCFS2_I(inode);
348 loff_t start;
349 struct page *last;
350
351 /*
352 * Use the nonblocking flag for the dlm code to avoid page
353 * lock inversion, but don't bother with retrying.
354 */
355 ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
356 if (ret)
357 return err;
358
359 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
360 ocfs2_inode_unlock(inode, 0);
361 return err;
362 }
363
364 /*
365 * Don't bother with inline-data. There isn't anything
366 * to read-ahead in that case anyway...
367 */
368 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
369 goto out_unlock;
370
371 /*
372 * Check whether a remote node truncated this file - we just
373 * drop out in that case as it's not worth handling here.
374 */
375 last = list_entry(pages->prev, struct page, lru);
376 start = (loff_t)last->index << PAGE_CACHE_SHIFT;
377 if (start >= i_size_read(inode))
378 goto out_unlock;
379
380 err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
381
382out_unlock:
383 up_read(&oi->ip_alloc_sem);
384 ocfs2_inode_unlock(inode, 0);
385
386 return err;
387}
388
334/* Note: Because we don't support holes, our allocation has 389/* Note: Because we don't support holes, our allocation has
335 * already happened (allocation writes zeros to the file data) 390 * already happened (allocation writes zeros to the file data)
336 * so we don't have to worry about ordered writes in 391 * so we don't have to worry about ordered writes in
@@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
452 * accessed concurrently from multiple nodes. 507 * accessed concurrently from multiple nodes.
453 */ 508 */
454 if (!INODE_JOURNAL(inode)) { 509 if (!INODE_JOURNAL(inode)) {
455 err = ocfs2_meta_lock(inode, NULL, 0); 510 err = ocfs2_inode_lock(inode, NULL, 0);
456 if (err) { 511 if (err) {
457 if (err != -ENOENT) 512 if (err != -ENOENT)
458 mlog_errno(err); 513 mlog_errno(err);
@@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
467 522
468 if (!INODE_JOURNAL(inode)) { 523 if (!INODE_JOURNAL(inode)) {
469 up_read(&OCFS2_I(inode)->ip_alloc_sem); 524 up_read(&OCFS2_I(inode)->ip_alloc_sem);
470 ocfs2_meta_unlock(inode, 0); 525 ocfs2_inode_unlock(inode, 0);
471 } 526 }
472 527
473 if (err) { 528 if (err) {
@@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw,
638 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 693 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
639 return 0; 694 return 0;
640 695
641 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
642 /*
643 * We get PR data locks even for O_DIRECT. This
644 * allows concurrent O_DIRECT I/O but doesn't let
645 * O_DIRECT with extending and buffered zeroing writes
646 * race. If they did race then the buffered zeroing
647 * could be written back after the O_DIRECT I/O. It's
648 * one thing to tell people not to mix buffered and
649 * O_DIRECT writes, but expecting them to understand
650 * that file extension is also an implicit buffered
651 * write is too much. By getting the PR we force
652 * writeback of the buffered zeroing before
653 * proceeding.
654 */
655 ret = ocfs2_data_lock(inode, 0);
656 if (ret < 0) {
657 mlog_errno(ret);
658 goto out;
659 }
660 ocfs2_data_unlock(inode, 0);
661 }
662
663 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 696 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
664 inode->i_sb->s_bdev, iov, offset, 697 inode->i_sb->s_bdev, iov, offset,
665 nr_segs, 698 nr_segs,
666 ocfs2_direct_IO_get_blocks, 699 ocfs2_direct_IO_get_blocks,
667 ocfs2_dio_end_io); 700 ocfs2_dio_end_io);
668out: 701
669 mlog_exit(ret); 702 mlog_exit(ret);
670 return ret; 703 return ret;
671} 704}
@@ -1754,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1754 struct buffer_head *di_bh = NULL; 1787 struct buffer_head *di_bh = NULL;
1755 struct inode *inode = mapping->host; 1788 struct inode *inode = mapping->host;
1756 1789
1757 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1790 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1758 if (ret) { 1791 if (ret) {
1759 mlog_errno(ret); 1792 mlog_errno(ret);
1760 return ret; 1793 return ret;
@@ -1769,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1769 */ 1802 */
1770 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1803 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1771 1804
1772 ret = ocfs2_data_lock(inode, 1);
1773 if (ret) {
1774 mlog_errno(ret);
1775 goto out_fail;
1776 }
1777
1778 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1805 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1779 fsdata, di_bh, NULL); 1806 fsdata, di_bh, NULL);
1780 if (ret) { 1807 if (ret) {
1781 mlog_errno(ret); 1808 mlog_errno(ret);
1782 goto out_fail_data; 1809 goto out_fail;
1783 } 1810 }
1784 1811
1785 brelse(di_bh); 1812 brelse(di_bh);
1786 1813
1787 return 0; 1814 return 0;
1788 1815
1789out_fail_data:
1790 ocfs2_data_unlock(inode, 1);
1791out_fail: 1816out_fail:
1792 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1817 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1793 1818
1794 brelse(di_bh); 1819 brelse(di_bh);
1795 ocfs2_meta_unlock(inode, 1); 1820 ocfs2_inode_unlock(inode, 1);
1796 1821
1797 return ret; 1822 return ret;
1798} 1823}
@@ -1908,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1908 1933
1909 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); 1934 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1910 1935
1911 ocfs2_data_unlock(inode, 1);
1912 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1936 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1913 ocfs2_meta_unlock(inode, 1); 1937 ocfs2_inode_unlock(inode, 1);
1914 1938
1915 return ret; 1939 return ret;
1916} 1940}
1917 1941
1918const struct address_space_operations ocfs2_aops = { 1942const struct address_space_operations ocfs2_aops = {
1919 .readpage = ocfs2_readpage, 1943 .readpage = ocfs2_readpage,
1944 .readpages = ocfs2_readpages,
1920 .writepage = ocfs2_writepage, 1945 .writepage = ocfs2_writepage,
1921 .write_begin = ocfs2_write_begin, 1946 .write_begin = ocfs2_write_begin,
1922 .write_end = ocfs2_write_end, 1947 .write_end = ocfs2_write_end,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f6..f136639f5b41 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
79 * information for this bh as it's not marked locally 79 * information for this bh as it's not marked locally
80 * uptodate. */ 80 * uptodate. */
81 ret = -EIO; 81 ret = -EIO;
82 brelse(bh); 82 put_bh(bh);
83 } 83 }
84 84
85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
256 * for this bh as it's not marked locally 256 * for this bh as it's not marked locally
257 * uptodate. */ 257 * uptodate. */
258 status = -EIO; 258 status = -EIO;
259 brelse(bh); 259 put_bh(bh);
260 bhs[i] = NULL; 260 bhs[i] = NULL;
261 continue; 261 continue;
262 } 262 }
@@ -280,3 +280,64 @@ bail:
280 mlog_exit(status); 280 mlog_exit(status);
281 return status; 281 return status;
282} 282}
283
284/* Check whether the blkno is the super block or one of the backups. */
285static void ocfs2_check_super_or_backup(struct super_block *sb,
286 sector_t blkno)
287{
288 int i;
289 u64 backup_blkno;
290
291 if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
292 return;
293
294 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
295 backup_blkno = ocfs2_backup_super_blkno(sb, i);
296 if (backup_blkno == blkno)
297 return;
298 }
299
300 BUG();
301}
302
303/*
304 * Write super block and backups doesn't need to collaborate with journal,
305 * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
306 * into this function.
307 */
308int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
309 struct buffer_head *bh)
310{
311 int ret = 0;
312
313 mlog_entry_void();
314
315 BUG_ON(buffer_jbd(bh));
316 ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
317
318 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
319 ret = -EROFS;
320 goto out;
321 }
322
323 lock_buffer(bh);
324 set_buffer_uptodate(bh);
325
326 /* remove from dirty list before I/O. */
327 clear_buffer_dirty(bh);
328
329 get_bh(bh); /* for end_buffer_write_sync() */
330 bh->b_end_io = end_buffer_write_sync;
331 submit_bh(WRITE, bh);
332
333 wait_on_buffer(bh);
334
335 if (!buffer_uptodate(bh)) {
336 ret = -EIO;
337 put_bh(bh);
338 }
339
340out:
341 mlog_exit(ret);
342 return ret;
343}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac3..c2e78614c3e5 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super *osb,
47 int flags, 47 int flags,
48 struct inode *inode); 48 struct inode *inode);
49 49
50int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
51 struct buffer_head *bh);
50 52
51#define OCFS2_BH_CACHED 1 53#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 54#define OCFS2_BH_READAHEAD 8
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 35397dd5ecdb..e511339886b3 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -35,7 +35,7 @@
35#define O2HB_LIVE_THRESHOLD 2 35#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 36/* number of equal samples to be seen as dead */
37extern unsigned int o2hb_dead_threshold; 37extern unsigned int o2hb_dead_threshold;
38#define O2HB_DEFAULT_DEAD_THRESHOLD 7 38#define O2HB_DEFAULT_DEAD_THRESHOLD 31
39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ 39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
40#define O2HB_MIN_DEAD_THRESHOLD 2 40#define O2HB_MIN_DEAD_THRESHOLD 2
41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) 41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index da880fc215f0..f36f66aab3dd 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
60/* same as hb delay, we're waiting for another node to recognize our hb */ 60/* same as hb delay, we're waiting for another node to recognize our hb */
61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
62 62
63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
65 65
66 66
67/* TODO: figure this out.... */ 67/* TODO: figure this out.... */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 9606111fe89d..b2e832aca567 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * New in version 10:
42 * - Meta/data locks combined
43 *
44 * New in version 9:
45 * - All votes removed
46 *
41 * New in version 8: 47 * New in version 8:
42 * - Replace delete inode votes with a cluster lock 48 * - Replace delete inode votes with a cluster lock
43 * 49 *
@@ -60,7 +66,7 @@
60 * - full 64 bit i_size in the metadata lock lvbs 66 * - full 64 bit i_size in the metadata lock lvbs
61 * - introduction of "rw" lock and pushing meta/data locking down 67 * - introduction of "rw" lock and pushing meta/data locking down
62 */ 68 */
63#define O2NET_PROTOCOL_VERSION 8ULL 69#define O2NET_PROTOCOL_VERSION 10ULL
64struct o2net_handshake { 70struct o2net_handshake {
65 __be64 protocol_version; 71 __be64 protocol_version;
66 __be64 connector_id; 72 __be64 connector_id;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
index 7286c48bb30d..a56eee6abad3 100644
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -28,7 +28,7 @@
28 28
29#include "ver.h" 29#include "ver.h"
30 30
31#define CLUSTER_BUILD_VERSION "1.3.3" 31#define CLUSTER_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION 33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 9923278ea6d4..b1cc7c381e88 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
128/* 128/*
129 * Walk the inode alias list, and find a dentry which has a given 129 * Walk the inode alias list, and find a dentry which has a given
130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it 130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
131 * is looking for a dentry_lock reference. The vote thread is looking 131 * is looking for a dentry_lock reference. The downconvert thread is
132 * to unhash aliases, so we allow it to skip any that already have 132 * looking to unhash aliases, so we allow it to skip any that already
133 * that property. 133 * have that property.
134 */ 134 */
135struct dentry *ocfs2_find_local_alias(struct inode *inode, 135struct dentry *ocfs2_find_local_alias(struct inode *inode,
136 u64 parent_blkno, 136 u64 parent_blkno,
@@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
266 dl->dl_count = 0; 266 dl->dl_count = 0;
267 /* 267 /*
268 * Does this have to happen below, for all attaches, in case 268 * Does this have to happen below, for all attaches, in case
269 * the struct inode gets blown away by votes? 269 * the struct inode gets blown away by the downconvert thread?
270 */ 270 */
271 dl->dl_inode = igrab(inode); 271 dl->dl_inode = igrab(inode);
272 dl->dl_parent_blkno = parent_blkno; 272 dl->dl_parent_blkno = parent_blkno;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 63b28fdceb4a..6b0107f21344 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
846 mlog_entry("dirino=%llu\n", 846 mlog_entry("dirino=%llu\n",
847 (unsigned long long)OCFS2_I(inode)->ip_blkno); 847 (unsigned long long)OCFS2_I(inode)->ip_blkno);
848 848
849 error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 849 error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
850 if (lock_level && error >= 0) { 850 if (lock_level && error >= 0) {
851 /* We release EX lock which used to update atime 851 /* We release EX lock which used to update atime
852 * and get PR lock again to reduce contention 852 * and get PR lock again to reduce contention
853 * on commonly accessed directories. */ 853 * on commonly accessed directories. */
854 ocfs2_meta_unlock(inode, 1); 854 ocfs2_inode_unlock(inode, 1);
855 lock_level = 0; 855 lock_level = 0;
856 error = ocfs2_meta_lock(inode, NULL, 0); 856 error = ocfs2_inode_lock(inode, NULL, 0);
857 } 857 }
858 if (error < 0) { 858 if (error < 0) {
859 if (error != -ENOENT) 859 if (error != -ENOENT)
@@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, 865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
866 dirent, filldir, NULL); 866 dirent, filldir, NULL);
867 867
868 ocfs2_meta_unlock(inode, lock_level); 868 ocfs2_inode_unlock(inode, lock_level);
869 869
870bail_nolock: 870bail_nolock:
871 mlog_exit(error); 871 mlog_exit(error);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
index d2be3ad841f9..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmfsver.h" 29#include "dlmfsver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2fde7bf91434..91f747b8a538 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2270 } 2270 }
2271 } 2271 }
2272 2272
2273 /* Clean up join state on node death. */
2274 if (dlm->joining_node == idx) {
2275 mlog(0, "Clearing join state for node %u\n", idx);
2276 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2277 }
2278
2273 /* check to see if the node is already considered dead */ 2279 /* check to see if the node is already considered dead */
2274 if (!test_bit(idx, dlm->live_nodes_map)) { 2280 if (!test_bit(idx, dlm->live_nodes_map)) {
2275 mlog(0, "for domain %s, node %d is already dead. " 2281 mlog(0, "for domain %s, node %d is already dead. "
@@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2288 2294
2289 clear_bit(idx, dlm->live_nodes_map); 2295 clear_bit(idx, dlm->live_nodes_map);
2290 2296
2291 /* Clean up join state on node death. */
2292 if (dlm->joining_node == idx) {
2293 mlog(0, "Clearing join state for node %u\n", idx);
2294 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2295 }
2296
2297 /* make sure local cleanup occurs before the heartbeat events */ 2297 /* make sure local cleanup occurs before the heartbeat events */
2298 if (!test_bit(idx, dlm->recovery_map)) 2298 if (!test_bit(idx, dlm->recovery_map))
2299 dlm_do_local_recovery_cleanup(dlm, idx); 2299 dlm_do_local_recovery_cleanup(dlm, idx);
@@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
2321 if (!dlm_grab(dlm)) 2321 if (!dlm_grab(dlm))
2322 return; 2322 return;
2323 2323
2324 /*
2325 * This will notify any dlm users that a node in our domain
2326 * went away without notifying us first.
2327 */
2328 if (test_bit(idx, dlm->domain_map))
2329 dlm_fire_domain_eviction_callbacks(dlm, idx);
2330
2324 spin_lock(&dlm->spinlock); 2331 spin_lock(&dlm->spinlock);
2325 __dlm_hb_node_down(dlm, idx); 2332 __dlm_hb_node_down(dlm, idx);
2326 spin_unlock(&dlm->spinlock); 2333 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
index 7ef2653f8f41..dfc0da4d158d 100644
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmver.h" 29#include "dlmver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4e97dcceaf8f..3867244fb144 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
55#include "slot_map.h" 55#include "slot_map.h"
56#include "super.h" 56#include "super.h"
57#include "uptodate.h" 57#include "uptodate.h"
58#include "vote.h"
59 58
60#include "buffer_head_io.h" 59#include "buffer_head_io.h"
61 60
@@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
69 68
70static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 69static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); 70static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
72 72
73/* 73/*
74 * Return value from ->downconvert_worker functions. 74 * Return value from ->downconvert_worker functions.
@@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); 153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
154 154
155 /* 155 /*
156 * Optionally called in the downconvert (or "vote") thread 156 * Optionally called in the downconvert thread after a
157 * after a successful downconvert. The lockres will not be 157 * successful downconvert. The lockres will not be referenced
158 * referenced after this callback is called, so it is safe to 158 * after this callback is called, so it is safe to free
159 * free memory, etc. 159 * memory, etc.
160 * 160 *
161 * The exact semantics of when this is called are controlled 161 * The exact semantics of when this is called are controlled
162 * by ->downconvert_worker() 162 * by ->downconvert_worker()
@@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
225 .flags = 0, 225 .flags = 0,
226}; 226};
227 227
228static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { 228static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
229 .get_osb = ocfs2_get_inode_osb, 229 .get_osb = ocfs2_get_inode_osb,
230 .check_downconvert = ocfs2_check_meta_downconvert, 230 .check_downconvert = ocfs2_check_meta_downconvert,
231 .set_lvb = ocfs2_set_meta_lvb, 231 .set_lvb = ocfs2_set_meta_lvb,
232 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
233};
234
235static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
236 .get_osb = ocfs2_get_inode_osb,
237 .downconvert_worker = ocfs2_data_convert_worker, 232 .downconvert_worker = ocfs2_data_convert_worker,
238 .flags = 0, 233 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
239}; 234};
240 235
241static struct ocfs2_lock_res_ops ocfs2_super_lops = { 236static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
258 .flags = 0, 253 .flags = 0,
259}; 254};
260 255
256static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
257 .get_osb = ocfs2_get_file_osb,
258 .flags = 0,
259};
260
261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
262{ 262{
263 return lockres->l_type == OCFS2_LOCK_TYPE_META || 263 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
264 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
265 lockres->l_type == OCFS2_LOCK_TYPE_RW || 264 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
266 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 265 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
267} 266}
@@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
310 "resource %s: %s\n", dlm_errname(_stat), _func, \ 309 "resource %s: %s\n", dlm_errname(_stat), _func, \
311 _lockres->l_name, dlm_errmsg(_stat)); \ 310 _lockres->l_name, dlm_errmsg(_stat)); \
312} while (0) 311} while (0)
313static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 312static int ocfs2_downconvert_thread(void *arg);
314 struct ocfs2_lock_res *lockres); 313static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
315static int ocfs2_meta_lock_update(struct inode *inode, 314 struct ocfs2_lock_res *lockres);
315static int ocfs2_inode_lock_update(struct inode *inode,
316 struct buffer_head **bh); 316 struct buffer_head **bh);
317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
318static inline int ocfs2_highest_compat_lock_level(int level); 318static inline int ocfs2_highest_compat_lock_level(int level);
319static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
320 int new_level);
321static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
322 struct ocfs2_lock_res *lockres,
323 int new_level,
324 int lvb);
325static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
326 struct ocfs2_lock_res *lockres);
327static int ocfs2_cancel_convert(struct ocfs2_super *osb,
328 struct ocfs2_lock_res *lockres);
329
319 330
320static void ocfs2_build_lock_name(enum ocfs2_lock_type type, 331static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
321 u64 blkno, 332 u64 blkno,
@@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
402 ops = &ocfs2_inode_rw_lops; 413 ops = &ocfs2_inode_rw_lops;
403 break; 414 break;
404 case OCFS2_LOCK_TYPE_META: 415 case OCFS2_LOCK_TYPE_META:
405 ops = &ocfs2_inode_meta_lops; 416 ops = &ocfs2_inode_inode_lops;
406 break;
407 case OCFS2_LOCK_TYPE_DATA:
408 ops = &ocfs2_inode_data_lops;
409 break; 417 break;
410 case OCFS2_LOCK_TYPE_OPEN: 418 case OCFS2_LOCK_TYPE_OPEN:
411 ops = &ocfs2_inode_open_lops; 419 ops = &ocfs2_inode_open_lops;
@@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
428 return OCFS2_SB(inode->i_sb); 436 return OCFS2_SB(inode->i_sb);
429} 437}
430 438
439static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
440{
441 struct ocfs2_file_private *fp = lockres->l_priv;
442
443 return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
444}
445
431static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) 446static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
432{ 447{
433 __be64 inode_blkno_be; 448 __be64 inode_blkno_be;
@@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
508 &ocfs2_rename_lops, osb); 523 &ocfs2_rename_lops, osb);
509} 524}
510 525
526void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
527 struct ocfs2_file_private *fp)
528{
529 struct inode *inode = fp->fp_file->f_mapping->host;
530 struct ocfs2_inode_info *oi = OCFS2_I(inode);
531
532 ocfs2_lock_res_init_once(lockres);
533 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
534 inode->i_generation, lockres->l_name);
535 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
536 OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
537 fp);
538 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
539}
540
511void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 541void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
512{ 542{
513 mlog_entry_void(); 543 mlog_entry_void();
@@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
724 lockres->l_name, level, lockres->l_level, 754 lockres->l_name, level, lockres->l_level,
725 ocfs2_lock_type_string(lockres->l_type)); 755 ocfs2_lock_type_string(lockres->l_type));
726 756
757 /*
758 * We can skip the bast for locks which don't enable caching -
759 * they'll be dropped at the earliest possible time anyway.
760 */
761 if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
762 return;
763
727 spin_lock_irqsave(&lockres->l_lock, flags); 764 spin_lock_irqsave(&lockres->l_lock, flags);
728 needs_downconvert = ocfs2_generic_handle_bast(lockres, level); 765 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
729 if (needs_downconvert) 766 if (needs_downconvert)
@@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
732 769
733 wake_up(&lockres->l_event); 770 wake_up(&lockres->l_event);
734 771
735 ocfs2_kick_vote_thread(osb); 772 ocfs2_wake_downconvert_thread(osb);
736} 773}
737 774
738static void ocfs2_locking_ast(void *opaque) 775static void ocfs2_locking_ast(void *opaque)
@@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
935 972
936} 973}
937 974
975static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
976 struct ocfs2_lock_res *lockres)
977{
978 int ret;
979
980 ret = wait_for_completion_interruptible(&mw->mw_complete);
981 if (ret)
982 lockres_remove_mask_waiter(lockres, mw);
983 else
984 ret = mw->mw_status;
985 /* Re-arm the completion in case we want to wait on it again */
986 INIT_COMPLETION(mw->mw_complete);
987 return ret;
988}
989
938static int ocfs2_cluster_lock(struct ocfs2_super *osb, 990static int ocfs2_cluster_lock(struct ocfs2_super *osb,
939 struct ocfs2_lock_res *lockres, 991 struct ocfs2_lock_res *lockres,
940 int level, 992 int level,
@@ -1089,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1089 mlog_entry_void(); 1141 mlog_entry_void();
1090 spin_lock_irqsave(&lockres->l_lock, flags); 1142 spin_lock_irqsave(&lockres->l_lock, flags);
1091 ocfs2_dec_holders(lockres, level); 1143 ocfs2_dec_holders(lockres, level);
1092 ocfs2_vote_on_unlock(osb, lockres); 1144 ocfs2_downconvert_on_unlock(osb, lockres);
1093 spin_unlock_irqrestore(&lockres->l_lock, flags); 1145 spin_unlock_irqrestore(&lockres->l_lock, flags);
1094 mlog_exit_void(); 1146 mlog_exit_void();
1095} 1147}
@@ -1147,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1147 * We don't want to use LKM_LOCAL on a meta data lock as they 1199 * We don't want to use LKM_LOCAL on a meta data lock as they
1148 * don't use a generation in their lock names. 1200 * don't use a generation in their lock names.
1149 */ 1201 */
1150 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0); 1202 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
1151 if (ret) {
1152 mlog_errno(ret);
1153 goto bail;
1154 }
1155
1156 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
1157 if (ret) { 1203 if (ret) {
1158 mlog_errno(ret); 1204 mlog_errno(ret);
1159 goto bail; 1205 goto bail;
@@ -1311,76 +1357,221 @@ out:
1311 mlog_exit_void(); 1357 mlog_exit_void();
1312} 1358}
1313 1359
1314int ocfs2_data_lock_full(struct inode *inode, 1360static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
1315 int write, 1361 int level)
1316 int arg_flags)
1317{ 1362{
1318 int status = 0, level; 1363 int ret;
1319 struct ocfs2_lock_res *lockres; 1364 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1320 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 unsigned long flags;
1366 struct ocfs2_mask_waiter mw;
1321 1367
1322 BUG_ON(!inode); 1368 ocfs2_init_mask_waiter(&mw);
1323 1369
1324 mlog_entry_void(); 1370retry_cancel:
1371 spin_lock_irqsave(&lockres->l_lock, flags);
1372 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
1373 ret = ocfs2_prepare_cancel_convert(osb, lockres);
1374 if (ret) {
1375 spin_unlock_irqrestore(&lockres->l_lock, flags);
1376 ret = ocfs2_cancel_convert(osb, lockres);
1377 if (ret < 0) {
1378 mlog_errno(ret);
1379 goto out;
1380 }
1381 goto retry_cancel;
1382 }
1383 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1384 spin_unlock_irqrestore(&lockres->l_lock, flags);
1325 1385
1326 mlog(0, "inode %llu take %s DATA lock\n", 1386 ocfs2_wait_for_mask(&mw);
1327 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1387 goto retry_cancel;
1328 write ? "EXMODE" : "PRMODE"); 1388 }
1329 1389
1330 /* We'll allow faking a readonly data lock for 1390 ret = -ERESTARTSYS;
1331 * rodevices. */ 1391 /*
1332 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { 1392 * We may still have gotten the lock, in which case there's no
1333 if (write) { 1393 * point to restarting the syscall.
1334 status = -EROFS; 1394 */
1335 mlog_errno(status); 1395 if (lockres->l_level == level)
1396 ret = 0;
1397
1398 mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
1399 lockres->l_flags, lockres->l_level, lockres->l_action);
1400
1401 spin_unlock_irqrestore(&lockres->l_lock, flags);
1402
1403out:
1404 return ret;
1405}
1406
1407/*
1408 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1409 * flock() calls. The locking approach this requires is sufficiently
1410 * different from all other cluster lock types that we implement a
1411 * seperate path to the "low-level" dlm calls. In particular:
1412 *
1413 * - No optimization of lock levels is done - we take at exactly
1414 * what's been requested.
1415 *
1416 * - No lock caching is employed. We immediately downconvert to
1417 * no-lock at unlock time. This also means flock locks never go on
1418 * the blocking list).
1419 *
1420 * - Since userspace can trivially deadlock itself with flock, we make
1421 * sure to allow cancellation of a misbehaving applications flock()
1422 * request.
1423 *
1424 * - Access to any flock lockres doesn't require concurrency, so we
1425 * can simplify the code by requiring the caller to guarantee
1426 * serialization of dlmglue flock calls.
1427 */
1428int ocfs2_file_lock(struct file *file, int ex, int trylock)
1429{
1430 int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
1431 unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
1432 unsigned long flags;
1433 struct ocfs2_file_private *fp = file->private_data;
1434 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1435 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1436 struct ocfs2_mask_waiter mw;
1437
1438 ocfs2_init_mask_waiter(&mw);
1439
1440 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
1441 (lockres->l_level > LKM_NLMODE)) {
1442 mlog(ML_ERROR,
1443 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1444 "level: %u\n", lockres->l_name, lockres->l_flags,
1445 lockres->l_level);
1446 return -EINVAL;
1447 }
1448
1449 spin_lock_irqsave(&lockres->l_lock, flags);
1450 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1451 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1452 spin_unlock_irqrestore(&lockres->l_lock, flags);
1453
1454 /*
1455 * Get the lock at NLMODE to start - that way we
1456 * can cancel the upconvert request if need be.
1457 */
1458 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
1459 if (ret < 0) {
1460 mlog_errno(ret);
1461 goto out;
1336 } 1462 }
1337 goto out; 1463
1464 ret = ocfs2_wait_for_mask(&mw);
1465 if (ret) {
1466 mlog_errno(ret);
1467 goto out;
1468 }
1469 spin_lock_irqsave(&lockres->l_lock, flags);
1338 } 1470 }
1339 1471
1340 if (ocfs2_mount_local(osb)) 1472 lockres->l_action = OCFS2_AST_CONVERT;
1341 goto out; 1473 lkm_flags |= LKM_CONVERT;
1474 lockres->l_requested = level;
1475 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1342 1476
1343 lockres = &OCFS2_I(inode)->ip_data_lockres; 1477 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1478 spin_unlock_irqrestore(&lockres->l_lock, flags);
1344 1479
1345 level = write ? LKM_EXMODE : LKM_PRMODE; 1480 ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
1481 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
1482 ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
1483 if (ret != DLM_NORMAL) {
1484 if (trylock && ret == DLM_NOTQUEUED)
1485 ret = -EAGAIN;
1486 else {
1487 ocfs2_log_dlm_error("dlmlock", ret, lockres);
1488 ret = -EINVAL;
1489 }
1346 1490
1347 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 1491 ocfs2_recover_from_dlm_error(lockres, 1);
1348 0, arg_flags); 1492 lockres_remove_mask_waiter(lockres, &mw);
1349 if (status < 0 && status != -EAGAIN) 1493 goto out;
1350 mlog_errno(status); 1494 }
1495
1496 ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
1497 if (ret == -ERESTARTSYS) {
1498 /*
1499 * Userspace can cause deadlock itself with
1500 * flock(). Current behavior locally is to allow the
1501 * deadlock, but abort the system call if a signal is
1502 * received. We follow this example, otherwise a
1503 * poorly written program could sit in kernel until
1504 * reboot.
1505 *
1506 * Handling this is a bit more complicated for Ocfs2
1507 * though. We can't exit this function with an
1508 * outstanding lock request, so a cancel convert is
1509 * required. We intentionally overwrite 'ret' - if the
1510 * cancel fails and the lock was granted, it's easier
1511 * to just bubble sucess back up to the user.
1512 */
1513 ret = ocfs2_flock_handle_signal(lockres, level);
1514 }
1351 1515
1352out: 1516out:
1353 mlog_exit(status); 1517
1354 return status; 1518 mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
1519 lockres->l_name, ex, trylock, ret);
1520 return ret;
1355} 1521}
1356 1522
1357/* see ocfs2_meta_lock_with_page() */ 1523void ocfs2_file_unlock(struct file *file)
1358int ocfs2_data_lock_with_page(struct inode *inode,
1359 int write,
1360 struct page *page)
1361{ 1524{
1362 int ret; 1525 int ret;
1526 unsigned long flags;
1527 struct ocfs2_file_private *fp = file->private_data;
1528 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1529 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1530 struct ocfs2_mask_waiter mw;
1363 1531
1364 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); 1532 ocfs2_init_mask_waiter(&mw);
1365 if (ret == -EAGAIN) { 1533
1366 unlock_page(page); 1534 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
1367 if (ocfs2_data_lock(inode, write) == 0) 1535 return;
1368 ocfs2_data_unlock(inode, write); 1536
1369 ret = AOP_TRUNCATED_PAGE; 1537 if (lockres->l_level == LKM_NLMODE)
1538 return;
1539
1540 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
1541 lockres->l_name, lockres->l_flags, lockres->l_level,
1542 lockres->l_action);
1543
1544 spin_lock_irqsave(&lockres->l_lock, flags);
1545 /*
1546 * Fake a blocking ast for the downconvert code.
1547 */
1548 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1549 lockres->l_blocking = LKM_EXMODE;
1550
1551 ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
1552 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1553 spin_unlock_irqrestore(&lockres->l_lock, flags);
1554
1555 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
1556 if (ret) {
1557 mlog_errno(ret);
1558 return;
1370 } 1559 }
1371 1560
1372 return ret; 1561 ret = ocfs2_wait_for_mask(&mw);
1562 if (ret)
1563 mlog_errno(ret);
1373} 1564}
1374 1565
1375static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 1566static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
1376 struct ocfs2_lock_res *lockres) 1567 struct ocfs2_lock_res *lockres)
1377{ 1568{
1378 int kick = 0; 1569 int kick = 0;
1379 1570
1380 mlog_entry_void(); 1571 mlog_entry_void();
1381 1572
1382 /* If we know that another node is waiting on our lock, kick 1573 /* If we know that another node is waiting on our lock, kick
1383 * the vote thread * pre-emptively when we reach a release 1574 * the downconvert thread * pre-emptively when we reach a release
1384 * condition. */ 1575 * condition. */
1385 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1576 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1386 switch(lockres->l_blocking) { 1577 switch(lockres->l_blocking) {
@@ -1398,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1398 } 1589 }
1399 1590
1400 if (kick) 1591 if (kick)
1401 ocfs2_kick_vote_thread(osb); 1592 ocfs2_wake_downconvert_thread(osb);
1402
1403 mlog_exit_void();
1404}
1405
1406void ocfs2_data_unlock(struct inode *inode,
1407 int write)
1408{
1409 int level = write ? LKM_EXMODE : LKM_PRMODE;
1410 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1411 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1412
1413 mlog_entry_void();
1414
1415 mlog(0, "inode %llu drop %s DATA lock\n",
1416 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1417 write ? "EXMODE" : "PRMODE");
1418
1419 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
1420 !ocfs2_mount_local(osb))
1421 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1422 1593
1423 mlog_exit_void(); 1594 mlog_exit_void();
1424} 1595}
@@ -1442,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
1442 1613
1443/* Call this with the lockres locked. I am reasonably sure we don't 1614/* Call this with the lockres locked. I am reasonably sure we don't
1444 * need ip_lock in this function as anyone who would be changing those 1615 * need ip_lock in this function as anyone who would be changing those
1445 * values is supposed to be blocked in ocfs2_meta_lock right now. */ 1616 * values is supposed to be blocked in ocfs2_inode_lock right now. */
1446static void __ocfs2_stuff_meta_lvb(struct inode *inode) 1617static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1447{ 1618{
1448 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1619 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1449 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1620 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1450 struct ocfs2_meta_lvb *lvb; 1621 struct ocfs2_meta_lvb *lvb;
1451 1622
1452 mlog_entry_void(); 1623 mlog_entry_void();
@@ -1496,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
1496static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 1667static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1497{ 1668{
1498 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1669 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1499 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1670 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1500 struct ocfs2_meta_lvb *lvb; 1671 struct ocfs2_meta_lvb *lvb;
1501 1672
1502 mlog_entry_void(); 1673 mlog_entry_void();
@@ -1604,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
1604} 1775}
1605 1776
1606/* may or may not return a bh if it went to disk. */ 1777/* may or may not return a bh if it went to disk. */
1607static int ocfs2_meta_lock_update(struct inode *inode, 1778static int ocfs2_inode_lock_update(struct inode *inode,
1608 struct buffer_head **bh) 1779 struct buffer_head **bh)
1609{ 1780{
1610 int status = 0; 1781 int status = 0;
1611 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1782 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1612 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1783 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1613 struct ocfs2_dinode *fe; 1784 struct ocfs2_dinode *fe;
1614 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1615 1786
@@ -1721,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
1721 * returns < 0 error if the callback will never be called, otherwise 1892 * returns < 0 error if the callback will never be called, otherwise
1722 * the result of the lock will be communicated via the callback. 1893 * the result of the lock will be communicated via the callback.
1723 */ 1894 */
1724int ocfs2_meta_lock_full(struct inode *inode, 1895int ocfs2_inode_lock_full(struct inode *inode,
1725 struct buffer_head **ret_bh, 1896 struct buffer_head **ret_bh,
1726 int ex, 1897 int ex,
1727 int arg_flags) 1898 int arg_flags)
@@ -1756,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
1756 wait_event(osb->recovery_event, 1927 wait_event(osb->recovery_event,
1757 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1928 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1758 1929
1759 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1930 lockres = &OCFS2_I(inode)->ip_inode_lockres;
1760 level = ex ? LKM_EXMODE : LKM_PRMODE; 1931 level = ex ? LKM_EXMODE : LKM_PRMODE;
1761 dlm_flags = 0; 1932 dlm_flags = 0;
1762 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 1933 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1795,11 +1966,11 @@ local:
1795 } 1966 }
1796 1967
1797 /* This is fun. The caller may want a bh back, or it may 1968 /* This is fun. The caller may want a bh back, or it may
1798 * not. ocfs2_meta_lock_update definitely wants one in, but 1969 * not. ocfs2_inode_lock_update definitely wants one in, but
1799 * may or may not read one, depending on what's in the 1970 * may or may not read one, depending on what's in the
1800 * LVB. The result of all of this is that we've *only* gone to 1971 * LVB. The result of all of this is that we've *only* gone to
1801 * disk if we have to, so the complexity is worthwhile. */ 1972 * disk if we have to, so the complexity is worthwhile. */
1802 status = ocfs2_meta_lock_update(inode, &local_bh); 1973 status = ocfs2_inode_lock_update(inode, &local_bh);
1803 if (status < 0) { 1974 if (status < 0) {
1804 if (status != -ENOENT) 1975 if (status != -ENOENT)
1805 mlog_errno(status); 1976 mlog_errno(status);
@@ -1821,7 +1992,7 @@ bail:
1821 *ret_bh = NULL; 1992 *ret_bh = NULL;
1822 } 1993 }
1823 if (acquired) 1994 if (acquired)
1824 ocfs2_meta_unlock(inode, ex); 1995 ocfs2_inode_unlock(inode, ex);
1825 } 1996 }
1826 1997
1827 if (local_bh) 1998 if (local_bh)
@@ -1832,19 +2003,20 @@ bail:
1832} 2003}
1833 2004
1834/* 2005/*
1835 * This is working around a lock inversion between tasks acquiring DLM locks 2006 * This is working around a lock inversion between tasks acquiring DLM
1836 * while holding a page lock and the vote thread which blocks dlm lock acquiry 2007 * locks while holding a page lock and the downconvert thread which
1837 * while acquiring page locks. 2008 * blocks dlm lock acquiry while acquiring page locks.
1838 * 2009 *
1839 * ** These _with_page variantes are only intended to be called from aop 2010 * ** These _with_page variantes are only intended to be called from aop
1840 * methods that hold page locks and return a very specific *positive* error 2011 * methods that hold page locks and return a very specific *positive* error
1841 * code that aop methods pass up to the VFS -- test for errors with != 0. ** 2012 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1842 * 2013 *
1843 * The DLM is called such that it returns -EAGAIN if it would have blocked 2014 * The DLM is called such that it returns -EAGAIN if it would have
1844 * waiting for the vote thread. In that case we unlock our page so the vote 2015 * blocked waiting for the downconvert thread. In that case we unlock
1845 * thread can make progress. Once we've done this we have to return 2016 * our page so the downconvert thread can make progress. Once we've
1846 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up 2017 * done this we have to return AOP_TRUNCATED_PAGE so the aop method
1847 * into the VFS who will then immediately retry the aop call. 2018 * that called us can bubble that back up into the VFS who will then
2019 * immediately retry the aop call.
1848 * 2020 *
1849 * We do a blocking lock and immediate unlock before returning, though, so that 2021 * We do a blocking lock and immediate unlock before returning, though, so that
1850 * the lock has a great chance of being cached on this node by the time the VFS 2022 * the lock has a great chance of being cached on this node by the time the VFS
@@ -1852,32 +2024,32 @@ bail:
1852 * ping locks back and forth, but that's a risk we're willing to take to avoid 2024 * ping locks back and forth, but that's a risk we're willing to take to avoid
1853 * the lock inversion simply. 2025 * the lock inversion simply.
1854 */ 2026 */
1855int ocfs2_meta_lock_with_page(struct inode *inode, 2027int ocfs2_inode_lock_with_page(struct inode *inode,
1856 struct buffer_head **ret_bh, 2028 struct buffer_head **ret_bh,
1857 int ex, 2029 int ex,
1858 struct page *page) 2030 struct page *page)
1859{ 2031{
1860 int ret; 2032 int ret;
1861 2033
1862 ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); 2034 ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
1863 if (ret == -EAGAIN) { 2035 if (ret == -EAGAIN) {
1864 unlock_page(page); 2036 unlock_page(page);
1865 if (ocfs2_meta_lock(inode, ret_bh, ex) == 0) 2037 if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
1866 ocfs2_meta_unlock(inode, ex); 2038 ocfs2_inode_unlock(inode, ex);
1867 ret = AOP_TRUNCATED_PAGE; 2039 ret = AOP_TRUNCATED_PAGE;
1868 } 2040 }
1869 2041
1870 return ret; 2042 return ret;
1871} 2043}
1872 2044
1873int ocfs2_meta_lock_atime(struct inode *inode, 2045int ocfs2_inode_lock_atime(struct inode *inode,
1874 struct vfsmount *vfsmnt, 2046 struct vfsmount *vfsmnt,
1875 int *level) 2047 int *level)
1876{ 2048{
1877 int ret; 2049 int ret;
1878 2050
1879 mlog_entry_void(); 2051 mlog_entry_void();
1880 ret = ocfs2_meta_lock(inode, NULL, 0); 2052 ret = ocfs2_inode_lock(inode, NULL, 0);
1881 if (ret < 0) { 2053 if (ret < 0) {
1882 mlog_errno(ret); 2054 mlog_errno(ret);
1883 return ret; 2055 return ret;
@@ -1890,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1890 if (ocfs2_should_update_atime(inode, vfsmnt)) { 2062 if (ocfs2_should_update_atime(inode, vfsmnt)) {
1891 struct buffer_head *bh = NULL; 2063 struct buffer_head *bh = NULL;
1892 2064
1893 ocfs2_meta_unlock(inode, 0); 2065 ocfs2_inode_unlock(inode, 0);
1894 ret = ocfs2_meta_lock(inode, &bh, 1); 2066 ret = ocfs2_inode_lock(inode, &bh, 1);
1895 if (ret < 0) { 2067 if (ret < 0) {
1896 mlog_errno(ret); 2068 mlog_errno(ret);
1897 return ret; 2069 return ret;
@@ -1908,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1908 return ret; 2080 return ret;
1909} 2081}
1910 2082
1911void ocfs2_meta_unlock(struct inode *inode, 2083void ocfs2_inode_unlock(struct inode *inode,
1912 int ex) 2084 int ex)
1913{ 2085{
1914 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2086 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1915 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 2087 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
1916 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2088 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1917 2089
1918 mlog_entry_void(); 2090 mlog_entry_void();
@@ -2320,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2320 goto bail; 2492 goto bail;
2321 } 2493 }
2322 2494
2323 /* launch vote thread */ 2495 /* launch downconvert thread */
2324 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); 2496 osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
2325 if (IS_ERR(osb->vote_task)) { 2497 if (IS_ERR(osb->dc_task)) {
2326 status = PTR_ERR(osb->vote_task); 2498 status = PTR_ERR(osb->dc_task);
2327 osb->vote_task = NULL; 2499 osb->dc_task = NULL;
2328 mlog_errno(status); 2500 mlog_errno(status);
2329 goto bail; 2501 goto bail;
2330 } 2502 }
@@ -2353,8 +2525,8 @@ local:
2353bail: 2525bail:
2354 if (status < 0) { 2526 if (status < 0) {
2355 ocfs2_dlm_shutdown_debug(osb); 2527 ocfs2_dlm_shutdown_debug(osb);
2356 if (osb->vote_task) 2528 if (osb->dc_task)
2357 kthread_stop(osb->vote_task); 2529 kthread_stop(osb->dc_task);
2358 } 2530 }
2359 2531
2360 mlog_exit(status); 2532 mlog_exit(status);
@@ -2369,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2369 2541
2370 ocfs2_drop_osb_locks(osb); 2542 ocfs2_drop_osb_locks(osb);
2371 2543
2372 if (osb->vote_task) { 2544 if (osb->dc_task) {
2373 kthread_stop(osb->vote_task); 2545 kthread_stop(osb->dc_task);
2374 osb->vote_task = NULL; 2546 osb->dc_task = NULL;
2375 } 2547 }
2376 2548
2377 ocfs2_lock_res_free(&osb->osb_super_lockres); 2549 ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2527,7 +2699,7 @@ out:
2527 2699
2528/* Mark the lockres as being dropped. It will no longer be 2700/* Mark the lockres as being dropped. It will no longer be
2529 * queued if blocking, but we still may have to wait on it 2701 * queued if blocking, but we still may have to wait on it
2530 * being dequeued from the vote thread before we can consider 2702 * being dequeued from the downconvert thread before we can consider
2531 * it safe to drop. 2703 * it safe to drop.
2532 * 2704 *
2533 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 2705 * You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2590,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2590 status = err; 2762 status = err;
2591 2763
2592 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2764 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2593 &OCFS2_I(inode)->ip_data_lockres); 2765 &OCFS2_I(inode)->ip_inode_lockres);
2594 if (err < 0)
2595 mlog_errno(err);
2596 if (err < 0 && !status)
2597 status = err;
2598
2599 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2600 &OCFS2_I(inode)->ip_meta_lockres);
2601 if (err < 0) 2766 if (err < 0)
2602 mlog_errno(err); 2767 mlog_errno(err);
2603 if (err < 0 && !status) 2768 if (err < 0 && !status)
@@ -2850,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2850 inode = ocfs2_lock_res_inode(lockres); 3015 inode = ocfs2_lock_res_inode(lockres);
2851 mapping = inode->i_mapping; 3016 mapping = inode->i_mapping;
2852 3017
3018 if (S_ISREG(inode->i_mode))
3019 goto out;
3020
2853 /* 3021 /*
2854 * We need this before the filemap_fdatawrite() so that it can 3022 * We need this before the filemap_fdatawrite() so that it can
2855 * transfer the dirty bit from the PTE to the 3023 * transfer the dirty bit from the PTE to the
@@ -2875,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2875 filemap_fdatawait(mapping); 3043 filemap_fdatawait(mapping);
2876 } 3044 }
2877 3045
3046out:
2878 return UNBLOCK_CONTINUE; 3047 return UNBLOCK_CONTINUE;
2879} 3048}
2880 3049
@@ -2903,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
2903 3072
2904/* 3073/*
2905 * Does the final reference drop on our dentry lock. Right now this 3074 * Does the final reference drop on our dentry lock. Right now this
2906 * happens in the vote thread, but we could choose to simplify the 3075 * happens in the downconvert thread, but we could choose to simplify the
2907 * dlmglue API and push these off to the ocfs2_wq in the future. 3076 * dlmglue API and push these off to the ocfs2_wq in the future.
2908 */ 3077 */
2909static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 3078static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3042,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3042 mlog(0, "lockres %s blocked.\n", lockres->l_name); 3211 mlog(0, "lockres %s blocked.\n", lockres->l_name);
3043 3212
3044 /* Detect whether a lock has been marked as going away while 3213 /* Detect whether a lock has been marked as going away while
3045 * the vote thread was processing other things. A lock can 3214 * the downconvert thread was processing other things. A lock can
3046 * still be marked with OCFS2_LOCK_FREEING after this check, 3215 * still be marked with OCFS2_LOCK_FREEING after this check,
3047 * but short circuiting here will still save us some 3216 * but short circuiting here will still save us some
3048 * performance. */ 3217 * performance. */
@@ -3091,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3091 3260
3092 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); 3261 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
3093 3262
3094 spin_lock(&osb->vote_task_lock); 3263 spin_lock(&osb->dc_task_lock);
3095 if (list_empty(&lockres->l_blocked_list)) { 3264 if (list_empty(&lockres->l_blocked_list)) {
3096 list_add_tail(&lockres->l_blocked_list, 3265 list_add_tail(&lockres->l_blocked_list,
3097 &osb->blocked_lock_list); 3266 &osb->blocked_lock_list);
3098 osb->blocked_lock_count++; 3267 osb->blocked_lock_count++;
3099 } 3268 }
3100 spin_unlock(&osb->vote_task_lock); 3269 spin_unlock(&osb->dc_task_lock);
3270
3271 mlog_exit_void();
3272}
3273
3274static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
3275{
3276 unsigned long processed;
3277 struct ocfs2_lock_res *lockres;
3278
3279 mlog_entry_void();
3280
3281 spin_lock(&osb->dc_task_lock);
3282 /* grab this early so we know to try again if a state change and
3283 * wake happens part-way through our work */
3284 osb->dc_work_sequence = osb->dc_wake_sequence;
3285
3286 processed = osb->blocked_lock_count;
3287 while (processed) {
3288 BUG_ON(list_empty(&osb->blocked_lock_list));
3289
3290 lockres = list_entry(osb->blocked_lock_list.next,
3291 struct ocfs2_lock_res, l_blocked_list);
3292 list_del_init(&lockres->l_blocked_list);
3293 osb->blocked_lock_count--;
3294 spin_unlock(&osb->dc_task_lock);
3295
3296 BUG_ON(!processed);
3297 processed--;
3298
3299 ocfs2_process_blocked_lock(osb, lockres);
3300
3301 spin_lock(&osb->dc_task_lock);
3302 }
3303 spin_unlock(&osb->dc_task_lock);
3101 3304
3102 mlog_exit_void(); 3305 mlog_exit_void();
3103} 3306}
3307
3308static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
3309{
3310 int empty = 0;
3311
3312 spin_lock(&osb->dc_task_lock);
3313 if (list_empty(&osb->blocked_lock_list))
3314 empty = 1;
3315
3316 spin_unlock(&osb->dc_task_lock);
3317 return empty;
3318}
3319
3320static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
3321{
3322 int should_wake = 0;
3323
3324 spin_lock(&osb->dc_task_lock);
3325 if (osb->dc_work_sequence != osb->dc_wake_sequence)
3326 should_wake = 1;
3327 spin_unlock(&osb->dc_task_lock);
3328
3329 return should_wake;
3330}
3331
3332int ocfs2_downconvert_thread(void *arg)
3333{
3334 int status = 0;
3335 struct ocfs2_super *osb = arg;
3336
3337 /* only quit once we've been asked to stop and there is no more
3338 * work available */
3339 while (!(kthread_should_stop() &&
3340 ocfs2_downconvert_thread_lists_empty(osb))) {
3341
3342 wait_event_interruptible(osb->dc_event,
3343 ocfs2_downconvert_thread_should_wake(osb) ||
3344 kthread_should_stop());
3345
3346 mlog(0, "downconvert_thread: awoken\n");
3347
3348 ocfs2_downconvert_thread_do_work(osb);
3349 }
3350
3351 osb->dc_task = NULL;
3352 return status;
3353}
3354
3355void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
3356{
3357 spin_lock(&osb->dc_task_lock);
3358 /* make sure the voting thread gets a swipe at whatever changes
3359 * the caller may have made to the voting state */
3360 osb->dc_wake_sequence++;
3361 spin_unlock(&osb->dc_task_lock);
3362 wake_up(&osb->dc_event);
3363}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 87a785e41205..5f17243ba501 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,12 +49,12 @@ struct ocfs2_meta_lvb {
49 __be32 lvb_reserved2; 49 __be32 lvb_reserved2;
50}; 50};
51 51
52/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ 52/* ocfs2_inode_lock_full() 'arg_flags' flags */
53/* don't wait on recovery. */ 53/* don't wait on recovery. */
54#define OCFS2_META_LOCK_RECOVERY (0x01) 54#define OCFS2_META_LOCK_RECOVERY (0x01)
55/* Instruct the dlm not to queue ourselves on the other node. */ 55/* Instruct the dlm not to queue ourselves on the other node. */
56#define OCFS2_META_LOCK_NOQUEUE (0x02) 56#define OCFS2_META_LOCK_NOQUEUE (0x02)
57/* don't block waiting for the vote thread, instead return -EAGAIN */ 57/* don't block waiting for the downconvert thread, instead return -EAGAIN */
58#define OCFS2_LOCK_NONBLOCK (0x04) 58#define OCFS2_LOCK_NONBLOCK (0x04)
59 59
60int ocfs2_dlm_init(struct ocfs2_super *osb); 60int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
66 struct inode *inode); 66 struct inode *inode);
67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, 67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
68 u64 parent, struct inode *inode); 68 u64 parent, struct inode *inode);
69struct ocfs2_file_private;
70void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
71 struct ocfs2_file_private *fp);
69void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 72void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
70int ocfs2_create_new_inode_locks(struct inode *inode); 73int ocfs2_create_new_inode_locks(struct inode *inode);
71int ocfs2_drop_inode_locks(struct inode *inode); 74int ocfs2_drop_inode_locks(struct inode *inode);
72int ocfs2_data_lock_full(struct inode *inode,
73 int write,
74 int arg_flags);
75#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
76int ocfs2_data_lock_with_page(struct inode *inode,
77 int write,
78 struct page *page);
79void ocfs2_data_unlock(struct inode *inode,
80 int write);
81int ocfs2_rw_lock(struct inode *inode, int write); 75int ocfs2_rw_lock(struct inode *inode, int write);
82void ocfs2_rw_unlock(struct inode *inode, int write); 76void ocfs2_rw_unlock(struct inode *inode, int write);
83int ocfs2_open_lock(struct inode *inode); 77int ocfs2_open_lock(struct inode *inode);
84int ocfs2_try_open_lock(struct inode *inode, int write); 78int ocfs2_try_open_lock(struct inode *inode, int write);
85void ocfs2_open_unlock(struct inode *inode); 79void ocfs2_open_unlock(struct inode *inode);
86int ocfs2_meta_lock_atime(struct inode *inode, 80int ocfs2_inode_lock_atime(struct inode *inode,
87 struct vfsmount *vfsmnt, 81 struct vfsmount *vfsmnt,
88 int *level); 82 int *level);
89int ocfs2_meta_lock_full(struct inode *inode, 83int ocfs2_inode_lock_full(struct inode *inode,
90 struct buffer_head **ret_bh, 84 struct buffer_head **ret_bh,
91 int ex, 85 int ex,
92 int arg_flags); 86 int arg_flags);
93int ocfs2_meta_lock_with_page(struct inode *inode, 87int ocfs2_inode_lock_with_page(struct inode *inode,
94 struct buffer_head **ret_bh, 88 struct buffer_head **ret_bh,
95 int ex, 89 int ex,
96 struct page *page); 90 struct page *page);
97/* 99% of the time we don't want to supply any additional flags -- 91/* 99% of the time we don't want to supply any additional flags --
98 * those are for very specific cases only. */ 92 * those are for very specific cases only. */
99#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0) 93#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
100void ocfs2_meta_unlock(struct inode *inode, 94void ocfs2_inode_unlock(struct inode *inode,
101 int ex); 95 int ex);
102int ocfs2_super_lock(struct ocfs2_super *osb, 96int ocfs2_super_lock(struct ocfs2_super *osb,
103 int ex); 97 int ex);
@@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
107void ocfs2_rename_unlock(struct ocfs2_super *osb); 101void ocfs2_rename_unlock(struct ocfs2_super *osb);
108int ocfs2_dentry_lock(struct dentry *dentry, int ex); 102int ocfs2_dentry_lock(struct dentry *dentry, int ex);
109void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 103void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
104int ocfs2_file_lock(struct file *file, int ex, int trylock);
105void ocfs2_file_unlock(struct file *file);
110 106
111void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 107void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
112void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 108void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
113 struct ocfs2_lock_res *lockres); 109 struct ocfs2_lock_res *lockres);
114 110
115/* for the vote thread */ 111/* for the downconvert thread */
116void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 112void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
117 struct ocfs2_lock_res *lockres); 113 struct ocfs2_lock_res *lockres);
114void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
118 115
119struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); 116struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
120void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); 117void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index ff257628af16..1942e09f6ee5 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
37 *var = cpu_to_le64(le64_to_cpu(*var) + val); 37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38} 38}
39 39
40static inline void le32_and_cpu(__le32 *var, u32 val)
41{
42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
43}
44
45static inline void be32_add_cpu(__be32 *var, u32 val) 40static inline void be32_add_cpu(__be32 *var, u32 val)
46{ 41{
47 *var = cpu_to_be32(be32_to_cpu(*var) + val); 42 *var = cpu_to_be32(be32_to_cpu(*var) + val);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 535bfa9568a4..67527cebf214 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
58 return ERR_PTR(-ESTALE); 58 return ERR_PTR(-ESTALE);
59 } 59 }
60 60
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); 61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
62 62
63 if (IS_ERR(inode)) 63 if (IS_ERR(inode))
64 return (void *)inode; 64 return (void *)inode;
@@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
95 mlog(0, "find parent of directory %llu\n", 95 mlog(0, "find parent of directory %llu\n",
96 (unsigned long long)OCFS2_I(dir)->ip_blkno); 96 (unsigned long long)OCFS2_I(dir)->ip_blkno);
97 97
98 status = ocfs2_meta_lock(dir, NULL, 0); 98 status = ocfs2_inode_lock(dir, NULL, 0);
99 if (status < 0) { 99 if (status < 0) {
100 if (status != -ENOENT) 100 if (status != -ENOENT)
101 mlog_errno(status); 101 mlog_errno(status);
@@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
109 goto bail_unlock; 109 goto bail_unlock;
110 } 110 }
111 111
112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
113 if (IS_ERR(inode)) { 113 if (IS_ERR(inode)) {
114 mlog(ML_ERROR, "Unable to create inode %llu\n", 114 mlog(ML_ERROR, "Unable to create inode %llu\n",
115 (unsigned long long)blkno); 115 (unsigned long long)blkno);
@@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
126 parent->d_op = &ocfs2_dentry_ops; 126 parent->d_op = &ocfs2_dentry_ops;
127 127
128bail_unlock: 128bail_unlock:
129 ocfs2_meta_unlock(dir, 0); 129 ocfs2_inode_unlock(dir, 0);
130 130
131bail: 131bail:
132 mlog_exit_ptr(parent); 132 mlog_exit_ptr(parent);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b75b2e1f0e42..ed5d5232e85d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -51,6 +51,7 @@
51#include "inode.h" 51#include "inode.h"
52#include "ioctl.h" 52#include "ioctl.h"
53#include "journal.h" 53#include "journal.h"
54#include "locks.h"
54#include "mmap.h" 55#include "mmap.h"
55#include "suballoc.h" 56#include "suballoc.h"
56#include "super.h" 57#include "super.h"
@@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
63 return sync_mapping_buffers(inode->i_mapping); 64 return sync_mapping_buffers(inode->i_mapping);
64} 65}
65 66
67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
68{
69 struct ocfs2_file_private *fp;
70
71 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
72 if (!fp)
73 return -ENOMEM;
74
75 fp->fp_file = file;
76 mutex_init(&fp->fp_mutex);
77 ocfs2_file_lock_res_init(&fp->fp_flock, fp);
78 file->private_data = fp;
79
80 return 0;
81}
82
83static void ocfs2_free_file_private(struct inode *inode, struct file *file)
84{
85 struct ocfs2_file_private *fp = file->private_data;
86 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
87
88 if (fp) {
89 ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
90 ocfs2_lock_res_free(&fp->fp_flock);
91 kfree(fp);
92 file->private_data = NULL;
93 }
94}
95
66static int ocfs2_file_open(struct inode *inode, struct file *file) 96static int ocfs2_file_open(struct inode *inode, struct file *file)
67{ 97{
68 int status; 98 int status;
@@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
89 119
90 oi->ip_open_count++; 120 oi->ip_open_count++;
91 spin_unlock(&oi->ip_lock); 121 spin_unlock(&oi->ip_lock);
92 status = 0; 122
123 status = ocfs2_init_file_private(inode, file);
124 if (status) {
125 /*
126 * We want to set open count back if we're failing the
127 * open.
128 */
129 spin_lock(&oi->ip_lock);
130 oi->ip_open_count--;
131 spin_unlock(&oi->ip_lock);
132 }
133
93leave: 134leave:
94 mlog_exit(status); 135 mlog_exit(status);
95 return status; 136 return status;
@@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
108 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
109 spin_unlock(&oi->ip_lock); 150 spin_unlock(&oi->ip_lock);
110 151
152 ocfs2_free_file_private(inode, file);
153
111 mlog_exit(0); 154 mlog_exit(0);
112 155
113 return 0; 156 return 0;
114} 157}
115 158
159static int ocfs2_dir_open(struct inode *inode, struct file *file)
160{
161 return ocfs2_init_file_private(inode, file);
162}
163
164static int ocfs2_dir_release(struct inode *inode, struct file *file)
165{
166 ocfs2_free_file_private(inode, file);
167 return 0;
168}
169
116static int ocfs2_sync_file(struct file *file, 170static int ocfs2_sync_file(struct file *file,
117 struct dentry *dentry, 171 struct dentry *dentry,
118 int datasync) 172 int datasync)
@@ -382,18 +436,13 @@ static int ocfs2_truncate_file(struct inode *inode,
382 436
383 down_write(&OCFS2_I(inode)->ip_alloc_sem); 437 down_write(&OCFS2_I(inode)->ip_alloc_sem);
384 438
385 /* This forces other nodes to sync and drop their pages. Do 439 /*
386 * this even if we have a truncate without allocation change - 440 * The inode lock forced other nodes to sync and drop their
387 * ocfs2 cluster sizes can be much greater than page size, so 441 * pages, which (correctly) happens even if we have a truncate
388 * we have to truncate them anyway. */ 442 * without allocation change - ocfs2 cluster sizes can be much
389 status = ocfs2_data_lock(inode, 1); 443 * greater than page size, so we have to truncate them
390 if (status < 0) { 444 * anyway.
391 up_write(&OCFS2_I(inode)->ip_alloc_sem); 445 */
392
393 mlog_errno(status);
394 goto bail;
395 }
396
397 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 446 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
398 truncate_inode_pages(inode->i_mapping, new_i_size); 447 truncate_inode_pages(inode->i_mapping, new_i_size);
399 448
@@ -403,7 +452,7 @@ static int ocfs2_truncate_file(struct inode *inode,
403 if (status) 452 if (status)
404 mlog_errno(status); 453 mlog_errno(status);
405 454
406 goto bail_unlock_data; 455 goto bail_unlock_sem;
407 } 456 }
408 457
409 /* alright, we're going to need to do a full blown alloc size 458 /* alright, we're going to need to do a full blown alloc size
@@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode,
413 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 462 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
414 if (status < 0) { 463 if (status < 0) {
415 mlog_errno(status); 464 mlog_errno(status);
416 goto bail_unlock_data; 465 goto bail_unlock_sem;
417 } 466 }
418 467
419 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 468 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
420 if (status < 0) { 469 if (status < 0) {
421 mlog_errno(status); 470 mlog_errno(status);
422 goto bail_unlock_data; 471 goto bail_unlock_sem;
423 } 472 }
424 473
425 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 474 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
426 if (status < 0) { 475 if (status < 0) {
427 mlog_errno(status); 476 mlog_errno(status);
428 goto bail_unlock_data; 477 goto bail_unlock_sem;
429 } 478 }
430 479
431 /* TODO: orphan dir cleanup here. */ 480 /* TODO: orphan dir cleanup here. */
432bail_unlock_data: 481bail_unlock_sem:
433 ocfs2_data_unlock(inode, 1);
434
435 up_write(&OCFS2_I(inode)->ip_alloc_sem); 482 up_write(&OCFS2_I(inode)->ip_alloc_sem);
436 483
437bail: 484bail:
@@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
579 626
580 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 627 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
581 "clusters_to_add = %u, extents_to_split = %u\n", 628 "clusters_to_add = %u, extents_to_split = %u\n",
582 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 629 (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
583 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); 630 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
584 631
585 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 632 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@@ -760,7 +807,7 @@ restarted_transaction:
760 le32_to_cpu(fe->i_clusters), 807 le32_to_cpu(fe->i_clusters),
761 (unsigned long long)le64_to_cpu(fe->i_size)); 808 (unsigned long long)le64_to_cpu(fe->i_size));
762 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 809 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
763 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 810 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
764 811
765leave: 812leave:
766 if (handle) { 813 if (handle) {
@@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode,
917 struct buffer_head *di_bh, 964 struct buffer_head *di_bh,
918 u64 new_i_size) 965 u64 new_i_size)
919{ 966{
920 int ret = 0, data_locked = 0; 967 int ret = 0;
921 struct ocfs2_inode_info *oi = OCFS2_I(inode); 968 struct ocfs2_inode_info *oi = OCFS2_I(inode);
922 969
923 BUG_ON(!di_bh); 970 BUG_ON(!di_bh);
@@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode,
943 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 990 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
944 goto out_update_size; 991 goto out_update_size;
945 992
946 /*
947 * protect the pages that ocfs2_zero_extend is going to be
948 * pulling into the page cache.. we do this before the
949 * metadata extend so that we don't get into the situation
950 * where we've extended the metadata but can't get the data
951 * lock to zero.
952 */
953 ret = ocfs2_data_lock(inode, 1);
954 if (ret < 0) {
955 mlog_errno(ret);
956 goto out;
957 }
958 data_locked = 1;
959
960 /* 993 /*
961 * The alloc sem blocks people in read/write from reading our 994 * The alloc sem blocks people in read/write from reading our
962 * allocation until we're done changing it. We depend on 995 * allocation until we're done changing it. We depend on
@@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode,
980 up_write(&oi->ip_alloc_sem); 1013 up_write(&oi->ip_alloc_sem);
981 1014
982 mlog_errno(ret); 1015 mlog_errno(ret);
983 goto out_unlock; 1016 goto out;
984 } 1017 }
985 } 1018 }
986 1019
@@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode,
991 1024
992 if (ret < 0) { 1025 if (ret < 0) {
993 mlog_errno(ret); 1026 mlog_errno(ret);
994 goto out_unlock; 1027 goto out;
995 } 1028 }
996 1029
997out_update_size: 1030out_update_size:
@@ -999,10 +1032,6 @@ out_update_size:
999 if (ret < 0) 1032 if (ret < 0)
1000 mlog_errno(ret); 1033 mlog_errno(ret);
1001 1034
1002out_unlock:
1003 if (data_locked)
1004 ocfs2_data_unlock(inode, 1);
1005
1006out: 1035out:
1007 return ret; 1036 return ret;
1008} 1037}
@@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1050 } 1079 }
1051 } 1080 }
1052 1081
1053 status = ocfs2_meta_lock(inode, &bh, 1); 1082 status = ocfs2_inode_lock(inode, &bh, 1);
1054 if (status < 0) { 1083 if (status < 0) {
1055 if (status != -ENOENT) 1084 if (status != -ENOENT)
1056 mlog_errno(status); 1085 mlog_errno(status);
@@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1102bail_commit: 1131bail_commit:
1103 ocfs2_commit_trans(osb, handle); 1132 ocfs2_commit_trans(osb, handle);
1104bail_unlock: 1133bail_unlock:
1105 ocfs2_meta_unlock(inode, 1); 1134 ocfs2_inode_unlock(inode, 1);
1106bail_unlock_rw: 1135bail_unlock_rw:
1107 if (size_change) 1136 if (size_change)
1108 ocfs2_rw_unlock(inode, 1); 1137 ocfs2_rw_unlock(inode, 1);
@@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1149 1178
1150 mlog_entry_void(); 1179 mlog_entry_void();
1151 1180
1152 ret = ocfs2_meta_lock(inode, NULL, 0); 1181 ret = ocfs2_inode_lock(inode, NULL, 0);
1153 if (ret) { 1182 if (ret) {
1154 if (ret != -ENOENT) 1183 if (ret != -ENOENT)
1155 mlog_errno(ret); 1184 mlog_errno(ret);
@@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1158 1187
1159 ret = generic_permission(inode, mask, NULL); 1188 ret = generic_permission(inode, mask, NULL);
1160 1189
1161 ocfs2_meta_unlock(inode, 0); 1190 ocfs2_inode_unlock(inode, 0);
1162out: 1191out:
1163 mlog_exit(ret); 1192 mlog_exit(ret);
1164 return ret; 1193 return ret;
@@ -1630,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1630 goto out; 1659 goto out;
1631 } 1660 }
1632 1661
1633 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1662 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1634 if (ret) { 1663 if (ret) {
1635 mlog_errno(ret); 1664 mlog_errno(ret);
1636 goto out_rw_unlock; 1665 goto out_rw_unlock;
@@ -1638,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1638 1667
1639 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1668 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1640 ret = -EPERM; 1669 ret = -EPERM;
1641 goto out_meta_unlock; 1670 goto out_inode_unlock;
1642 } 1671 }
1643 1672
1644 switch (sr->l_whence) { 1673 switch (sr->l_whence) {
@@ -1652,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1652 break; 1681 break;
1653 default: 1682 default:
1654 ret = -EINVAL; 1683 ret = -EINVAL;
1655 goto out_meta_unlock; 1684 goto out_inode_unlock;
1656 } 1685 }
1657 sr->l_whence = 0; 1686 sr->l_whence = 0;
1658 1687
@@ -1663,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1663 || (sr->l_start + llen) < 0 1692 || (sr->l_start + llen) < 0
1664 || (sr->l_start + llen) > max_off) { 1693 || (sr->l_start + llen) > max_off) {
1665 ret = -EINVAL; 1694 ret = -EINVAL;
1666 goto out_meta_unlock; 1695 goto out_inode_unlock;
1667 } 1696 }
1668 size = sr->l_start + sr->l_len; 1697 size = sr->l_start + sr->l_len;
1669 1698
1670 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1699 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1671 if (sr->l_len <= 0) { 1700 if (sr->l_len <= 0) {
1672 ret = -EINVAL; 1701 ret = -EINVAL;
1673 goto out_meta_unlock; 1702 goto out_inode_unlock;
1674 } 1703 }
1675 } 1704 }
1676 1705
@@ -1678,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1678 ret = __ocfs2_write_remove_suid(inode, di_bh); 1707 ret = __ocfs2_write_remove_suid(inode, di_bh);
1679 if (ret) { 1708 if (ret) {
1680 mlog_errno(ret); 1709 mlog_errno(ret);
1681 goto out_meta_unlock; 1710 goto out_inode_unlock;
1682 } 1711 }
1683 } 1712 }
1684 1713
@@ -1704,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1704 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1733 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1705 if (ret) { 1734 if (ret) {
1706 mlog_errno(ret); 1735 mlog_errno(ret);
1707 goto out_meta_unlock; 1736 goto out_inode_unlock;
1708 } 1737 }
1709 1738
1710 /* 1739 /*
@@ -1714,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1714 if (IS_ERR(handle)) { 1743 if (IS_ERR(handle)) {
1715 ret = PTR_ERR(handle); 1744 ret = PTR_ERR(handle);
1716 mlog_errno(ret); 1745 mlog_errno(ret);
1717 goto out_meta_unlock; 1746 goto out_inode_unlock;
1718 } 1747 }
1719 1748
1720 if (change_size && i_size_read(inode) < size) 1749 if (change_size && i_size_read(inode) < size)
@@ -1727,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1727 1756
1728 ocfs2_commit_trans(osb, handle); 1757 ocfs2_commit_trans(osb, handle);
1729 1758
1730out_meta_unlock: 1759out_inode_unlock:
1731 brelse(di_bh); 1760 brelse(di_bh);
1732 ocfs2_meta_unlock(inode, 1); 1761 ocfs2_inode_unlock(inode, 1);
1733out_rw_unlock: 1762out_rw_unlock:
1734 ocfs2_rw_unlock(inode, 1); 1763 ocfs2_rw_unlock(inode, 1);
1735 1764
@@ -1799,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1799 * if we need to make modifications here. 1828 * if we need to make modifications here.
1800 */ 1829 */
1801 for(;;) { 1830 for(;;) {
1802 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1831 ret = ocfs2_inode_lock(inode, NULL, meta_level);
1803 if (ret < 0) { 1832 if (ret < 0) {
1804 meta_level = -1; 1833 meta_level = -1;
1805 mlog_errno(ret); 1834 mlog_errno(ret);
@@ -1817,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1817 * set inode->i_size at the end of a write. */ 1846 * set inode->i_size at the end of a write. */
1818 if (should_remove_suid(dentry)) { 1847 if (should_remove_suid(dentry)) {
1819 if (meta_level == 0) { 1848 if (meta_level == 0) {
1820 ocfs2_meta_unlock(inode, meta_level); 1849 ocfs2_inode_unlock(inode, meta_level);
1821 meta_level = 1; 1850 meta_level = 1;
1822 continue; 1851 continue;
1823 } 1852 }
@@ -1886,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1886 *ppos = saved_pos; 1915 *ppos = saved_pos;
1887 1916
1888out_unlock: 1917out_unlock:
1889 ocfs2_meta_unlock(inode, meta_level); 1918 ocfs2_inode_unlock(inode, meta_level);
1890 1919
1891out: 1920out:
1892 return ret; 1921 return ret;
@@ -2099,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2099 /* 2128 /*
2100 * See the comment in ocfs2_file_aio_read() 2129 * See the comment in ocfs2_file_aio_read()
2101 */ 2130 */
2102 ret = ocfs2_meta_lock(inode, NULL, 0); 2131 ret = ocfs2_inode_lock(inode, NULL, 0);
2103 if (ret < 0) { 2132 if (ret < 0) {
2104 mlog_errno(ret); 2133 mlog_errno(ret);
2105 goto bail; 2134 goto bail;
2106 } 2135 }
2107 ocfs2_meta_unlock(inode, 0); 2136 ocfs2_inode_unlock(inode, 0);
2108 2137
2109 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2138 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2110 2139
@@ -2160,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2160 * like i_size. This allows the checks down below 2189 * like i_size. This allows the checks down below
2161 * generic_file_aio_read() a chance of actually working. 2190 * generic_file_aio_read() a chance of actually working.
2162 */ 2191 */
2163 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2192 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2164 if (ret < 0) { 2193 if (ret < 0) {
2165 mlog_errno(ret); 2194 mlog_errno(ret);
2166 goto bail; 2195 goto bail;
2167 } 2196 }
2168 ocfs2_meta_unlock(inode, lock_level); 2197 ocfs2_inode_unlock(inode, lock_level);
2169 2198
2170 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2199 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2171 if (ret == -EINVAL) 2200 if (ret == -EINVAL)
@@ -2204,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
2204}; 2233};
2205 2234
2206const struct file_operations ocfs2_fops = { 2235const struct file_operations ocfs2_fops = {
2236 .llseek = generic_file_llseek,
2207 .read = do_sync_read, 2237 .read = do_sync_read,
2208 .write = do_sync_write, 2238 .write = do_sync_write,
2209 .mmap = ocfs2_mmap, 2239 .mmap = ocfs2_mmap,
@@ -2216,16 +2246,21 @@ const struct file_operations ocfs2_fops = {
2216#ifdef CONFIG_COMPAT 2246#ifdef CONFIG_COMPAT
2217 .compat_ioctl = ocfs2_compat_ioctl, 2247 .compat_ioctl = ocfs2_compat_ioctl,
2218#endif 2248#endif
2249 .flock = ocfs2_flock,
2219 .splice_read = ocfs2_file_splice_read, 2250 .splice_read = ocfs2_file_splice_read,
2220 .splice_write = ocfs2_file_splice_write, 2251 .splice_write = ocfs2_file_splice_write,
2221}; 2252};
2222 2253
2223const struct file_operations ocfs2_dops = { 2254const struct file_operations ocfs2_dops = {
2255 .llseek = generic_file_llseek,
2224 .read = generic_read_dir, 2256 .read = generic_read_dir,
2225 .readdir = ocfs2_readdir, 2257 .readdir = ocfs2_readdir,
2226 .fsync = ocfs2_sync_file, 2258 .fsync = ocfs2_sync_file,
2259 .release = ocfs2_dir_release,
2260 .open = ocfs2_dir_open,
2227 .ioctl = ocfs2_ioctl, 2261 .ioctl = ocfs2_ioctl,
2228#ifdef CONFIG_COMPAT 2262#ifdef CONFIG_COMPAT
2229 .compat_ioctl = ocfs2_compat_ioctl, 2263 .compat_ioctl = ocfs2_compat_ioctl,
2230#endif 2264#endif
2265 .flock = ocfs2_flock,
2231}; 2266};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 066f14add3a8..048ddcaf5c80 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
32extern const struct inode_operations ocfs2_special_file_iops; 32extern const struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context; 33struct ocfs2_alloc_context;
34 34
35struct ocfs2_file_private {
36 struct file *fp_file;
37 struct mutex fp_mutex;
38 struct ocfs2_lock_res fp_flock;
39};
40
35enum ocfs2_alloc_restarted { 41enum ocfs2_alloc_restarted {
36 RESTART_NONE = 0, 42 RESTART_NONE = 0,
37 RESTART_TRANS, 43 RESTART_TRANS,
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c4c36171240d..c0efd9489fe8 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,9 +30,6 @@
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/kmod.h> 31#include <linux/kmod.h>
32 32
33#include <cluster/heartbeat.h>
34#include <cluster/nodemanager.h>
35
36#include <dlm/dlmapi.h> 33#include <dlm/dlmapi.h>
37 34
38#define MLOG_MASK_PREFIX ML_SUPER 35#define MLOG_MASK_PREFIX ML_SUPER
@@ -44,13 +41,9 @@
44#include "heartbeat.h" 41#include "heartbeat.h"
45#include "inode.h" 42#include "inode.h"
46#include "journal.h" 43#include "journal.h"
47#include "vote.h"
48 44
49#include "buffer_head_io.h" 45#include "buffer_head_io.h"
50 46
51#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
52#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
53
54static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, 47static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
55 int bit); 48 int bit);
56static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, 49static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
64void ocfs2_init_node_maps(struct ocfs2_super *osb) 57void ocfs2_init_node_maps(struct ocfs2_super *osb)
65{ 58{
66 spin_lock_init(&osb->node_map_lock); 59 spin_lock_init(&osb->node_map_lock);
67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map); 60 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map);
70 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); 61 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
71} 62}
72 63
@@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num,
87 return; 78 return;
88 } 79 }
89 80
90 if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
91 /* If a node is in the umount map, then we've been
92 * expecting him to go down and we know ahead of time
93 * that recovery is not necessary. */
94 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
95 return;
96 }
97
98 ocfs2_recovery_thread(osb, node_num); 81 ocfs2_recovery_thread(osb, node_num);
99
100 ocfs2_remove_node_from_vote_queues(osb, node_num);
101}
102
103static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
104 int node_num,
105 void *data)
106{
107 ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
108} 82}
109 83
110/* Called from the dlm when it's about to evict a node. We may also 84/* Called from the dlm when it's about to evict a node. We may also
@@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
121 ocfs2_do_node_down(node_num, osb); 95 ocfs2_do_node_down(node_num, osb);
122} 96}
123 97
124static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
125 int node_num,
126 void *data)
127{
128 struct ocfs2_super *osb = data;
129
130 BUG_ON(osb->node_num == node_num);
131
132 mlog(0, "node up event for %d\n", node_num);
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) 98void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
137{ 99{
138 o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
139 ocfs2_hb_node_down_cb, osb,
140 OCFS2_HB_NODE_DOWN_PRI);
141
142 o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
143 ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
144
145 /* Not exactly a heartbeat callback, but leads to essentially 100 /* Not exactly a heartbeat callback, but leads to essentially
146 * the same path so we set it up here. */ 101 * the same path so we set it up here. */
147 dlm_setup_eviction_cb(&osb->osb_eviction_cb, 102 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
149 osb); 104 osb);
150} 105}
151 106
152/* Most functions here are just stubs for now... */
153int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
154{
155 int status;
156
157 if (ocfs2_mount_local(osb))
158 return 0;
159
160 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
161 if (status < 0) {
162 mlog_errno(status);
163 goto bail;
164 }
165
166 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
167 if (status < 0) {
168 mlog_errno(status);
169 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
170 }
171
172bail:
173 return status;
174}
175
176void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
177{
178 if (ocfs2_mount_local(osb))
179 return;
180
181 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
182 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
183}
184
185void ocfs2_stop_heartbeat(struct ocfs2_super *osb) 107void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
186{ 108{
187 int ret; 109 int ret;
@@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
341 263
342 spin_lock(&osb->node_map_lock); 264 spin_lock(&osb->node_map_lock);
343 265
344 __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
345
346 if (!test_bit(num, osb->recovery_map.map)) { 266 if (!test_bit(num, osb->recovery_map.map)) {
347 __ocfs2_node_map_set_bit(&osb->recovery_map, num); 267 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
348 set = 1; 268 set = 1;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e4..56859211888a 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,8 +29,6 @@
29void ocfs2_init_node_maps(struct ocfs2_super *osb); 29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30 30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); 31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
32int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
33void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
34void ocfs2_stop_heartbeat(struct ocfs2_super *osb); 32void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
35 33
36/* node map functions - used to keep track of mounted and in-recovery 34/* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ebb2bbe30f35..7e9e4c79aec7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,7 +49,6 @@
49#include "symlink.h" 49#include "symlink.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h" 51#include "uptodate.h"
52#include "vote.h"
53 52
54#include "buffer_head_io.h" 53#include "buffer_head_io.h"
55 54
@@ -58,8 +57,11 @@ struct ocfs2_find_inode_args
58 u64 fi_blkno; 57 u64 fi_blkno;
59 unsigned long fi_ino; 58 unsigned long fi_ino;
60 unsigned int fi_flags; 59 unsigned int fi_flags;
60 unsigned int fi_sysfile_type;
61}; 61};
62 62
63static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
64
63static int ocfs2_read_locked_inode(struct inode *inode, 65static int ocfs2_read_locked_inode(struct inode *inode,
64 struct ocfs2_find_inode_args *args); 66 struct ocfs2_find_inode_args *args);
65static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); 67static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
107 oi->ip_attr |= OCFS2_DIRSYNC_FL; 109 oi->ip_attr |= OCFS2_DIRSYNC_FL;
108} 110}
109 111
110struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) 112struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
113 int sysfile_type)
111{ 114{
112 struct inode *inode = NULL; 115 struct inode *inode = NULL;
113 struct super_block *sb = osb->sb; 116 struct super_block *sb = osb->sb;
@@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
127 args.fi_blkno = blkno; 130 args.fi_blkno = blkno;
128 args.fi_flags = flags; 131 args.fi_flags = flags;
129 args.fi_ino = ino_from_blkno(sb, blkno); 132 args.fi_ino = ino_from_blkno(sb, blkno);
133 args.fi_sysfile_type = sysfile_type;
130 134
131 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, 135 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
132 ocfs2_init_locked_inode, &args); 136 ocfs2_init_locked_inode, &args);
@@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
201 205
202 inode->i_ino = args->fi_ino; 206 inode->i_ino = args->fi_ino;
203 OCFS2_I(inode)->ip_blkno = args->fi_blkno; 207 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
208 if (args->fi_sysfile_type != 0)
209 lockdep_set_class(&inode->i_mutex,
210 &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
204 211
205 mlog_exit(0); 212 mlog_exit(0);
206 return 0; 213 return 0;
@@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
322 */ 329 */
323 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); 330 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
324 331
325 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 332 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
326 OCFS2_LOCK_TYPE_META, 0, inode); 333 OCFS2_LOCK_TYPE_META, 0, inode);
327 334
328 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, 335 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
333 OCFS2_LOCK_TYPE_RW, inode->i_generation, 340 OCFS2_LOCK_TYPE_RW, inode->i_generation,
334 inode); 341 inode);
335 342
336 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
337 OCFS2_LOCK_TYPE_DATA, inode->i_generation,
338 inode);
339
340 ocfs2_set_inode_flags(inode); 343 ocfs2_set_inode_flags(inode);
341 344
342 status = 0; 345 status = 0;
@@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
414 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 417 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
415 generation = osb->fs_generation; 418 generation = osb->fs_generation;
416 419
417 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 420 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
418 OCFS2_LOCK_TYPE_META, 421 OCFS2_LOCK_TYPE_META,
419 generation, inode); 422 generation, inode);
420 423
@@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
429 mlog_errno(status); 432 mlog_errno(status);
430 return status; 433 return status;
431 } 434 }
432 status = ocfs2_meta_lock(inode, NULL, 0); 435 status = ocfs2_inode_lock(inode, NULL, 0);
433 if (status) { 436 if (status) {
434 make_bad_inode(inode); 437 make_bad_inode(inode);
435 mlog_errno(status); 438 mlog_errno(status);
@@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
484 487
485bail: 488bail:
486 if (can_lock) 489 if (can_lock)
487 ocfs2_meta_unlock(inode, 0); 490 ocfs2_inode_unlock(inode, 0);
488 491
489 if (status < 0) 492 if (status < 0)
490 make_bad_inode(inode); 493 make_bad_inode(inode);
@@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode,
586 } 589 }
587 590
588 mutex_lock(&inode_alloc_inode->i_mutex); 591 mutex_lock(&inode_alloc_inode->i_mutex);
589 status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1); 592 status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
590 if (status < 0) { 593 if (status < 0) {
591 mutex_unlock(&inode_alloc_inode->i_mutex); 594 mutex_unlock(&inode_alloc_inode->i_mutex);
592 595
@@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
617 } 620 }
618 621
619 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 622 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
620 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 623 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
621 624
622 status = ocfs2_journal_dirty(handle, di_bh); 625 status = ocfs2_journal_dirty(handle, di_bh);
623 if (status < 0) { 626 if (status < 0) {
@@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
635bail_commit: 638bail_commit:
636 ocfs2_commit_trans(osb, handle); 639 ocfs2_commit_trans(osb, handle);
637bail_unlock: 640bail_unlock:
638 ocfs2_meta_unlock(inode_alloc_inode, 1); 641 ocfs2_inode_unlock(inode_alloc_inode, 1);
639 mutex_unlock(&inode_alloc_inode->i_mutex); 642 mutex_unlock(&inode_alloc_inode->i_mutex);
640 brelse(inode_alloc_bh); 643 brelse(inode_alloc_bh);
641bail: 644bail:
@@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
709 * delete_inode operation. We do this now to avoid races with 712 * delete_inode operation. We do this now to avoid races with
710 * recovery completion on other nodes. */ 713 * recovery completion on other nodes. */
711 mutex_lock(&orphan_dir_inode->i_mutex); 714 mutex_lock(&orphan_dir_inode->i_mutex);
712 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 715 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
713 if (status < 0) { 716 if (status < 0) {
714 mutex_unlock(&orphan_dir_inode->i_mutex); 717 mutex_unlock(&orphan_dir_inode->i_mutex);
715 718
@@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
718 } 721 }
719 722
720 /* we do this while holding the orphan dir lock because we 723 /* we do this while holding the orphan dir lock because we
721 * don't want recovery being run from another node to vote for 724 * don't want recovery being run from another node to try an
722 * an inode delete on us -- this will result in two nodes 725 * inode delete underneath us -- this will result in two nodes
723 * truncating the same file! */ 726 * truncating the same file! */
724 status = ocfs2_truncate_for_delete(osb, inode, di_bh); 727 status = ocfs2_truncate_for_delete(osb, inode, di_bh);
725 if (status < 0) { 728 if (status < 0) {
@@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
733 mlog_errno(status); 736 mlog_errno(status);
734 737
735bail_unlock_dir: 738bail_unlock_dir:
736 ocfs2_meta_unlock(orphan_dir_inode, 1); 739 ocfs2_inode_unlock(orphan_dir_inode, 1);
737 mutex_unlock(&orphan_dir_inode->i_mutex); 740 mutex_unlock(&orphan_dir_inode->i_mutex);
738 brelse(orphan_dir_bh); 741 brelse(orphan_dir_bh);
739bail: 742bail:
@@ -744,7 +747,7 @@ bail:
744} 747}
745 748
746/* There is a series of simple checks that should be done before a 749/* There is a series of simple checks that should be done before a
747 * vote is even considered. Encapsulate those in this function. */ 750 * trylock is even considered. Encapsulate those in this function. */
748static int ocfs2_inode_is_valid_to_delete(struct inode *inode) 751static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
749{ 752{
750 int ret = 0; 753 int ret = 0;
@@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
758 goto bail; 761 goto bail;
759 } 762 }
760 763
761 /* If we're coming from process_vote we can't go into our own 764 /* If we're coming from downconvert_thread we can't go into our own
762 * voting [hello, deadlock city!], so unforuntately we just 765 * voting [hello, deadlock city!], so unforuntately we just
763 * have to skip deleting this guy. That's OK though because 766 * have to skip deleting this guy. That's OK though because
764 * the node who's doing the actual deleting should handle it 767 * the node who's doing the actual deleting should handle it
765 * anyway. */ 768 * anyway. */
766 if (current == osb->vote_task) { 769 if (current == osb->dc_task) {
767 mlog(0, "Skipping delete of %lu because we're currently " 770 mlog(0, "Skipping delete of %lu because we're currently "
768 "in process_vote\n", inode->i_ino); 771 "in downconvert\n", inode->i_ino);
769 goto bail; 772 goto bail;
770 } 773 }
771 774
@@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
779 goto bail_unlock; 782 goto bail_unlock;
780 } 783 }
781 784
782 /* If we have voted "yes" on the wipe of this inode for 785 /* If we have allowd wipe of this inode for another node, it
783 * another node, it will be marked here so we can safely skip 786 * will be marked here so we can safely skip it. Recovery will
784 * it. Recovery will cleanup any inodes we might inadvertantly 787 * cleanup any inodes we might inadvertantly skip here. */
785 * skip here. */
786 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { 788 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
787 mlog(0, "Skipping delete of %lu because another node " 789 mlog(0, "Skipping delete of %lu because another node "
788 "has done this for us.\n", inode->i_ino); 790 "has done this for us.\n", inode->i_ino);
@@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode)
929 931
930 /* Lock down the inode. This gives us an up to date view of 932 /* Lock down the inode. This gives us an up to date view of
931 * it's metadata (for verification), and allows us to 933 * it's metadata (for verification), and allows us to
932 * serialize delete_inode votes. 934 * serialize delete_inode on multiple nodes.
933 * 935 *
934 * Even though we might be doing a truncate, we don't take the 936 * Even though we might be doing a truncate, we don't take the
935 * allocation lock here as it won't be needed - nobody will 937 * allocation lock here as it won't be needed - nobody will
936 * have the file open. 938 * have the file open.
937 */ 939 */
938 status = ocfs2_meta_lock(inode, &di_bh, 1); 940 status = ocfs2_inode_lock(inode, &di_bh, 1);
939 if (status < 0) { 941 if (status < 0) {
940 if (status != -ENOENT) 942 if (status != -ENOENT)
941 mlog_errno(status); 943 mlog_errno(status);
@@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode)
947 * before we go ahead and wipe the inode. */ 949 * before we go ahead and wipe the inode. */
948 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); 950 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
949 if (!wipe || status < 0) { 951 if (!wipe || status < 0) {
950 /* Error and inode busy vote both mean we won't be 952 /* Error and remote inode busy both mean we won't be
951 * removing the inode, so they take almost the same 953 * removing the inode, so they take almost the same
952 * path. */ 954 * path. */
953 if (status < 0) 955 if (status < 0)
954 mlog_errno(status); 956 mlog_errno(status);
955 957
956 /* Someone in the cluster has voted to not wipe this 958 /* Someone in the cluster has disallowed a wipe of
957 * inode, or it was never completely orphaned. Write 959 * this inode, or it was never completely
958 * out the pages and exit now. */ 960 * orphaned. Write out the pages and exit now. */
959 ocfs2_cleanup_delete_inode(inode, 1); 961 ocfs2_cleanup_delete_inode(inode, 1);
960 goto bail_unlock_inode; 962 goto bail_unlock_inode;
961 } 963 }
@@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode)
981 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; 983 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
982 984
983bail_unlock_inode: 985bail_unlock_inode:
984 ocfs2_meta_unlock(inode, 1); 986 ocfs2_inode_unlock(inode, 1);
985 brelse(di_bh); 987 brelse(di_bh);
986bail_unblock: 988bail_unblock:
987 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 989 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode)
1008 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1010 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1009 "Inode=%lu\n", inode->i_ino); 1011 "Inode=%lu\n", inode->i_ino);
1010 1012
1011 /* For remove delete_inode vote, we hold open lock before, 1013 /* To preven remote deletes we hold open lock before, now it
1012 * now it is time to unlock PR and EX open locks. */ 1014 * is time to unlock PR and EX open locks. */
1013 ocfs2_open_unlock(inode); 1015 ocfs2_open_unlock(inode);
1014 1016
1015 /* Do these before all the other work so that we don't bounce 1017 /* Do these before all the other work so that we don't bounce
1016 * the vote thread while waiting to destroy the locks. */ 1018 * the downconvert thread while waiting to destroy the locks. */
1017 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 1019 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
1018 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); 1020 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
1019 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
1020 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1021 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1021 1022
1022 /* We very well may get a clear_inode before all an inodes 1023 /* We very well may get a clear_inode before all an inodes
@@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode)
1039 mlog_errno(status); 1040 mlog_errno(status);
1040 1041
1041 ocfs2_lock_res_free(&oi->ip_rw_lockres); 1042 ocfs2_lock_res_free(&oi->ip_rw_lockres);
1042 ocfs2_lock_res_free(&oi->ip_meta_lockres); 1043 ocfs2_lock_res_free(&oi->ip_inode_lockres);
1043 ocfs2_lock_res_free(&oi->ip_data_lockres);
1044 ocfs2_lock_res_free(&oi->ip_open_lockres); 1044 ocfs2_lock_res_free(&oi->ip_open_lockres);
1045 1045
1046 ocfs2_metadata_cache_purge(inode); 1046 ocfs2_metadata_cache_purge(inode);
@@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
1184 } 1184 }
1185 spin_unlock(&OCFS2_I(inode)->ip_lock); 1185 spin_unlock(&OCFS2_I(inode)->ip_lock);
1186 1186
1187 /* Let ocfs2_meta_lock do the work of updating our struct 1187 /* Let ocfs2_inode_lock do the work of updating our struct
1188 * inode for us. */ 1188 * inode for us. */
1189 status = ocfs2_meta_lock(inode, NULL, 0); 1189 status = ocfs2_inode_lock(inode, NULL, 0);
1190 if (status < 0) { 1190 if (status < 0) {
1191 if (status != -ENOENT) 1191 if (status != -ENOENT)
1192 mlog_errno(status); 1192 mlog_errno(status);
1193 goto bail; 1193 goto bail;
1194 } 1194 }
1195 ocfs2_meta_unlock(inode, 0); 1195 ocfs2_inode_unlock(inode, 0);
1196bail: 1196bail:
1197 mlog_exit(status); 1197 mlog_exit(status);
1198 1198
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 70e881c55536..390a85596aa0 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,8 +34,7 @@ struct ocfs2_inode_info
34 u64 ip_blkno; 34 u64 ip_blkno;
35 35
36 struct ocfs2_lock_res ip_rw_lockres; 36 struct ocfs2_lock_res ip_rw_lockres;
37 struct ocfs2_lock_res ip_meta_lockres; 37 struct ocfs2_lock_res ip_inode_lockres;
38 struct ocfs2_lock_res ip_data_lockres;
39 struct ocfs2_lock_res ip_open_lockres; 38 struct ocfs2_lock_res ip_open_lockres;
40 39
41 /* protects allocation changes on this inode. */ 40 /* protects allocation changes on this inode. */
@@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
121void ocfs2_drop_inode(struct inode *inode); 120void ocfs2_drop_inode(struct inode *inode);
122 121
123/* Flags for ocfs2_iget() */ 122/* Flags for ocfs2_iget() */
124#define OCFS2_FI_FLAG_SYSFILE 0x4 123#define OCFS2_FI_FLAG_SYSFILE 0x1
125#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 124#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
126struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); 125struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
126 int sysfile_type);
127int ocfs2_inode_init_private(struct inode *inode); 127int ocfs2_inode_init_private(struct inode *inode);
128int ocfs2_inode_revalidate(struct dentry *dentry); 128int ocfs2_inode_revalidate(struct dentry *dentry);
129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 87dcece7e1b5..5177fba5162b 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -20,6 +20,7 @@
20 20
21#include "ocfs2_fs.h" 21#include "ocfs2_fs.h"
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h"
23 24
24#include <linux/ext2_fs.h> 25#include <linux/ext2_fs.h>
25 26
@@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
27{ 28{
28 int status; 29 int status;
29 30
30 status = ocfs2_meta_lock(inode, NULL, 0); 31 status = ocfs2_inode_lock(inode, NULL, 0);
31 if (status < 0) { 32 if (status < 0) {
32 mlog_errno(status); 33 mlog_errno(status);
33 return status; 34 return status;
34 } 35 }
35 ocfs2_get_inode_flags(OCFS2_I(inode)); 36 ocfs2_get_inode_flags(OCFS2_I(inode));
36 *flags = OCFS2_I(inode)->ip_attr; 37 *flags = OCFS2_I(inode)->ip_attr;
37 ocfs2_meta_unlock(inode, 0); 38 ocfs2_inode_unlock(inode, 0);
38 39
39 mlog_exit(status); 40 mlog_exit(status);
40 return status; 41 return status;
@@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
52 53
53 mutex_lock(&inode->i_mutex); 54 mutex_lock(&inode->i_mutex);
54 55
55 status = ocfs2_meta_lock(inode, &bh, 1); 56 status = ocfs2_inode_lock(inode, &bh, 1);
56 if (status < 0) { 57 if (status < 0) {
57 mlog_errno(status); 58 mlog_errno(status);
58 goto bail; 59 goto bail;
@@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
100 101
101 ocfs2_commit_trans(osb, handle); 102 ocfs2_commit_trans(osb, handle);
102bail_unlock: 103bail_unlock:
103 ocfs2_meta_unlock(inode, 1); 104 ocfs2_inode_unlock(inode, 1);
104bail: 105bail:
105 mutex_unlock(&inode->i_mutex); 106 mutex_unlock(&inode->i_mutex);
106 107
@@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
115 unsigned int cmd, unsigned long arg) 116 unsigned int cmd, unsigned long arg)
116{ 117{
117 unsigned int flags; 118 unsigned int flags;
119 int new_clusters;
118 int status; 120 int status;
119 struct ocfs2_space_resv sr; 121 struct ocfs2_space_resv sr;
122 struct ocfs2_new_group_input input;
120 123
121 switch (cmd) { 124 switch (cmd) {
122 case OCFS2_IOC_GETFLAGS: 125 case OCFS2_IOC_GETFLAGS:
@@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
140 return -EFAULT; 143 return -EFAULT;
141 144
142 return ocfs2_change_file_space(filp, cmd, &sr); 145 return ocfs2_change_file_space(filp, cmd, &sr);
146 case OCFS2_IOC_GROUP_EXTEND:
147 if (!capable(CAP_SYS_RESOURCE))
148 return -EPERM;
149
150 if (get_user(new_clusters, (int __user *)arg))
151 return -EFAULT;
152
153 return ocfs2_group_extend(inode, new_clusters);
154 case OCFS2_IOC_GROUP_ADD:
155 case OCFS2_IOC_GROUP_ADD64:
156 if (!capable(CAP_SYS_RESOURCE))
157 return -EPERM;
158
159 if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
160 return -EFAULT;
161
162 return ocfs2_group_add(inode, &input);
143 default: 163 default:
144 return -ENOTTY; 164 return -ENOTTY;
145 } 165 }
@@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
162 case OCFS2_IOC_RESVSP64: 182 case OCFS2_IOC_RESVSP64:
163 case OCFS2_IOC_UNRESVSP: 183 case OCFS2_IOC_UNRESVSP:
164 case OCFS2_IOC_UNRESVSP64: 184 case OCFS2_IOC_UNRESVSP64:
185 case OCFS2_IOC_GROUP_EXTEND:
186 case OCFS2_IOC_GROUP_ADD:
187 case OCFS2_IOC_GROUP_ADD64:
165 break; 188 break;
166 default: 189 default:
167 return -ENOIOCTLCMD; 190 return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8d81f6c1b877..f31c7e8c19c3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -44,7 +44,6 @@
44#include "localalloc.h" 44#include "localalloc.h"
45#include "slot_map.h" 45#include "slot_map.h"
46#include "super.h" 46#include "super.h"
47#include "vote.h"
48#include "sysfile.h" 47#include "sysfile.h"
49 48
50#include "buffer_head_io.h" 49#include "buffer_head_io.h"
@@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
103 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", 102 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
104 journal->j_trans_id, flushed); 103 journal->j_trans_id, flushed);
105 104
106 ocfs2_kick_vote_thread(osb); 105 ocfs2_wake_downconvert_thread(osb);
107 wake_up(&journal->j_checkpointed); 106 wake_up(&journal->j_checkpointed);
108finally: 107finally:
109 mlog_exit(status); 108 mlog_exit(status);
@@ -314,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
314 return err; 313 return err;
315} 314}
316 315
317#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) 316#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
318 317
319void ocfs2_set_journal_params(struct ocfs2_super *osb) 318void ocfs2_set_journal_params(struct ocfs2_super *osb)
320{ 319{
321 journal_t *journal = osb->journal->j_journal; 320 journal_t *journal = osb->journal->j_journal;
321 unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
322
323 if (osb->osb_commit_interval)
324 commit_interval = osb->osb_commit_interval;
322 325
323 spin_lock(&journal->j_state_lock); 326 spin_lock(&journal->j_state_lock);
324 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; 327 journal->j_commit_interval = commit_interval;
325 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 328 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
326 journal->j_flags |= JFS_BARRIER; 329 journal->j_flags |= JFS_BARRIER;
327 else 330 else
@@ -337,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
337 struct ocfs2_dinode *di = NULL; 340 struct ocfs2_dinode *di = NULL;
338 struct buffer_head *bh = NULL; 341 struct buffer_head *bh = NULL;
339 struct ocfs2_super *osb; 342 struct ocfs2_super *osb;
340 int meta_lock = 0; 343 int inode_lock = 0;
341 344
342 mlog_entry_void(); 345 mlog_entry_void();
343 346
@@ -367,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
367 /* Skip recovery waits here - journal inode metadata never 370 /* Skip recovery waits here - journal inode metadata never
368 * changes in a live cluster so it can be considered an 371 * changes in a live cluster so it can be considered an
369 * exception to the rule. */ 372 * exception to the rule. */
370 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 373 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
371 if (status < 0) { 374 if (status < 0) {
372 if (status != -ERESTARTSYS) 375 if (status != -ERESTARTSYS)
373 mlog(ML_ERROR, "Could not get lock on journal!\n"); 376 mlog(ML_ERROR, "Could not get lock on journal!\n");
374 goto done; 377 goto done;
375 } 378 }
376 379
377 meta_lock = 1; 380 inode_lock = 1;
378 di = (struct ocfs2_dinode *)bh->b_data; 381 di = (struct ocfs2_dinode *)bh->b_data;
379 382
380 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { 383 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
@@ -414,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
414 status = 0; 417 status = 0;
415done: 418done:
416 if (status < 0) { 419 if (status < 0) {
417 if (meta_lock) 420 if (inode_lock)
418 ocfs2_meta_unlock(inode, 1); 421 ocfs2_inode_unlock(inode, 1);
419 if (bh != NULL) 422 if (bh != NULL)
420 brelse(bh); 423 brelse(bh);
421 if (inode) { 424 if (inode) {
@@ -544,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
544 OCFS2_I(inode)->ip_open_count--; 547 OCFS2_I(inode)->ip_open_count--;
545 548
546 /* unlock our journal */ 549 /* unlock our journal */
547 ocfs2_meta_unlock(inode, 1); 550 ocfs2_inode_unlock(inode, 1);
548 551
549 brelse(journal->j_bh); 552 brelse(journal->j_bh);
550 journal->j_bh = NULL; 553 journal->j_bh = NULL;
@@ -883,8 +886,8 @@ restart:
883 ocfs2_super_unlock(osb, 1); 886 ocfs2_super_unlock(osb, 1);
884 887
885 /* We always run recovery on our own orphan dir - the dead 888 /* We always run recovery on our own orphan dir - the dead
886 * node(s) may have voted "no" on an inode delete earlier. A 889 * node(s) may have disallowd a previos inode delete. Re-processing
887 * revote is therefore required. */ 890 * is therefore required. */
888 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 891 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
889 NULL); 892 NULL);
890 893
@@ -973,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
973 } 976 }
974 SET_INODE_JOURNAL(inode); 977 SET_INODE_JOURNAL(inode);
975 978
976 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 979 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
977 if (status < 0) { 980 if (status < 0) {
978 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); 981 mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
979 if (status != -ERESTARTSYS) 982 if (status != -ERESTARTSYS)
980 mlog(ML_ERROR, "Could not lock journal!\n"); 983 mlog(ML_ERROR, "Could not lock journal!\n");
981 goto done; 984 goto done;
@@ -1047,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1047done: 1050done:
1048 /* drop the lock on this nodes journal */ 1051 /* drop the lock on this nodes journal */
1049 if (got_lock) 1052 if (got_lock)
1050 ocfs2_meta_unlock(inode, 1); 1053 ocfs2_inode_unlock(inode, 1);
1051 1054
1052 if (inode) 1055 if (inode)
1053 iput(inode); 1056 iput(inode);
@@ -1162,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1162 SET_INODE_JOURNAL(inode); 1165 SET_INODE_JOURNAL(inode);
1163 1166
1164 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; 1167 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1165 status = ocfs2_meta_lock_full(inode, NULL, 1, flags); 1168 status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
1166 if (status < 0) { 1169 if (status < 0) {
1167 if (status != -EAGAIN) 1170 if (status != -EAGAIN)
1168 mlog_errno(status); 1171 mlog_errno(status);
1169 goto bail; 1172 goto bail;
1170 } 1173 }
1171 1174
1172 ocfs2_meta_unlock(inode, 1); 1175 ocfs2_inode_unlock(inode, 1);
1173bail: 1176bail:
1174 if (inode) 1177 if (inode)
1175 iput(inode); 1178 iput(inode);
@@ -1241,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
1241 1244
1242 /* Skip bad inodes so that recovery can continue */ 1245 /* Skip bad inodes so that recovery can continue */
1243 iter = ocfs2_iget(p->osb, ino, 1246 iter = ocfs2_iget(p->osb, ino,
1244 OCFS2_FI_FLAG_ORPHAN_RECOVERY); 1247 OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
1245 if (IS_ERR(iter)) 1248 if (IS_ERR(iter))
1246 return 0; 1249 return 0;
1247 1250
@@ -1277,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1277 } 1280 }
1278 1281
1279 mutex_lock(&orphan_dir_inode->i_mutex); 1282 mutex_lock(&orphan_dir_inode->i_mutex);
1280 status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0); 1283 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
1281 if (status < 0) { 1284 if (status < 0) {
1282 mlog_errno(status); 1285 mlog_errno(status);
1283 goto out; 1286 goto out;
@@ -1293,7 +1296,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1293 *head = priv.head; 1296 *head = priv.head;
1294 1297
1295out_cluster: 1298out_cluster:
1296 ocfs2_meta_unlock(orphan_dir_inode, 0); 1299 ocfs2_inode_unlock(orphan_dir_inode, 0);
1297out: 1300out:
1298 mutex_unlock(&orphan_dir_inode->i_mutex); 1301 mutex_unlock(&orphan_dir_inode->i_mutex);
1299 iput(orphan_dir_inode); 1302 iput(orphan_dir_inode);
@@ -1380,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1380 iter = oi->ip_next_orphan; 1383 iter = oi->ip_next_orphan;
1381 1384
1382 spin_lock(&oi->ip_lock); 1385 spin_lock(&oi->ip_lock);
1383 /* Delete voting may have set these on the assumption 1386 /* The remote delete code may have set these on the
1384 * that the other node would wipe them successfully. 1387 * assumption that the other node would wipe them
1385 * If they are still in the node's orphan dir, we need 1388 * successfully. If they are still in the node's
1386 * to reset that state. */ 1389 * orphan dir, we need to reset that state. */
1387 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); 1390 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1388 1391
1389 /* Set the proper information to get us going into 1392 /* Set the proper information to get us going into
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4b32e0961568..220f3e818e78 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -278,6 +278,12 @@ int ocfs2_journal_dirty_data(handle_t *handle,
278/* simple file updates like chmod, etc. */ 278/* simple file updates like chmod, etc. */
279#define OCFS2_INODE_UPDATE_CREDITS 1 279#define OCFS2_INODE_UPDATE_CREDITS 1
280 280
281/* group extend. inode update and last group update. */
282#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
283
284/* group add. inode update and the new group update. */
285#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
286
281/* get one bit out of a suballocator: dinode + group descriptor + 287/* get one bit out of a suballocator: dinode + group descriptor +
282 * prev. group desc. if we relink. */ 288 * prev. group desc. if we relink. */
283#define OCFS2_SUBALLOC_ALLOC (3) 289#define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 58ea88b5af36..add1ffdc5c6c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 76 struct inode *local_alloc_inode);
77 77
78/*
79 * Determine how large our local alloc window should be, in bits.
80 *
81 * These values (and the behavior in ocfs2_alloc_should_use_local) have
82 * been chosen so that most allocations, including new block groups go
83 * through local alloc.
84 */
85static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) 78static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
86{ 79{
87 BUG_ON(osb->s_clustersize_bits < 12); 80 BUG_ON(osb->s_clustersize_bits > 20);
88 81
89 return 2048 >> (osb->s_clustersize_bits - 12); 82 /* Size local alloc windows by the megabyte */
83 return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
90} 84}
91 85
92/* 86/*
@@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
96int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) 90int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
97{ 91{
98 int la_bits = ocfs2_local_alloc_window_bits(osb); 92 int la_bits = ocfs2_local_alloc_window_bits(osb);
93 int ret = 0;
99 94
100 if (osb->local_alloc_state != OCFS2_LA_ENABLED) 95 if (osb->local_alloc_state != OCFS2_LA_ENABLED)
101 return 0; 96 goto bail;
102 97
103 /* la_bits should be at least twice the size (in clusters) of 98 /* la_bits should be at least twice the size (in clusters) of
104 * a new block group. We want to be sure block group 99 * a new block group. We want to be sure block group
105 * allocations go through the local alloc, so allow an 100 * allocations go through the local alloc, so allow an
106 * allocation to take up to half the bitmap. */ 101 * allocation to take up to half the bitmap. */
107 if (bits > (la_bits / 2)) 102 if (bits > (la_bits / 2))
108 return 0; 103 goto bail;
109 104
110 return 1; 105 ret = 1;
106bail:
107 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
108 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
109 return ret;
111} 110}
112 111
113int ocfs2_load_local_alloc(struct ocfs2_super *osb) 112int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
121 120
122 mlog_entry_void(); 121 mlog_entry_void();
123 122
123 if (ocfs2_mount_local(osb))
124 goto bail;
125
126 if (osb->local_alloc_size == 0)
127 goto bail;
128
129 if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
130 mlog(ML_NOTICE, "Requested local alloc window %d is larger "
131 "than max possible %u. Using defaults.\n",
132 ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
133 osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
134 }
135
124 /* read the alloc off disk */ 136 /* read the alloc off disk */
125 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, 137 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
126 osb->slot_num); 138 osb->slot_num);
@@ -181,6 +193,9 @@ bail:
181 if (inode) 193 if (inode)
182 iput(inode); 194 iput(inode);
183 195
196 mlog(0, "Local alloc window bits = %d\n",
197 ocfs2_local_alloc_window_bits(osb));
198
184 mlog_exit(status); 199 mlog_exit(status);
185 return status; 200 return status;
186} 201}
@@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
231 246
232 mutex_lock(&main_bm_inode->i_mutex); 247 mutex_lock(&main_bm_inode->i_mutex);
233 248
234 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 249 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
235 if (status < 0) { 250 if (status < 0) {
236 mlog_errno(status); 251 mlog_errno(status);
237 goto out_mutex; 252 goto out_mutex;
@@ -286,7 +301,7 @@ out_unlock:
286 if (main_bm_bh) 301 if (main_bm_bh)
287 brelse(main_bm_bh); 302 brelse(main_bm_bh);
288 303
289 ocfs2_meta_unlock(main_bm_inode, 1); 304 ocfs2_inode_unlock(main_bm_inode, 1);
290 305
291out_mutex: 306out_mutex:
292 mutex_unlock(&main_bm_inode->i_mutex); 307 mutex_unlock(&main_bm_inode->i_mutex);
@@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
399 414
400 mutex_lock(&main_bm_inode->i_mutex); 415 mutex_lock(&main_bm_inode->i_mutex);
401 416
402 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 417 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
403 if (status < 0) { 418 if (status < 0) {
404 mlog_errno(status); 419 mlog_errno(status);
405 goto out_mutex; 420 goto out_mutex;
@@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
424 ocfs2_commit_trans(osb, handle); 439 ocfs2_commit_trans(osb, handle);
425 440
426out_unlock: 441out_unlock:
427 ocfs2_meta_unlock(main_bm_inode, 1); 442 ocfs2_inode_unlock(main_bm_inode, 1);
428 443
429out_mutex: 444out_mutex:
430 mutex_unlock(&main_bm_inode->i_mutex); 445 mutex_unlock(&main_bm_inode->i_mutex);
@@ -521,6 +536,9 @@ bail:
521 iput(local_alloc_inode); 536 iput(local_alloc_inode);
522 } 537 }
523 538
539 mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
540 status);
541
524 mlog_exit(status); 542 mlog_exit(status);
525 return status; 543 return status;
526} 544}
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 000000000000..203f87143877
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,125 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * locks.c
5 *
6 * Userspace file locking support
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27
28#define MLOG_MASK_PREFIX ML_INODE
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32
33#include "dlmglue.h"
34#include "file.h"
35#include "locks.h"
36
37static int ocfs2_do_flock(struct file *file, struct inode *inode,
38 int cmd, struct file_lock *fl)
39{
40 int ret = 0, level = 0, trylock = 0;
41 struct ocfs2_file_private *fp = file->private_data;
42 struct ocfs2_lock_res *lockres = &fp->fp_flock;
43
44 if (fl->fl_type == F_WRLCK)
45 level = 1;
46 if (!IS_SETLKW(cmd))
47 trylock = 1;
48
49 mutex_lock(&fp->fp_mutex);
50
51 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
52 lockres->l_level > LKM_NLMODE) {
53 int old_level = 0;
54
55 if (lockres->l_level == LKM_EXMODE)
56 old_level = 1;
57
58 if (level == old_level)
59 goto out;
60
61 /*
62 * Converting an existing lock is not guaranteed to be
63 * atomic, so we can get away with simply unlocking
64 * here and allowing the lock code to try at the new
65 * level.
66 */
67
68 flock_lock_file_wait(file,
69 &(struct file_lock){.fl_type = F_UNLCK});
70
71 ocfs2_file_unlock(file);
72 }
73
74 ret = ocfs2_file_lock(file, level, trylock);
75 if (ret) {
76 if (ret == -EAGAIN && trylock)
77 ret = -EWOULDBLOCK;
78 else
79 mlog_errno(ret);
80 goto out;
81 }
82
83 ret = flock_lock_file_wait(file, fl);
84
85out:
86 mutex_unlock(&fp->fp_mutex);
87
88 return ret;
89}
90
91static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
92{
93 int ret;
94 struct ocfs2_file_private *fp = file->private_data;
95
96 mutex_lock(&fp->fp_mutex);
97 ocfs2_file_unlock(file);
98 ret = flock_lock_file_wait(file, fl);
99 mutex_unlock(&fp->fp_mutex);
100
101 return ret;
102}
103
104/*
105 * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
106 */
107int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
108{
109 struct inode *inode = file->f_mapping->host;
110 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
111
112 if (!(fl->fl_flags & FL_FLOCK))
113 return -ENOLCK;
114 if (__mandatory_lock(inode))
115 return -ENOLCK;
116
117 if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
118 ocfs2_mount_local(osb))
119 return flock_lock_file_wait(file, fl);
120
121 if (fl->fl_type == F_UNLCK)
122 return ocfs2_do_funlock(file, cmd, fl);
123 else
124 return ocfs2_do_flock(file, inode, cmd, fl);
125}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/locks.h
index 9ea46f62de31..9743ef2324ec 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/locks.h
@@ -1,9 +1,9 @@
1/* -*- mode: c; c-basic-offset: 8; -*- 1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * vote.h 4 * locks.h
5 * 5 *
6 * description here 6 * Function prototypes for Userspace file locking support
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
@@ -23,26 +23,9 @@
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26#ifndef OCFS2_LOCKS_H
27#define OCFS2_LOCKS_H
26 28
27#ifndef VOTE_H 29int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
28#define VOTE_H
29 30
30int ocfs2_vote_thread(void *arg); 31#endif /* OCFS2_LOCKS_H */
31static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
32{
33 spin_lock(&osb->vote_task_lock);
34 /* make sure the voting thread gets a swipe at whatever changes
35 * the caller may have made to the voting state */
36 osb->vote_wake_sequence++;
37 spin_unlock(&osb->vote_task_lock);
38 wake_up(&osb->vote_event);
39}
40
41int ocfs2_request_mount_vote(struct ocfs2_super *osb);
42int ocfs2_request_umount_vote(struct ocfs2_super *osb);
43int ocfs2_register_net_handlers(struct ocfs2_super *osb);
44void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
45
46void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
47 int node_num);
48#endif
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 98756156d298..3dc18d67557c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
168 * node. Taking the data lock will also ensure that we don't 168 * node. Taking the data lock will also ensure that we don't
169 * attempt page truncation as part of a downconvert. 169 * attempt page truncation as part of a downconvert.
170 */ 170 */
171 ret = ocfs2_meta_lock(inode, &di_bh, 1); 171 ret = ocfs2_inode_lock(inode, &di_bh, 1);
172 if (ret < 0) { 172 if (ret < 0) {
173 mlog_errno(ret); 173 mlog_errno(ret);
174 goto out; 174 goto out;
@@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
181 */ 181 */
182 down_write(&OCFS2_I(inode)->ip_alloc_sem); 182 down_write(&OCFS2_I(inode)->ip_alloc_sem);
183 183
184 ret = ocfs2_data_lock(inode, 1);
185 if (ret < 0) {
186 mlog_errno(ret);
187 goto out_meta_unlock;
188 }
189
190 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 184 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
191 185
192 ocfs2_data_unlock(inode, 1);
193
194out_meta_unlock:
195 up_write(&OCFS2_I(inode)->ip_alloc_sem); 186 up_write(&OCFS2_I(inode)->ip_alloc_sem);
196 187
197 brelse(di_bh); 188 brelse(di_bh);
198 ocfs2_meta_unlock(inode, 1); 189 ocfs2_inode_unlock(inode, 1);
199 190
200out: 191out:
201 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 192 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
214{ 205{
215 int ret = 0, lock_level = 0; 206 int ret = 0, lock_level = 0;
216 207
217 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 208 ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
218 file->f_vfsmnt, &lock_level); 209 file->f_vfsmnt, &lock_level);
219 if (ret < 0) { 210 if (ret < 0) {
220 mlog_errno(ret); 211 mlog_errno(ret);
221 goto out; 212 goto out;
222 } 213 }
223 ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level); 214 ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
224out: 215out:
225 vma->vm_ops = &ocfs2_file_vm_ops; 216 vma->vm_ops = &ocfs2_file_vm_ops;
226 vma->vm_flags |= VM_CAN_NONLINEAR; 217 vma->vm_flags |= VM_CAN_NONLINEAR;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 989ac2718587..ae9ad9587516 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,7 +60,6 @@
60#include "symlink.h" 60#include "symlink.h"
61#include "sysfile.h" 61#include "sysfile.h"
62#include "uptodate.h" 62#include "uptodate.h"
63#include "vote.h"
64 63
65#include "buffer_head_io.h" 64#include "buffer_head_io.h"
66 65
@@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
116 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len, 115 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
117 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); 116 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
118 117
119 status = ocfs2_meta_lock(dir, NULL, 0); 118 status = ocfs2_inode_lock(dir, NULL, 0);
120 if (status < 0) { 119 if (status < 0) {
121 if (status != -ENOENT) 120 if (status != -ENOENT)
122 mlog_errno(status); 121 mlog_errno(status);
@@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
129 if (status < 0) 128 if (status < 0)
130 goto bail_add; 129 goto bail_add;
131 130
132 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 131 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
133 if (IS_ERR(inode)) { 132 if (IS_ERR(inode)) {
134 ret = ERR_PTR(-EACCES); 133 ret = ERR_PTR(-EACCES);
135 goto bail_unlock; 134 goto bail_unlock;
@@ -176,8 +175,8 @@ bail_unlock:
176 /* Don't drop the cluster lock until *after* the d_add -- 175 /* Don't drop the cluster lock until *after* the d_add --
177 * unlink on another node will message us to remove that 176 * unlink on another node will message us to remove that
178 * dentry under this lock so otherwise we can race this with 177 * dentry under this lock so otherwise we can race this with
179 * the vote thread and have a stale dentry. */ 178 * the downconvert thread and have a stale dentry. */
180 ocfs2_meta_unlock(dir, 0); 179 ocfs2_inode_unlock(dir, 0);
181 180
182bail: 181bail:
183 182
@@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
209 /* get our super block */ 208 /* get our super block */
210 osb = OCFS2_SB(dir->i_sb); 209 osb = OCFS2_SB(dir->i_sb);
211 210
212 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 211 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
213 if (status < 0) { 212 if (status < 0) {
214 if (status != -ENOENT) 213 if (status != -ENOENT)
215 mlog_errno(status); 214 mlog_errno(status);
@@ -323,7 +322,7 @@ leave:
323 if (handle) 322 if (handle)
324 ocfs2_commit_trans(osb, handle); 323 ocfs2_commit_trans(osb, handle);
325 324
326 ocfs2_meta_unlock(dir, 1); 325 ocfs2_inode_unlock(dir, 1);
327 326
328 if (status == -ENOSPC) 327 if (status == -ENOSPC)
329 mlog(0, "Disk is full\n"); 328 mlog(0, "Disk is full\n");
@@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
553 if (S_ISDIR(inode->i_mode)) 552 if (S_ISDIR(inode->i_mode))
554 return -EPERM; 553 return -EPERM;
555 554
556 err = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 555 err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
557 if (err < 0) { 556 if (err < 0) {
558 if (err != -ENOENT) 557 if (err != -ENOENT)
559 mlog_errno(err); 558 mlog_errno(err);
@@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
578 goto out; 577 goto out;
579 } 578 }
580 579
581 err = ocfs2_meta_lock(inode, &fe_bh, 1); 580 err = ocfs2_inode_lock(inode, &fe_bh, 1);
582 if (err < 0) { 581 if (err < 0) {
583 if (err != -ENOENT) 582 if (err != -ENOENT)
584 mlog_errno(err); 583 mlog_errno(err);
@@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
643out_commit: 642out_commit:
644 ocfs2_commit_trans(osb, handle); 643 ocfs2_commit_trans(osb, handle);
645out_unlock_inode: 644out_unlock_inode:
646 ocfs2_meta_unlock(inode, 1); 645 ocfs2_inode_unlock(inode, 1);
647 646
648out: 647out:
649 ocfs2_meta_unlock(dir, 1); 648 ocfs2_inode_unlock(dir, 1);
650 649
651 if (de_bh) 650 if (de_bh)
652 brelse(de_bh); 651 brelse(de_bh);
@@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
720 return -EPERM; 719 return -EPERM;
721 } 720 }
722 721
723 status = ocfs2_meta_lock(dir, &parent_node_bh, 1); 722 status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
724 if (status < 0) { 723 if (status < 0) {
725 if (status != -ENOENT) 724 if (status != -ENOENT)
726 mlog_errno(status); 725 mlog_errno(status);
@@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
745 goto leave; 744 goto leave;
746 } 745 }
747 746
748 status = ocfs2_meta_lock(inode, &fe_bh, 1); 747 status = ocfs2_inode_lock(inode, &fe_bh, 1);
749 if (status < 0) { 748 if (status < 0) {
750 if (status != -ENOENT) 749 if (status != -ENOENT)
751 mlog_errno(status); 750 mlog_errno(status);
@@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
765 764
766 status = ocfs2_remote_dentry_delete(dentry); 765 status = ocfs2_remote_dentry_delete(dentry);
767 if (status < 0) { 766 if (status < 0) {
768 /* This vote should succeed under all normal 767 /* This remote delete should succeed under all normal
769 * circumstances. */ 768 * circumstances. */
770 mlog_errno(status); 769 mlog_errno(status);
771 goto leave; 770 goto leave;
@@ -841,13 +840,13 @@ leave:
841 ocfs2_commit_trans(osb, handle); 840 ocfs2_commit_trans(osb, handle);
842 841
843 if (child_locked) 842 if (child_locked)
844 ocfs2_meta_unlock(inode, 1); 843 ocfs2_inode_unlock(inode, 1);
845 844
846 ocfs2_meta_unlock(dir, 1); 845 ocfs2_inode_unlock(dir, 1);
847 846
848 if (orphan_dir) { 847 if (orphan_dir) {
849 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 848 /* This was locked for us in ocfs2_prepare_orphan_dir() */
850 ocfs2_meta_unlock(orphan_dir, 1); 849 ocfs2_inode_unlock(orphan_dir, 1);
851 mutex_unlock(&orphan_dir->i_mutex); 850 mutex_unlock(&orphan_dir->i_mutex);
852 iput(orphan_dir); 851 iput(orphan_dir);
853 } 852 }
@@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
908 inode1 = tmpinode; 907 inode1 = tmpinode;
909 } 908 }
910 /* lock id2 */ 909 /* lock id2 */
911 status = ocfs2_meta_lock(inode2, bh2, 1); 910 status = ocfs2_inode_lock(inode2, bh2, 1);
912 if (status < 0) { 911 if (status < 0) {
913 if (status != -ENOENT) 912 if (status != -ENOENT)
914 mlog_errno(status); 913 mlog_errno(status);
@@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
917 } 916 }
918 917
919 /* lock id1 */ 918 /* lock id1 */
920 status = ocfs2_meta_lock(inode1, bh1, 1); 919 status = ocfs2_inode_lock(inode1, bh1, 1);
921 if (status < 0) { 920 if (status < 0) {
922 /* 921 /*
923 * An error return must mean that no cluster locks 922 * An error return must mean that no cluster locks
924 * were held on function exit. 923 * were held on function exit.
925 */ 924 */
926 if (oi1->ip_blkno != oi2->ip_blkno) 925 if (oi1->ip_blkno != oi2->ip_blkno)
927 ocfs2_meta_unlock(inode2, 1); 926 ocfs2_inode_unlock(inode2, 1);
928 927
929 if (status != -ENOENT) 928 if (status != -ENOENT)
930 mlog_errno(status); 929 mlog_errno(status);
@@ -937,10 +936,10 @@ bail:
937 936
938static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) 937static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
939{ 938{
940 ocfs2_meta_unlock(inode1, 1); 939 ocfs2_inode_unlock(inode1, 1);
941 940
942 if (inode1 != inode2) 941 if (inode1 != inode2)
943 ocfs2_meta_unlock(inode2, 1); 942 ocfs2_inode_unlock(inode2, 1);
944} 943}
945 944
946static int ocfs2_rename(struct inode *old_dir, 945static int ocfs2_rename(struct inode *old_dir,
@@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir,
1031 1030
1032 /* 1031 /*
1033 * Aside from allowing a meta data update, the locking here 1032 * Aside from allowing a meta data update, the locking here
1034 * also ensures that the vote thread on other nodes won't have 1033 * also ensures that the downconvert thread on other nodes
1035 * to concurrently downconvert the inode and the dentry locks. 1034 * won't have to concurrently downconvert the inode and the
1035 * dentry locks.
1036 */ 1036 */
1037 status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1); 1037 status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
1038 if (status < 0) { 1038 if (status < 0) {
1039 if (status != -ENOENT) 1039 if (status != -ENOENT)
1040 mlog_errno(status); 1040 mlog_errno(status);
@@ -1143,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
1143 goto bail; 1143 goto bail;
1144 } 1144 }
1145 1145
1146 status = ocfs2_meta_lock(new_inode, &newfe_bh, 1); 1146 status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
1147 if (status < 0) { 1147 if (status < 0) {
1148 if (status != -ENOENT) 1148 if (status != -ENOENT)
1149 mlog_errno(status); 1149 mlog_errno(status);
@@ -1355,14 +1355,14 @@ bail:
1355 ocfs2_double_unlock(old_dir, new_dir); 1355 ocfs2_double_unlock(old_dir, new_dir);
1356 1356
1357 if (old_child_locked) 1357 if (old_child_locked)
1358 ocfs2_meta_unlock(old_inode, 1); 1358 ocfs2_inode_unlock(old_inode, 1);
1359 1359
1360 if (new_child_locked) 1360 if (new_child_locked)
1361 ocfs2_meta_unlock(new_inode, 1); 1361 ocfs2_inode_unlock(new_inode, 1);
1362 1362
1363 if (orphan_dir) { 1363 if (orphan_dir) {
1364 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 1364 /* This was locked for us in ocfs2_prepare_orphan_dir() */
1365 ocfs2_meta_unlock(orphan_dir, 1); 1365 ocfs2_inode_unlock(orphan_dir, 1);
1366 mutex_unlock(&orphan_dir->i_mutex); 1366 mutex_unlock(&orphan_dir->i_mutex);
1367 iput(orphan_dir); 1367 iput(orphan_dir);
1368 } 1368 }
@@ -1530,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
1530 credits = ocfs2_calc_symlink_credits(sb); 1530 credits = ocfs2_calc_symlink_credits(sb);
1531 1531
1532 /* lock the parent directory */ 1532 /* lock the parent directory */
1533 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 1533 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
1534 if (status < 0) { 1534 if (status < 0) {
1535 if (status != -ENOENT) 1535 if (status != -ENOENT)
1536 mlog_errno(status); 1536 mlog_errno(status);
@@ -1657,7 +1657,7 @@ bail:
1657 if (handle) 1657 if (handle)
1658 ocfs2_commit_trans(osb, handle); 1658 ocfs2_commit_trans(osb, handle);
1659 1659
1660 ocfs2_meta_unlock(dir, 1); 1660 ocfs2_inode_unlock(dir, 1);
1661 1661
1662 if (new_fe_bh) 1662 if (new_fe_bh)
1663 brelse(new_fe_bh); 1663 brelse(new_fe_bh);
@@ -1735,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1735 1735
1736 mutex_lock(&orphan_dir_inode->i_mutex); 1736 mutex_lock(&orphan_dir_inode->i_mutex);
1737 1737
1738 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 1738 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
1739 if (status < 0) { 1739 if (status < 0) {
1740 mlog_errno(status); 1740 mlog_errno(status);
1741 goto leave; 1741 goto leave;
@@ -1745,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1745 orphan_dir_bh, name, 1745 orphan_dir_bh, name,
1746 OCFS2_ORPHAN_NAMELEN, de_bh); 1746 OCFS2_ORPHAN_NAMELEN, de_bh);
1747 if (status < 0) { 1747 if (status < 0) {
1748 ocfs2_meta_unlock(orphan_dir_inode, 1); 1748 ocfs2_inode_unlock(orphan_dir_inode, 1);
1749 1749
1750 mlog_errno(status); 1750 mlog_errno(status);
1751 goto leave; 1751 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 60a23e1906b0..d08480580470 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
101 * about to be 101 * about to be
102 * dropped. */ 102 * dropped. */
103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ 103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
104#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
104 105
105struct ocfs2_lock_res_ops; 106struct ocfs2_lock_res_ops;
106 107
@@ -170,6 +171,7 @@ enum ocfs2_mount_options
170 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 171 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
171 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 172 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
172 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ 173 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
174 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
173}; 175};
174 176
175#define OCFS2_OSB_SOFT_RO 0x0001 177#define OCFS2_OSB_SOFT_RO 0x0001
@@ -189,9 +191,7 @@ struct ocfs2_super
189 struct ocfs2_slot_info *slot_info; 191 struct ocfs2_slot_info *slot_info;
190 192
191 spinlock_t node_map_lock; 193 spinlock_t node_map_lock;
192 struct ocfs2_node_map mounted_map;
193 struct ocfs2_node_map recovery_map; 194 struct ocfs2_node_map recovery_map;
194 struct ocfs2_node_map umount_map;
195 195
196 u64 root_blkno; 196 u64 root_blkno;
197 u64 system_dir_blkno; 197 u64 system_dir_blkno;
@@ -231,7 +231,9 @@ struct ocfs2_super
231 wait_queue_head_t checkpoint_event; 231 wait_queue_head_t checkpoint_event;
232 atomic_t needs_checkpoint; 232 atomic_t needs_checkpoint;
233 struct ocfs2_journal *journal; 233 struct ocfs2_journal *journal;
234 unsigned long osb_commit_interval;
234 235
236 int local_alloc_size;
235 enum ocfs2_local_alloc_state local_alloc_state; 237 enum ocfs2_local_alloc_state local_alloc_state;
236 struct buffer_head *local_alloc_bh; 238 struct buffer_head *local_alloc_bh;
237 u64 la_last_gd; 239 u64 la_last_gd;
@@ -254,28 +256,21 @@ struct ocfs2_super
254 256
255 wait_queue_head_t recovery_event; 257 wait_queue_head_t recovery_event;
256 258
257 spinlock_t vote_task_lock; 259 spinlock_t dc_task_lock;
258 struct task_struct *vote_task; 260 struct task_struct *dc_task;
259 wait_queue_head_t vote_event; 261 wait_queue_head_t dc_event;
260 unsigned long vote_wake_sequence; 262 unsigned long dc_wake_sequence;
261 unsigned long vote_work_sequence; 263 unsigned long dc_work_sequence;
262 264
265 /*
266 * Any thread can add locks to the list, but the downconvert
267 * thread is the only one allowed to remove locks. Any change
268 * to this rule requires updating
269 * ocfs2_downconvert_thread_do_work().
270 */
263 struct list_head blocked_lock_list; 271 struct list_head blocked_lock_list;
264 unsigned long blocked_lock_count; 272 unsigned long blocked_lock_count;
265 273
266 struct list_head vote_list;
267 int vote_count;
268
269 u32 net_key;
270 spinlock_t net_response_lock;
271 unsigned int net_response_ids;
272 struct list_head net_response_list;
273
274 struct o2hb_callback_func osb_hb_up;
275 struct o2hb_callback_func osb_hb_down;
276
277 struct list_head osb_net_handlers;
278
279 wait_queue_head_t osb_mount_event; 274 wait_queue_head_t osb_mount_event;
280 275
281 /* Truncate log info */ 276 /* Truncate log info */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 6ef876759a73..3633edd3982f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,6 +231,20 @@ struct ocfs2_space_resv {
231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) 231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) 232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
233 233
234/* Used to pass group descriptor data when online resize is done */
235struct ocfs2_new_group_input {
236 __u64 group; /* Group descriptor's blkno. */
237 __u32 clusters; /* Total number of clusters in this group */
238 __u32 frees; /* Total free clusters in this group */
239 __u16 chain; /* Chain for this group */
240 __u16 reserved1;
241 __u32 reserved2;
242};
243
244#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
245#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
246#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
247
234/* 248/*
235 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 249 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
236 */ 250 */
@@ -256,6 +270,14 @@ struct ocfs2_space_resv {
256/* Journal limits (in bytes) */ 270/* Journal limits (in bytes) */
257#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 271#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
258 272
273/*
274 * Default local alloc size (in megabytes)
275 *
276 * The value chosen should be such that most allocations, including new
277 * block groups, use local alloc.
278 */
279#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
280
259struct ocfs2_system_inode_info { 281struct ocfs2_system_inode_info {
260 char *si_name; 282 char *si_name;
261 int si_iflags; 283 int si_iflags;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4ca02b1c38ac..86f3e3799c2b 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
45 OCFS2_LOCK_TYPE_RW, 45 OCFS2_LOCK_TYPE_RW,
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK,
48 OCFS2_NUM_LOCK_TYPES 49 OCFS2_NUM_LOCK_TYPES
49}; 50};
50 51
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
73 case OCFS2_LOCK_TYPE_OPEN: 74 case OCFS2_LOCK_TYPE_OPEN:
74 c = 'O'; 75 c = 'O';
75 break; 76 break;
77 case OCFS2_LOCK_TYPE_FLOCK:
78 c = 'F';
79 break;
76 default: 80 default:
77 c = '\0'; 81 c = '\0';
78 } 82 }
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
90 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 94 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
91 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 95 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
92 [OCFS2_LOCK_TYPE_OPEN] = "Open", 96 [OCFS2_LOCK_TYPE_OPEN] = "Open",
97 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
93}; 98};
94 99
95static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 000000000000..37835ffcb039
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,634 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.c
5 *
6 * volume resize.
7 * Inspired by ext3/resize.c.
8 *
9 * Copyright (C) 2007 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29
30#define MLOG_MASK_PREFIX ML_DISK_ALLOC
31#include <cluster/masklog.h>
32
33#include "ocfs2.h"
34
35#include "alloc.h"
36#include "dlmglue.h"
37#include "inode.h"
38#include "journal.h"
39#include "super.h"
40#include "sysfile.h"
41#include "uptodate.h"
42
43#include "buffer_head_io.h"
44#include "suballoc.h"
45#include "resize.h"
46
47/*
48 * Check whether there are new backup superblocks exist
49 * in the last group. If there are some, mark them or clear
50 * them in the bitmap.
51 *
52 * Return how many backups we find in the last group.
53 */
54static u16 ocfs2_calc_new_backup_super(struct inode *inode,
55 struct ocfs2_group_desc *gd,
56 int new_clusters,
57 u32 first_new_cluster,
58 u16 cl_cpg,
59 int set)
60{
61 int i;
62 u16 backups = 0;
63 u32 cluster;
64 u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
65
66 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
67 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
68 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
69
70 gd_blkno = ocfs2_which_cluster_group(inode, cluster);
71 if (gd_blkno < lgd_blkno)
72 continue;
73 else if (gd_blkno > lgd_blkno)
74 break;
75
76 if (set)
77 ocfs2_set_bit(cluster % cl_cpg,
78 (unsigned long *)gd->bg_bitmap);
79 else
80 ocfs2_clear_bit(cluster % cl_cpg,
81 (unsigned long *)gd->bg_bitmap);
82 backups++;
83 }
84
85 mlog_exit_void();
86 return backups;
87}
88
89static int ocfs2_update_last_group_and_inode(handle_t *handle,
90 struct inode *bm_inode,
91 struct buffer_head *bm_bh,
92 struct buffer_head *group_bh,
93 u32 first_new_cluster,
94 int new_clusters)
95{
96 int ret = 0;
97 struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
98 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
99 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
100 struct ocfs2_chain_rec *cr;
101 struct ocfs2_group_desc *group;
102 u16 chain, num_bits, backups = 0;
103 u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
104 u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
105
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster);
108
109 ret = ocfs2_journal_access(handle, bm_inode, group_bh,
110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) {
112 mlog_errno(ret);
113 goto out;
114 }
115
116 group = (struct ocfs2_group_desc *)group_bh->b_data;
117
118 /* update the group first. */
119 num_bits = new_clusters * cl_bpc;
120 le16_add_cpu(&group->bg_bits, num_bits);
121 le16_add_cpu(&group->bg_free_bits_count, num_bits);
122
123 /*
124 * check whether there are some new backup superblocks exist in
125 * this group and update the group bitmap accordingly.
126 */
127 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
128 OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
129 backups = ocfs2_calc_new_backup_super(bm_inode,
130 group,
131 new_clusters,
132 first_new_cluster,
133 cl_cpg, 1);
134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
135 }
136
137 ret = ocfs2_journal_dirty(handle, group_bh);
138 if (ret < 0) {
139 mlog_errno(ret);
140 goto out_rollback;
141 }
142
143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) {
147 mlog_errno(ret);
148 goto out_rollback;
149 }
150
151 chain = le16_to_cpu(group->bg_chain);
152 cr = (&cl->cl_recs[chain]);
153 le32_add_cpu(&cr->c_total, num_bits);
154 le32_add_cpu(&cr->c_free, num_bits);
155 le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
156 le32_add_cpu(&fe->i_clusters, new_clusters);
157
158 if (backups) {
159 le32_add_cpu(&cr->c_free, -1 * backups);
160 le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
161 }
162
163 spin_lock(&OCFS2_I(bm_inode)->ip_lock);
164 OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
165 le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
166 spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
167 i_size_write(bm_inode, le64_to_cpu(fe->i_size));
168
169 ocfs2_journal_dirty(handle, bm_bh);
170
171out_rollback:
172 if (ret < 0) {
173 ocfs2_calc_new_backup_super(bm_inode,
174 group,
175 new_clusters,
176 first_new_cluster,
177 cl_cpg, 0);
178 le16_add_cpu(&group->bg_free_bits_count, backups);
179 le16_add_cpu(&group->bg_bits, -1 * num_bits);
180 le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
181 }
182out:
183 mlog_exit(ret);
184 return ret;
185}
186
187static int update_backups(struct inode * inode, u32 clusters, char *data)
188{
189 int i, ret = 0;
190 u32 cluster;
191 u64 blkno;
192 struct buffer_head *backup = NULL;
193 struct ocfs2_dinode *backup_di = NULL;
194 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
195
196 /* calculate the real backups we need to update. */
197 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
198 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
199 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
200 if (cluster > clusters)
201 break;
202
203 ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
204 if (ret < 0) {
205 mlog_errno(ret);
206 break;
207 }
208
209 memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
210
211 backup_di = (struct ocfs2_dinode *)backup->b_data;
212 backup_di->i_blkno = cpu_to_le64(blkno);
213
214 ret = ocfs2_write_super_or_backup(osb, backup);
215 brelse(backup);
216 backup = NULL;
217 if (ret < 0) {
218 mlog_errno(ret);
219 break;
220 }
221 }
222
223 return ret;
224}
225
226static void ocfs2_update_super_and_backups(struct inode *inode,
227 int new_clusters)
228{
229 int ret;
230 u32 clusters = 0;
231 struct buffer_head *super_bh = NULL;
232 struct ocfs2_dinode *super_di = NULL;
233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234
235 /*
236 * update the superblock last.
237 * It doesn't matter if the write failed.
238 */
239 ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
240 &super_bh, 0, NULL);
241 if (ret < 0) {
242 mlog_errno(ret);
243 goto out;
244 }
245
246 super_di = (struct ocfs2_dinode *)super_bh->b_data;
247 le32_add_cpu(&super_di->i_clusters, new_clusters);
248 clusters = le32_to_cpu(super_di->i_clusters);
249
250 ret = ocfs2_write_super_or_backup(osb, super_bh);
251 if (ret < 0) {
252 mlog_errno(ret);
253 goto out;
254 }
255
256 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
257 ret = update_backups(inode, clusters, super_bh->b_data);
258
259out:
260 brelse(super_bh);
261 if (ret)
262 printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
263 " during fs resize. This condition is not fatal,"
264 " but fsck.ocfs2 should be run to fix it\n",
265 osb->dev_str);
266 return;
267}
268
269/*
270 * Extend the filesystem to the new number of clusters specified. This entry
271 * point is only used to extend the current filesystem to the end of the last
272 * existing group.
273 */
274int ocfs2_group_extend(struct inode * inode, int new_clusters)
275{
276 int ret;
277 handle_t *handle;
278 struct buffer_head *main_bm_bh = NULL;
279 struct buffer_head *group_bh = NULL;
280 struct inode *main_bm_inode = NULL;
281 struct ocfs2_dinode *fe = NULL;
282 struct ocfs2_group_desc *group = NULL;
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 u16 cl_bpc;
285 u32 first_new_cluster;
286 u64 lgd_blkno;
287
288 mlog_entry_void();
289
290 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
291 return -EROFS;
292
293 if (new_clusters < 0)
294 return -EINVAL;
295 else if (new_clusters == 0)
296 return 0;
297
298 main_bm_inode = ocfs2_get_system_file_inode(osb,
299 GLOBAL_BITMAP_SYSTEM_INODE,
300 OCFS2_INVALID_SLOT);
301 if (!main_bm_inode) {
302 ret = -EINVAL;
303 mlog_errno(ret);
304 goto out;
305 }
306
307 mutex_lock(&main_bm_inode->i_mutex);
308
309 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
310 if (ret < 0) {
311 mlog_errno(ret);
312 goto out_mutex;
313 }
314
315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
316
317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
318 ocfs2_group_bitmap_size(osb->sb) * 8) {
319 mlog(ML_ERROR, "The disk is too old and small. "
320 "Force to do offline resize.");
321 ret = -EINVAL;
322 goto out_unlock;
323 }
324
325 if (!OCFS2_IS_VALID_DINODE(fe)) {
326 OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
327 ret = -EIO;
328 goto out_unlock;
329 }
330
331 first_new_cluster = le32_to_cpu(fe->i_clusters);
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1);
334
335 ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
336 main_bm_inode);
337 if (ret < 0) {
338 mlog_errno(ret);
339 goto out_unlock;
340 }
341
342 group = (struct ocfs2_group_desc *)group_bh->b_data;
343
344 ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
345 if (ret) {
346 mlog_errno(ret);
347 goto out_unlock;
348 }
349
350 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
351 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
352 le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
353 ret = -EINVAL;
354 goto out_unlock;
355 }
356
357 mlog(0, "extend the last group at %llu, new clusters = %d\n",
358 (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
359
360 handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
361 if (IS_ERR(handle)) {
362 mlog_errno(PTR_ERR(handle));
363 ret = -EINVAL;
364 goto out_unlock;
365 }
366
367 /* update the last group descriptor and inode. */
368 ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
369 main_bm_bh, group_bh,
370 first_new_cluster,
371 new_clusters);
372 if (ret) {
373 mlog_errno(ret);
374 goto out_commit;
375 }
376
377 ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
378
379out_commit:
380 ocfs2_commit_trans(osb, handle);
381out_unlock:
382 brelse(group_bh);
383 brelse(main_bm_bh);
384
385 ocfs2_inode_unlock(main_bm_inode, 1);
386
387out_mutex:
388 mutex_unlock(&main_bm_inode->i_mutex);
389 iput(main_bm_inode);
390
391out:
392 mlog_exit_void();
393 return ret;
394}
395
396static int ocfs2_check_new_group(struct inode *inode,
397 struct ocfs2_dinode *di,
398 struct ocfs2_new_group_input *input,
399 struct buffer_head *group_bh)
400{
401 int ret;
402 struct ocfs2_group_desc *gd;
403 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
404 unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
405 le16_to_cpu(di->id2.i_chain.cl_bpc);
406
407
408 gd = (struct ocfs2_group_desc *)group_bh->b_data;
409
410 ret = -EIO;
411 if (!OCFS2_IS_VALID_GROUP_DESC(gd))
412 mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
413 (unsigned long long)le64_to_cpu(gd->bg_blkno));
414 else if (di->i_blkno != gd->bg_parent_dinode)
415 mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
416 "pointer (%llu, expected %llu)\n",
417 (unsigned long long)le64_to_cpu(gd->bg_blkno),
418 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
419 (unsigned long long)le64_to_cpu(di->i_blkno));
420 else if (le16_to_cpu(gd->bg_bits) > max_bits)
421 mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
422 (unsigned long long)le64_to_cpu(gd->bg_blkno),
423 le16_to_cpu(gd->bg_bits));
424 else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
425 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
426 "claims that %u are free\n",
427 (unsigned long long)le64_to_cpu(gd->bg_blkno),
428 le16_to_cpu(gd->bg_bits),
429 le16_to_cpu(gd->bg_free_bits_count));
430 else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
431 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
432 "max bitmap bits of %u\n",
433 (unsigned long long)le64_to_cpu(gd->bg_blkno),
434 le16_to_cpu(gd->bg_bits),
435 8 * le16_to_cpu(gd->bg_size));
436 else if (le16_to_cpu(gd->bg_chain) != input->chain)
437 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
438 "while input has %u set.\n",
439 (unsigned long long)le64_to_cpu(gd->bg_blkno),
440 le16_to_cpu(gd->bg_chain), input->chain);
441 else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
442 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
443 "input has %u clusters set\n",
444 (unsigned long long)le64_to_cpu(gd->bg_blkno),
445 le16_to_cpu(gd->bg_bits), input->clusters);
446 else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
447 mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
448 "but it should have %u set\n",
449 (unsigned long long)le64_to_cpu(gd->bg_blkno),
450 le16_to_cpu(gd->bg_bits),
451 input->frees * cl_bpc);
452 else
453 ret = 0;
454
455 return ret;
456}
457
458static int ocfs2_verify_group_and_input(struct inode *inode,
459 struct ocfs2_dinode *di,
460 struct ocfs2_new_group_input *input,
461 struct buffer_head *group_bh)
462{
463 u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
464 u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
465 u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
466 u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
467 u32 total_clusters = le32_to_cpu(di->i_clusters);
468 int ret = -EINVAL;
469
470 if (cluster < total_clusters)
471 mlog(ML_ERROR, "add a group which is in the current volume.\n");
472 else if (input->chain >= cl_count)
473 mlog(ML_ERROR, "input chain exceeds the limit.\n");
474 else if (next_free != cl_count && next_free != input->chain)
475 mlog(ML_ERROR,
476 "the add group should be in chain %u\n", next_free);
477 else if (total_clusters + input->clusters < total_clusters)
478 mlog(ML_ERROR, "add group's clusters overflow.\n");
479 else if (input->clusters > cl_cpg)
480 mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
481 else if (input->frees > input->clusters)
482 mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
483 else if (total_clusters % cl_cpg != 0)
484 mlog(ML_ERROR,
485 "the last group isn't full. Use group extend first.\n");
486 else if (input->group != ocfs2_which_cluster_group(inode, cluster))
487 mlog(ML_ERROR, "group blkno is invalid\n");
488 else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
489 mlog(ML_ERROR, "group descriptor check failed.\n");
490 else
491 ret = 0;
492
493 return ret;
494}
495
496/* Add a new group descriptor to global_bitmap. */
497int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
498{
499 int ret;
500 handle_t *handle;
501 struct buffer_head *main_bm_bh = NULL;
502 struct inode *main_bm_inode = NULL;
503 struct ocfs2_dinode *fe = NULL;
504 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
505 struct buffer_head *group_bh = NULL;
506 struct ocfs2_group_desc *group = NULL;
507 struct ocfs2_chain_list *cl;
508 struct ocfs2_chain_rec *cr;
509 u16 cl_bpc;
510
511 mlog_entry_void();
512
513 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
514 return -EROFS;
515
516 main_bm_inode = ocfs2_get_system_file_inode(osb,
517 GLOBAL_BITMAP_SYSTEM_INODE,
518 OCFS2_INVALID_SLOT);
519 if (!main_bm_inode) {
520 ret = -EINVAL;
521 mlog_errno(ret);
522 goto out;
523 }
524
525 mutex_lock(&main_bm_inode->i_mutex);
526
527 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
528 if (ret < 0) {
529 mlog_errno(ret);
530 goto out_mutex;
531 }
532
533 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
534
535 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
536 ocfs2_group_bitmap_size(osb->sb) * 8) {
537 mlog(ML_ERROR, "The disk is too old and small."
538 " Force to do offline resize.");
539 ret = -EINVAL;
540 goto out_unlock;
541 }
542
543 ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
544 if (ret < 0) {
545 mlog(ML_ERROR, "Can't read the group descriptor # %llu "
546 "from the device.", (unsigned long long)input->group);
547 goto out_unlock;
548 }
549
550 ocfs2_set_new_buffer_uptodate(inode, group_bh);
551
552 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
553 if (ret) {
554 mlog_errno(ret);
555 goto out_unlock;
556 }
557
558 mlog(0, "Add a new group %llu in chain = %u, length = %u\n",
559 (unsigned long long)input->group, input->chain, input->clusters);
560
561 handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
562 if (IS_ERR(handle)) {
563 mlog_errno(PTR_ERR(handle));
564 ret = -EINVAL;
565 goto out_unlock;
566 }
567
568 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
569 cl = &fe->id2.i_chain;
570 cr = &cl->cl_recs[input->chain];
571
572 ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
573 OCFS2_JOURNAL_ACCESS_WRITE);
574 if (ret < 0) {
575 mlog_errno(ret);
576 goto out_commit;
577 }
578
579 group = (struct ocfs2_group_desc *)group_bh->b_data;
580 group->bg_next_group = cr->c_blkno;
581
582 ret = ocfs2_journal_dirty(handle, group_bh);
583 if (ret < 0) {
584 mlog_errno(ret);
585 goto out_commit;
586 }
587
588 ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
589 OCFS2_JOURNAL_ACCESS_WRITE);
590 if (ret < 0) {
591 mlog_errno(ret);
592 goto out_commit;
593 }
594
595 if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
596 le16_add_cpu(&cl->cl_next_free_rec, 1);
597 memset(cr, 0, sizeof(struct ocfs2_chain_rec));
598 }
599
600 cr->c_blkno = le64_to_cpu(input->group);
601 le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
602 le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
603
604 le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
605 le32_add_cpu(&fe->id1.bitmap1.i_used,
606 (input->clusters - input->frees) * cl_bpc);
607 le32_add_cpu(&fe->i_clusters, input->clusters);
608
609 ocfs2_journal_dirty(handle, main_bm_bh);
610
611 spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
612 OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
613 le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
614 spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
615 i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
616
617 ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
618
619out_commit:
620 ocfs2_commit_trans(osb, handle);
621out_unlock:
622 brelse(group_bh);
623 brelse(main_bm_bh);
624
625 ocfs2_inode_unlock(main_bm_inode, 1);
626
627out_mutex:
628 mutex_unlock(&main_bm_inode->i_mutex);
629 iput(main_bm_inode);
630
631out:
632 mlog_exit_void();
633 return ret;
634}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 000000000000..f38841abf10b
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_RESIZE_H
27#define OCFS2_RESIZE_H
28
29int ocfs2_group_extend(struct inode * inode, int new_clusters);
30int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
31
32#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index af4882b62cfa..3a50ce555e64 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
48 s16 slot_num, 48 s16 slot_num,
49 s16 node_num); 49 s16 node_num);
50 50
51/* Use the slot information we've collected to create a map of mounted
52 * nodes. Should be holding an EX on super block. assumes slot info is
53 * up to date. Note that we call this *after* we find a slot, so our
54 * own node should be set in the map too... */
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
56{
57 int i;
58 struct ocfs2_slot_info *si = osb->slot_info;
59
60 spin_lock(&si->si_lock);
61
62 for (i = 0; i < si->si_size; i++)
63 if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
64 ocfs2_node_map_set_bit(osb, &osb->mounted_map,
65 si->si_global_node_nums[i]);
66
67 spin_unlock(&si->si_lock);
68}
69
70/* post the slot information on disk into our slot_info struct. */ 51/* post the slot information on disk into our slot_info struct. */
71void ocfs2_update_slot_info(struct ocfs2_slot_info *si) 52void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
72{ 53{
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031b..1025872aaade 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
52void ocfs2_clear_slot(struct ocfs2_slot_info *si, 52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num); 53 s16 slot_num);
54 54
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
56
57static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, 55static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
58 int slot_num) 56 int slot_num)
59{ 57{
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8f09f5235e3a..7e397e2c25dd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
102 u64 bg_blkno, 102 u64 bg_blkno,
103 u16 bg_bit_off); 103 u16 bg_bit_off);
104static inline u64 ocfs2_which_cluster_group(struct inode *inode,
105 u32 cluster);
106static inline void ocfs2_block_to_cluster_group(struct inode *inode, 104static inline void ocfs2_block_to_cluster_group(struct inode *inode,
107 u64 data_blkno, 105 u64 data_blkno,
108 u64 *bg_blkno, 106 u64 *bg_blkno,
@@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
114 112
115 if (inode) { 113 if (inode) {
116 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 114 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
117 ocfs2_meta_unlock(inode, 1); 115 ocfs2_inode_unlock(inode, 1);
118 116
119 mutex_unlock(&inode->i_mutex); 117 mutex_unlock(&inode->i_mutex);
120 118
@@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
131} 129}
132 130
133/* somewhat more expensive than our other checks, so use sparingly. */ 131/* somewhat more expensive than our other checks, so use sparingly. */
134static int ocfs2_check_group_descriptor(struct super_block *sb, 132int ocfs2_check_group_descriptor(struct super_block *sb,
135 struct ocfs2_dinode *di, 133 struct ocfs2_dinode *di,
136 struct ocfs2_group_desc *gd) 134 struct ocfs2_group_desc *gd)
137{ 135{
138 unsigned int max_bits; 136 unsigned int max_bits;
139 137
@@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
412 410
413 mutex_lock(&alloc_inode->i_mutex); 411 mutex_lock(&alloc_inode->i_mutex);
414 412
415 status = ocfs2_meta_lock(alloc_inode, &bh, 1); 413 status = ocfs2_inode_lock(alloc_inode, &bh, 1);
416 if (status < 0) { 414 if (status < 0) {
417 mutex_unlock(&alloc_inode->i_mutex); 415 mutex_unlock(&alloc_inode->i_mutex);
418 iput(alloc_inode); 416 iput(alloc_inode);
@@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1443 1441
1444/* given a cluster offset, calculate which block group it belongs to 1442/* given a cluster offset, calculate which block group it belongs to
1445 * and return that block offset. */ 1443 * and return that block offset. */
1446static inline u64 ocfs2_which_cluster_group(struct inode *inode, 1444u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1447 u32 cluster)
1448{ 1445{
1449 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1446 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1450 u32 group_no; 1447 u32 group_no;
@@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1519 if (min_clusters > (osb->bitmap_cpg - 1)) { 1516 if (min_clusters > (osb->bitmap_cpg - 1)) {
1520 /* The only paths asking for contiguousness 1517 /* The only paths asking for contiguousness
1521 * should know about this already. */ 1518 * should know about this already. */
1522 mlog(ML_ERROR, "minimum allocation requested exceeds " 1519 mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1523 "group bitmap size!"); 1520 "group bitmap size %u!\n", min_clusters,
1521 osb->bitmap_cpg);
1524 status = -ENOSPC; 1522 status = -ENOSPC;
1525 goto bail; 1523 goto bail;
1526 } 1524 }
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index cafe93703095..8799033bb459 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
148 struct ocfs2_alloc_context *ac); 148 struct ocfs2_alloc_context *ac);
149 149
150/* given a cluster offset, calculate which block group it belongs to
151 * and return that block offset. */
152u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
153
154/* somewhat more expensive than our other checks, so use sparingly. */
155int ocfs2_check_group_descriptor(struct super_block *sb,
156 struct ocfs2_dinode *di,
157 struct ocfs2_group_desc *gd);
150#endif /* _CHAINALLOC_H_ */ 158#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5ee775420665..01fe40ee5ea9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,7 +65,6 @@
65#include "sysfile.h" 65#include "sysfile.h"
66#include "uptodate.h" 66#include "uptodate.h"
67#include "ver.h" 67#include "ver.h"
68#include "vote.h"
69 68
70#include "buffer_head_io.h" 69#include "buffer_head_io.h"
71 70
@@ -84,9 +83,11 @@ MODULE_LICENSE("GPL");
84 83
85struct mount_options 84struct mount_options
86{ 85{
86 unsigned long commit_interval;
87 unsigned long mount_opt; 87 unsigned long mount_opt;
88 unsigned int atime_quantum; 88 unsigned int atime_quantum;
89 signed short slot; 89 signed short slot;
90 unsigned int localalloc_opt;
90}; 91};
91 92
92static int ocfs2_parse_options(struct super_block *sb, char *options, 93static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -150,6 +151,9 @@ enum {
150 Opt_data_writeback, 151 Opt_data_writeback,
151 Opt_atime_quantum, 152 Opt_atime_quantum,
152 Opt_slot, 153 Opt_slot,
154 Opt_commit,
155 Opt_localalloc,
156 Opt_localflocks,
153 Opt_err, 157 Opt_err,
154}; 158};
155 159
@@ -165,6 +169,9 @@ static match_table_t tokens = {
165 {Opt_data_writeback, "data=writeback"}, 169 {Opt_data_writeback, "data=writeback"},
166 {Opt_atime_quantum, "atime_quantum=%u"}, 170 {Opt_atime_quantum, "atime_quantum=%u"},
167 {Opt_slot, "preferred_slot=%u"}, 171 {Opt_slot, "preferred_slot=%u"},
172 {Opt_commit, "commit=%u"},
173 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"},
168 {Opt_err, NULL} 175 {Opt_err, NULL}
169}; 176};
170 177
@@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
213 220
214 mlog_entry_void(); 221 mlog_entry_void();
215 222
216 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE); 223 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
217 if (IS_ERR(new)) { 224 if (IS_ERR(new)) {
218 status = PTR_ERR(new); 225 status = PTR_ERR(new);
219 mlog_errno(status); 226 mlog_errno(status);
@@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
221 } 228 }
222 osb->root_inode = new; 229 osb->root_inode = new;
223 230
224 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE); 231 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
225 if (IS_ERR(new)) { 232 if (IS_ERR(new)) {
226 status = PTR_ERR(new); 233 status = PTR_ERR(new);
227 mlog_errno(status); 234 mlog_errno(status);
@@ -443,6 +450,8 @@ unlock_osb:
443 osb->s_mount_opt = parsed_options.mount_opt; 450 osb->s_mount_opt = parsed_options.mount_opt;
444 osb->s_atime_quantum = parsed_options.atime_quantum; 451 osb->s_atime_quantum = parsed_options.atime_quantum;
445 osb->preferred_slot = parsed_options.slot; 452 osb->preferred_slot = parsed_options.slot;
453 if (parsed_options.commit_interval)
454 osb->osb_commit_interval = parsed_options.commit_interval;
446 455
447 if (!ocfs2_is_hard_readonly(osb)) 456 if (!ocfs2_is_hard_readonly(osb))
448 ocfs2_set_journal_params(osb); 457 ocfs2_set_journal_params(osb);
@@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
597 osb->s_mount_opt = parsed_options.mount_opt; 606 osb->s_mount_opt = parsed_options.mount_opt;
598 osb->s_atime_quantum = parsed_options.atime_quantum; 607 osb->s_atime_quantum = parsed_options.atime_quantum;
599 osb->preferred_slot = parsed_options.slot; 608 osb->preferred_slot = parsed_options.slot;
609 osb->osb_commit_interval = parsed_options.commit_interval;
610 osb->local_alloc_size = parsed_options.localalloc_opt;
600 611
601 sb->s_magic = OCFS2_SUPER_MAGIC; 612 sb->s_magic = OCFS2_SUPER_MAGIC;
602 613
@@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb,
747 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 758 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
748 options ? options : "(none)"); 759 options ? options : "(none)");
749 760
761 mopt->commit_interval = 0;
750 mopt->mount_opt = 0; 762 mopt->mount_opt = 0;
751 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 763 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
752 mopt->slot = OCFS2_INVALID_SLOT; 764 mopt->slot = OCFS2_INVALID_SLOT;
765 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
753 766
754 if (!options) { 767 if (!options) {
755 status = 1; 768 status = 1;
@@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb,
816 if (option) 829 if (option)
817 mopt->slot = (s16)option; 830 mopt->slot = (s16)option;
818 break; 831 break;
832 case Opt_commit:
833 option = 0;
834 if (match_int(&args[0], &option)) {
835 status = 0;
836 goto bail;
837 }
838 if (option < 0)
839 return 0;
840 if (option == 0)
841 option = JBD_DEFAULT_MAX_COMMIT_AGE;
842 mopt->commit_interval = HZ * option;
843 break;
844 case Opt_localalloc:
845 option = 0;
846 if (match_int(&args[0], &option)) {
847 status = 0;
848 goto bail;
849 }
850 if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
851 mopt->localalloc_opt = option;
852 break;
853 case Opt_localflocks:
854 /*
855 * Changing this during remount could race
856 * flock() requests, or "unbalance" existing
857 * ones (e.g., a lock is taken in one mode but
858 * dropped in the other). If users care enough
859 * to flip locking modes during remount, we
860 * could add a "local" flag to individual
861 * flock structures for proper tracking of
862 * state.
863 */
864 if (!is_remount)
865 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
866 break;
819 default: 867 default:
820 mlog(ML_ERROR, 868 mlog(ML_ERROR,
821 "Unrecognized mount option \"%s\" " 869 "Unrecognized mount option \"%s\" "
@@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
864 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) 912 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
865 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); 913 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
866 914
915 if (osb->osb_commit_interval)
916 seq_printf(s, ",commit=%u",
917 (unsigned) (osb->osb_commit_interval / HZ));
918
919 if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
920 seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
921
922 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
923 seq_printf(s, ",localflocks,");
924
867 return 0; 925 return 0;
868} 926}
869 927
@@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
965 goto bail; 1023 goto bail;
966 } 1024 }
967 1025
968 status = ocfs2_meta_lock(inode, &bh, 0); 1026 status = ocfs2_inode_lock(inode, &bh, 0);
969 if (status < 0) { 1027 if (status < 0) {
970 mlog_errno(status); 1028 mlog_errno(status);
971 goto bail; 1029 goto bail;
@@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
989 1047
990 brelse(bh); 1048 brelse(bh);
991 1049
992 ocfs2_meta_unlock(inode, 0); 1050 ocfs2_inode_unlock(inode, 0);
993 status = 0; 1051 status = 0;
994bail: 1052bail:
995 if (inode) 1053 if (inode)
@@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
1020 oi->ip_clusters = 0; 1078 oi->ip_clusters = 0;
1021 1079
1022 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1080 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1023 ocfs2_lock_res_init_once(&oi->ip_meta_lockres); 1081 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1024 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
1025 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1082 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1026 1083
1027 ocfs2_metadata_cache_init(&oi->vfs_inode); 1084 ocfs2_metadata_cache_init(&oi->vfs_inode);
@@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb)
1117 goto leave; 1174 goto leave;
1118 } 1175 }
1119 1176
1120 status = ocfs2_register_hb_callbacks(osb);
1121 if (status < 0) {
1122 mlog_errno(status);
1123 goto leave;
1124 }
1125
1126 status = ocfs2_dlm_init(osb); 1177 status = ocfs2_dlm_init(osb);
1127 if (status < 0) { 1178 if (status < 0) {
1128 mlog_errno(status); 1179 mlog_errno(status);
1129 goto leave; 1180 goto leave;
1130 } 1181 }
1131 1182
1132 /* requires vote_thread to be running. */
1133 status = ocfs2_register_net_handlers(osb);
1134 if (status < 0) {
1135 mlog_errno(status);
1136 goto leave;
1137 }
1138
1139 status = ocfs2_super_lock(osb, 1); 1183 status = ocfs2_super_lock(osb, 1);
1140 if (status < 0) { 1184 if (status < 0) {
1141 mlog_errno(status); 1185 mlog_errno(status);
@@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1150 goto leave; 1194 goto leave;
1151 } 1195 }
1152 1196
1153 ocfs2_populate_mounted_map(osb);
1154
1155 /* load all node-local system inodes */ 1197 /* load all node-local system inodes */
1156 status = ocfs2_init_local_system_inodes(osb); 1198 status = ocfs2_init_local_system_inodes(osb);
1157 if (status < 0) { 1199 if (status < 0) {
@@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1174 if (ocfs2_mount_local(osb)) 1216 if (ocfs2_mount_local(osb))
1175 goto leave; 1217 goto leave;
1176 1218
1177 /* This should be sent *after* we recovered our journal as it
1178 * will cause other nodes to unmark us as needing
1179 * recovery. However, we need to send it *before* dropping the
1180 * super block lock as otherwise their recovery threads might
1181 * try to clean us up while we're live! */
1182 status = ocfs2_request_mount_vote(osb);
1183 if (status < 0)
1184 mlog_errno(status);
1185
1186leave: 1219leave:
1187 if (unlock_super) 1220 if (unlock_super)
1188 ocfs2_super_unlock(osb, 1); 1221 ocfs2_super_unlock(osb, 1);
@@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1240 mlog_errno(tmp); 1273 mlog_errno(tmp);
1241 return; 1274 return;
1242 } 1275 }
1243
1244 tmp = ocfs2_request_umount_vote(osb);
1245 if (tmp < 0)
1246 mlog_errno(tmp);
1247 } 1276 }
1248 1277
1249 if (osb->slot_num != OCFS2_INVALID_SLOT) 1278 if (osb->slot_num != OCFS2_INVALID_SLOT)
@@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1254 1283
1255 ocfs2_release_system_inodes(osb); 1284 ocfs2_release_system_inodes(osb);
1256 1285
1257 if (osb->dlm) { 1286 if (osb->dlm)
1258 ocfs2_unregister_net_handlers(osb);
1259
1260 ocfs2_dlm_shutdown(osb); 1287 ocfs2_dlm_shutdown(osb);
1261 }
1262
1263 ocfs2_clear_hb_callbacks(osb);
1264 1288
1265 debugfs_remove(osb->osb_debug_root); 1289 debugfs_remove(osb->osb_debug_root);
1266 1290
@@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1315 int i, cbits, bbits; 1339 int i, cbits, bbits;
1316 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1340 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1317 struct inode *inode = NULL; 1341 struct inode *inode = NULL;
1318 struct buffer_head *bitmap_bh = NULL;
1319 struct ocfs2_journal *journal; 1342 struct ocfs2_journal *journal;
1320 __le32 uuid_net_key; 1343 __le32 uuid_net_key;
1321 struct ocfs2_super *osb; 1344 struct ocfs2_super *osb;
@@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
1344 osb->s_sectsize_bits = blksize_bits(sector_size); 1367 osb->s_sectsize_bits = blksize_bits(sector_size);
1345 BUG_ON(!osb->s_sectsize_bits); 1368 BUG_ON(!osb->s_sectsize_bits);
1346 1369
1347 osb->net_response_ids = 0;
1348 spin_lock_init(&osb->net_response_lock);
1349 INIT_LIST_HEAD(&osb->net_response_list);
1350
1351 INIT_LIST_HEAD(&osb->osb_net_handlers);
1352 init_waitqueue_head(&osb->recovery_event); 1370 init_waitqueue_head(&osb->recovery_event);
1353 spin_lock_init(&osb->vote_task_lock); 1371 spin_lock_init(&osb->dc_task_lock);
1354 init_waitqueue_head(&osb->vote_event); 1372 init_waitqueue_head(&osb->dc_event);
1355 osb->vote_work_sequence = 0; 1373 osb->dc_work_sequence = 0;
1356 osb->vote_wake_sequence = 0; 1374 osb->dc_wake_sequence = 0;
1357 INIT_LIST_HEAD(&osb->blocked_lock_list); 1375 INIT_LIST_HEAD(&osb->blocked_lock_list);
1358 osb->blocked_lock_count = 0; 1376 osb->blocked_lock_count = 0;
1359 INIT_LIST_HEAD(&osb->vote_list);
1360 spin_lock_init(&osb->osb_lock); 1377 spin_lock_init(&osb->osb_lock);
1361 1378
1362 atomic_set(&osb->alloc_stats.moves, 0); 1379 atomic_set(&osb->alloc_stats.moves, 0);
@@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1496 } 1513 }
1497 1514
1498 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); 1515 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
1499 osb->net_key = le32_to_cpu(uuid_net_key);
1500 1516
1501 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 1517 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
1502 osb->vol_label[63] = '\0'; 1518 osb->vol_label[63] = '\0';
@@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
1539 } 1555 }
1540 1556
1541 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 1557 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1542
1543 /* We don't have a cluster lock on the bitmap here because
1544 * we're only interested in static information and the extra
1545 * complexity at mount time isn't worht it. Don't pass the
1546 * inode in to the read function though as we don't want it to
1547 * be put in the cache. */
1548 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1549 NULL);
1550 iput(inode); 1558 iput(inode);
1551 if (status < 0) {
1552 mlog_errno(status);
1553 goto bail;
1554 }
1555 1559
1556 di = (struct ocfs2_dinode *) bitmap_bh->b_data; 1560 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
1557 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1558 brelse(bitmap_bh);
1559 mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
1560 (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
1561 1561
1562 status = ocfs2_init_slot_info(osb); 1562 status = ocfs2_init_slot_info(osb);
1563 if (status < 0) { 1563 if (status < 0) {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fd2e846e3e6f..ab713ebdd546 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
112 goto bail; 112 goto bail;
113 } 113 }
114 114
115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE); 115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
116 if (IS_ERR(inode)) { 116 if (IS_ERR(inode)) {
117 mlog_errno(PTR_ERR(inode)); 117 mlog_errno(PTR_ERR(inode));
118 inode = NULL; 118 inode = NULL;
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
index 5405ce121c99..e2488f4128a2 100644
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
@@ -29,7 +29,7 @@
29 29
30#include "ver.h" 30#include "ver.h"
31 31
32#define OCFS2_BUILD_VERSION "1.3.3" 32#define OCFS2_BUILD_VERSION "1.5.0"
33 33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION 34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35 35
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index c05358538f2b..000000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,756 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.c
5 *
6 * description here
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/kthread.h>
30
31#include <cluster/heartbeat.h>
32#include <cluster/nodemanager.h>
33#include <cluster/tcp.h>
34
35#include <dlm/dlmapi.h>
36
37#define MLOG_MASK_PREFIX ML_VOTE
38#include <cluster/masklog.h>
39
40#include "ocfs2.h"
41
42#include "alloc.h"
43#include "dlmglue.h"
44#include "extent_map.h"
45#include "heartbeat.h"
46#include "inode.h"
47#include "journal.h"
48#include "slot_map.h"
49#include "vote.h"
50
51#include "buffer_head_io.h"
52
53#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
54#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
55struct ocfs2_msg_hdr
56{
57 __be32 h_response_id; /* used to lookup message handle on sending
58 * node. */
59 __be32 h_request;
60 __be64 h_blkno;
61 __be32 h_generation;
62 __be32 h_node_num; /* node sending this particular message. */
63};
64
65struct ocfs2_vote_msg
66{
67 struct ocfs2_msg_hdr v_hdr;
68 __be32 v_reserved1;
69} __attribute__ ((packed));
70
71/* Responses are given these values to maintain backwards
72 * compatibility with older ocfs2 versions */
73#define OCFS2_RESPONSE_OK (0)
74#define OCFS2_RESPONSE_BUSY (-16)
75#define OCFS2_RESPONSE_BAD_MSG (-22)
76
77struct ocfs2_response_msg
78{
79 struct ocfs2_msg_hdr r_hdr;
80 __be32 r_response;
81} __attribute__ ((packed));
82
83struct ocfs2_vote_work {
84 struct list_head w_list;
85 struct ocfs2_vote_msg w_msg;
86};
87
88enum ocfs2_vote_request {
89 OCFS2_VOTE_REQ_INVALID = 0,
90 OCFS2_VOTE_REQ_MOUNT,
91 OCFS2_VOTE_REQ_UMOUNT,
92 OCFS2_VOTE_REQ_LAST
93};
94
95static inline int ocfs2_is_valid_vote_request(int request)
96{
97 return OCFS2_VOTE_REQ_INVALID < request &&
98 request < OCFS2_VOTE_REQ_LAST;
99}
100
101typedef void (*ocfs2_net_response_callback)(void *priv,
102 struct ocfs2_response_msg *resp);
103struct ocfs2_net_response_cb {
104 ocfs2_net_response_callback rc_cb;
105 void *rc_priv;
106};
107
108struct ocfs2_net_wait_ctxt {
109 struct list_head n_list;
110 u32 n_response_id;
111 wait_queue_head_t n_event;
112 struct ocfs2_node_map n_node_map;
113 int n_response; /* an agreggate response. 0 if
114 * all nodes are go, < 0 on any
115 * negative response from any
116 * node or network error. */
117 struct ocfs2_net_response_cb *n_callback;
118};
119
120static void ocfs2_process_mount_request(struct ocfs2_super *osb,
121 unsigned int node_num)
122{
123 mlog(0, "MOUNT vote from node %u\n", node_num);
124 /* The other node only sends us this message when he has an EX
125 * on the superblock, so our recovery threads (if having been
126 * launched) are waiting on it.*/
127 ocfs2_recovery_map_clear(osb, node_num);
128 ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
129
130 /* We clear the umount map here because a node may have been
131 * previously mounted, safely unmounted but never stopped
132 * heartbeating - in which case we'd have a stale entry. */
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136static void ocfs2_process_umount_request(struct ocfs2_super *osb,
137 unsigned int node_num)
138{
139 mlog(0, "UMOUNT vote from node %u\n", node_num);
140 ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
141 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
142}
143
144static void ocfs2_process_vote(struct ocfs2_super *osb,
145 struct ocfs2_vote_msg *msg)
146{
147 int net_status, vote_response;
148 unsigned int node_num;
149 u64 blkno;
150 enum ocfs2_vote_request request;
151 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
152 struct ocfs2_response_msg response;
153
154 /* decode the network mumbo jumbo into local variables. */
155 request = be32_to_cpu(hdr->h_request);
156 blkno = be64_to_cpu(hdr->h_blkno);
157 node_num = be32_to_cpu(hdr->h_node_num);
158
159 mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
160 request, (unsigned long long)blkno, node_num);
161
162 if (!ocfs2_is_valid_vote_request(request)) {
163 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
164 request, node_num);
165 vote_response = OCFS2_RESPONSE_BAD_MSG;
166 goto respond;
167 }
168
169 vote_response = OCFS2_RESPONSE_OK;
170
171 switch (request) {
172 case OCFS2_VOTE_REQ_UMOUNT:
173 ocfs2_process_umount_request(osb, node_num);
174 goto respond;
175 case OCFS2_VOTE_REQ_MOUNT:
176 ocfs2_process_mount_request(osb, node_num);
177 goto respond;
178 default:
179 /* avoids a gcc warning */
180 break;
181 }
182
183respond:
184 /* Response struture is small so we just put it on the stack
185 * and stuff it inline. */
186 memset(&response, 0, sizeof(struct ocfs2_response_msg));
187 response.r_hdr.h_response_id = hdr->h_response_id;
188 response.r_hdr.h_blkno = hdr->h_blkno;
189 response.r_hdr.h_generation = hdr->h_generation;
190 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
191 response.r_response = cpu_to_be32(vote_response);
192
193 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
194 osb->net_key,
195 &response,
196 sizeof(struct ocfs2_response_msg),
197 node_num,
198 NULL);
199 /* We still want to error print for ENOPROTOOPT here. The
200 * sending node shouldn't have unregistered his net handler
201 * without sending an unmount vote 1st */
202 if (net_status < 0
203 && net_status != -ETIMEDOUT
204 && net_status != -ENOTCONN)
205 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
206 node_num, net_status);
207}
208
209static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
210{
211 unsigned long processed;
212 struct ocfs2_lock_res *lockres;
213 struct ocfs2_vote_work *work;
214
215 mlog_entry_void();
216
217 spin_lock(&osb->vote_task_lock);
218 /* grab this early so we know to try again if a state change and
219 * wake happens part-way through our work */
220 osb->vote_work_sequence = osb->vote_wake_sequence;
221
222 processed = osb->blocked_lock_count;
223 while (processed) {
224 BUG_ON(list_empty(&osb->blocked_lock_list));
225
226 lockres = list_entry(osb->blocked_lock_list.next,
227 struct ocfs2_lock_res, l_blocked_list);
228 list_del_init(&lockres->l_blocked_list);
229 osb->blocked_lock_count--;
230 spin_unlock(&osb->vote_task_lock);
231
232 BUG_ON(!processed);
233 processed--;
234
235 ocfs2_process_blocked_lock(osb, lockres);
236
237 spin_lock(&osb->vote_task_lock);
238 }
239
240 while (osb->vote_count) {
241 BUG_ON(list_empty(&osb->vote_list));
242 work = list_entry(osb->vote_list.next,
243 struct ocfs2_vote_work, w_list);
244 list_del(&work->w_list);
245 osb->vote_count--;
246 spin_unlock(&osb->vote_task_lock);
247
248 ocfs2_process_vote(osb, &work->w_msg);
249 kfree(work);
250
251 spin_lock(&osb->vote_task_lock);
252 }
253 spin_unlock(&osb->vote_task_lock);
254
255 mlog_exit_void();
256}
257
258static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
259{
260 int empty = 0;
261
262 spin_lock(&osb->vote_task_lock);
263 if (list_empty(&osb->blocked_lock_list) &&
264 list_empty(&osb->vote_list))
265 empty = 1;
266
267 spin_unlock(&osb->vote_task_lock);
268 return empty;
269}
270
271static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
272{
273 int should_wake = 0;
274
275 spin_lock(&osb->vote_task_lock);
276 if (osb->vote_work_sequence != osb->vote_wake_sequence)
277 should_wake = 1;
278 spin_unlock(&osb->vote_task_lock);
279
280 return should_wake;
281}
282
283int ocfs2_vote_thread(void *arg)
284{
285 int status = 0;
286 struct ocfs2_super *osb = arg;
287
288 /* only quit once we've been asked to stop and there is no more
289 * work available */
290 while (!(kthread_should_stop() &&
291 ocfs2_vote_thread_lists_empty(osb))) {
292
293 wait_event_interruptible(osb->vote_event,
294 ocfs2_vote_thread_should_wake(osb) ||
295 kthread_should_stop());
296
297 mlog(0, "vote_thread: awoken\n");
298
299 ocfs2_vote_thread_do_work(osb);
300 }
301
302 osb->vote_task = NULL;
303 return status;
304}
305
306static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
307{
308 struct ocfs2_net_wait_ctxt *w;
309
310 w = kzalloc(sizeof(*w), GFP_NOFS);
311 if (!w) {
312 mlog_errno(-ENOMEM);
313 goto bail;
314 }
315
316 INIT_LIST_HEAD(&w->n_list);
317 init_waitqueue_head(&w->n_event);
318 ocfs2_node_map_init(&w->n_node_map);
319 w->n_response_id = response_id;
320 w->n_callback = NULL;
321bail:
322 return w;
323}
324
325static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
326{
327 unsigned int ret;
328
329 spin_lock(&osb->net_response_lock);
330 ret = ++osb->net_response_ids;
331 spin_unlock(&osb->net_response_lock);
332
333 return ret;
334}
335
336static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
337 struct ocfs2_net_wait_ctxt *w)
338{
339 spin_lock(&osb->net_response_lock);
340 list_del(&w->n_list);
341 spin_unlock(&osb->net_response_lock);
342}
343
344static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
345 struct ocfs2_net_wait_ctxt *w)
346{
347 spin_lock(&osb->net_response_lock);
348 list_add_tail(&w->n_list,
349 &osb->net_response_list);
350 spin_unlock(&osb->net_response_lock);
351}
352
353static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
354 struct ocfs2_net_wait_ctxt *w,
355 int node_num)
356{
357 assert_spin_locked(&osb->net_response_lock);
358
359 ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
360 if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
361 wake_up(&w->n_event);
362}
363
364/* Intended to be called from the node down callback, we fake remove
365 * the node from all our response contexts */
366void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
367 int node_num)
368{
369 struct list_head *p;
370 struct ocfs2_net_wait_ctxt *w = NULL;
371
372 spin_lock(&osb->net_response_lock);
373
374 list_for_each(p, &osb->net_response_list) {
375 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
376
377 __ocfs2_mark_node_responded(osb, w, node_num);
378 }
379
380 spin_unlock(&osb->net_response_lock);
381}
382
383static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
384 struct ocfs2_vote_msg *request,
385 unsigned int response_id,
386 int *response,
387 struct ocfs2_net_response_cb *callback)
388{
389 int status, i, remote_err;
390 struct ocfs2_net_wait_ctxt *w = NULL;
391 int dequeued = 0;
392
393 mlog_entry_void();
394
395 w = ocfs2_new_net_wait_ctxt(response_id);
396 if (!w) {
397 status = -ENOMEM;
398 mlog_errno(status);
399 goto bail;
400 }
401 w->n_callback = callback;
402
403 /* we're pretty much ready to go at this point, and this fills
404 * in n_response which we need anyway... */
405 ocfs2_queue_net_wait_ctxt(osb, w);
406
407 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
408
409 while (i != O2NM_INVALID_NODE_NUM) {
410 if (i != osb->node_num) {
411 mlog(0, "trying to send request to node %i\n", i);
412 ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
413
414 remote_err = 0;
415 status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
416 osb->net_key,
417 request,
418 sizeof(*request),
419 i,
420 &remote_err);
421 if (status == -ETIMEDOUT) {
422 mlog(0, "remote node %d timed out!\n", i);
423 status = -EAGAIN;
424 goto bail;
425 }
426 if (remote_err < 0) {
427 status = remote_err;
428 mlog(0, "remote error %d on node %d!\n",
429 remote_err, i);
430 mlog_errno(status);
431 goto bail;
432 }
433 if (status < 0) {
434 mlog_errno(status);
435 goto bail;
436 }
437 }
438 i++;
439 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
440 mlog(0, "next is %d, i am %d\n", i, osb->node_num);
441 }
442 mlog(0, "done sending, now waiting on responses...\n");
443
444 wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
445
446 ocfs2_dequeue_net_wait_ctxt(osb, w);
447 dequeued = 1;
448
449 *response = w->n_response;
450 status = 0;
451bail:
452 if (w) {
453 if (!dequeued)
454 ocfs2_dequeue_net_wait_ctxt(osb, w);
455 kfree(w);
456 }
457
458 mlog_exit(status);
459 return status;
460}
461
462static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
463 u64 blkno,
464 unsigned int generation,
465 enum ocfs2_vote_request type)
466{
467 struct ocfs2_vote_msg *request;
468 struct ocfs2_msg_hdr *hdr;
469
470 BUG_ON(!ocfs2_is_valid_vote_request(type));
471
472 request = kzalloc(sizeof(*request), GFP_NOFS);
473 if (!request) {
474 mlog_errno(-ENOMEM);
475 } else {
476 hdr = &request->v_hdr;
477 hdr->h_node_num = cpu_to_be32(osb->node_num);
478 hdr->h_request = cpu_to_be32(type);
479 hdr->h_blkno = cpu_to_be64(blkno);
480 hdr->h_generation = cpu_to_be32(generation);
481 }
482
483 return request;
484}
485
486/* Complete the buildup of a new vote request and process the
487 * broadcast return value. */
488static int ocfs2_do_request_vote(struct ocfs2_super *osb,
489 struct ocfs2_vote_msg *request,
490 struct ocfs2_net_response_cb *callback)
491{
492 int status, response = -EBUSY;
493 unsigned int response_id;
494 struct ocfs2_msg_hdr *hdr;
495
496 response_id = ocfs2_new_response_id(osb);
497
498 hdr = &request->v_hdr;
499 hdr->h_response_id = cpu_to_be32(response_id);
500
501 status = ocfs2_broadcast_vote(osb, request, response_id, &response,
502 callback);
503 if (status < 0) {
504 mlog_errno(status);
505 goto bail;
506 }
507
508 status = response;
509bail:
510
511 return status;
512}
513
514int ocfs2_request_mount_vote(struct ocfs2_super *osb)
515{
516 int status;
517 struct ocfs2_vote_msg *request = NULL;
518
519 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
520 if (!request) {
521 status = -ENOMEM;
522 goto bail;
523 }
524
525 status = -EAGAIN;
526 while (status == -EAGAIN) {
527 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
528 signal_pending(current)) {
529 status = -ERESTARTSYS;
530 goto bail;
531 }
532
533 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
534 osb->node_num)) {
535 status = 0;
536 goto bail;
537 }
538
539 status = ocfs2_do_request_vote(osb, request, NULL);
540 }
541
542bail:
543 kfree(request);
544 return status;
545}
546
547int ocfs2_request_umount_vote(struct ocfs2_super *osb)
548{
549 int status;
550 struct ocfs2_vote_msg *request = NULL;
551
552 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
553 if (!request) {
554 status = -ENOMEM;
555 goto bail;
556 }
557
558 status = -EAGAIN;
559 while (status == -EAGAIN) {
560 /* Do not check signals on this vote... We really want
561 * this one to go all the way through. */
562
563 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
564 osb->node_num)) {
565 status = 0;
566 goto bail;
567 }
568
569 status = ocfs2_do_request_vote(osb, request, NULL);
570 }
571
572bail:
573 kfree(request);
574 return status;
575}
576
577/* TODO: This should eventually be a hash table! */
578static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
579 u32 response_id)
580{
581 struct list_head *p;
582 struct ocfs2_net_wait_ctxt *w = NULL;
583
584 list_for_each(p, &osb->net_response_list) {
585 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
586 if (response_id == w->n_response_id)
587 break;
588 w = NULL;
589 }
590
591 return w;
592}
593
594/* Translate response codes into local node errno values */
595static inline int ocfs2_translate_response(int response)
596{
597 int ret;
598
599 switch (response) {
600 case OCFS2_RESPONSE_OK:
601 ret = 0;
602 break;
603
604 case OCFS2_RESPONSE_BUSY:
605 ret = -EBUSY;
606 break;
607
608 default:
609 ret = -EINVAL;
610 }
611
612 return ret;
613}
614
615static int ocfs2_handle_response_message(struct o2net_msg *msg,
616 u32 len,
617 void *data, void **ret_data)
618{
619 unsigned int response_id, node_num;
620 int response_status;
621 struct ocfs2_super *osb = data;
622 struct ocfs2_response_msg *resp;
623 struct ocfs2_net_wait_ctxt * w;
624 struct ocfs2_net_response_cb *resp_cb;
625
626 resp = (struct ocfs2_response_msg *) msg->buf;
627
628 response_id = be32_to_cpu(resp->r_hdr.h_response_id);
629 node_num = be32_to_cpu(resp->r_hdr.h_node_num);
630 response_status =
631 ocfs2_translate_response(be32_to_cpu(resp->r_response));
632
633 mlog(0, "received response message:\n");
634 mlog(0, "h_response_id = %u\n", response_id);
635 mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
636 mlog(0, "h_blkno = %llu\n",
637 (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
638 mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
639 mlog(0, "h_node_num = %u\n", node_num);
640 mlog(0, "r_response = %d\n", response_status);
641
642 spin_lock(&osb->net_response_lock);
643 w = __ocfs2_find_net_wait_ctxt(osb, response_id);
644 if (!w) {
645 mlog(0, "request not found!\n");
646 goto bail;
647 }
648 resp_cb = w->n_callback;
649
650 if (response_status && (!w->n_response)) {
651 /* we only really need one negative response so don't
652 * set it twice. */
653 w->n_response = response_status;
654 }
655
656 if (resp_cb) {
657 spin_unlock(&osb->net_response_lock);
658
659 resp_cb->rc_cb(resp_cb->rc_priv, resp);
660
661 spin_lock(&osb->net_response_lock);
662 }
663
664 __ocfs2_mark_node_responded(osb, w, node_num);
665bail:
666 spin_unlock(&osb->net_response_lock);
667
668 return 0;
669}
670
671static int ocfs2_handle_vote_message(struct o2net_msg *msg,
672 u32 len,
673 void *data, void **ret_data)
674{
675 int status;
676 struct ocfs2_super *osb = data;
677 struct ocfs2_vote_work *work;
678
679 work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
680 if (!work) {
681 status = -ENOMEM;
682 mlog_errno(status);
683 goto bail;
684 }
685
686 INIT_LIST_HEAD(&work->w_list);
687 memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
688
689 mlog(0, "scheduling vote request:\n");
690 mlog(0, "h_response_id = %u\n",
691 be32_to_cpu(work->w_msg.v_hdr.h_response_id));
692 mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
693 mlog(0, "h_blkno = %llu\n",
694 (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
695 mlog(0, "h_generation = %u\n",
696 be32_to_cpu(work->w_msg.v_hdr.h_generation));
697 mlog(0, "h_node_num = %u\n",
698 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
699
700 spin_lock(&osb->vote_task_lock);
701 list_add_tail(&work->w_list, &osb->vote_list);
702 osb->vote_count++;
703 spin_unlock(&osb->vote_task_lock);
704
705 ocfs2_kick_vote_thread(osb);
706
707 status = 0;
708bail:
709 return status;
710}
711
712void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
713{
714 if (!osb->net_key)
715 return;
716
717 o2net_unregister_handler_list(&osb->osb_net_handlers);
718
719 if (!list_empty(&osb->net_response_list))
720 mlog(ML_ERROR, "net response list not empty!\n");
721
722 osb->net_key = 0;
723}
724
725int ocfs2_register_net_handlers(struct ocfs2_super *osb)
726{
727 int status = 0;
728
729 if (ocfs2_mount_local(osb))
730 return 0;
731
732 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
733 osb->net_key,
734 sizeof(struct ocfs2_response_msg),
735 ocfs2_handle_response_message,
736 osb, NULL, &osb->osb_net_handlers);
737 if (status) {
738 mlog_errno(status);
739 goto bail;
740 }
741
742 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
743 osb->net_key,
744 sizeof(struct ocfs2_vote_msg),
745 ocfs2_handle_vote_message,
746 osb, NULL, &osb->osb_net_handlers);
747 if (status) {
748 mlog_errno(status);
749 goto bail;
750 }
751bail:
752 if (status < 0)
753 ocfs2_unregister_net_handlers(osb);
754
755 return status;
756}