aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@infradead.org>2008-02-03 02:29:41 -0500
committerDavid Woodhouse <dwmw2@infradead.org>2008-02-03 02:30:32 -0500
commitc1f3ee120bb61045b1c0a3ead620d1d65af47130 (patch)
tree908430bf2b47fe8e96ac623ae7ab6dd5698d0938 /fs/ocfs2
parente619a75ff6201b567a539e787aa9af9bc63a3187 (diff)
parent9135f1901ee6449dfe338adf6e40e9c2025b8150 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Makefile5
-rw-r--r--fs/ocfs2/alloc.c78
-rw-r--r--fs/ocfs2/aops.c161
-rw-r--r--fs/ocfs2/buffer_head_io.c65
-rw-r--r--fs/ocfs2/buffer_head_io.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/masklog.c4
-rw-r--r--fs/ocfs2/cluster/masklog.h2
-rw-r--r--fs/ocfs2/cluster/sys.c83
-rw-r--r--fs/ocfs2/cluster/tcp.c24
-rw-r--r--fs/ocfs2/cluster/tcp.h4
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h8
-rw-r--r--fs/ocfs2/cluster/ver.c2
-rw-r--r--fs/ocfs2/dcache.c30
-rw-r--r--fs/ocfs2/dir.c14
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c4
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c19
-rw-r--r--fs/ocfs2/dlm/dlmver.c2
-rw-r--r--fs/ocfs2/dlmglue.c571
-rw-r--r--fs/ocfs2/dlmglue.h31
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/file.c208
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/heartbeat.c80
-rw-r--r--fs/ocfs2/heartbeat.h2
-rw-r--r--fs/ocfs2/inode.c90
-rw-r--r--fs/ocfs2/inode.h10
-rw-r--r--fs/ocfs2/ioctl.c31
-rw-r--r--fs/ocfs2/journal.c64
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/ocfs2/localalloc.c55
-rw-r--r--fs/ocfs2/locks.c125
-rw-r--r--fs/ocfs2/locks.h (renamed from fs/ocfs2/vote.h)29
-rw-r--r--fs/ocfs2/mmap.c17
-rw-r--r--fs/ocfs2/namei.c79
-rw-r--r--fs/ocfs2/ocfs2.h35
-rw-r--r--fs/ocfs2/ocfs2_fs.h22
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/resize.c634
-rw-r--r--fs/ocfs2/resize.h32
-rw-r--r--fs/ocfs2/slot_map.c19
-rw-r--r--fs/ocfs2/slot_map.h2
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/ocfs2/suballoc.h8
-rw-r--r--fs/ocfs2/super.c146
-rw-r--r--fs/ocfs2/sysfile.c2
-rw-r--r--fs/ocfs2/ver.c2
-rw-r--r--fs/ocfs2/vote.c756
51 files changed, 2024 insertions, 1589 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b0..4d4ce48bb42c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -19,16 +19,17 @@ ocfs2-objs := \
19 ioctl.o \ 19 ioctl.o \
20 journal.o \ 20 journal.o \
21 localalloc.o \ 21 localalloc.o \
22 locks.o \
22 mmap.o \ 23 mmap.o \
23 namei.o \ 24 namei.o \
25 resize.o \
24 slot_map.o \ 26 slot_map.o \
25 suballoc.o \ 27 suballoc.o \
26 super.o \ 28 super.o \
27 symlink.o \ 29 symlink.o \
28 sysfile.o \ 30 sysfile.o \
29 uptodate.o \ 31 uptodate.o \
30 ver.o \ 32 ver.o
31 vote.o
32 33
33obj-$(CONFIG_OCFS2_FS) += cluster/ 34obj-$(CONFIG_OCFS2_FS) += cluster/
34obj-$(CONFIG_OCFS2_FS) += dlm/ 35obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4ba7f0bdc248..e6df06ac6405 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2389,6 +2389,18 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2389 goto out; 2389 goto out;
2390 } 2390 }
2391 2391
2392 /*
2393 * Caller might still want to make changes to the
2394 * tree root, so re-add it to the journal here.
2395 */
2396 ret = ocfs2_journal_access(handle, inode,
2397 path_root_bh(left_path),
2398 OCFS2_JOURNAL_ACCESS_WRITE);
2399 if (ret) {
2400 mlog_errno(ret);
2401 goto out;
2402 }
2403
2392 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 2404 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2393 right_path, subtree_root, 2405 right_path, subtree_root,
2394 dealloc, &deleted); 2406 dealloc, &deleted);
@@ -3289,16 +3301,6 @@ static int ocfs2_insert_path(struct inode *inode,
3289 int ret, subtree_index; 3301 int ret, subtree_index;
3290 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3302 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3291 3303
3292 /*
3293 * Pass both paths to the journal. The majority of inserts
3294 * will be touching all components anyway.
3295 */
3296 ret = ocfs2_journal_access_path(inode, handle, right_path);
3297 if (ret < 0) {
3298 mlog_errno(ret);
3299 goto out;
3300 }
3301
3302 if (left_path) { 3304 if (left_path) {
3303 int credits = handle->h_buffer_credits; 3305 int credits = handle->h_buffer_credits;
3304 3306
@@ -3323,6 +3325,16 @@ static int ocfs2_insert_path(struct inode *inode,
3323 } 3325 }
3324 } 3326 }
3325 3327
3328 /*
3329 * Pass both paths to the journal. The majority of inserts
3330 * will be touching all components anyway.
3331 */
3332 ret = ocfs2_journal_access_path(inode, handle, right_path);
3333 if (ret < 0) {
3334 mlog_errno(ret);
3335 goto out;
3336 }
3337
3326 if (insert->ins_split != SPLIT_NONE) { 3338 if (insert->ins_split != SPLIT_NONE) {
3327 /* 3339 /*
3328 * We could call ocfs2_insert_at_leaf() for some types 3340 * We could call ocfs2_insert_at_leaf() for some types
@@ -3331,6 +3343,17 @@ static int ocfs2_insert_path(struct inode *inode,
3331 */ 3343 */
3332 ocfs2_split_record(inode, left_path, right_path, 3344 ocfs2_split_record(inode, left_path, right_path,
3333 insert_rec, insert->ins_split); 3345 insert_rec, insert->ins_split);
3346
3347 /*
3348 * Split might have modified either leaf and we don't
3349 * have a guarantee that the later edge insert will
3350 * dirty this for us.
3351 */
3352 if (left_path)
3353 ret = ocfs2_journal_dirty(handle,
3354 path_leaf_bh(left_path));
3355 if (ret)
3356 mlog_errno(ret);
3334 } else 3357 } else
3335 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), 3358 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3336 insert, inode); 3359 insert, inode);
@@ -3430,6 +3453,17 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3430 mlog_errno(ret); 3453 mlog_errno(ret);
3431 goto out; 3454 goto out;
3432 } 3455 }
3456
3457 /*
3458 * ocfs2_rotate_tree_right() might have extended the
3459 * transaction without re-journaling our tree root.
3460 */
3461 ret = ocfs2_journal_access(handle, inode, di_bh,
3462 OCFS2_JOURNAL_ACCESS_WRITE);
3463 if (ret) {
3464 mlog_errno(ret);
3465 goto out;
3466 }
3433 } else if (type->ins_appending == APPEND_TAIL 3467 } else if (type->ins_appending == APPEND_TAIL
3434 && type->ins_contig != CONTIG_LEFT) { 3468 && type->ins_contig != CONTIG_LEFT) {
3435 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 3469 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
@@ -3941,12 +3975,12 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
3941{ 3975{
3942 int ret = 0; 3976 int ret = 0;
3943 struct ocfs2_extent_list *el = path_leaf_el(path); 3977 struct ocfs2_extent_list *el = path_leaf_el(path);
3944 struct buffer_head *eb_bh, *last_eb_bh = NULL; 3978 struct buffer_head *last_eb_bh = NULL;
3945 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3979 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3946 struct ocfs2_merge_ctxt ctxt; 3980 struct ocfs2_merge_ctxt ctxt;
3947 struct ocfs2_extent_list *rightmost_el; 3981 struct ocfs2_extent_list *rightmost_el;
3948 3982
3949 if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) { 3983 if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
3950 ret = -EIO; 3984 ret = -EIO;
3951 mlog_errno(ret); 3985 mlog_errno(ret);
3952 goto out; 3986 goto out;
@@ -3960,14 +3994,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
3960 goto out; 3994 goto out;
3961 } 3995 }
3962 3996
3963 eb_bh = path_leaf_bh(path);
3964 ret = ocfs2_journal_access(handle, inode, eb_bh,
3965 OCFS2_JOURNAL_ACCESS_WRITE);
3966 if (ret) {
3967 mlog_errno(ret);
3968 goto out;
3969 }
3970
3971 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, 3997 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
3972 split_index, 3998 split_index,
3973 split_rec); 3999 split_rec);
@@ -4029,8 +4055,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4029 mlog_errno(ret); 4055 mlog_errno(ret);
4030 } 4056 }
4031 4057
4032 ocfs2_journal_dirty(handle, eb_bh);
4033
4034out: 4058out:
4035 brelse(last_eb_bh); 4059 brelse(last_eb_bh);
4036 return ret; 4060 return ret;
@@ -4707,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4707 4731
4708 mutex_lock(&data_alloc_inode->i_mutex); 4732 mutex_lock(&data_alloc_inode->i_mutex);
4709 4733
4710 status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); 4734 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
4711 if (status < 0) { 4735 if (status < 0) {
4712 mlog_errno(status); 4736 mlog_errno(status);
4713 goto out_mutex; 4737 goto out_mutex;
@@ -4729,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4729 4753
4730out_unlock: 4754out_unlock:
4731 brelse(data_alloc_bh); 4755 brelse(data_alloc_bh);
4732 ocfs2_meta_unlock(data_alloc_inode, 1); 4756 ocfs2_inode_unlock(data_alloc_inode, 1);
4733 4757
4734out_mutex: 4758out_mutex:
4735 mutex_unlock(&data_alloc_inode->i_mutex); 4759 mutex_unlock(&data_alloc_inode->i_mutex);
@@ -5053,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5053 5077
5054 mutex_lock(&inode->i_mutex); 5078 mutex_lock(&inode->i_mutex);
5055 5079
5056 ret = ocfs2_meta_lock(inode, &di_bh, 1); 5080 ret = ocfs2_inode_lock(inode, &di_bh, 1);
5057 if (ret) { 5081 if (ret) {
5058 mlog_errno(ret); 5082 mlog_errno(ret);
5059 goto out_mutex; 5083 goto out_mutex;
@@ -5094,7 +5118,7 @@ out_journal:
5094 ocfs2_commit_trans(osb, handle); 5118 ocfs2_commit_trans(osb, handle);
5095 5119
5096out_unlock: 5120out_unlock:
5097 ocfs2_meta_unlock(inode, 1); 5121 ocfs2_inode_unlock(inode, 1);
5098 brelse(di_bh); 5122 brelse(di_bh);
5099out_mutex: 5123out_mutex:
5100 mutex_unlock(&inode->i_mutex); 5124 mutex_unlock(&inode->i_mutex);
@@ -6093,8 +6117,6 @@ start:
6093 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 6117 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6094 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 6118 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6095 6119
6096 BUG_ON(clusters_to_del == 0);
6097
6098 mutex_lock(&tl_inode->i_mutex); 6120 mutex_lock(&tl_inode->i_mutex);
6099 tl_sem = 1; 6121 tl_sem = 1;
6100 /* ocfs2_truncate_log_needs_flush guarantees us at least one 6122 /* ocfs2_truncate_log_needs_flush guarantees us at least one
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c69c1b300155..bc7b4cbbe8ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -26,6 +26,7 @@
26#include <asm/byteorder.h> 26#include <asm/byteorder.h>
27#include <linux/swap.h> 27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h>
29 30
30#define MLOG_MASK_PREFIX ML_FILE_IO 31#define MLOG_MASK_PREFIX ML_FILE_IO
31#include <cluster/masklog.h> 32#include <cluster/masklog.h>
@@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
139{ 140{
140 int err = 0; 141 int err = 0;
141 unsigned int ext_flags; 142 unsigned int ext_flags;
142 u64 p_blkno, past_eof; 143 u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
144 u64 p_blkno, count, past_eof;
143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
144 146
145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 147 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
155 goto bail; 157 goto bail;
156 } 158 }
157 159
158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, 160 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
159 &ext_flags); 161 &ext_flags);
160 if (err) { 162 if (err) {
161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 163 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
164 goto bail; 166 goto bail;
165 } 167 }
166 168
169 if (max_blocks < count)
170 count = max_blocks;
171
167 /* 172 /*
168 * ocfs2 never allocates in this function - the only time we 173 * ocfs2 never allocates in this function - the only time we
169 * need to use BH_New is when we're extending i_size on a file 174 * need to use BH_New is when we're extending i_size on a file
@@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 183 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
179 map_bh(bh_result, inode->i_sb, p_blkno); 184 map_bh(bh_result, inode->i_sb, p_blkno);
180 185
186 bh_result->b_size = count << inode->i_blkbits;
187
181 if (!ocfs2_sparse_alloc(osb)) { 188 if (!ocfs2_sparse_alloc(osb)) {
182 if (p_blkno == 0) { 189 if (p_blkno == 0) {
183 err = -EIO; 190 err = -EIO;
@@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
210 struct buffer_head *di_bh) 217 struct buffer_head *di_bh)
211{ 218{
212 void *kaddr; 219 void *kaddr;
213 unsigned int size; 220 loff_t size;
214 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 221 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
215 222
216 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { 223 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
224 if (size > PAGE_CACHE_SIZE || 231 if (size > PAGE_CACHE_SIZE ||
225 size > ocfs2_max_inline_data(inode->i_sb)) { 232 size > ocfs2_max_inline_data(inode->i_sb)) {
226 ocfs2_error(inode->i_sb, 233 ocfs2_error(inode->i_sb,
227 "Inode %llu has with inline data has bad size: %u", 234 "Inode %llu has with inline data has bad size: %Lu",
228 (unsigned long long)OCFS2_I(inode)->ip_blkno, size); 235 (unsigned long long)OCFS2_I(inode)->ip_blkno,
236 (unsigned long long)size);
229 return -EROFS; 237 return -EROFS;
230 } 238 }
231 239
@@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
275 283
276 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 284 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
277 285
278 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); 286 ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
279 if (ret != 0) { 287 if (ret != 0) {
280 if (ret == AOP_TRUNCATED_PAGE) 288 if (ret == AOP_TRUNCATED_PAGE)
281 unlock = 0; 289 unlock = 0;
@@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
285 293
286 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 294 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
287 ret = AOP_TRUNCATED_PAGE; 295 ret = AOP_TRUNCATED_PAGE;
288 goto out_meta_unlock; 296 goto out_inode_unlock;
289 } 297 }
290 298
291 /* 299 /*
@@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page)
305 goto out_alloc; 313 goto out_alloc;
306 } 314 }
307 315
308 ret = ocfs2_data_lock_with_page(inode, 0, page);
309 if (ret != 0) {
310 if (ret == AOP_TRUNCATED_PAGE)
311 unlock = 0;
312 mlog_errno(ret);
313 goto out_alloc;
314 }
315
316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
317 ret = ocfs2_readpage_inline(inode, page); 317 ret = ocfs2_readpage_inline(inode, page);
318 else 318 else
319 ret = block_read_full_page(page, ocfs2_get_block); 319 ret = block_read_full_page(page, ocfs2_get_block);
320 unlock = 0; 320 unlock = 0;
321 321
322 ocfs2_data_unlock(inode, 0);
323out_alloc: 322out_alloc:
324 up_read(&OCFS2_I(inode)->ip_alloc_sem); 323 up_read(&OCFS2_I(inode)->ip_alloc_sem);
325out_meta_unlock: 324out_inode_unlock:
326 ocfs2_meta_unlock(inode, 0); 325 ocfs2_inode_unlock(inode, 0);
327out: 326out:
328 if (unlock) 327 if (unlock)
329 unlock_page(page); 328 unlock_page(page);
@@ -331,6 +330,62 @@ out:
331 return ret; 330 return ret;
332} 331}
333 332
333/*
334 * This is used only for read-ahead. Failures or difficult to handle
335 * situations are safe to ignore.
336 *
337 * Right now, we don't bother with BH_Boundary - in-inode extent lists
338 * are quite large (243 extents on 4k blocks), so most inodes don't
339 * grow out to a tree. If need be, detecting boundary extents could
340 * trivially be added in a future version of ocfs2_get_block().
341 */
342static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
343 struct list_head *pages, unsigned nr_pages)
344{
345 int ret, err = -EIO;
346 struct inode *inode = mapping->host;
347 struct ocfs2_inode_info *oi = OCFS2_I(inode);
348 loff_t start;
349 struct page *last;
350
351 /*
352 * Use the nonblocking flag for the dlm code to avoid page
353 * lock inversion, but don't bother with retrying.
354 */
355 ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
356 if (ret)
357 return err;
358
359 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
360 ocfs2_inode_unlock(inode, 0);
361 return err;
362 }
363
364 /*
365 * Don't bother with inline-data. There isn't anything
366 * to read-ahead in that case anyway...
367 */
368 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
369 goto out_unlock;
370
371 /*
372 * Check whether a remote node truncated this file - we just
373 * drop out in that case as it's not worth handling here.
374 */
375 last = list_entry(pages->prev, struct page, lru);
376 start = (loff_t)last->index << PAGE_CACHE_SHIFT;
377 if (start >= i_size_read(inode))
378 goto out_unlock;
379
380 err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
381
382out_unlock:
383 up_read(&oi->ip_alloc_sem);
384 ocfs2_inode_unlock(inode, 0);
385
386 return err;
387}
388
334/* Note: Because we don't support holes, our allocation has 389/* Note: Because we don't support holes, our allocation has
335 * already happened (allocation writes zeros to the file data) 390 * already happened (allocation writes zeros to the file data)
336 * so we don't have to worry about ordered writes in 391 * so we don't have to worry about ordered writes in
@@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
452 * accessed concurrently from multiple nodes. 507 * accessed concurrently from multiple nodes.
453 */ 508 */
454 if (!INODE_JOURNAL(inode)) { 509 if (!INODE_JOURNAL(inode)) {
455 err = ocfs2_meta_lock(inode, NULL, 0); 510 err = ocfs2_inode_lock(inode, NULL, 0);
456 if (err) { 511 if (err) {
457 if (err != -ENOENT) 512 if (err != -ENOENT)
458 mlog_errno(err); 513 mlog_errno(err);
@@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
467 522
468 if (!INODE_JOURNAL(inode)) { 523 if (!INODE_JOURNAL(inode)) {
469 up_read(&OCFS2_I(inode)->ip_alloc_sem); 524 up_read(&OCFS2_I(inode)->ip_alloc_sem);
470 ocfs2_meta_unlock(inode, 0); 525 ocfs2_inode_unlock(inode, 0);
471 } 526 }
472 527
473 if (err) { 528 if (err) {
@@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw,
638 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 693 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
639 return 0; 694 return 0;
640 695
641 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
642 /*
643 * We get PR data locks even for O_DIRECT. This
644 * allows concurrent O_DIRECT I/O but doesn't let
645 * O_DIRECT with extending and buffered zeroing writes
646 * race. If they did race then the buffered zeroing
647 * could be written back after the O_DIRECT I/O. It's
648 * one thing to tell people not to mix buffered and
649 * O_DIRECT writes, but expecting them to understand
650 * that file extension is also an implicit buffered
651 * write is too much. By getting the PR we force
652 * writeback of the buffered zeroing before
653 * proceeding.
654 */
655 ret = ocfs2_data_lock(inode, 0);
656 if (ret < 0) {
657 mlog_errno(ret);
658 goto out;
659 }
660 ocfs2_data_unlock(inode, 0);
661 }
662
663 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 696 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
664 inode->i_sb->s_bdev, iov, offset, 697 inode->i_sb->s_bdev, iov, offset,
665 nr_segs, 698 nr_segs,
666 ocfs2_direct_IO_get_blocks, 699 ocfs2_direct_IO_get_blocks,
667 ocfs2_dio_end_io); 700 ocfs2_dio_end_io);
668out: 701
669 mlog_exit(ret); 702 mlog_exit(ret);
670 return ret; 703 return ret;
671} 704}
@@ -729,6 +762,27 @@ static void ocfs2_clear_page_regions(struct page *page,
729} 762}
730 763
731/* 764/*
765 * Nonsparse file systems fully allocate before we get to the write
766 * code. This prevents ocfs2_write() from tagging the write as an
767 * allocating one, which means ocfs2_map_page_blocks() might try to
768 * read-in the blocks at the tail of our file. Avoid reading them by
769 * testing i_size against each block offset.
770 */
771static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
772 unsigned int block_start)
773{
774 u64 offset = page_offset(page) + block_start;
775
776 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
777 return 1;
778
779 if (i_size_read(inode) > offset)
780 return 1;
781
782 return 0;
783}
784
785/*
732 * Some of this taken from block_prepare_write(). We already have our 786 * Some of this taken from block_prepare_write(). We already have our
733 * mapping by now though, and the entire write will be allocating or 787 * mapping by now though, and the entire write will be allocating or
734 * it won't, so not much need to use BH_New. 788 * it won't, so not much need to use BH_New.
@@ -781,6 +835,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
781 set_buffer_uptodate(bh); 835 set_buffer_uptodate(bh);
782 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 836 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
783 !buffer_new(bh) && 837 !buffer_new(bh) &&
838 ocfs2_should_read_blk(inode, page, block_start) &&
784 (block_start < from || block_end > to)) { 839 (block_start < from || block_end > to)) {
785 ll_rw_block(READ, 1, &bh); 840 ll_rw_block(READ, 1, &bh);
786 *wait_bh++=bh; 841 *wait_bh++=bh;
@@ -1492,7 +1547,7 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
1492{ 1547{
1493 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1548 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1494 1549
1495 if (new_size < le16_to_cpu(di->id2.i_data.id_count)) 1550 if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
1496 return 1; 1551 return 1;
1497 return 0; 1552 return 0;
1498} 1553}
@@ -1732,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1732 struct buffer_head *di_bh = NULL; 1787 struct buffer_head *di_bh = NULL;
1733 struct inode *inode = mapping->host; 1788 struct inode *inode = mapping->host;
1734 1789
1735 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1790 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1736 if (ret) { 1791 if (ret) {
1737 mlog_errno(ret); 1792 mlog_errno(ret);
1738 return ret; 1793 return ret;
@@ -1747,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1747 */ 1802 */
1748 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1803 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1749 1804
1750 ret = ocfs2_data_lock(inode, 1);
1751 if (ret) {
1752 mlog_errno(ret);
1753 goto out_fail;
1754 }
1755
1756 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1805 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1757 fsdata, di_bh, NULL); 1806 fsdata, di_bh, NULL);
1758 if (ret) { 1807 if (ret) {
1759 mlog_errno(ret); 1808 mlog_errno(ret);
1760 goto out_fail_data; 1809 goto out_fail;
1761 } 1810 }
1762 1811
1763 brelse(di_bh); 1812 brelse(di_bh);
1764 1813
1765 return 0; 1814 return 0;
1766 1815
1767out_fail_data:
1768 ocfs2_data_unlock(inode, 1);
1769out_fail: 1816out_fail:
1770 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1817 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1771 1818
1772 brelse(di_bh); 1819 brelse(di_bh);
1773 ocfs2_meta_unlock(inode, 1); 1820 ocfs2_inode_unlock(inode, 1);
1774 1821
1775 return ret; 1822 return ret;
1776} 1823}
@@ -1886,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1886 1933
1887 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); 1934 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1888 1935
1889 ocfs2_data_unlock(inode, 1);
1890 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1936 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1891 ocfs2_meta_unlock(inode, 1); 1937 ocfs2_inode_unlock(inode, 1);
1892 1938
1893 return ret; 1939 return ret;
1894} 1940}
1895 1941
1896const struct address_space_operations ocfs2_aops = { 1942const struct address_space_operations ocfs2_aops = {
1897 .readpage = ocfs2_readpage, 1943 .readpage = ocfs2_readpage,
1944 .readpages = ocfs2_readpages,
1898 .writepage = ocfs2_writepage, 1945 .writepage = ocfs2_writepage,
1899 .write_begin = ocfs2_write_begin, 1946 .write_begin = ocfs2_write_begin,
1900 .write_end = ocfs2_write_end, 1947 .write_end = ocfs2_write_end,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f6..f136639f5b41 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
79 * information for this bh as it's not marked locally 79 * information for this bh as it's not marked locally
80 * uptodate. */ 80 * uptodate. */
81 ret = -EIO; 81 ret = -EIO;
82 brelse(bh); 82 put_bh(bh);
83 } 83 }
84 84
85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
256 * for this bh as it's not marked locally 256 * for this bh as it's not marked locally
257 * uptodate. */ 257 * uptodate. */
258 status = -EIO; 258 status = -EIO;
259 brelse(bh); 259 put_bh(bh);
260 bhs[i] = NULL; 260 bhs[i] = NULL;
261 continue; 261 continue;
262 } 262 }
@@ -280,3 +280,64 @@ bail:
280 mlog_exit(status); 280 mlog_exit(status);
281 return status; 281 return status;
282} 282}
283
284/* Check whether the blkno is the super block or one of the backups. */
285static void ocfs2_check_super_or_backup(struct super_block *sb,
286 sector_t blkno)
287{
288 int i;
289 u64 backup_blkno;
290
291 if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
292 return;
293
294 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
295 backup_blkno = ocfs2_backup_super_blkno(sb, i);
296 if (backup_blkno == blkno)
297 return;
298 }
299
300 BUG();
301}
302
303/*
304 * Write super block and backups doesn't need to collaborate with journal,
305 * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
306 * into this function.
307 */
308int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
309 struct buffer_head *bh)
310{
311 int ret = 0;
312
313 mlog_entry_void();
314
315 BUG_ON(buffer_jbd(bh));
316 ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
317
318 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
319 ret = -EROFS;
320 goto out;
321 }
322
323 lock_buffer(bh);
324 set_buffer_uptodate(bh);
325
326 /* remove from dirty list before I/O. */
327 clear_buffer_dirty(bh);
328
329 get_bh(bh); /* for end_buffer_write_sync() */
330 bh->b_end_io = end_buffer_write_sync;
331 submit_bh(WRITE, bh);
332
333 wait_on_buffer(bh);
334
335 if (!buffer_uptodate(bh)) {
336 ret = -EIO;
337 put_bh(bh);
338 }
339
340out:
341 mlog_exit(ret);
342 return ret;
343}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac3..c2e78614c3e5 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super *osb,
47 int flags, 47 int flags,
48 struct inode *inode); 48 struct inode *inode);
49 49
50int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
51 struct buffer_head *bh);
50 52
51#define OCFS2_BH_CACHED 1 53#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 54#define OCFS2_BH_READAHEAD 8
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9cc7c0418b70..f02ccb34604d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -267,7 +267,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
267 current_page = cs / spp; 267 current_page = cs / spp;
268 page = reg->hr_slot_data[current_page]; 268 page = reg->hr_slot_data[current_page];
269 269
270 vec_len = min(PAGE_CACHE_SIZE, 270 vec_len = min(PAGE_CACHE_SIZE - vec_start,
271 (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); 271 (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
272 272
273 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", 273 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 35397dd5ecdb..e511339886b3 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -35,7 +35,7 @@
35#define O2HB_LIVE_THRESHOLD 2 35#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 36/* number of equal samples to be seen as dead */
37extern unsigned int o2hb_dead_threshold; 37extern unsigned int o2hb_dead_threshold;
38#define O2HB_DEFAULT_DEAD_THRESHOLD 7 38#define O2HB_DEFAULT_DEAD_THRESHOLD 31
39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ 39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
40#define O2HB_MIN_DEAD_THRESHOLD 2 40#define O2HB_MIN_DEAD_THRESHOLD 2
41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) 41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index a4882c8df945..23c732f27529 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -146,7 +146,7 @@ static struct kset mlog_kset = {
146 .kobj = {.ktype = &mlog_ktype}, 146 .kobj = {.ktype = &mlog_ktype},
147}; 147};
148 148
149int mlog_sys_init(struct kset *o2cb_subsys) 149int mlog_sys_init(struct kset *o2cb_kset)
150{ 150{
151 int i = 0; 151 int i = 0;
152 152
@@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
157 mlog_attr_ptrs[i] = NULL; 157 mlog_attr_ptrs[i] = NULL;
158 158
159 kobject_set_name(&mlog_kset.kobj, "logmask"); 159 kobject_set_name(&mlog_kset.kobj, "logmask");
160 kobj_set_kset_s(&mlog_kset, *o2cb_subsys); 160 mlog_kset.kobj.kset = o2cb_kset;
161 return kset_register(&mlog_kset); 161 return kset_register(&mlog_kset);
162} 162}
163 163
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index cd046060114e..597e064bb94f 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -212,7 +212,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
212#define mlog_errno(st) do { \ 212#define mlog_errno(st) do { \
213 int _st = (st); \ 213 int _st = (st); \
214 if (_st != -ERESTARTSYS && _st != -EINTR && \ 214 if (_st != -ERESTARTSYS && _st != -EINTR && \
215 _st != AOP_TRUNCATED_PAGE) \ 215 _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \
216 mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ 216 mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
217} while (0) 217} while (0)
218 218
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 64f6f378fd09..0c095ce7723d 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -28,96 +28,55 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/kobject.h> 29#include <linux/kobject.h>
30#include <linux/sysfs.h> 30#include <linux/sysfs.h>
31#include <linux/fs.h>
31 32
32#include "ocfs2_nodemanager.h" 33#include "ocfs2_nodemanager.h"
33#include "masklog.h" 34#include "masklog.h"
34#include "sys.h" 35#include "sys.h"
35 36
36struct o2cb_attribute {
37 struct attribute attr;
38 ssize_t (*show)(char *buf);
39 ssize_t (*store)(const char *buf, size_t count);
40};
41
42#define O2CB_ATTR(_name, _mode, _show, _store) \
43struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
44
45#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
46 37
47static ssize_t o2cb_interface_revision_show(char *buf) 38static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
39 char *buf)
48{ 40{
49 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); 41 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
50} 42}
51 43static struct kobj_attribute attr_version =
52static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL); 44 __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
53 45
54static struct attribute *o2cb_attrs[] = { 46static struct attribute *o2cb_attrs[] = {
55 &o2cb_attr_interface_revision.attr, 47 &attr_version.attr,
56 NULL, 48 NULL,
57}; 49};
58 50
59static ssize_t 51static struct attribute_group o2cb_attr_group = {
60o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer); 52 .attrs = o2cb_attrs,
61static ssize_t
62o2cb_store(struct kobject * kobj, struct attribute * attr,
63 const char * buffer, size_t count);
64static struct sysfs_ops o2cb_sysfs_ops = {
65 .show = o2cb_show,
66 .store = o2cb_store,
67}; 53};
68 54
69static struct kobj_type o2cb_subsys_type = { 55static struct kset *o2cb_kset;
70 .default_attrs = o2cb_attrs,
71 .sysfs_ops = &o2cb_sysfs_ops,
72};
73
74/* gives us o2cb_subsys */
75static decl_subsys(o2cb, NULL, NULL);
76
77static ssize_t
78o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
79{
80 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
81 struct kset *sbs = to_kset(kobj);
82
83 BUG_ON(sbs != &o2cb_subsys);
84
85 if (o2cb_attr->show)
86 return o2cb_attr->show(buffer);
87 return -EIO;
88}
89
90static ssize_t
91o2cb_store(struct kobject * kobj, struct attribute * attr,
92 const char * buffer, size_t count)
93{
94 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
95 struct kset *sbs = to_kset(kobj);
96
97 BUG_ON(sbs != &o2cb_subsys);
98
99 if (o2cb_attr->store)
100 return o2cb_attr->store(buffer, count);
101 return -EIO;
102}
103 56
104void o2cb_sys_shutdown(void) 57void o2cb_sys_shutdown(void)
105{ 58{
106 mlog_sys_shutdown(); 59 mlog_sys_shutdown();
107 subsystem_unregister(&o2cb_subsys); 60 kset_unregister(o2cb_kset);
108} 61}
109 62
110int o2cb_sys_init(void) 63int o2cb_sys_init(void)
111{ 64{
112 int ret; 65 int ret;
113 66
114 o2cb_subsys.kobj.ktype = &o2cb_subsys_type; 67 o2cb_kset = kset_create_and_add("o2cb", NULL, NULL);
115 ret = subsystem_register(&o2cb_subsys); 68 if (!o2cb_kset)
69 return -ENOMEM;
70
71 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
116 if (ret) 72 if (ret)
117 return ret; 73 goto error;
118 74
119 ret = mlog_sys_init(&o2cb_subsys); 75 ret = mlog_sys_init(o2cb_kset);
120 if (ret) 76 if (ret)
121 subsystem_unregister(&o2cb_subsys); 77 goto error;
78 return 0;
79error:
80 kset_unregister(o2cb_kset);
122 return ret; 81 return ret;
123} 82}
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 685c18065c82..ee50c9610e7f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -58,6 +58,7 @@
58#include <linux/slab.h> 58#include <linux/slab.h>
59#include <linux/idr.h> 59#include <linux/idr.h>
60#include <linux/kref.h> 60#include <linux/kref.h>
61#include <linux/net.h>
61#include <net/tcp.h> 62#include <net/tcp.h>
62 63
63#include <asm/uaccess.h> 64#include <asm/uaccess.h>
@@ -71,14 +72,6 @@
71 72
72#include "tcp_internal.h" 73#include "tcp_internal.h"
73 74
74/*
75 * The linux network stack isn't sparse endian clean.. It has macros like
76 * ntohs() which perform the endian checks and structs like sockaddr_in
77 * which aren't annotated. So __force is found here to get the build
78 * clean. When they emerge from the dark ages and annotate the code
79 * we can remove these.
80 */
81
82#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u" 75#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
83#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ 76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
84 NIPQUAD(sc->sc_node->nd_ipv4_address), \ 77 NIPQUAD(sc->sc_node->nd_ipv4_address), \
@@ -616,8 +609,7 @@ static void o2net_shutdown_sc(struct work_struct *work)
616 del_timer_sync(&sc->sc_idle_timeout); 609 del_timer_sync(&sc->sc_idle_timeout);
617 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); 610 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
618 sc_put(sc); 611 sc_put(sc);
619 sc->sc_sock->ops->shutdown(sc->sc_sock, 612 kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
620 RCV_SHUTDOWN|SEND_SHUTDOWN);
621 } 613 }
622 614
623 /* not fatal so failed connects before the other guy has our 615 /* not fatal so failed connects before the other guy has our
@@ -1500,7 +1492,7 @@ static void o2net_start_connect(struct work_struct *work)
1500 1492
1501 myaddr.sin_family = AF_INET; 1493 myaddr.sin_family = AF_INET;
1502 myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; 1494 myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
1503 myaddr.sin_port = (__force u16)htons(0); /* any port */ 1495 myaddr.sin_port = htons(0); /* any port */
1504 1496
1505 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, 1497 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
1506 sizeof(myaddr)); 1498 sizeof(myaddr));
@@ -1701,11 +1693,11 @@ static int o2net_accept_one(struct socket *sock)
1701 if (ret < 0) 1693 if (ret < 0)
1702 goto out; 1694 goto out;
1703 1695
1704 node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr); 1696 node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
1705 if (node == NULL) { 1697 if (node == NULL) {
1706 mlog(ML_NOTICE, "attempt to connect from unknown node at " 1698 mlog(ML_NOTICE, "attempt to connect from unknown node at "
1707 "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr), 1699 "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
1708 ntohs((__force __be16)sin.sin_port)); 1700 ntohs(sin.sin_port));
1709 ret = -EINVAL; 1701 ret = -EINVAL;
1710 goto out; 1702 goto out;
1711 } 1703 }
@@ -1714,7 +1706,7 @@ static int o2net_accept_one(struct socket *sock)
1714 mlog(ML_NOTICE, "unexpected connect attempted from a lower " 1706 mlog(ML_NOTICE, "unexpected connect attempted from a lower "
1715 "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n", 1707 "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
1716 node->nd_name, NIPQUAD(sin.sin_addr.s_addr), 1708 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1717 ntohs((__force __be16)sin.sin_port), node->nd_num); 1709 ntohs(sin.sin_port), node->nd_num);
1718 ret = -EINVAL; 1710 ret = -EINVAL;
1719 goto out; 1711 goto out;
1720 } 1712 }
@@ -1725,7 +1717,7 @@ static int o2net_accept_one(struct socket *sock)
1725 mlog(ML_CONN, "attempt to connect from node '%s' at " 1717 mlog(ML_CONN, "attempt to connect from node '%s' at "
1726 "%u.%u.%u.%u:%d but it isn't heartbeating\n", 1718 "%u.%u.%u.%u:%d but it isn't heartbeating\n",
1727 node->nd_name, NIPQUAD(sin.sin_addr.s_addr), 1719 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1728 ntohs((__force __be16)sin.sin_port)); 1720 ntohs(sin.sin_port));
1729 ret = -EINVAL; 1721 ret = -EINVAL;
1730 goto out; 1722 goto out;
1731 } 1723 }
@@ -1742,7 +1734,7 @@ static int o2net_accept_one(struct socket *sock)
1742 mlog(ML_NOTICE, "attempt to connect from node '%s' at " 1734 mlog(ML_NOTICE, "attempt to connect from node '%s' at "
1743 "%u.%u.%u.%u:%d but it already has an open connection\n", 1735 "%u.%u.%u.%u:%d but it already has an open connection\n",
1744 node->nd_name, NIPQUAD(sin.sin_addr.s_addr), 1736 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1745 ntohs((__force __be16)sin.sin_port)); 1737 ntohs(sin.sin_port));
1746 goto out; 1738 goto out;
1747 } 1739 }
1748 1740
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index da880fc215f0..f36f66aab3dd 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
60/* same as hb delay, we're waiting for another node to recognize our hb */ 60/* same as hb delay, we're waiting for another node to recognize our hb */
61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
62 62
63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
65 65
66 66
67/* TODO: figure this out.... */ 67/* TODO: figure this out.... */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 9606111fe89d..b2e832aca567 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * New in version 10:
42 * - Meta/data locks combined
43 *
44 * New in version 9:
45 * - All votes removed
46 *
41 * New in version 8: 47 * New in version 8:
42 * - Replace delete inode votes with a cluster lock 48 * - Replace delete inode votes with a cluster lock
43 * 49 *
@@ -60,7 +66,7 @@
60 * - full 64 bit i_size in the metadata lock lvbs 66 * - full 64 bit i_size in the metadata lock lvbs
61 * - introduction of "rw" lock and pushing meta/data locking down 67 * - introduction of "rw" lock and pushing meta/data locking down
62 */ 68 */
63#define O2NET_PROTOCOL_VERSION 8ULL 69#define O2NET_PROTOCOL_VERSION 10ULL
64struct o2net_handshake { 70struct o2net_handshake {
65 __be64 protocol_version; 71 __be64 protocol_version;
66 __be64 connector_id; 72 __be64 connector_id;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
index 7286c48bb30d..a56eee6abad3 100644
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -28,7 +28,7 @@
28 28
29#include "ver.h" 29#include "ver.h"
30 30
31#define CLUSTER_BUILD_VERSION "1.3.3" 31#define CLUSTER_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION 33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 3094ddb7a254..b1cc7c381e88 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
128/* 128/*
129 * Walk the inode alias list, and find a dentry which has a given 129 * Walk the inode alias list, and find a dentry which has a given
130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it 130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
131 * is looking for a dentry_lock reference. The vote thread is looking 131 * is looking for a dentry_lock reference. The downconvert thread is
132 * to unhash aliases, so we allow it to skip any that already have 132 * looking to unhash aliases, so we allow it to skip any that already
133 * that property. 133 * have that property.
134 */ 134 */
135struct dentry *ocfs2_find_local_alias(struct inode *inode, 135struct dentry *ocfs2_find_local_alias(struct inode *inode,
136 u64 parent_blkno, 136 u64 parent_blkno,
@@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
266 dl->dl_count = 0; 266 dl->dl_count = 0;
267 /* 267 /*
268 * Does this have to happen below, for all attaches, in case 268 * Does this have to happen below, for all attaches, in case
269 * the struct inode gets blown away by votes? 269 * the struct inode gets blown away by the downconvert thread?
270 */ 270 */
271 dl->dl_inode = igrab(inode); 271 dl->dl_inode = igrab(inode);
272 dl->dl_parent_blkno = parent_blkno; 272 dl->dl_parent_blkno = parent_blkno;
@@ -318,9 +318,9 @@ out_attach:
318static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, 318static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
319 struct ocfs2_dentry_lock *dl) 319 struct ocfs2_dentry_lock *dl)
320{ 320{
321 iput(dl->dl_inode);
321 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); 322 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
322 ocfs2_lock_res_free(&dl->dl_lockres); 323 ocfs2_lock_res_free(&dl->dl_lockres);
323 iput(dl->dl_inode);
324 kfree(dl); 324 kfree(dl);
325} 325}
326 326
@@ -344,12 +344,24 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
344{ 344{
345 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 345 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
346 346
347 mlog_bug_on_msg(!dl && !(dentry->d_flags & DCACHE_DISCONNECTED), 347 if (!dl) {
348 "dentry: %.*s\n", dentry->d_name.len, 348 /*
349 dentry->d_name.name); 349 * No dentry lock is ok if we're disconnected or
350 * unhashed.
351 */
352 if (!(dentry->d_flags & DCACHE_DISCONNECTED) &&
353 !d_unhashed(dentry)) {
354 unsigned long long ino = 0ULL;
355 if (inode)
356 ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
357 mlog(ML_ERROR, "Dentry is missing cluster lock. "
358 "inode: %llu, d_flags: 0x%x, d_name: %.*s\n",
359 ino, dentry->d_flags, dentry->d_name.len,
360 dentry->d_name.name);
361 }
350 362
351 if (!dl)
352 goto out; 363 goto out;
364 }
353 365
354 mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n", 366 mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
355 dentry->d_name.len, dentry->d_name.name, 367 dentry->d_name.len, dentry->d_name.name,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 6a2f143e269c..6b0107f21344 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -208,9 +208,9 @@ out:
208 return NULL; 208 return NULL;
209} 209}
210 210
211struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 211static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
212 struct inode *dir, 212 struct inode *dir,
213 struct ocfs2_dir_entry **res_dir) 213 struct ocfs2_dir_entry **res_dir)
214{ 214{
215 struct super_block *sb; 215 struct super_block *sb;
216 struct buffer_head *bh_use[NAMEI_RA_SIZE]; 216 struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
846 mlog_entry("dirino=%llu\n", 846 mlog_entry("dirino=%llu\n",
847 (unsigned long long)OCFS2_I(inode)->ip_blkno); 847 (unsigned long long)OCFS2_I(inode)->ip_blkno);
848 848
849 error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 849 error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
850 if (lock_level && error >= 0) { 850 if (lock_level && error >= 0) {
851 /* We release EX lock which used to update atime 851 /* We release EX lock which used to update atime
852 * and get PR lock again to reduce contention 852 * and get PR lock again to reduce contention
853 * on commonly accessed directories. */ 853 * on commonly accessed directories. */
854 ocfs2_meta_unlock(inode, 1); 854 ocfs2_inode_unlock(inode, 1);
855 lock_level = 0; 855 lock_level = 0;
856 error = ocfs2_meta_lock(inode, NULL, 0); 856 error = ocfs2_inode_lock(inode, NULL, 0);
857 } 857 }
858 if (error < 0) { 858 if (error < 0) {
859 if (error != -ENOENT) 859 if (error != -ENOENT)
@@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, 865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
866 dirent, filldir, NULL); 866 dirent, filldir, NULL);
867 867
868 ocfs2_meta_unlock(inode, lock_level); 868 ocfs2_inode_unlock(inode, lock_level);
869 869
870bail_nolock: 870bail_nolock:
871 mlog_exit(error); 871 mlog_exit(error);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
index d2be3ad841f9..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmfsver.h" 29#include "dlmfsver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 62e4a7daa286..a54d33d95ada 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -908,7 +908,7 @@ lookup:
908 * but they might own this lockres. wait on them. */ 908 * but they might own this lockres. wait on them. */
909 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 909 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
910 if (bit < O2NM_MAX_NODES) { 910 if (bit < O2NM_MAX_NODES) {
911 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to" 911 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
912 "recover before lock mastery can begin\n", 912 "recover before lock mastery can begin\n",
913 dlm->name, namelen, (char *)lockid, bit); 913 dlm->name, namelen, (char *)lockid, bit);
914 wait_on_recovery = 1; 914 wait_on_recovery = 1;
@@ -962,7 +962,7 @@ redo_request:
962 spin_lock(&dlm->spinlock); 962 spin_lock(&dlm->spinlock);
963 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 963 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
964 if (bit < O2NM_MAX_NODES) { 964 if (bit < O2NM_MAX_NODES) {
965 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to" 965 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
966 "recover before lock mastery can begin\n", 966 "recover before lock mastery can begin\n",
967 dlm->name, namelen, (char *)lockid, bit); 967 dlm->name, namelen, (char *)lockid, bit);
968 wait_on_recovery = 1; 968 wait_on_recovery = 1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2fde7bf91434..91f747b8a538 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2270 } 2270 }
2271 } 2271 }
2272 2272
2273 /* Clean up join state on node death. */
2274 if (dlm->joining_node == idx) {
2275 mlog(0, "Clearing join state for node %u\n", idx);
2276 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2277 }
2278
2273 /* check to see if the node is already considered dead */ 2279 /* check to see if the node is already considered dead */
2274 if (!test_bit(idx, dlm->live_nodes_map)) { 2280 if (!test_bit(idx, dlm->live_nodes_map)) {
2275 mlog(0, "for domain %s, node %d is already dead. " 2281 mlog(0, "for domain %s, node %d is already dead. "
@@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2288 2294
2289 clear_bit(idx, dlm->live_nodes_map); 2295 clear_bit(idx, dlm->live_nodes_map);
2290 2296
2291 /* Clean up join state on node death. */
2292 if (dlm->joining_node == idx) {
2293 mlog(0, "Clearing join state for node %u\n", idx);
2294 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2295 }
2296
2297 /* make sure local cleanup occurs before the heartbeat events */ 2297 /* make sure local cleanup occurs before the heartbeat events */
2298 if (!test_bit(idx, dlm->recovery_map)) 2298 if (!test_bit(idx, dlm->recovery_map))
2299 dlm_do_local_recovery_cleanup(dlm, idx); 2299 dlm_do_local_recovery_cleanup(dlm, idx);
@@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
2321 if (!dlm_grab(dlm)) 2321 if (!dlm_grab(dlm))
2322 return; 2322 return;
2323 2323
2324 /*
2325 * This will notify any dlm users that a node in our domain
2326 * went away without notifying us first.
2327 */
2328 if (test_bit(idx, dlm->domain_map))
2329 dlm_fire_domain_eviction_callbacks(dlm, idx);
2330
2324 spin_lock(&dlm->spinlock); 2331 spin_lock(&dlm->spinlock);
2325 __dlm_hb_node_down(dlm, idx); 2332 __dlm_hb_node_down(dlm, idx);
2326 spin_unlock(&dlm->spinlock); 2333 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
index 7ef2653f8f41..dfc0da4d158d 100644
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmver.h" 29#include "dlmver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 41c76ff2fcfb..3867244fb144 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
55#include "slot_map.h" 55#include "slot_map.h"
56#include "super.h" 56#include "super.h"
57#include "uptodate.h" 57#include "uptodate.h"
58#include "vote.h"
59 58
60#include "buffer_head_io.h" 59#include "buffer_head_io.h"
61 60
@@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
69 68
70static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 69static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); 70static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
72 72
73/* 73/*
74 * Return value from ->downconvert_worker functions. 74 * Return value from ->downconvert_worker functions.
@@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); 153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
154 154
155 /* 155 /*
156 * Optionally called in the downconvert (or "vote") thread 156 * Optionally called in the downconvert thread after a
157 * after a successful downconvert. The lockres will not be 157 * successful downconvert. The lockres will not be referenced
158 * referenced after this callback is called, so it is safe to 158 * after this callback is called, so it is safe to free
159 * free memory, etc. 159 * memory, etc.
160 * 160 *
161 * The exact semantics of when this is called are controlled 161 * The exact semantics of when this is called are controlled
162 * by ->downconvert_worker() 162 * by ->downconvert_worker()
@@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
225 .flags = 0, 225 .flags = 0,
226}; 226};
227 227
228static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { 228static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
229 .get_osb = ocfs2_get_inode_osb, 229 .get_osb = ocfs2_get_inode_osb,
230 .check_downconvert = ocfs2_check_meta_downconvert, 230 .check_downconvert = ocfs2_check_meta_downconvert,
231 .set_lvb = ocfs2_set_meta_lvb, 231 .set_lvb = ocfs2_set_meta_lvb,
232 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
233};
234
235static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
236 .get_osb = ocfs2_get_inode_osb,
237 .downconvert_worker = ocfs2_data_convert_worker, 232 .downconvert_worker = ocfs2_data_convert_worker,
238 .flags = 0, 233 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
239}; 234};
240 235
241static struct ocfs2_lock_res_ops ocfs2_super_lops = { 236static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
258 .flags = 0, 253 .flags = 0,
259}; 254};
260 255
256static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
257 .get_osb = ocfs2_get_file_osb,
258 .flags = 0,
259};
260
261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
262{ 262{
263 return lockres->l_type == OCFS2_LOCK_TYPE_META || 263 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
264 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
265 lockres->l_type == OCFS2_LOCK_TYPE_RW || 264 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
266 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 265 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
267} 266}
@@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
310 "resource %s: %s\n", dlm_errname(_stat), _func, \ 309 "resource %s: %s\n", dlm_errname(_stat), _func, \
311 _lockres->l_name, dlm_errmsg(_stat)); \ 310 _lockres->l_name, dlm_errmsg(_stat)); \
312} while (0) 311} while (0)
313static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 312static int ocfs2_downconvert_thread(void *arg);
314 struct ocfs2_lock_res *lockres); 313static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
315static int ocfs2_meta_lock_update(struct inode *inode, 314 struct ocfs2_lock_res *lockres);
315static int ocfs2_inode_lock_update(struct inode *inode,
316 struct buffer_head **bh); 316 struct buffer_head **bh);
317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
318static inline int ocfs2_highest_compat_lock_level(int level); 318static inline int ocfs2_highest_compat_lock_level(int level);
319static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
320 int new_level);
321static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
322 struct ocfs2_lock_res *lockres,
323 int new_level,
324 int lvb);
325static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
326 struct ocfs2_lock_res *lockres);
327static int ocfs2_cancel_convert(struct ocfs2_super *osb,
328 struct ocfs2_lock_res *lockres);
329
319 330
320static void ocfs2_build_lock_name(enum ocfs2_lock_type type, 331static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
321 u64 blkno, 332 u64 blkno,
@@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
402 ops = &ocfs2_inode_rw_lops; 413 ops = &ocfs2_inode_rw_lops;
403 break; 414 break;
404 case OCFS2_LOCK_TYPE_META: 415 case OCFS2_LOCK_TYPE_META:
405 ops = &ocfs2_inode_meta_lops; 416 ops = &ocfs2_inode_inode_lops;
406 break;
407 case OCFS2_LOCK_TYPE_DATA:
408 ops = &ocfs2_inode_data_lops;
409 break; 417 break;
410 case OCFS2_LOCK_TYPE_OPEN: 418 case OCFS2_LOCK_TYPE_OPEN:
411 ops = &ocfs2_inode_open_lops; 419 ops = &ocfs2_inode_open_lops;
@@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
428 return OCFS2_SB(inode->i_sb); 436 return OCFS2_SB(inode->i_sb);
429} 437}
430 438
439static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
440{
441 struct ocfs2_file_private *fp = lockres->l_priv;
442
443 return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
444}
445
431static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) 446static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
432{ 447{
433 __be64 inode_blkno_be; 448 __be64 inode_blkno_be;
@@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
508 &ocfs2_rename_lops, osb); 523 &ocfs2_rename_lops, osb);
509} 524}
510 525
526void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
527 struct ocfs2_file_private *fp)
528{
529 struct inode *inode = fp->fp_file->f_mapping->host;
530 struct ocfs2_inode_info *oi = OCFS2_I(inode);
531
532 ocfs2_lock_res_init_once(lockres);
533 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
534 inode->i_generation, lockres->l_name);
535 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
536 OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
537 fp);
538 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
539}
540
511void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 541void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
512{ 542{
513 mlog_entry_void(); 543 mlog_entry_void();
@@ -670,7 +700,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
670{ 700{
671 mlog_entry_void(); 701 mlog_entry_void();
672 702
673 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY)); 703 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
674 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 704 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
675 705
676 if (lockres->l_requested > LKM_NLMODE && 706 if (lockres->l_requested > LKM_NLMODE &&
@@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
724 lockres->l_name, level, lockres->l_level, 754 lockres->l_name, level, lockres->l_level,
725 ocfs2_lock_type_string(lockres->l_type)); 755 ocfs2_lock_type_string(lockres->l_type));
726 756
757 /*
758 * We can skip the bast for locks which don't enable caching -
759 * they'll be dropped at the earliest possible time anyway.
760 */
761 if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
762 return;
763
727 spin_lock_irqsave(&lockres->l_lock, flags); 764 spin_lock_irqsave(&lockres->l_lock, flags);
728 needs_downconvert = ocfs2_generic_handle_bast(lockres, level); 765 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
729 if (needs_downconvert) 766 if (needs_downconvert)
@@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
732 769
733 wake_up(&lockres->l_event); 770 wake_up(&lockres->l_event);
734 771
735 ocfs2_kick_vote_thread(osb); 772 ocfs2_wake_downconvert_thread(osb);
736} 773}
737 774
738static void ocfs2_locking_ast(void *opaque) 775static void ocfs2_locking_ast(void *opaque)
@@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
935 972
936} 973}
937 974
975static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
976 struct ocfs2_lock_res *lockres)
977{
978 int ret;
979
980 ret = wait_for_completion_interruptible(&mw->mw_complete);
981 if (ret)
982 lockres_remove_mask_waiter(lockres, mw);
983 else
984 ret = mw->mw_status;
985 /* Re-arm the completion in case we want to wait on it again */
986 INIT_COMPLETION(mw->mw_complete);
987 return ret;
988}
989
938static int ocfs2_cluster_lock(struct ocfs2_super *osb, 990static int ocfs2_cluster_lock(struct ocfs2_super *osb,
939 struct ocfs2_lock_res *lockres, 991 struct ocfs2_lock_res *lockres,
940 int level, 992 int level,
@@ -980,18 +1032,6 @@ again:
980 goto unlock; 1032 goto unlock;
981 } 1033 }
982 1034
983 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
984 /* lock has not been created yet. */
985 spin_unlock_irqrestore(&lockres->l_lock, flags);
986
987 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
988 if (ret < 0) {
989 mlog_errno(ret);
990 goto out;
991 }
992 goto again;
993 }
994
995 if (lockres->l_flags & OCFS2_LOCK_BLOCKED && 1035 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
996 !ocfs2_may_continue_on_blocked_lock(lockres, level)) { 1036 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
997 /* is the lock is currently blocked on behalf of 1037 /* is the lock is currently blocked on behalf of
@@ -1006,7 +1046,14 @@ again:
1006 mlog(ML_ERROR, "lockres %s has action %u pending\n", 1046 mlog(ML_ERROR, "lockres %s has action %u pending\n",
1007 lockres->l_name, lockres->l_action); 1047 lockres->l_name, lockres->l_action);
1008 1048
1009 lockres->l_action = OCFS2_AST_CONVERT; 1049 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1050 lockres->l_action = OCFS2_AST_ATTACH;
1051 lkm_flags &= ~LKM_CONVERT;
1052 } else {
1053 lockres->l_action = OCFS2_AST_CONVERT;
1054 lkm_flags |= LKM_CONVERT;
1055 }
1056
1010 lockres->l_requested = level; 1057 lockres->l_requested = level;
1011 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 1058 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1012 spin_unlock_irqrestore(&lockres->l_lock, flags); 1059 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -1021,7 +1068,7 @@ again:
1021 status = dlmlock(osb->dlm, 1068 status = dlmlock(osb->dlm,
1022 level, 1069 level,
1023 &lockres->l_lksb, 1070 &lockres->l_lksb,
1024 lkm_flags|LKM_CONVERT, 1071 lkm_flags,
1025 lockres->l_name, 1072 lockres->l_name,
1026 OCFS2_LOCK_ID_MAX_LEN - 1, 1073 OCFS2_LOCK_ID_MAX_LEN - 1,
1027 ocfs2_locking_ast, 1074 ocfs2_locking_ast,
@@ -1094,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1094 mlog_entry_void(); 1141 mlog_entry_void();
1095 spin_lock_irqsave(&lockres->l_lock, flags); 1142 spin_lock_irqsave(&lockres->l_lock, flags);
1096 ocfs2_dec_holders(lockres, level); 1143 ocfs2_dec_holders(lockres, level);
1097 ocfs2_vote_on_unlock(osb, lockres); 1144 ocfs2_downconvert_on_unlock(osb, lockres);
1098 spin_unlock_irqrestore(&lockres->l_lock, flags); 1145 spin_unlock_irqrestore(&lockres->l_lock, flags);
1099 mlog_exit_void(); 1146 mlog_exit_void();
1100} 1147}
@@ -1152,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1152 * We don't want to use LKM_LOCAL on a meta data lock as they 1199 * We don't want to use LKM_LOCAL on a meta data lock as they
1153 * don't use a generation in their lock names. 1200 * don't use a generation in their lock names.
1154 */ 1201 */
1155 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0); 1202 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
1156 if (ret) {
1157 mlog_errno(ret);
1158 goto bail;
1159 }
1160
1161 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
1162 if (ret) { 1203 if (ret) {
1163 mlog_errno(ret); 1204 mlog_errno(ret);
1164 goto bail; 1205 goto bail;
@@ -1316,76 +1357,221 @@ out:
1316 mlog_exit_void(); 1357 mlog_exit_void();
1317} 1358}
1318 1359
1319int ocfs2_data_lock_full(struct inode *inode, 1360static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
1320 int write, 1361 int level)
1321 int arg_flags)
1322{ 1362{
1323 int status = 0, level; 1363 int ret;
1324 struct ocfs2_lock_res *lockres; 1364 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1325 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 unsigned long flags;
1366 struct ocfs2_mask_waiter mw;
1326 1367
1327 BUG_ON(!inode); 1368 ocfs2_init_mask_waiter(&mw);
1328 1369
1329 mlog_entry_void(); 1370retry_cancel:
1371 spin_lock_irqsave(&lockres->l_lock, flags);
1372 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
1373 ret = ocfs2_prepare_cancel_convert(osb, lockres);
1374 if (ret) {
1375 spin_unlock_irqrestore(&lockres->l_lock, flags);
1376 ret = ocfs2_cancel_convert(osb, lockres);
1377 if (ret < 0) {
1378 mlog_errno(ret);
1379 goto out;
1380 }
1381 goto retry_cancel;
1382 }
1383 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1384 spin_unlock_irqrestore(&lockres->l_lock, flags);
1330 1385
1331 mlog(0, "inode %llu take %s DATA lock\n", 1386 ocfs2_wait_for_mask(&mw);
1332 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1387 goto retry_cancel;
1333 write ? "EXMODE" : "PRMODE"); 1388 }
1334 1389
1335 /* We'll allow faking a readonly data lock for 1390 ret = -ERESTARTSYS;
1336 * rodevices. */ 1391 /*
1337 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { 1392 * We may still have gotten the lock, in which case there's no
1338 if (write) { 1393 * point to restarting the syscall.
1339 status = -EROFS; 1394 */
1340 mlog_errno(status); 1395 if (lockres->l_level == level)
1396 ret = 0;
1397
1398 mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
1399 lockres->l_flags, lockres->l_level, lockres->l_action);
1400
1401 spin_unlock_irqrestore(&lockres->l_lock, flags);
1402
1403out:
1404 return ret;
1405}
1406
1407/*
1408 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1409 * flock() calls. The locking approach this requires is sufficiently
1410 * different from all other cluster lock types that we implement a
1411 * seperate path to the "low-level" dlm calls. In particular:
1412 *
1413 * - No optimization of lock levels is done - we take at exactly
1414 * what's been requested.
1415 *
1416 * - No lock caching is employed. We immediately downconvert to
1417 * no-lock at unlock time. This also means flock locks never go on
1418 * the blocking list).
1419 *
1420 * - Since userspace can trivially deadlock itself with flock, we make
1421 * sure to allow cancellation of a misbehaving applications flock()
1422 * request.
1423 *
1424 * - Access to any flock lockres doesn't require concurrency, so we
1425 * can simplify the code by requiring the caller to guarantee
1426 * serialization of dlmglue flock calls.
1427 */
1428int ocfs2_file_lock(struct file *file, int ex, int trylock)
1429{
1430 int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
1431 unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
1432 unsigned long flags;
1433 struct ocfs2_file_private *fp = file->private_data;
1434 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1435 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1436 struct ocfs2_mask_waiter mw;
1437
1438 ocfs2_init_mask_waiter(&mw);
1439
1440 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
1441 (lockres->l_level > LKM_NLMODE)) {
1442 mlog(ML_ERROR,
1443 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1444 "level: %u\n", lockres->l_name, lockres->l_flags,
1445 lockres->l_level);
1446 return -EINVAL;
1447 }
1448
1449 spin_lock_irqsave(&lockres->l_lock, flags);
1450 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1451 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1452 spin_unlock_irqrestore(&lockres->l_lock, flags);
1453
1454 /*
1455 * Get the lock at NLMODE to start - that way we
1456 * can cancel the upconvert request if need be.
1457 */
1458 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
1459 if (ret < 0) {
1460 mlog_errno(ret);
1461 goto out;
1341 } 1462 }
1342 goto out; 1463
1464 ret = ocfs2_wait_for_mask(&mw);
1465 if (ret) {
1466 mlog_errno(ret);
1467 goto out;
1468 }
1469 spin_lock_irqsave(&lockres->l_lock, flags);
1343 } 1470 }
1344 1471
1345 if (ocfs2_mount_local(osb)) 1472 lockres->l_action = OCFS2_AST_CONVERT;
1346 goto out; 1473 lkm_flags |= LKM_CONVERT;
1474 lockres->l_requested = level;
1475 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1347 1476
1348 lockres = &OCFS2_I(inode)->ip_data_lockres; 1477 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1478 spin_unlock_irqrestore(&lockres->l_lock, flags);
1349 1479
1350 level = write ? LKM_EXMODE : LKM_PRMODE; 1480 ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
1481 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
1482 ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
1483 if (ret != DLM_NORMAL) {
1484 if (trylock && ret == DLM_NOTQUEUED)
1485 ret = -EAGAIN;
1486 else {
1487 ocfs2_log_dlm_error("dlmlock", ret, lockres);
1488 ret = -EINVAL;
1489 }
1351 1490
1352 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 1491 ocfs2_recover_from_dlm_error(lockres, 1);
1353 0, arg_flags); 1492 lockres_remove_mask_waiter(lockres, &mw);
1354 if (status < 0 && status != -EAGAIN) 1493 goto out;
1355 mlog_errno(status); 1494 }
1495
1496 ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
1497 if (ret == -ERESTARTSYS) {
1498 /*
1499 * Userspace can cause deadlock itself with
1500 * flock(). Current behavior locally is to allow the
1501 * deadlock, but abort the system call if a signal is
1502 * received. We follow this example, otherwise a
1503 * poorly written program could sit in kernel until
1504 * reboot.
1505 *
1506 * Handling this is a bit more complicated for Ocfs2
1507 * though. We can't exit this function with an
1508 * outstanding lock request, so a cancel convert is
1509 * required. We intentionally overwrite 'ret' - if the
1510 * cancel fails and the lock was granted, it's easier
1511 * to just bubble sucess back up to the user.
1512 */
1513 ret = ocfs2_flock_handle_signal(lockres, level);
1514 }
1356 1515
1357out: 1516out:
1358 mlog_exit(status); 1517
1359 return status; 1518 mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
1519 lockres->l_name, ex, trylock, ret);
1520 return ret;
1360} 1521}
1361 1522
1362/* see ocfs2_meta_lock_with_page() */ 1523void ocfs2_file_unlock(struct file *file)
1363int ocfs2_data_lock_with_page(struct inode *inode,
1364 int write,
1365 struct page *page)
1366{ 1524{
1367 int ret; 1525 int ret;
1526 unsigned long flags;
1527 struct ocfs2_file_private *fp = file->private_data;
1528 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1529 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1530 struct ocfs2_mask_waiter mw;
1368 1531
1369 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); 1532 ocfs2_init_mask_waiter(&mw);
1370 if (ret == -EAGAIN) { 1533
1371 unlock_page(page); 1534 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
1372 if (ocfs2_data_lock(inode, write) == 0) 1535 return;
1373 ocfs2_data_unlock(inode, write); 1536
1374 ret = AOP_TRUNCATED_PAGE; 1537 if (lockres->l_level == LKM_NLMODE)
1538 return;
1539
1540 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
1541 lockres->l_name, lockres->l_flags, lockres->l_level,
1542 lockres->l_action);
1543
1544 spin_lock_irqsave(&lockres->l_lock, flags);
1545 /*
1546 * Fake a blocking ast for the downconvert code.
1547 */
1548 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1549 lockres->l_blocking = LKM_EXMODE;
1550
1551 ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
1552 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1553 spin_unlock_irqrestore(&lockres->l_lock, flags);
1554
1555 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
1556 if (ret) {
1557 mlog_errno(ret);
1558 return;
1375 } 1559 }
1376 1560
1377 return ret; 1561 ret = ocfs2_wait_for_mask(&mw);
1562 if (ret)
1563 mlog_errno(ret);
1378} 1564}
1379 1565
1380static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 1566static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
1381 struct ocfs2_lock_res *lockres) 1567 struct ocfs2_lock_res *lockres)
1382{ 1568{
1383 int kick = 0; 1569 int kick = 0;
1384 1570
1385 mlog_entry_void(); 1571 mlog_entry_void();
1386 1572
1387 /* If we know that another node is waiting on our lock, kick 1573 /* If we know that another node is waiting on our lock, kick
1388 * the vote thread * pre-emptively when we reach a release 1574 * the downconvert thread * pre-emptively when we reach a release
1389 * condition. */ 1575 * condition. */
1390 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1576 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1391 switch(lockres->l_blocking) { 1577 switch(lockres->l_blocking) {
@@ -1403,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1403 } 1589 }
1404 1590
1405 if (kick) 1591 if (kick)
1406 ocfs2_kick_vote_thread(osb); 1592 ocfs2_wake_downconvert_thread(osb);
1407
1408 mlog_exit_void();
1409}
1410
1411void ocfs2_data_unlock(struct inode *inode,
1412 int write)
1413{
1414 int level = write ? LKM_EXMODE : LKM_PRMODE;
1415 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1416 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1417
1418 mlog_entry_void();
1419
1420 mlog(0, "inode %llu drop %s DATA lock\n",
1421 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1422 write ? "EXMODE" : "PRMODE");
1423
1424 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
1425 !ocfs2_mount_local(osb))
1426 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1427 1593
1428 mlog_exit_void(); 1594 mlog_exit_void();
1429} 1595}
@@ -1447,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
1447 1613
1448/* Call this with the lockres locked. I am reasonably sure we don't 1614/* Call this with the lockres locked. I am reasonably sure we don't
1449 * need ip_lock in this function as anyone who would be changing those 1615 * need ip_lock in this function as anyone who would be changing those
1450 * values is supposed to be blocked in ocfs2_meta_lock right now. */ 1616 * values is supposed to be blocked in ocfs2_inode_lock right now. */
1451static void __ocfs2_stuff_meta_lvb(struct inode *inode) 1617static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1452{ 1618{
1453 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1619 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1454 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1620 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1455 struct ocfs2_meta_lvb *lvb; 1621 struct ocfs2_meta_lvb *lvb;
1456 1622
1457 mlog_entry_void(); 1623 mlog_entry_void();
@@ -1501,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
1501static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 1667static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1502{ 1668{
1503 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1669 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1504 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1670 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1505 struct ocfs2_meta_lvb *lvb; 1671 struct ocfs2_meta_lvb *lvb;
1506 1672
1507 mlog_entry_void(); 1673 mlog_entry_void();
@@ -1609,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
1609} 1775}
1610 1776
1611/* may or may not return a bh if it went to disk. */ 1777/* may or may not return a bh if it went to disk. */
1612static int ocfs2_meta_lock_update(struct inode *inode, 1778static int ocfs2_inode_lock_update(struct inode *inode,
1613 struct buffer_head **bh) 1779 struct buffer_head **bh)
1614{ 1780{
1615 int status = 0; 1781 int status = 0;
1616 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1782 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1617 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1783 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1618 struct ocfs2_dinode *fe; 1784 struct ocfs2_dinode *fe;
1619 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1620 1786
@@ -1726,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
1726 * returns < 0 error if the callback will never be called, otherwise 1892 * returns < 0 error if the callback will never be called, otherwise
1727 * the result of the lock will be communicated via the callback. 1893 * the result of the lock will be communicated via the callback.
1728 */ 1894 */
1729int ocfs2_meta_lock_full(struct inode *inode, 1895int ocfs2_inode_lock_full(struct inode *inode,
1730 struct buffer_head **ret_bh, 1896 struct buffer_head **ret_bh,
1731 int ex, 1897 int ex,
1732 int arg_flags) 1898 int arg_flags)
@@ -1761,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
1761 wait_event(osb->recovery_event, 1927 wait_event(osb->recovery_event,
1762 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1928 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1763 1929
1764 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1930 lockres = &OCFS2_I(inode)->ip_inode_lockres;
1765 level = ex ? LKM_EXMODE : LKM_PRMODE; 1931 level = ex ? LKM_EXMODE : LKM_PRMODE;
1766 dlm_flags = 0; 1932 dlm_flags = 0;
1767 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 1933 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1800,11 +1966,11 @@ local:
1800 } 1966 }
1801 1967
1802 /* This is fun. The caller may want a bh back, or it may 1968 /* This is fun. The caller may want a bh back, or it may
1803 * not. ocfs2_meta_lock_update definitely wants one in, but 1969 * not. ocfs2_inode_lock_update definitely wants one in, but
1804 * may or may not read one, depending on what's in the 1970 * may or may not read one, depending on what's in the
1805 * LVB. The result of all of this is that we've *only* gone to 1971 * LVB. The result of all of this is that we've *only* gone to
1806 * disk if we have to, so the complexity is worthwhile. */ 1972 * disk if we have to, so the complexity is worthwhile. */
1807 status = ocfs2_meta_lock_update(inode, &local_bh); 1973 status = ocfs2_inode_lock_update(inode, &local_bh);
1808 if (status < 0) { 1974 if (status < 0) {
1809 if (status != -ENOENT) 1975 if (status != -ENOENT)
1810 mlog_errno(status); 1976 mlog_errno(status);
@@ -1826,7 +1992,7 @@ bail:
1826 *ret_bh = NULL; 1992 *ret_bh = NULL;
1827 } 1993 }
1828 if (acquired) 1994 if (acquired)
1829 ocfs2_meta_unlock(inode, ex); 1995 ocfs2_inode_unlock(inode, ex);
1830 } 1996 }
1831 1997
1832 if (local_bh) 1998 if (local_bh)
@@ -1837,19 +2003,20 @@ bail:
1837} 2003}
1838 2004
1839/* 2005/*
1840 * This is working around a lock inversion between tasks acquiring DLM locks 2006 * This is working around a lock inversion between tasks acquiring DLM
1841 * while holding a page lock and the vote thread which blocks dlm lock acquiry 2007 * locks while holding a page lock and the downconvert thread which
1842 * while acquiring page locks. 2008 * blocks dlm lock acquiry while acquiring page locks.
1843 * 2009 *
1844 * ** These _with_page variantes are only intended to be called from aop 2010 * ** These _with_page variantes are only intended to be called from aop
1845 * methods that hold page locks and return a very specific *positive* error 2011 * methods that hold page locks and return a very specific *positive* error
1846 * code that aop methods pass up to the VFS -- test for errors with != 0. ** 2012 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1847 * 2013 *
1848 * The DLM is called such that it returns -EAGAIN if it would have blocked 2014 * The DLM is called such that it returns -EAGAIN if it would have
1849 * waiting for the vote thread. In that case we unlock our page so the vote 2015 * blocked waiting for the downconvert thread. In that case we unlock
1850 * thread can make progress. Once we've done this we have to return 2016 * our page so the downconvert thread can make progress. Once we've
1851 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up 2017 * done this we have to return AOP_TRUNCATED_PAGE so the aop method
1852 * into the VFS who will then immediately retry the aop call. 2018 * that called us can bubble that back up into the VFS who will then
2019 * immediately retry the aop call.
1853 * 2020 *
1854 * We do a blocking lock and immediate unlock before returning, though, so that 2021 * We do a blocking lock and immediate unlock before returning, though, so that
1855 * the lock has a great chance of being cached on this node by the time the VFS 2022 * the lock has a great chance of being cached on this node by the time the VFS
@@ -1857,32 +2024,32 @@ bail:
1857 * ping locks back and forth, but that's a risk we're willing to take to avoid 2024 * ping locks back and forth, but that's a risk we're willing to take to avoid
1858 * the lock inversion simply. 2025 * the lock inversion simply.
1859 */ 2026 */
1860int ocfs2_meta_lock_with_page(struct inode *inode, 2027int ocfs2_inode_lock_with_page(struct inode *inode,
1861 struct buffer_head **ret_bh, 2028 struct buffer_head **ret_bh,
1862 int ex, 2029 int ex,
1863 struct page *page) 2030 struct page *page)
1864{ 2031{
1865 int ret; 2032 int ret;
1866 2033
1867 ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); 2034 ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
1868 if (ret == -EAGAIN) { 2035 if (ret == -EAGAIN) {
1869 unlock_page(page); 2036 unlock_page(page);
1870 if (ocfs2_meta_lock(inode, ret_bh, ex) == 0) 2037 if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
1871 ocfs2_meta_unlock(inode, ex); 2038 ocfs2_inode_unlock(inode, ex);
1872 ret = AOP_TRUNCATED_PAGE; 2039 ret = AOP_TRUNCATED_PAGE;
1873 } 2040 }
1874 2041
1875 return ret; 2042 return ret;
1876} 2043}
1877 2044
1878int ocfs2_meta_lock_atime(struct inode *inode, 2045int ocfs2_inode_lock_atime(struct inode *inode,
1879 struct vfsmount *vfsmnt, 2046 struct vfsmount *vfsmnt,
1880 int *level) 2047 int *level)
1881{ 2048{
1882 int ret; 2049 int ret;
1883 2050
1884 mlog_entry_void(); 2051 mlog_entry_void();
1885 ret = ocfs2_meta_lock(inode, NULL, 0); 2052 ret = ocfs2_inode_lock(inode, NULL, 0);
1886 if (ret < 0) { 2053 if (ret < 0) {
1887 mlog_errno(ret); 2054 mlog_errno(ret);
1888 return ret; 2055 return ret;
@@ -1895,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1895 if (ocfs2_should_update_atime(inode, vfsmnt)) { 2062 if (ocfs2_should_update_atime(inode, vfsmnt)) {
1896 struct buffer_head *bh = NULL; 2063 struct buffer_head *bh = NULL;
1897 2064
1898 ocfs2_meta_unlock(inode, 0); 2065 ocfs2_inode_unlock(inode, 0);
1899 ret = ocfs2_meta_lock(inode, &bh, 1); 2066 ret = ocfs2_inode_lock(inode, &bh, 1);
1900 if (ret < 0) { 2067 if (ret < 0) {
1901 mlog_errno(ret); 2068 mlog_errno(ret);
1902 return ret; 2069 return ret;
@@ -1913,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1913 return ret; 2080 return ret;
1914} 2081}
1915 2082
1916void ocfs2_meta_unlock(struct inode *inode, 2083void ocfs2_inode_unlock(struct inode *inode,
1917 int ex) 2084 int ex)
1918{ 2085{
1919 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2086 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1920 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 2087 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
1921 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2088 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1922 2089
1923 mlog_entry_void(); 2090 mlog_entry_void();
@@ -2325,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2325 goto bail; 2492 goto bail;
2326 } 2493 }
2327 2494
2328 /* launch vote thread */ 2495 /* launch downconvert thread */
2329 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); 2496 osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
2330 if (IS_ERR(osb->vote_task)) { 2497 if (IS_ERR(osb->dc_task)) {
2331 status = PTR_ERR(osb->vote_task); 2498 status = PTR_ERR(osb->dc_task);
2332 osb->vote_task = NULL; 2499 osb->dc_task = NULL;
2333 mlog_errno(status); 2500 mlog_errno(status);
2334 goto bail; 2501 goto bail;
2335 } 2502 }
@@ -2358,8 +2525,8 @@ local:
2358bail: 2525bail:
2359 if (status < 0) { 2526 if (status < 0) {
2360 ocfs2_dlm_shutdown_debug(osb); 2527 ocfs2_dlm_shutdown_debug(osb);
2361 if (osb->vote_task) 2528 if (osb->dc_task)
2362 kthread_stop(osb->vote_task); 2529 kthread_stop(osb->dc_task);
2363 } 2530 }
2364 2531
2365 mlog_exit(status); 2532 mlog_exit(status);
@@ -2374,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2374 2541
2375 ocfs2_drop_osb_locks(osb); 2542 ocfs2_drop_osb_locks(osb);
2376 2543
2377 if (osb->vote_task) { 2544 if (osb->dc_task) {
2378 kthread_stop(osb->vote_task); 2545 kthread_stop(osb->dc_task);
2379 osb->vote_task = NULL; 2546 osb->dc_task = NULL;
2380 } 2547 }
2381 2548
2382 ocfs2_lock_res_free(&osb->osb_super_lockres); 2549 ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2532,7 +2699,7 @@ out:
2532 2699
2533/* Mark the lockres as being dropped. It will no longer be 2700/* Mark the lockres as being dropped. It will no longer be
2534 * queued if blocking, but we still may have to wait on it 2701 * queued if blocking, but we still may have to wait on it
2535 * being dequeued from the vote thread before we can consider 2702 * being dequeued from the downconvert thread before we can consider
2536 * it safe to drop. 2703 * it safe to drop.
2537 * 2704 *
2538 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 2705 * You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2595,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2595 status = err; 2762 status = err;
2596 2763
2597 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2764 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2598 &OCFS2_I(inode)->ip_data_lockres); 2765 &OCFS2_I(inode)->ip_inode_lockres);
2599 if (err < 0)
2600 mlog_errno(err);
2601 if (err < 0 && !status)
2602 status = err;
2603
2604 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2605 &OCFS2_I(inode)->ip_meta_lockres);
2606 if (err < 0) 2766 if (err < 0)
2607 mlog_errno(err); 2767 mlog_errno(err);
2608 if (err < 0 && !status) 2768 if (err < 0 && !status)
@@ -2855,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2855 inode = ocfs2_lock_res_inode(lockres); 3015 inode = ocfs2_lock_res_inode(lockres);
2856 mapping = inode->i_mapping; 3016 mapping = inode->i_mapping;
2857 3017
3018 if (S_ISREG(inode->i_mode))
3019 goto out;
3020
2858 /* 3021 /*
2859 * We need this before the filemap_fdatawrite() so that it can 3022 * We need this before the filemap_fdatawrite() so that it can
2860 * transfer the dirty bit from the PTE to the 3023 * transfer the dirty bit from the PTE to the
@@ -2880,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2880 filemap_fdatawait(mapping); 3043 filemap_fdatawait(mapping);
2881 } 3044 }
2882 3045
3046out:
2883 return UNBLOCK_CONTINUE; 3047 return UNBLOCK_CONTINUE;
2884} 3048}
2885 3049
@@ -2908,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
2908 3072
2909/* 3073/*
2910 * Does the final reference drop on our dentry lock. Right now this 3074 * Does the final reference drop on our dentry lock. Right now this
2911 * happens in the vote thread, but we could choose to simplify the 3075 * happens in the downconvert thread, but we could choose to simplify the
2912 * dlmglue API and push these off to the ocfs2_wq in the future. 3076 * dlmglue API and push these off to the ocfs2_wq in the future.
2913 */ 3077 */
2914static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 3078static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3047,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3047 mlog(0, "lockres %s blocked.\n", lockres->l_name); 3211 mlog(0, "lockres %s blocked.\n", lockres->l_name);
3048 3212
3049 /* Detect whether a lock has been marked as going away while 3213 /* Detect whether a lock has been marked as going away while
3050 * the vote thread was processing other things. A lock can 3214 * the downconvert thread was processing other things. A lock can
3051 * still be marked with OCFS2_LOCK_FREEING after this check, 3215 * still be marked with OCFS2_LOCK_FREEING after this check,
3052 * but short circuiting here will still save us some 3216 * but short circuiting here will still save us some
3053 * performance. */ 3217 * performance. */
@@ -3096,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3096 3260
3097 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); 3261 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
3098 3262
3099 spin_lock(&osb->vote_task_lock); 3263 spin_lock(&osb->dc_task_lock);
3100 if (list_empty(&lockres->l_blocked_list)) { 3264 if (list_empty(&lockres->l_blocked_list)) {
3101 list_add_tail(&lockres->l_blocked_list, 3265 list_add_tail(&lockres->l_blocked_list,
3102 &osb->blocked_lock_list); 3266 &osb->blocked_lock_list);
3103 osb->blocked_lock_count++; 3267 osb->blocked_lock_count++;
3104 } 3268 }
3105 spin_unlock(&osb->vote_task_lock); 3269 spin_unlock(&osb->dc_task_lock);
3106 3270
3107 mlog_exit_void(); 3271 mlog_exit_void();
3108} 3272}
3273
3274static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
3275{
3276 unsigned long processed;
3277 struct ocfs2_lock_res *lockres;
3278
3279 mlog_entry_void();
3280
3281 spin_lock(&osb->dc_task_lock);
3282 /* grab this early so we know to try again if a state change and
3283 * wake happens part-way through our work */
3284 osb->dc_work_sequence = osb->dc_wake_sequence;
3285
3286 processed = osb->blocked_lock_count;
3287 while (processed) {
3288 BUG_ON(list_empty(&osb->blocked_lock_list));
3289
3290 lockres = list_entry(osb->blocked_lock_list.next,
3291 struct ocfs2_lock_res, l_blocked_list);
3292 list_del_init(&lockres->l_blocked_list);
3293 osb->blocked_lock_count--;
3294 spin_unlock(&osb->dc_task_lock);
3295
3296 BUG_ON(!processed);
3297 processed--;
3298
3299 ocfs2_process_blocked_lock(osb, lockres);
3300
3301 spin_lock(&osb->dc_task_lock);
3302 }
3303 spin_unlock(&osb->dc_task_lock);
3304
3305 mlog_exit_void();
3306}
3307
3308static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
3309{
3310 int empty = 0;
3311
3312 spin_lock(&osb->dc_task_lock);
3313 if (list_empty(&osb->blocked_lock_list))
3314 empty = 1;
3315
3316 spin_unlock(&osb->dc_task_lock);
3317 return empty;
3318}
3319
3320static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
3321{
3322 int should_wake = 0;
3323
3324 spin_lock(&osb->dc_task_lock);
3325 if (osb->dc_work_sequence != osb->dc_wake_sequence)
3326 should_wake = 1;
3327 spin_unlock(&osb->dc_task_lock);
3328
3329 return should_wake;
3330}
3331
3332int ocfs2_downconvert_thread(void *arg)
3333{
3334 int status = 0;
3335 struct ocfs2_super *osb = arg;
3336
3337 /* only quit once we've been asked to stop and there is no more
3338 * work available */
3339 while (!(kthread_should_stop() &&
3340 ocfs2_downconvert_thread_lists_empty(osb))) {
3341
3342 wait_event_interruptible(osb->dc_event,
3343 ocfs2_downconvert_thread_should_wake(osb) ||
3344 kthread_should_stop());
3345
3346 mlog(0, "downconvert_thread: awoken\n");
3347
3348 ocfs2_downconvert_thread_do_work(osb);
3349 }
3350
3351 osb->dc_task = NULL;
3352 return status;
3353}
3354
3355void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
3356{
3357 spin_lock(&osb->dc_task_lock);
3358 /* make sure the voting thread gets a swipe at whatever changes
3359 * the caller may have made to the voting state */
3360 osb->dc_wake_sequence++;
3361 spin_unlock(&osb->dc_task_lock);
3362 wake_up(&osb->dc_event);
3363}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 87a785e41205..5f17243ba501 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,12 +49,12 @@ struct ocfs2_meta_lvb {
49 __be32 lvb_reserved2; 49 __be32 lvb_reserved2;
50}; 50};
51 51
52/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ 52/* ocfs2_inode_lock_full() 'arg_flags' flags */
53/* don't wait on recovery. */ 53/* don't wait on recovery. */
54#define OCFS2_META_LOCK_RECOVERY (0x01) 54#define OCFS2_META_LOCK_RECOVERY (0x01)
55/* Instruct the dlm not to queue ourselves on the other node. */ 55/* Instruct the dlm not to queue ourselves on the other node. */
56#define OCFS2_META_LOCK_NOQUEUE (0x02) 56#define OCFS2_META_LOCK_NOQUEUE (0x02)
57/* don't block waiting for the vote thread, instead return -EAGAIN */ 57/* don't block waiting for the downconvert thread, instead return -EAGAIN */
58#define OCFS2_LOCK_NONBLOCK (0x04) 58#define OCFS2_LOCK_NONBLOCK (0x04)
59 59
60int ocfs2_dlm_init(struct ocfs2_super *osb); 60int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
66 struct inode *inode); 66 struct inode *inode);
67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, 67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
68 u64 parent, struct inode *inode); 68 u64 parent, struct inode *inode);
69struct ocfs2_file_private;
70void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
71 struct ocfs2_file_private *fp);
69void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 72void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
70int ocfs2_create_new_inode_locks(struct inode *inode); 73int ocfs2_create_new_inode_locks(struct inode *inode);
71int ocfs2_drop_inode_locks(struct inode *inode); 74int ocfs2_drop_inode_locks(struct inode *inode);
72int ocfs2_data_lock_full(struct inode *inode,
73 int write,
74 int arg_flags);
75#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
76int ocfs2_data_lock_with_page(struct inode *inode,
77 int write,
78 struct page *page);
79void ocfs2_data_unlock(struct inode *inode,
80 int write);
81int ocfs2_rw_lock(struct inode *inode, int write); 75int ocfs2_rw_lock(struct inode *inode, int write);
82void ocfs2_rw_unlock(struct inode *inode, int write); 76void ocfs2_rw_unlock(struct inode *inode, int write);
83int ocfs2_open_lock(struct inode *inode); 77int ocfs2_open_lock(struct inode *inode);
84int ocfs2_try_open_lock(struct inode *inode, int write); 78int ocfs2_try_open_lock(struct inode *inode, int write);
85void ocfs2_open_unlock(struct inode *inode); 79void ocfs2_open_unlock(struct inode *inode);
86int ocfs2_meta_lock_atime(struct inode *inode, 80int ocfs2_inode_lock_atime(struct inode *inode,
87 struct vfsmount *vfsmnt, 81 struct vfsmount *vfsmnt,
88 int *level); 82 int *level);
89int ocfs2_meta_lock_full(struct inode *inode, 83int ocfs2_inode_lock_full(struct inode *inode,
90 struct buffer_head **ret_bh, 84 struct buffer_head **ret_bh,
91 int ex, 85 int ex,
92 int arg_flags); 86 int arg_flags);
93int ocfs2_meta_lock_with_page(struct inode *inode, 87int ocfs2_inode_lock_with_page(struct inode *inode,
94 struct buffer_head **ret_bh, 88 struct buffer_head **ret_bh,
95 int ex, 89 int ex,
96 struct page *page); 90 struct page *page);
97/* 99% of the time we don't want to supply any additional flags -- 91/* 99% of the time we don't want to supply any additional flags --
98 * those are for very specific cases only. */ 92 * those are for very specific cases only. */
99#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0) 93#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
100void ocfs2_meta_unlock(struct inode *inode, 94void ocfs2_inode_unlock(struct inode *inode,
101 int ex); 95 int ex);
102int ocfs2_super_lock(struct ocfs2_super *osb, 96int ocfs2_super_lock(struct ocfs2_super *osb,
103 int ex); 97 int ex);
@@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
107void ocfs2_rename_unlock(struct ocfs2_super *osb); 101void ocfs2_rename_unlock(struct ocfs2_super *osb);
108int ocfs2_dentry_lock(struct dentry *dentry, int ex); 102int ocfs2_dentry_lock(struct dentry *dentry, int ex);
109void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 103void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
104int ocfs2_file_lock(struct file *file, int ex, int trylock);
105void ocfs2_file_unlock(struct file *file);
110 106
111void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 107void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
112void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 108void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
113 struct ocfs2_lock_res *lockres); 109 struct ocfs2_lock_res *lockres);
114 110
115/* for the vote thread */ 111/* for the downconvert thread */
116void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 112void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
117 struct ocfs2_lock_res *lockres); 113 struct ocfs2_lock_res *lockres);
114void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
118 115
119struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); 116struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
120void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); 117void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index ff257628af16..1942e09f6ee5 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
37 *var = cpu_to_le64(le64_to_cpu(*var) + val); 37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38} 38}
39 39
40static inline void le32_and_cpu(__le32 *var, u32 val)
41{
42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
43}
44
45static inline void be32_add_cpu(__be32 *var, u32 val) 40static inline void be32_add_cpu(__be32 *var, u32 val)
46{ 41{
47 *var = cpu_to_be32(be32_to_cpu(*var) + val); 42 *var = cpu_to_be32(be32_to_cpu(*var) + val);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 535bfa9568a4..67527cebf214 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
58 return ERR_PTR(-ESTALE); 58 return ERR_PTR(-ESTALE);
59 } 59 }
60 60
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); 61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
62 62
63 if (IS_ERR(inode)) 63 if (IS_ERR(inode))
64 return (void *)inode; 64 return (void *)inode;
@@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
95 mlog(0, "find parent of directory %llu\n", 95 mlog(0, "find parent of directory %llu\n",
96 (unsigned long long)OCFS2_I(dir)->ip_blkno); 96 (unsigned long long)OCFS2_I(dir)->ip_blkno);
97 97
98 status = ocfs2_meta_lock(dir, NULL, 0); 98 status = ocfs2_inode_lock(dir, NULL, 0);
99 if (status < 0) { 99 if (status < 0) {
100 if (status != -ENOENT) 100 if (status != -ENOENT)
101 mlog_errno(status); 101 mlog_errno(status);
@@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
109 goto bail_unlock; 109 goto bail_unlock;
110 } 110 }
111 111
112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
113 if (IS_ERR(inode)) { 113 if (IS_ERR(inode)) {
114 mlog(ML_ERROR, "Unable to create inode %llu\n", 114 mlog(ML_ERROR, "Unable to create inode %llu\n",
115 (unsigned long long)blkno); 115 (unsigned long long)blkno);
@@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
126 parent->d_op = &ocfs2_dentry_ops; 126 parent->d_op = &ocfs2_dentry_ops;
127 127
128bail_unlock: 128bail_unlock:
129 ocfs2_meta_unlock(dir, 0); 129 ocfs2_inode_unlock(dir, 0);
130 130
131bail: 131bail:
132 mlog_exit_ptr(parent); 132 mlog_exit_ptr(parent);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f92fe91ff260..ed5d5232e85d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -51,6 +51,7 @@
51#include "inode.h" 51#include "inode.h"
52#include "ioctl.h" 52#include "ioctl.h"
53#include "journal.h" 53#include "journal.h"
54#include "locks.h"
54#include "mmap.h" 55#include "mmap.h"
55#include "suballoc.h" 56#include "suballoc.h"
56#include "super.h" 57#include "super.h"
@@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
63 return sync_mapping_buffers(inode->i_mapping); 64 return sync_mapping_buffers(inode->i_mapping);
64} 65}
65 66
67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
68{
69 struct ocfs2_file_private *fp;
70
71 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
72 if (!fp)
73 return -ENOMEM;
74
75 fp->fp_file = file;
76 mutex_init(&fp->fp_mutex);
77 ocfs2_file_lock_res_init(&fp->fp_flock, fp);
78 file->private_data = fp;
79
80 return 0;
81}
82
83static void ocfs2_free_file_private(struct inode *inode, struct file *file)
84{
85 struct ocfs2_file_private *fp = file->private_data;
86 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
87
88 if (fp) {
89 ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
90 ocfs2_lock_res_free(&fp->fp_flock);
91 kfree(fp);
92 file->private_data = NULL;
93 }
94}
95
66static int ocfs2_file_open(struct inode *inode, struct file *file) 96static int ocfs2_file_open(struct inode *inode, struct file *file)
67{ 97{
68 int status; 98 int status;
@@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
89 119
90 oi->ip_open_count++; 120 oi->ip_open_count++;
91 spin_unlock(&oi->ip_lock); 121 spin_unlock(&oi->ip_lock);
92 status = 0; 122
123 status = ocfs2_init_file_private(inode, file);
124 if (status) {
125 /*
126 * We want to set open count back if we're failing the
127 * open.
128 */
129 spin_lock(&oi->ip_lock);
130 oi->ip_open_count--;
131 spin_unlock(&oi->ip_lock);
132 }
133
93leave: 134leave:
94 mlog_exit(status); 135 mlog_exit(status);
95 return status; 136 return status;
@@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
108 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
109 spin_unlock(&oi->ip_lock); 150 spin_unlock(&oi->ip_lock);
110 151
152 ocfs2_free_file_private(inode, file);
153
111 mlog_exit(0); 154 mlog_exit(0);
112 155
113 return 0; 156 return 0;
114} 157}
115 158
159static int ocfs2_dir_open(struct inode *inode, struct file *file)
160{
161 return ocfs2_init_file_private(inode, file);
162}
163
164static int ocfs2_dir_release(struct inode *inode, struct file *file)
165{
166 ocfs2_free_file_private(inode, file);
167 return 0;
168}
169
116static int ocfs2_sync_file(struct file *file, 170static int ocfs2_sync_file(struct file *file,
117 struct dentry *dentry, 171 struct dentry *dentry,
118 int datasync) 172 int datasync)
@@ -382,28 +436,23 @@ static int ocfs2_truncate_file(struct inode *inode,
382 436
383 down_write(&OCFS2_I(inode)->ip_alloc_sem); 437 down_write(&OCFS2_I(inode)->ip_alloc_sem);
384 438
385 /* This forces other nodes to sync and drop their pages. Do 439 /*
386 * this even if we have a truncate without allocation change - 440 * The inode lock forced other nodes to sync and drop their
387 * ocfs2 cluster sizes can be much greater than page size, so 441 * pages, which (correctly) happens even if we have a truncate
388 * we have to truncate them anyway. */ 442 * without allocation change - ocfs2 cluster sizes can be much
389 status = ocfs2_data_lock(inode, 1); 443 * greater than page size, so we have to truncate them
390 if (status < 0) { 444 * anyway.
391 up_write(&OCFS2_I(inode)->ip_alloc_sem); 445 */
392
393 mlog_errno(status);
394 goto bail;
395 }
396
397 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 446 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
398 truncate_inode_pages(inode->i_mapping, new_i_size); 447 truncate_inode_pages(inode->i_mapping, new_i_size);
399 448
400 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 449 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
401 status = ocfs2_truncate_inline(inode, di_bh, new_i_size, 450 status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
402 i_size_read(inode), 0); 451 i_size_read(inode), 1);
403 if (status) 452 if (status)
404 mlog_errno(status); 453 mlog_errno(status);
405 454
406 goto bail_unlock_data; 455 goto bail_unlock_sem;
407 } 456 }
408 457
409 /* alright, we're going to need to do a full blown alloc size 458 /* alright, we're going to need to do a full blown alloc size
@@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode,
413 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 462 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
414 if (status < 0) { 463 if (status < 0) {
415 mlog_errno(status); 464 mlog_errno(status);
416 goto bail_unlock_data; 465 goto bail_unlock_sem;
417 } 466 }
418 467
419 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 468 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
420 if (status < 0) { 469 if (status < 0) {
421 mlog_errno(status); 470 mlog_errno(status);
422 goto bail_unlock_data; 471 goto bail_unlock_sem;
423 } 472 }
424 473
425 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 474 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
426 if (status < 0) { 475 if (status < 0) {
427 mlog_errno(status); 476 mlog_errno(status);
428 goto bail_unlock_data; 477 goto bail_unlock_sem;
429 } 478 }
430 479
431 /* TODO: orphan dir cleanup here. */ 480 /* TODO: orphan dir cleanup here. */
432bail_unlock_data: 481bail_unlock_sem:
433 ocfs2_data_unlock(inode, 1);
434
435 up_write(&OCFS2_I(inode)->ip_alloc_sem); 482 up_write(&OCFS2_I(inode)->ip_alloc_sem);
436 483
437bail: 484bail:
@@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
579 626
580 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 627 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
581 "clusters_to_add = %u, extents_to_split = %u\n", 628 "clusters_to_add = %u, extents_to_split = %u\n",
582 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 629 (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
583 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); 630 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
584 631
585 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 632 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@@ -760,7 +807,7 @@ restarted_transaction:
760 le32_to_cpu(fe->i_clusters), 807 le32_to_cpu(fe->i_clusters),
761 (unsigned long long)le64_to_cpu(fe->i_size)); 808 (unsigned long long)le64_to_cpu(fe->i_size));
762 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 809 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
763 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 810 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
764 811
765leave: 812leave:
766 if (handle) { 813 if (handle) {
@@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode,
917 struct buffer_head *di_bh, 964 struct buffer_head *di_bh,
918 u64 new_i_size) 965 u64 new_i_size)
919{ 966{
920 int ret = 0, data_locked = 0; 967 int ret = 0;
921 struct ocfs2_inode_info *oi = OCFS2_I(inode); 968 struct ocfs2_inode_info *oi = OCFS2_I(inode);
922 969
923 BUG_ON(!di_bh); 970 BUG_ON(!di_bh);
@@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode,
943 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 990 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
944 goto out_update_size; 991 goto out_update_size;
945 992
946 /*
947 * protect the pages that ocfs2_zero_extend is going to be
948 * pulling into the page cache.. we do this before the
949 * metadata extend so that we don't get into the situation
950 * where we've extended the metadata but can't get the data
951 * lock to zero.
952 */
953 ret = ocfs2_data_lock(inode, 1);
954 if (ret < 0) {
955 mlog_errno(ret);
956 goto out;
957 }
958 data_locked = 1;
959
960 /* 993 /*
961 * The alloc sem blocks people in read/write from reading our 994 * The alloc sem blocks people in read/write from reading our
962 * allocation until we're done changing it. We depend on 995 * allocation until we're done changing it. We depend on
@@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode,
980 up_write(&oi->ip_alloc_sem); 1013 up_write(&oi->ip_alloc_sem);
981 1014
982 mlog_errno(ret); 1015 mlog_errno(ret);
983 goto out_unlock; 1016 goto out;
984 } 1017 }
985 } 1018 }
986 1019
@@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode,
991 1024
992 if (ret < 0) { 1025 if (ret < 0) {
993 mlog_errno(ret); 1026 mlog_errno(ret);
994 goto out_unlock; 1027 goto out;
995 } 1028 }
996 1029
997out_update_size: 1030out_update_size:
@@ -999,10 +1032,6 @@ out_update_size:
999 if (ret < 0) 1032 if (ret < 0)
1000 mlog_errno(ret); 1033 mlog_errno(ret);
1001 1034
1002out_unlock:
1003 if (data_locked)
1004 ocfs2_data_unlock(inode, 1);
1005
1006out: 1035out:
1007 return ret; 1036 return ret;
1008} 1037}
@@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1050 } 1079 }
1051 } 1080 }
1052 1081
1053 status = ocfs2_meta_lock(inode, &bh, 1); 1082 status = ocfs2_inode_lock(inode, &bh, 1);
1054 if (status < 0) { 1083 if (status < 0) {
1055 if (status != -ENOENT) 1084 if (status != -ENOENT)
1056 mlog_errno(status); 1085 mlog_errno(status);
@@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1102bail_commit: 1131bail_commit:
1103 ocfs2_commit_trans(osb, handle); 1132 ocfs2_commit_trans(osb, handle);
1104bail_unlock: 1133bail_unlock:
1105 ocfs2_meta_unlock(inode, 1); 1134 ocfs2_inode_unlock(inode, 1);
1106bail_unlock_rw: 1135bail_unlock_rw:
1107 if (size_change) 1136 if (size_change)
1108 ocfs2_rw_unlock(inode, 1); 1137 ocfs2_rw_unlock(inode, 1);
@@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1149 1178
1150 mlog_entry_void(); 1179 mlog_entry_void();
1151 1180
1152 ret = ocfs2_meta_lock(inode, NULL, 0); 1181 ret = ocfs2_inode_lock(inode, NULL, 0);
1153 if (ret) { 1182 if (ret) {
1154 if (ret != -ENOENT) 1183 if (ret != -ENOENT)
1155 mlog_errno(ret); 1184 mlog_errno(ret);
@@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1158 1187
1159 ret = generic_permission(inode, mask, NULL); 1188 ret = generic_permission(inode, mask, NULL);
1160 1189
1161 ocfs2_meta_unlock(inode, 0); 1190 ocfs2_inode_unlock(inode, 0);
1162out: 1191out:
1163 mlog_exit(ret); 1192 mlog_exit(ret);
1164 return ret; 1193 return ret;
@@ -1521,6 +1550,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1521 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; 1550 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1522 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1551 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1523 struct ocfs2_cached_dealloc_ctxt dealloc; 1552 struct ocfs2_cached_dealloc_ctxt dealloc;
1553 struct address_space *mapping = inode->i_mapping;
1524 1554
1525 ocfs2_init_dealloc_ctxt(&dealloc); 1555 ocfs2_init_dealloc_ctxt(&dealloc);
1526 1556
@@ -1529,10 +1559,20 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1529 1559
1530 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1560 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1531 ret = ocfs2_truncate_inline(inode, di_bh, byte_start, 1561 ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1532 byte_start + byte_len, 1); 1562 byte_start + byte_len, 0);
1533 if (ret) 1563 if (ret) {
1534 mlog_errno(ret); 1564 mlog_errno(ret);
1535 return ret; 1565 goto out;
1566 }
1567 /*
1568 * There's no need to get fancy with the page cache
1569 * truncate of an inline-data inode. We're talking
1570 * about less than a page here, which will be cached
1571 * in the dinode buffer anyway.
1572 */
1573 unmap_mapping_range(mapping, 0, 0, 0);
1574 truncate_inode_pages(mapping, 0);
1575 goto out;
1536 } 1576 }
1537 1577
1538 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1578 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
@@ -1619,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1619 goto out; 1659 goto out;
1620 } 1660 }
1621 1661
1622 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1662 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1623 if (ret) { 1663 if (ret) {
1624 mlog_errno(ret); 1664 mlog_errno(ret);
1625 goto out_rw_unlock; 1665 goto out_rw_unlock;
@@ -1627,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1627 1667
1628 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1668 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1629 ret = -EPERM; 1669 ret = -EPERM;
1630 goto out_meta_unlock; 1670 goto out_inode_unlock;
1631 } 1671 }
1632 1672
1633 switch (sr->l_whence) { 1673 switch (sr->l_whence) {
@@ -1641,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1641 break; 1681 break;
1642 default: 1682 default:
1643 ret = -EINVAL; 1683 ret = -EINVAL;
1644 goto out_meta_unlock; 1684 goto out_inode_unlock;
1645 } 1685 }
1646 sr->l_whence = 0; 1686 sr->l_whence = 0;
1647 1687
@@ -1652,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1652 || (sr->l_start + llen) < 0 1692 || (sr->l_start + llen) < 0
1653 || (sr->l_start + llen) > max_off) { 1693 || (sr->l_start + llen) > max_off) {
1654 ret = -EINVAL; 1694 ret = -EINVAL;
1655 goto out_meta_unlock; 1695 goto out_inode_unlock;
1656 } 1696 }
1657 size = sr->l_start + sr->l_len; 1697 size = sr->l_start + sr->l_len;
1658 1698
1659 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1699 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1660 if (sr->l_len <= 0) { 1700 if (sr->l_len <= 0) {
1661 ret = -EINVAL; 1701 ret = -EINVAL;
1662 goto out_meta_unlock; 1702 goto out_inode_unlock;
1663 } 1703 }
1664 } 1704 }
1665 1705
@@ -1667,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1667 ret = __ocfs2_write_remove_suid(inode, di_bh); 1707 ret = __ocfs2_write_remove_suid(inode, di_bh);
1668 if (ret) { 1708 if (ret) {
1669 mlog_errno(ret); 1709 mlog_errno(ret);
1670 goto out_meta_unlock; 1710 goto out_inode_unlock;
1671 } 1711 }
1672 } 1712 }
1673 1713
@@ -1693,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1693 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1733 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1694 if (ret) { 1734 if (ret) {
1695 mlog_errno(ret); 1735 mlog_errno(ret);
1696 goto out_meta_unlock; 1736 goto out_inode_unlock;
1697 } 1737 }
1698 1738
1699 /* 1739 /*
@@ -1703,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1703 if (IS_ERR(handle)) { 1743 if (IS_ERR(handle)) {
1704 ret = PTR_ERR(handle); 1744 ret = PTR_ERR(handle);
1705 mlog_errno(ret); 1745 mlog_errno(ret);
1706 goto out_meta_unlock; 1746 goto out_inode_unlock;
1707 } 1747 }
1708 1748
1709 if (change_size && i_size_read(inode) < size) 1749 if (change_size && i_size_read(inode) < size)
@@ -1716,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1716 1756
1717 ocfs2_commit_trans(osb, handle); 1757 ocfs2_commit_trans(osb, handle);
1718 1758
1719out_meta_unlock: 1759out_inode_unlock:
1720 brelse(di_bh); 1760 brelse(di_bh);
1721 ocfs2_meta_unlock(inode, 1); 1761 ocfs2_inode_unlock(inode, 1);
1722out_rw_unlock: 1762out_rw_unlock:
1723 ocfs2_rw_unlock(inode, 1); 1763 ocfs2_rw_unlock(inode, 1);
1724 1764
@@ -1788,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1788 * if we need to make modifications here. 1828 * if we need to make modifications here.
1789 */ 1829 */
1790 for(;;) { 1830 for(;;) {
1791 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1831 ret = ocfs2_inode_lock(inode, NULL, meta_level);
1792 if (ret < 0) { 1832 if (ret < 0) {
1793 meta_level = -1; 1833 meta_level = -1;
1794 mlog_errno(ret); 1834 mlog_errno(ret);
@@ -1806,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1806 * set inode->i_size at the end of a write. */ 1846 * set inode->i_size at the end of a write. */
1807 if (should_remove_suid(dentry)) { 1847 if (should_remove_suid(dentry)) {
1808 if (meta_level == 0) { 1848 if (meta_level == 0) {
1809 ocfs2_meta_unlock(inode, meta_level); 1849 ocfs2_inode_unlock(inode, meta_level);
1810 meta_level = 1; 1850 meta_level = 1;
1811 continue; 1851 continue;
1812 } 1852 }
@@ -1875,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1875 *ppos = saved_pos; 1915 *ppos = saved_pos;
1876 1916
1877out_unlock: 1917out_unlock:
1878 ocfs2_meta_unlock(inode, meta_level); 1918 ocfs2_inode_unlock(inode, meta_level);
1879 1919
1880out: 1920out:
1881 return ret; 1921 return ret;
@@ -1891,9 +1931,11 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1891 ssize_t written = 0; 1931 ssize_t written = 0;
1892 size_t ocount; /* original count */ 1932 size_t ocount; /* original count */
1893 size_t count; /* after file limit checks */ 1933 size_t count; /* after file limit checks */
1894 loff_t *ppos = &iocb->ki_pos; 1934 loff_t old_size, *ppos = &iocb->ki_pos;
1935 u32 old_clusters;
1895 struct file *file = iocb->ki_filp; 1936 struct file *file = iocb->ki_filp;
1896 struct inode *inode = file->f_path.dentry->d_inode; 1937 struct inode *inode = file->f_path.dentry->d_inode;
1938 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1897 1939
1898 mlog_entry("(0x%p, %u, '%.*s')\n", file, 1940 mlog_entry("(0x%p, %u, '%.*s')\n", file,
1899 (unsigned int)nr_segs, 1941 (unsigned int)nr_segs,
@@ -1949,6 +1991,13 @@ relock:
1949 goto relock; 1991 goto relock;
1950 } 1992 }
1951 1993
1994 /*
1995 * To later detect whether a journal commit for sync writes is
1996 * necessary, we sample i_size, and cluster count here.
1997 */
1998 old_size = i_size_read(inode);
1999 old_clusters = OCFS2_I(inode)->ip_clusters;
2000
1952 /* communicate with ocfs2_dio_end_io */ 2001 /* communicate with ocfs2_dio_end_io */
1953 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2002 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1954 2003
@@ -1978,6 +2027,21 @@ out_dio:
1978 /* buffered aio wouldn't have proper lock coverage today */ 2027 /* buffered aio wouldn't have proper lock coverage today */
1979 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2028 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1980 2029
2030 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
2031 /*
2032 * The generic write paths have handled getting data
2033 * to disk, but since we don't make use of the dirty
2034 * inode list, a manual journal commit is necessary
2035 * here.
2036 */
2037 if (old_size != i_size_read(inode) ||
2038 old_clusters != OCFS2_I(inode)->ip_clusters) {
2039 ret = journal_force_commit(osb->journal->j_journal);
2040 if (ret < 0)
2041 written = ret;
2042 }
2043 }
2044
1981 /* 2045 /*
1982 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2046 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1983 * function pointer which is called when o_direct io completes so that 2047 * function pointer which is called when o_direct io completes so that
@@ -2064,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2064 /* 2128 /*
2065 * See the comment in ocfs2_file_aio_read() 2129 * See the comment in ocfs2_file_aio_read()
2066 */ 2130 */
2067 ret = ocfs2_meta_lock(inode, NULL, 0); 2131 ret = ocfs2_inode_lock(inode, NULL, 0);
2068 if (ret < 0) { 2132 if (ret < 0) {
2069 mlog_errno(ret); 2133 mlog_errno(ret);
2070 goto bail; 2134 goto bail;
2071 } 2135 }
2072 ocfs2_meta_unlock(inode, 0); 2136 ocfs2_inode_unlock(inode, 0);
2073 2137
2074 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2138 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2075 2139
@@ -2125,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2125 * like i_size. This allows the checks down below 2189 * like i_size. This allows the checks down below
2126 * generic_file_aio_read() a chance of actually working. 2190 * generic_file_aio_read() a chance of actually working.
2127 */ 2191 */
2128 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2192 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2129 if (ret < 0) { 2193 if (ret < 0) {
2130 mlog_errno(ret); 2194 mlog_errno(ret);
2131 goto bail; 2195 goto bail;
2132 } 2196 }
2133 ocfs2_meta_unlock(inode, lock_level); 2197 ocfs2_inode_unlock(inode, lock_level);
2134 2198
2135 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2199 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2136 if (ret == -EINVAL) 2200 if (ret == -EINVAL)
@@ -2169,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
2169}; 2233};
2170 2234
2171const struct file_operations ocfs2_fops = { 2235const struct file_operations ocfs2_fops = {
2236 .llseek = generic_file_llseek,
2172 .read = do_sync_read, 2237 .read = do_sync_read,
2173 .write = do_sync_write, 2238 .write = do_sync_write,
2174 .mmap = ocfs2_mmap, 2239 .mmap = ocfs2_mmap,
@@ -2181,16 +2246,21 @@ const struct file_operations ocfs2_fops = {
2181#ifdef CONFIG_COMPAT 2246#ifdef CONFIG_COMPAT
2182 .compat_ioctl = ocfs2_compat_ioctl, 2247 .compat_ioctl = ocfs2_compat_ioctl,
2183#endif 2248#endif
2249 .flock = ocfs2_flock,
2184 .splice_read = ocfs2_file_splice_read, 2250 .splice_read = ocfs2_file_splice_read,
2185 .splice_write = ocfs2_file_splice_write, 2251 .splice_write = ocfs2_file_splice_write,
2186}; 2252};
2187 2253
2188const struct file_operations ocfs2_dops = { 2254const struct file_operations ocfs2_dops = {
2255 .llseek = generic_file_llseek,
2189 .read = generic_read_dir, 2256 .read = generic_read_dir,
2190 .readdir = ocfs2_readdir, 2257 .readdir = ocfs2_readdir,
2191 .fsync = ocfs2_sync_file, 2258 .fsync = ocfs2_sync_file,
2259 .release = ocfs2_dir_release,
2260 .open = ocfs2_dir_open,
2192 .ioctl = ocfs2_ioctl, 2261 .ioctl = ocfs2_ioctl,
2193#ifdef CONFIG_COMPAT 2262#ifdef CONFIG_COMPAT
2194 .compat_ioctl = ocfs2_compat_ioctl, 2263 .compat_ioctl = ocfs2_compat_ioctl,
2195#endif 2264#endif
2265 .flock = ocfs2_flock,
2196}; 2266};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 066f14add3a8..048ddcaf5c80 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
32extern const struct inode_operations ocfs2_special_file_iops; 32extern const struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context; 33struct ocfs2_alloc_context;
34 34
35struct ocfs2_file_private {
36 struct file *fp_file;
37 struct mutex fp_mutex;
38 struct ocfs2_lock_res fp_flock;
39};
40
35enum ocfs2_alloc_restarted { 41enum ocfs2_alloc_restarted {
36 RESTART_NONE = 0, 42 RESTART_NONE = 0,
37 RESTART_TRANS, 43 RESTART_TRANS,
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c4c36171240d..c0efd9489fe8 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,9 +30,6 @@
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/kmod.h> 31#include <linux/kmod.h>
32 32
33#include <cluster/heartbeat.h>
34#include <cluster/nodemanager.h>
35
36#include <dlm/dlmapi.h> 33#include <dlm/dlmapi.h>
37 34
38#define MLOG_MASK_PREFIX ML_SUPER 35#define MLOG_MASK_PREFIX ML_SUPER
@@ -44,13 +41,9 @@
44#include "heartbeat.h" 41#include "heartbeat.h"
45#include "inode.h" 42#include "inode.h"
46#include "journal.h" 43#include "journal.h"
47#include "vote.h"
48 44
49#include "buffer_head_io.h" 45#include "buffer_head_io.h"
50 46
51#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
52#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
53
54static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, 47static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
55 int bit); 48 int bit);
56static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, 49static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
64void ocfs2_init_node_maps(struct ocfs2_super *osb) 57void ocfs2_init_node_maps(struct ocfs2_super *osb)
65{ 58{
66 spin_lock_init(&osb->node_map_lock); 59 spin_lock_init(&osb->node_map_lock);
67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map); 60 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map);
70 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); 61 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
71} 62}
72 63
@@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num,
87 return; 78 return;
88 } 79 }
89 80
90 if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
91 /* If a node is in the umount map, then we've been
92 * expecting him to go down and we know ahead of time
93 * that recovery is not necessary. */
94 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
95 return;
96 }
97
98 ocfs2_recovery_thread(osb, node_num); 81 ocfs2_recovery_thread(osb, node_num);
99
100 ocfs2_remove_node_from_vote_queues(osb, node_num);
101}
102
103static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
104 int node_num,
105 void *data)
106{
107 ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
108} 82}
109 83
110/* Called from the dlm when it's about to evict a node. We may also 84/* Called from the dlm when it's about to evict a node. We may also
@@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
121 ocfs2_do_node_down(node_num, osb); 95 ocfs2_do_node_down(node_num, osb);
122} 96}
123 97
124static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
125 int node_num,
126 void *data)
127{
128 struct ocfs2_super *osb = data;
129
130 BUG_ON(osb->node_num == node_num);
131
132 mlog(0, "node up event for %d\n", node_num);
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) 98void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
137{ 99{
138 o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
139 ocfs2_hb_node_down_cb, osb,
140 OCFS2_HB_NODE_DOWN_PRI);
141
142 o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
143 ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
144
145 /* Not exactly a heartbeat callback, but leads to essentially 100 /* Not exactly a heartbeat callback, but leads to essentially
146 * the same path so we set it up here. */ 101 * the same path so we set it up here. */
147 dlm_setup_eviction_cb(&osb->osb_eviction_cb, 102 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
149 osb); 104 osb);
150} 105}
151 106
152/* Most functions here are just stubs for now... */
153int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
154{
155 int status;
156
157 if (ocfs2_mount_local(osb))
158 return 0;
159
160 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
161 if (status < 0) {
162 mlog_errno(status);
163 goto bail;
164 }
165
166 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
167 if (status < 0) {
168 mlog_errno(status);
169 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
170 }
171
172bail:
173 return status;
174}
175
176void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
177{
178 if (ocfs2_mount_local(osb))
179 return;
180
181 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
182 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
183}
184
185void ocfs2_stop_heartbeat(struct ocfs2_super *osb) 107void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
186{ 108{
187 int ret; 109 int ret;
@@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
341 263
342 spin_lock(&osb->node_map_lock); 264 spin_lock(&osb->node_map_lock);
343 265
344 __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
345
346 if (!test_bit(num, osb->recovery_map.map)) { 266 if (!test_bit(num, osb->recovery_map.map)) {
347 __ocfs2_node_map_set_bit(&osb->recovery_map, num); 267 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
348 set = 1; 268 set = 1;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e4..56859211888a 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,8 +29,6 @@
29void ocfs2_init_node_maps(struct ocfs2_super *osb); 29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30 30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); 31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
32int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
33void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
34void ocfs2_stop_heartbeat(struct ocfs2_super *osb); 32void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
35 33
36/* node map functions - used to keep track of mounted and in-recovery 34/* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 1d5e0cb0fda1..7e9e4c79aec7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,7 +49,6 @@
49#include "symlink.h" 49#include "symlink.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h" 51#include "uptodate.h"
52#include "vote.h"
53 52
54#include "buffer_head_io.h" 53#include "buffer_head_io.h"
55 54
@@ -58,8 +57,11 @@ struct ocfs2_find_inode_args
58 u64 fi_blkno; 57 u64 fi_blkno;
59 unsigned long fi_ino; 58 unsigned long fi_ino;
60 unsigned int fi_flags; 59 unsigned int fi_flags;
60 unsigned int fi_sysfile_type;
61}; 61};
62 62
63static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
64
63static int ocfs2_read_locked_inode(struct inode *inode, 65static int ocfs2_read_locked_inode(struct inode *inode,
64 struct ocfs2_find_inode_args *args); 66 struct ocfs2_find_inode_args *args);
65static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); 67static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
107 oi->ip_attr |= OCFS2_DIRSYNC_FL; 109 oi->ip_attr |= OCFS2_DIRSYNC_FL;
108} 110}
109 111
110struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) 112struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
113 int sysfile_type)
111{ 114{
112 struct inode *inode = NULL; 115 struct inode *inode = NULL;
113 struct super_block *sb = osb->sb; 116 struct super_block *sb = osb->sb;
@@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
127 args.fi_blkno = blkno; 130 args.fi_blkno = blkno;
128 args.fi_flags = flags; 131 args.fi_flags = flags;
129 args.fi_ino = ino_from_blkno(sb, blkno); 132 args.fi_ino = ino_from_blkno(sb, blkno);
133 args.fi_sysfile_type = sysfile_type;
130 134
131 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, 135 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
132 ocfs2_init_locked_inode, &args); 136 ocfs2_init_locked_inode, &args);
@@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
201 205
202 inode->i_ino = args->fi_ino; 206 inode->i_ino = args->fi_ino;
203 OCFS2_I(inode)->ip_blkno = args->fi_blkno; 207 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
208 if (args->fi_sysfile_type != 0)
209 lockdep_set_class(&inode->i_mutex,
210 &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
204 211
205 mlog_exit(0); 212 mlog_exit(0);
206 return 0; 213 return 0;
@@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
322 */ 329 */
323 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); 330 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
324 331
325 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 332 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
326 OCFS2_LOCK_TYPE_META, 0, inode); 333 OCFS2_LOCK_TYPE_META, 0, inode);
327 334
328 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, 335 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
333 OCFS2_LOCK_TYPE_RW, inode->i_generation, 340 OCFS2_LOCK_TYPE_RW, inode->i_generation,
334 inode); 341 inode);
335 342
336 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
337 OCFS2_LOCK_TYPE_DATA, inode->i_generation,
338 inode);
339
340 ocfs2_set_inode_flags(inode); 343 ocfs2_set_inode_flags(inode);
341 344
342 status = 0; 345 status = 0;
@@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
414 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 417 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
415 generation = osb->fs_generation; 418 generation = osb->fs_generation;
416 419
417 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 420 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
418 OCFS2_LOCK_TYPE_META, 421 OCFS2_LOCK_TYPE_META,
419 generation, inode); 422 generation, inode);
420 423
@@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
429 mlog_errno(status); 432 mlog_errno(status);
430 return status; 433 return status;
431 } 434 }
432 status = ocfs2_meta_lock(inode, NULL, 0); 435 status = ocfs2_inode_lock(inode, NULL, 0);
433 if (status) { 436 if (status) {
434 make_bad_inode(inode); 437 make_bad_inode(inode);
435 mlog_errno(status); 438 mlog_errno(status);
@@ -455,8 +458,8 @@ static int ocfs2_read_locked_inode(struct inode *inode,
455 status = -EINVAL; 458 status = -EINVAL;
456 fe = (struct ocfs2_dinode *) bh->b_data; 459 fe = (struct ocfs2_dinode *) bh->b_data;
457 if (!OCFS2_IS_VALID_DINODE(fe)) { 460 if (!OCFS2_IS_VALID_DINODE(fe)) {
458 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", 461 mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
459 (unsigned long long)le64_to_cpu(fe->i_blkno), 7, 462 (unsigned long long)args->fi_blkno, 7,
460 fe->i_signature); 463 fe->i_signature);
461 goto bail; 464 goto bail;
462 } 465 }
@@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
484 487
485bail: 488bail:
486 if (can_lock) 489 if (can_lock)
487 ocfs2_meta_unlock(inode, 0); 490 ocfs2_inode_unlock(inode, 0);
488 491
489 if (status < 0) 492 if (status < 0)
490 make_bad_inode(inode); 493 make_bad_inode(inode);
@@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode,
586 } 589 }
587 590
588 mutex_lock(&inode_alloc_inode->i_mutex); 591 mutex_lock(&inode_alloc_inode->i_mutex);
589 status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1); 592 status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
590 if (status < 0) { 593 if (status < 0) {
591 mutex_unlock(&inode_alloc_inode->i_mutex); 594 mutex_unlock(&inode_alloc_inode->i_mutex);
592 595
@@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
617 } 620 }
618 621
619 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 622 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
620 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 623 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
621 624
622 status = ocfs2_journal_dirty(handle, di_bh); 625 status = ocfs2_journal_dirty(handle, di_bh);
623 if (status < 0) { 626 if (status < 0) {
@@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
635bail_commit: 638bail_commit:
636 ocfs2_commit_trans(osb, handle); 639 ocfs2_commit_trans(osb, handle);
637bail_unlock: 640bail_unlock:
638 ocfs2_meta_unlock(inode_alloc_inode, 1); 641 ocfs2_inode_unlock(inode_alloc_inode, 1);
639 mutex_unlock(&inode_alloc_inode->i_mutex); 642 mutex_unlock(&inode_alloc_inode->i_mutex);
640 brelse(inode_alloc_bh); 643 brelse(inode_alloc_bh);
641bail: 644bail:
@@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
709 * delete_inode operation. We do this now to avoid races with 712 * delete_inode operation. We do this now to avoid races with
710 * recovery completion on other nodes. */ 713 * recovery completion on other nodes. */
711 mutex_lock(&orphan_dir_inode->i_mutex); 714 mutex_lock(&orphan_dir_inode->i_mutex);
712 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 715 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
713 if (status < 0) { 716 if (status < 0) {
714 mutex_unlock(&orphan_dir_inode->i_mutex); 717 mutex_unlock(&orphan_dir_inode->i_mutex);
715 718
@@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
718 } 721 }
719 722
720 /* we do this while holding the orphan dir lock because we 723 /* we do this while holding the orphan dir lock because we
721 * don't want recovery being run from another node to vote for 724 * don't want recovery being run from another node to try an
722 * an inode delete on us -- this will result in two nodes 725 * inode delete underneath us -- this will result in two nodes
723 * truncating the same file! */ 726 * truncating the same file! */
724 status = ocfs2_truncate_for_delete(osb, inode, di_bh); 727 status = ocfs2_truncate_for_delete(osb, inode, di_bh);
725 if (status < 0) { 728 if (status < 0) {
@@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
733 mlog_errno(status); 736 mlog_errno(status);
734 737
735bail_unlock_dir: 738bail_unlock_dir:
736 ocfs2_meta_unlock(orphan_dir_inode, 1); 739 ocfs2_inode_unlock(orphan_dir_inode, 1);
737 mutex_unlock(&orphan_dir_inode->i_mutex); 740 mutex_unlock(&orphan_dir_inode->i_mutex);
738 brelse(orphan_dir_bh); 741 brelse(orphan_dir_bh);
739bail: 742bail:
@@ -744,7 +747,7 @@ bail:
744} 747}
745 748
746/* There is a series of simple checks that should be done before a 749/* There is a series of simple checks that should be done before a
747 * vote is even considered. Encapsulate those in this function. */ 750 * trylock is even considered. Encapsulate those in this function. */
748static int ocfs2_inode_is_valid_to_delete(struct inode *inode) 751static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
749{ 752{
750 int ret = 0; 753 int ret = 0;
@@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
758 goto bail; 761 goto bail;
759 } 762 }
760 763
761 /* If we're coming from process_vote we can't go into our own 764 /* If we're coming from downconvert_thread we can't go into our own
762 * voting [hello, deadlock city!], so unforuntately we just 765 * voting [hello, deadlock city!], so unforuntately we just
763 * have to skip deleting this guy. That's OK though because 766 * have to skip deleting this guy. That's OK though because
764 * the node who's doing the actual deleting should handle it 767 * the node who's doing the actual deleting should handle it
765 * anyway. */ 768 * anyway. */
766 if (current == osb->vote_task) { 769 if (current == osb->dc_task) {
767 mlog(0, "Skipping delete of %lu because we're currently " 770 mlog(0, "Skipping delete of %lu because we're currently "
768 "in process_vote\n", inode->i_ino); 771 "in downconvert\n", inode->i_ino);
769 goto bail; 772 goto bail;
770 } 773 }
771 774
@@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
779 goto bail_unlock; 782 goto bail_unlock;
780 } 783 }
781 784
782 /* If we have voted "yes" on the wipe of this inode for 785 /* If we have allowd wipe of this inode for another node, it
783 * another node, it will be marked here so we can safely skip 786 * will be marked here so we can safely skip it. Recovery will
784 * it. Recovery will cleanup any inodes we might inadvertantly 787 * cleanup any inodes we might inadvertantly skip here. */
785 * skip here. */
786 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { 788 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
787 mlog(0, "Skipping delete of %lu because another node " 789 mlog(0, "Skipping delete of %lu because another node "
788 "has done this for us.\n", inode->i_ino); 790 "has done this for us.\n", inode->i_ino);
@@ -863,7 +865,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
863 status = ocfs2_try_open_lock(inode, 1); 865 status = ocfs2_try_open_lock(inode, 1);
864 if (status == -EAGAIN) { 866 if (status == -EAGAIN) {
865 status = 0; 867 status = 0;
866 mlog(0, "Skipping delete of %llu because it is in use on" 868 mlog(0, "Skipping delete of %llu because it is in use on "
867 "other nodes\n", (unsigned long long)oi->ip_blkno); 869 "other nodes\n", (unsigned long long)oi->ip_blkno);
868 goto bail; 870 goto bail;
869 } 871 }
@@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode)
929 931
930 /* Lock down the inode. This gives us an up to date view of 932 /* Lock down the inode. This gives us an up to date view of
931 * it's metadata (for verification), and allows us to 933 * it's metadata (for verification), and allows us to
932 * serialize delete_inode votes. 934 * serialize delete_inode on multiple nodes.
933 * 935 *
934 * Even though we might be doing a truncate, we don't take the 936 * Even though we might be doing a truncate, we don't take the
935 * allocation lock here as it won't be needed - nobody will 937 * allocation lock here as it won't be needed - nobody will
936 * have the file open. 938 * have the file open.
937 */ 939 */
938 status = ocfs2_meta_lock(inode, &di_bh, 1); 940 status = ocfs2_inode_lock(inode, &di_bh, 1);
939 if (status < 0) { 941 if (status < 0) {
940 if (status != -ENOENT) 942 if (status != -ENOENT)
941 mlog_errno(status); 943 mlog_errno(status);
@@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode)
947 * before we go ahead and wipe the inode. */ 949 * before we go ahead and wipe the inode. */
948 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); 950 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
949 if (!wipe || status < 0) { 951 if (!wipe || status < 0) {
950 /* Error and inode busy vote both mean we won't be 952 /* Error and remote inode busy both mean we won't be
951 * removing the inode, so they take almost the same 953 * removing the inode, so they take almost the same
952 * path. */ 954 * path. */
953 if (status < 0) 955 if (status < 0)
954 mlog_errno(status); 956 mlog_errno(status);
955 957
956 /* Someone in the cluster has voted to not wipe this 958 /* Someone in the cluster has disallowed a wipe of
957 * inode, or it was never completely orphaned. Write 959 * this inode, or it was never completely
958 * out the pages and exit now. */ 960 * orphaned. Write out the pages and exit now. */
959 ocfs2_cleanup_delete_inode(inode, 1); 961 ocfs2_cleanup_delete_inode(inode, 1);
960 goto bail_unlock_inode; 962 goto bail_unlock_inode;
961 } 963 }
@@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode)
981 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; 983 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
982 984
983bail_unlock_inode: 985bail_unlock_inode:
984 ocfs2_meta_unlock(inode, 1); 986 ocfs2_inode_unlock(inode, 1);
985 brelse(di_bh); 987 brelse(di_bh);
986bail_unblock: 988bail_unblock:
987 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 989 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode)
1008 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1010 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1009 "Inode=%lu\n", inode->i_ino); 1011 "Inode=%lu\n", inode->i_ino);
1010 1012
1011 /* For remove delete_inode vote, we hold open lock before, 1013 /* To preven remote deletes we hold open lock before, now it
1012 * now it is time to unlock PR and EX open locks. */ 1014 * is time to unlock PR and EX open locks. */
1013 ocfs2_open_unlock(inode); 1015 ocfs2_open_unlock(inode);
1014 1016
1015 /* Do these before all the other work so that we don't bounce 1017 /* Do these before all the other work so that we don't bounce
1016 * the vote thread while waiting to destroy the locks. */ 1018 * the downconvert thread while waiting to destroy the locks. */
1017 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 1019 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
1018 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); 1020 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
1019 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
1020 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1021 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1021 1022
1022 /* We very well may get a clear_inode before all an inodes 1023 /* We very well may get a clear_inode before all an inodes
@@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode)
1039 mlog_errno(status); 1040 mlog_errno(status);
1040 1041
1041 ocfs2_lock_res_free(&oi->ip_rw_lockres); 1042 ocfs2_lock_res_free(&oi->ip_rw_lockres);
1042 ocfs2_lock_res_free(&oi->ip_meta_lockres); 1043 ocfs2_lock_res_free(&oi->ip_inode_lockres);
1043 ocfs2_lock_res_free(&oi->ip_data_lockres);
1044 ocfs2_lock_res_free(&oi->ip_open_lockres); 1044 ocfs2_lock_res_free(&oi->ip_open_lockres);
1045 1045
1046 ocfs2_metadata_cache_purge(inode); 1046 ocfs2_metadata_cache_purge(inode);
@@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
1184 } 1184 }
1185 spin_unlock(&OCFS2_I(inode)->ip_lock); 1185 spin_unlock(&OCFS2_I(inode)->ip_lock);
1186 1186
1187 /* Let ocfs2_meta_lock do the work of updating our struct 1187 /* Let ocfs2_inode_lock do the work of updating our struct
1188 * inode for us. */ 1188 * inode for us. */
1189 status = ocfs2_meta_lock(inode, NULL, 0); 1189 status = ocfs2_inode_lock(inode, NULL, 0);
1190 if (status < 0) { 1190 if (status < 0) {
1191 if (status != -ENOENT) 1191 if (status != -ENOENT)
1192 mlog_errno(status); 1192 mlog_errno(status);
1193 goto bail; 1193 goto bail;
1194 } 1194 }
1195 ocfs2_meta_unlock(inode, 0); 1195 ocfs2_inode_unlock(inode, 0);
1196bail: 1196bail:
1197 mlog_exit(status); 1197 mlog_exit(status);
1198 1198
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 70e881c55536..390a85596aa0 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,8 +34,7 @@ struct ocfs2_inode_info
34 u64 ip_blkno; 34 u64 ip_blkno;
35 35
36 struct ocfs2_lock_res ip_rw_lockres; 36 struct ocfs2_lock_res ip_rw_lockres;
37 struct ocfs2_lock_res ip_meta_lockres; 37 struct ocfs2_lock_res ip_inode_lockres;
38 struct ocfs2_lock_res ip_data_lockres;
39 struct ocfs2_lock_res ip_open_lockres; 38 struct ocfs2_lock_res ip_open_lockres;
40 39
41 /* protects allocation changes on this inode. */ 40 /* protects allocation changes on this inode. */
@@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
121void ocfs2_drop_inode(struct inode *inode); 120void ocfs2_drop_inode(struct inode *inode);
122 121
123/* Flags for ocfs2_iget() */ 122/* Flags for ocfs2_iget() */
124#define OCFS2_FI_FLAG_SYSFILE 0x4 123#define OCFS2_FI_FLAG_SYSFILE 0x1
125#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 124#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
126struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); 125struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
126 int sysfile_type);
127int ocfs2_inode_init_private(struct inode *inode); 127int ocfs2_inode_init_private(struct inode *inode);
128int ocfs2_inode_revalidate(struct dentry *dentry); 128int ocfs2_inode_revalidate(struct dentry *dentry);
129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 87dcece7e1b5..5177fba5162b 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -20,6 +20,7 @@
20 20
21#include "ocfs2_fs.h" 21#include "ocfs2_fs.h"
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h"
23 24
24#include <linux/ext2_fs.h> 25#include <linux/ext2_fs.h>
25 26
@@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
27{ 28{
28 int status; 29 int status;
29 30
30 status = ocfs2_meta_lock(inode, NULL, 0); 31 status = ocfs2_inode_lock(inode, NULL, 0);
31 if (status < 0) { 32 if (status < 0) {
32 mlog_errno(status); 33 mlog_errno(status);
33 return status; 34 return status;
34 } 35 }
35 ocfs2_get_inode_flags(OCFS2_I(inode)); 36 ocfs2_get_inode_flags(OCFS2_I(inode));
36 *flags = OCFS2_I(inode)->ip_attr; 37 *flags = OCFS2_I(inode)->ip_attr;
37 ocfs2_meta_unlock(inode, 0); 38 ocfs2_inode_unlock(inode, 0);
38 39
39 mlog_exit(status); 40 mlog_exit(status);
40 return status; 41 return status;
@@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
52 53
53 mutex_lock(&inode->i_mutex); 54 mutex_lock(&inode->i_mutex);
54 55
55 status = ocfs2_meta_lock(inode, &bh, 1); 56 status = ocfs2_inode_lock(inode, &bh, 1);
56 if (status < 0) { 57 if (status < 0) {
57 mlog_errno(status); 58 mlog_errno(status);
58 goto bail; 59 goto bail;
@@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
100 101
101 ocfs2_commit_trans(osb, handle); 102 ocfs2_commit_trans(osb, handle);
102bail_unlock: 103bail_unlock:
103 ocfs2_meta_unlock(inode, 1); 104 ocfs2_inode_unlock(inode, 1);
104bail: 105bail:
105 mutex_unlock(&inode->i_mutex); 106 mutex_unlock(&inode->i_mutex);
106 107
@@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
115 unsigned int cmd, unsigned long arg) 116 unsigned int cmd, unsigned long arg)
116{ 117{
117 unsigned int flags; 118 unsigned int flags;
119 int new_clusters;
118 int status; 120 int status;
119 struct ocfs2_space_resv sr; 121 struct ocfs2_space_resv sr;
122 struct ocfs2_new_group_input input;
120 123
121 switch (cmd) { 124 switch (cmd) {
122 case OCFS2_IOC_GETFLAGS: 125 case OCFS2_IOC_GETFLAGS:
@@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
140 return -EFAULT; 143 return -EFAULT;
141 144
142 return ocfs2_change_file_space(filp, cmd, &sr); 145 return ocfs2_change_file_space(filp, cmd, &sr);
146 case OCFS2_IOC_GROUP_EXTEND:
147 if (!capable(CAP_SYS_RESOURCE))
148 return -EPERM;
149
150 if (get_user(new_clusters, (int __user *)arg))
151 return -EFAULT;
152
153 return ocfs2_group_extend(inode, new_clusters);
154 case OCFS2_IOC_GROUP_ADD:
155 case OCFS2_IOC_GROUP_ADD64:
156 if (!capable(CAP_SYS_RESOURCE))
157 return -EPERM;
158
159 if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
160 return -EFAULT;
161
162 return ocfs2_group_add(inode, &input);
143 default: 163 default:
144 return -ENOTTY; 164 return -ENOTTY;
145 } 165 }
@@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
162 case OCFS2_IOC_RESVSP64: 182 case OCFS2_IOC_RESVSP64:
163 case OCFS2_IOC_UNRESVSP: 183 case OCFS2_IOC_UNRESVSP:
164 case OCFS2_IOC_UNRESVSP64: 184 case OCFS2_IOC_UNRESVSP64:
185 case OCFS2_IOC_GROUP_EXTEND:
186 case OCFS2_IOC_GROUP_ADD:
187 case OCFS2_IOC_GROUP_ADD64:
165 break; 188 break;
166 default: 189 default:
167 return -ENOIOCTLCMD; 190 return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f9d01e25298d..f31c7e8c19c3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -44,7 +44,6 @@
44#include "localalloc.h" 44#include "localalloc.h"
45#include "slot_map.h" 45#include "slot_map.h"
46#include "super.h" 46#include "super.h"
47#include "vote.h"
48#include "sysfile.h" 47#include "sysfile.h"
49 48
50#include "buffer_head_io.h" 49#include "buffer_head_io.h"
@@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
103 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", 102 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
104 journal->j_trans_id, flushed); 103 journal->j_trans_id, flushed);
105 104
106 ocfs2_kick_vote_thread(osb); 105 ocfs2_wake_downconvert_thread(osb);
107 wake_up(&journal->j_checkpointed); 106 wake_up(&journal->j_checkpointed);
108finally: 107finally:
109 mlog_exit(status); 108 mlog_exit(status);
@@ -174,6 +173,12 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
174 * transaction. extend_trans will either extend the current handle by 173 * transaction. extend_trans will either extend the current handle by
175 * nblocks, or commit it and start a new one with nblocks credits. 174 * nblocks, or commit it and start a new one with nblocks credits.
176 * 175 *
176 * This might call journal_restart() which will commit dirty buffers
177 * and then restart the transaction. Before calling
178 * ocfs2_extend_trans(), any changed blocks should have been
179 * dirtied. After calling it, all blocks which need to be changed must
180 * go through another set of journal_access/journal_dirty calls.
181 *
177 * WARNING: This will not release any semaphores or disk locks taken 182 * WARNING: This will not release any semaphores or disk locks taken
178 * during the transaction, so make sure they were taken *before* 183 * during the transaction, so make sure they were taken *before*
179 * start_trans or we'll have ordering deadlocks. 184 * start_trans or we'll have ordering deadlocks.
@@ -193,11 +198,15 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
193 198
194 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 199 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
195 200
201#ifdef OCFS2_DEBUG_FS
202 status = 1;
203#else
196 status = journal_extend(handle, nblocks); 204 status = journal_extend(handle, nblocks);
197 if (status < 0) { 205 if (status < 0) {
198 mlog_errno(status); 206 mlog_errno(status);
199 goto bail; 207 goto bail;
200 } 208 }
209#endif
201 210
202 if (status > 0) { 211 if (status > 0) {
203 mlog(0, "journal_extend failed, trying journal_restart\n"); 212 mlog(0, "journal_extend failed, trying journal_restart\n");
@@ -304,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
304 return err; 313 return err;
305} 314}
306 315
307#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) 316#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
308 317
309void ocfs2_set_journal_params(struct ocfs2_super *osb) 318void ocfs2_set_journal_params(struct ocfs2_super *osb)
310{ 319{
311 journal_t *journal = osb->journal->j_journal; 320 journal_t *journal = osb->journal->j_journal;
321 unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
322
323 if (osb->osb_commit_interval)
324 commit_interval = osb->osb_commit_interval;
312 325
313 spin_lock(&journal->j_state_lock); 326 spin_lock(&journal->j_state_lock);
314 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; 327 journal->j_commit_interval = commit_interval;
315 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 328 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
316 journal->j_flags |= JFS_BARRIER; 329 journal->j_flags |= JFS_BARRIER;
317 else 330 else
@@ -327,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
327 struct ocfs2_dinode *di = NULL; 340 struct ocfs2_dinode *di = NULL;
328 struct buffer_head *bh = NULL; 341 struct buffer_head *bh = NULL;
329 struct ocfs2_super *osb; 342 struct ocfs2_super *osb;
330 int meta_lock = 0; 343 int inode_lock = 0;
331 344
332 mlog_entry_void(); 345 mlog_entry_void();
333 346
@@ -357,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
357 /* Skip recovery waits here - journal inode metadata never 370 /* Skip recovery waits here - journal inode metadata never
358 * changes in a live cluster so it can be considered an 371 * changes in a live cluster so it can be considered an
359 * exception to the rule. */ 372 * exception to the rule. */
360 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 373 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
361 if (status < 0) { 374 if (status < 0) {
362 if (status != -ERESTARTSYS) 375 if (status != -ERESTARTSYS)
363 mlog(ML_ERROR, "Could not get lock on journal!\n"); 376 mlog(ML_ERROR, "Could not get lock on journal!\n");
364 goto done; 377 goto done;
365 } 378 }
366 379
367 meta_lock = 1; 380 inode_lock = 1;
368 di = (struct ocfs2_dinode *)bh->b_data; 381 di = (struct ocfs2_dinode *)bh->b_data;
369 382
370 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { 383 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
@@ -404,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
404 status = 0; 417 status = 0;
405done: 418done:
406 if (status < 0) { 419 if (status < 0) {
407 if (meta_lock) 420 if (inode_lock)
408 ocfs2_meta_unlock(inode, 1); 421 ocfs2_inode_unlock(inode, 1);
409 if (bh != NULL) 422 if (bh != NULL)
410 brelse(bh); 423 brelse(bh);
411 if (inode) { 424 if (inode) {
@@ -534,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
534 OCFS2_I(inode)->ip_open_count--; 547 OCFS2_I(inode)->ip_open_count--;
535 548
536 /* unlock our journal */ 549 /* unlock our journal */
537 ocfs2_meta_unlock(inode, 1); 550 ocfs2_inode_unlock(inode, 1);
538 551
539 brelse(journal->j_bh); 552 brelse(journal->j_bh);
540 journal->j_bh = NULL; 553 journal->j_bh = NULL;
@@ -873,8 +886,8 @@ restart:
873 ocfs2_super_unlock(osb, 1); 886 ocfs2_super_unlock(osb, 1);
874 887
875 /* We always run recovery on our own orphan dir - the dead 888 /* We always run recovery on our own orphan dir - the dead
876 * node(s) may have voted "no" on an inode delete earlier. A 889 * node(s) may have disallowd a previos inode delete. Re-processing
877 * revote is therefore required. */ 890 * is therefore required. */
878 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 891 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
879 NULL); 892 NULL);
880 893
@@ -963,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
963 } 976 }
964 SET_INODE_JOURNAL(inode); 977 SET_INODE_JOURNAL(inode);
965 978
966 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 979 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
967 if (status < 0) { 980 if (status < 0) {
968 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); 981 mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
969 if (status != -ERESTARTSYS) 982 if (status != -ERESTARTSYS)
970 mlog(ML_ERROR, "Could not lock journal!\n"); 983 mlog(ML_ERROR, "Could not lock journal!\n");
971 goto done; 984 goto done;
@@ -1037,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1037done: 1050done:
1038 /* drop the lock on this nodes journal */ 1051 /* drop the lock on this nodes journal */
1039 if (got_lock) 1052 if (got_lock)
1040 ocfs2_meta_unlock(inode, 1); 1053 ocfs2_inode_unlock(inode, 1);
1041 1054
1042 if (inode) 1055 if (inode)
1043 iput(inode); 1056 iput(inode);
@@ -1152,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1152 SET_INODE_JOURNAL(inode); 1165 SET_INODE_JOURNAL(inode);
1153 1166
1154 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; 1167 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1155 status = ocfs2_meta_lock_full(inode, NULL, 1, flags); 1168 status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
1156 if (status < 0) { 1169 if (status < 0) {
1157 if (status != -EAGAIN) 1170 if (status != -EAGAIN)
1158 mlog_errno(status); 1171 mlog_errno(status);
1159 goto bail; 1172 goto bail;
1160 } 1173 }
1161 1174
1162 ocfs2_meta_unlock(inode, 1); 1175 ocfs2_inode_unlock(inode, 1);
1163bail: 1176bail:
1164 if (inode) 1177 if (inode)
1165 iput(inode); 1178 iput(inode);
@@ -1231,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
1231 1244
1232 /* Skip bad inodes so that recovery can continue */ 1245 /* Skip bad inodes so that recovery can continue */
1233 iter = ocfs2_iget(p->osb, ino, 1246 iter = ocfs2_iget(p->osb, ino,
1234 OCFS2_FI_FLAG_ORPHAN_RECOVERY); 1247 OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
1235 if (IS_ERR(iter)) 1248 if (IS_ERR(iter))
1236 return 0; 1249 return 0;
1237 1250
@@ -1267,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1267 } 1280 }
1268 1281
1269 mutex_lock(&orphan_dir_inode->i_mutex); 1282 mutex_lock(&orphan_dir_inode->i_mutex);
1270 status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0); 1283 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
1271 if (status < 0) { 1284 if (status < 0) {
1272 mlog_errno(status); 1285 mlog_errno(status);
1273 goto out; 1286 goto out;
@@ -1277,12 +1290,13 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1277 ocfs2_orphan_filldir); 1290 ocfs2_orphan_filldir);
1278 if (status) { 1291 if (status) {
1279 mlog_errno(status); 1292 mlog_errno(status);
1280 goto out; 1293 goto out_cluster;
1281 } 1294 }
1282 1295
1283 *head = priv.head; 1296 *head = priv.head;
1284 1297
1285 ocfs2_meta_unlock(orphan_dir_inode, 0); 1298out_cluster:
1299 ocfs2_inode_unlock(orphan_dir_inode, 0);
1286out: 1300out:
1287 mutex_unlock(&orphan_dir_inode->i_mutex); 1301 mutex_unlock(&orphan_dir_inode->i_mutex);
1288 iput(orphan_dir_inode); 1302 iput(orphan_dir_inode);
@@ -1369,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1369 iter = oi->ip_next_orphan; 1383 iter = oi->ip_next_orphan;
1370 1384
1371 spin_lock(&oi->ip_lock); 1385 spin_lock(&oi->ip_lock);
1372 /* Delete voting may have set these on the assumption 1386 /* The remote delete code may have set these on the
1373 * that the other node would wipe them successfully. 1387 * assumption that the other node would wipe them
1374 * If they are still in the node's orphan dir, we need 1388 * successfully. If they are still in the node's
1375 * to reset that state. */ 1389 * orphan dir, we need to reset that state. */
1376 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); 1390 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1377 1391
1378 /* Set the proper information to get us going into 1392 /* Set the proper information to get us going into
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4b32e0961568..220f3e818e78 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -278,6 +278,12 @@ int ocfs2_journal_dirty_data(handle_t *handle,
278/* simple file updates like chmod, etc. */ 278/* simple file updates like chmod, etc. */
279#define OCFS2_INODE_UPDATE_CREDITS 1 279#define OCFS2_INODE_UPDATE_CREDITS 1
280 280
281/* group extend. inode update and last group update. */
282#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
283
284/* group add. inode update and the new group update. */
285#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
286
281/* get one bit out of a suballocator: dinode + group descriptor + 287/* get one bit out of a suballocator: dinode + group descriptor +
282 * prev. group desc. if we relink. */ 288 * prev. group desc. if we relink. */
283#define OCFS2_SUBALLOC_ALLOC (3) 289#define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index d272847d5a07..add1ffdc5c6c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 76 struct inode *local_alloc_inode);
77 77
78/*
79 * Determine how large our local alloc window should be, in bits.
80 *
81 * These values (and the behavior in ocfs2_alloc_should_use_local) have
82 * been chosen so that most allocations, including new block groups go
83 * through local alloc.
84 */
85static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) 78static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
86{ 79{
87 BUG_ON(osb->s_clustersize_bits < 12); 80 BUG_ON(osb->s_clustersize_bits > 20);
88 81
89 return 2048 >> (osb->s_clustersize_bits - 12); 82 /* Size local alloc windows by the megabyte */
83 return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
90} 84}
91 85
92/* 86/*
@@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
96int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) 90int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
97{ 91{
98 int la_bits = ocfs2_local_alloc_window_bits(osb); 92 int la_bits = ocfs2_local_alloc_window_bits(osb);
93 int ret = 0;
99 94
100 if (osb->local_alloc_state != OCFS2_LA_ENABLED) 95 if (osb->local_alloc_state != OCFS2_LA_ENABLED)
101 return 0; 96 goto bail;
102 97
103 /* la_bits should be at least twice the size (in clusters) of 98 /* la_bits should be at least twice the size (in clusters) of
104 * a new block group. We want to be sure block group 99 * a new block group. We want to be sure block group
105 * allocations go through the local alloc, so allow an 100 * allocations go through the local alloc, so allow an
106 * allocation to take up to half the bitmap. */ 101 * allocation to take up to half the bitmap. */
107 if (bits > (la_bits / 2)) 102 if (bits > (la_bits / 2))
108 return 0; 103 goto bail;
109 104
110 return 1; 105 ret = 1;
106bail:
107 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
108 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
109 return ret;
111} 110}
112 111
113int ocfs2_load_local_alloc(struct ocfs2_super *osb) 112int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
121 120
122 mlog_entry_void(); 121 mlog_entry_void();
123 122
123 if (ocfs2_mount_local(osb))
124 goto bail;
125
126 if (osb->local_alloc_size == 0)
127 goto bail;
128
129 if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
130 mlog(ML_NOTICE, "Requested local alloc window %d is larger "
131 "than max possible %u. Using defaults.\n",
132 ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
133 osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
134 }
135
124 /* read the alloc off disk */ 136 /* read the alloc off disk */
125 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, 137 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
126 osb->slot_num); 138 osb->slot_num);
@@ -181,6 +193,9 @@ bail:
181 if (inode) 193 if (inode)
182 iput(inode); 194 iput(inode);
183 195
196 mlog(0, "Local alloc window bits = %d\n",
197 ocfs2_local_alloc_window_bits(osb));
198
184 mlog_exit(status); 199 mlog_exit(status);
185 return status; 200 return status;
186} 201}
@@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
231 246
232 mutex_lock(&main_bm_inode->i_mutex); 247 mutex_lock(&main_bm_inode->i_mutex);
233 248
234 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 249 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
235 if (status < 0) { 250 if (status < 0) {
236 mlog_errno(status); 251 mlog_errno(status);
237 goto out_mutex; 252 goto out_mutex;
@@ -286,7 +301,7 @@ out_unlock:
286 if (main_bm_bh) 301 if (main_bm_bh)
287 brelse(main_bm_bh); 302 brelse(main_bm_bh);
288 303
289 ocfs2_meta_unlock(main_bm_inode, 1); 304 ocfs2_inode_unlock(main_bm_inode, 1);
290 305
291out_mutex: 306out_mutex:
292 mutex_unlock(&main_bm_inode->i_mutex); 307 mutex_unlock(&main_bm_inode->i_mutex);
@@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
399 414
400 mutex_lock(&main_bm_inode->i_mutex); 415 mutex_lock(&main_bm_inode->i_mutex);
401 416
402 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 417 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
403 if (status < 0) { 418 if (status < 0) {
404 mlog_errno(status); 419 mlog_errno(status);
405 goto out_mutex; 420 goto out_mutex;
@@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
424 ocfs2_commit_trans(osb, handle); 439 ocfs2_commit_trans(osb, handle);
425 440
426out_unlock: 441out_unlock:
427 ocfs2_meta_unlock(main_bm_inode, 1); 442 ocfs2_inode_unlock(main_bm_inode, 1);
428 443
429out_mutex: 444out_mutex:
430 mutex_unlock(&main_bm_inode->i_mutex); 445 mutex_unlock(&main_bm_inode->i_mutex);
@@ -484,6 +499,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
484 499
485 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 500 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
486 501
502#ifdef OCFS2_DEBUG_FS
487 if (le32_to_cpu(alloc->id1.bitmap1.i_used) != 503 if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
488 ocfs2_local_alloc_count_bits(alloc)) { 504 ocfs2_local_alloc_count_bits(alloc)) {
489 ocfs2_error(osb->sb, "local alloc inode %llu says it has " 505 ocfs2_error(osb->sb, "local alloc inode %llu says it has "
@@ -494,6 +510,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
494 status = -EIO; 510 status = -EIO;
495 goto bail; 511 goto bail;
496 } 512 }
513#endif
497 514
498 free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - 515 free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
499 le32_to_cpu(alloc->id1.bitmap1.i_used); 516 le32_to_cpu(alloc->id1.bitmap1.i_used);
@@ -519,6 +536,9 @@ bail:
519 iput(local_alloc_inode); 536 iput(local_alloc_inode);
520 } 537 }
521 538
539 mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
540 status);
541
522 mlog_exit(status); 542 mlog_exit(status);
523 return status; 543 return status;
524} 544}
@@ -712,9 +732,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
712 void *bitmap; 732 void *bitmap;
713 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); 733 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
714 734
715 mlog_entry("total = %u, COUNT = %u, used = %u\n", 735 mlog_entry("total = %u, used = %u\n",
716 le32_to_cpu(alloc->id1.bitmap1.i_total), 736 le32_to_cpu(alloc->id1.bitmap1.i_total),
717 ocfs2_local_alloc_count_bits(alloc),
718 le32_to_cpu(alloc->id1.bitmap1.i_used)); 737 le32_to_cpu(alloc->id1.bitmap1.i_used));
719 738
720 if (!alloc->id1.bitmap1.i_total) { 739 if (!alloc->id1.bitmap1.i_total) {
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 000000000000..203f87143877
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,125 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * locks.c
5 *
6 * Userspace file locking support
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27
28#define MLOG_MASK_PREFIX ML_INODE
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32
33#include "dlmglue.h"
34#include "file.h"
35#include "locks.h"
36
37static int ocfs2_do_flock(struct file *file, struct inode *inode,
38 int cmd, struct file_lock *fl)
39{
40 int ret = 0, level = 0, trylock = 0;
41 struct ocfs2_file_private *fp = file->private_data;
42 struct ocfs2_lock_res *lockres = &fp->fp_flock;
43
44 if (fl->fl_type == F_WRLCK)
45 level = 1;
46 if (!IS_SETLKW(cmd))
47 trylock = 1;
48
49 mutex_lock(&fp->fp_mutex);
50
51 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
52 lockres->l_level > LKM_NLMODE) {
53 int old_level = 0;
54
55 if (lockres->l_level == LKM_EXMODE)
56 old_level = 1;
57
58 if (level == old_level)
59 goto out;
60
61 /*
62 * Converting an existing lock is not guaranteed to be
63 * atomic, so we can get away with simply unlocking
64 * here and allowing the lock code to try at the new
65 * level.
66 */
67
68 flock_lock_file_wait(file,
69 &(struct file_lock){.fl_type = F_UNLCK});
70
71 ocfs2_file_unlock(file);
72 }
73
74 ret = ocfs2_file_lock(file, level, trylock);
75 if (ret) {
76 if (ret == -EAGAIN && trylock)
77 ret = -EWOULDBLOCK;
78 else
79 mlog_errno(ret);
80 goto out;
81 }
82
83 ret = flock_lock_file_wait(file, fl);
84
85out:
86 mutex_unlock(&fp->fp_mutex);
87
88 return ret;
89}
90
91static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
92{
93 int ret;
94 struct ocfs2_file_private *fp = file->private_data;
95
96 mutex_lock(&fp->fp_mutex);
97 ocfs2_file_unlock(file);
98 ret = flock_lock_file_wait(file, fl);
99 mutex_unlock(&fp->fp_mutex);
100
101 return ret;
102}
103
104/*
105 * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
106 */
107int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
108{
109 struct inode *inode = file->f_mapping->host;
110 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
111
112 if (!(fl->fl_flags & FL_FLOCK))
113 return -ENOLCK;
114 if (__mandatory_lock(inode))
115 return -ENOLCK;
116
117 if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
118 ocfs2_mount_local(osb))
119 return flock_lock_file_wait(file, fl);
120
121 if (fl->fl_type == F_UNLCK)
122 return ocfs2_do_funlock(file, cmd, fl);
123 else
124 return ocfs2_do_flock(file, inode, cmd, fl);
125}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/locks.h
index 9ea46f62de31..9743ef2324ec 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/locks.h
@@ -1,9 +1,9 @@
1/* -*- mode: c; c-basic-offset: 8; -*- 1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * vote.h 4 * locks.h
5 * 5 *
6 * description here 6 * Function prototypes for Userspace file locking support
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
@@ -23,26 +23,9 @@
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26#ifndef OCFS2_LOCKS_H
27#define OCFS2_LOCKS_H
26 28
27#ifndef VOTE_H 29int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
28#define VOTE_H
29 30
30int ocfs2_vote_thread(void *arg); 31#endif /* OCFS2_LOCKS_H */
31static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
32{
33 spin_lock(&osb->vote_task_lock);
34 /* make sure the voting thread gets a swipe at whatever changes
35 * the caller may have made to the voting state */
36 osb->vote_wake_sequence++;
37 spin_unlock(&osb->vote_task_lock);
38 wake_up(&osb->vote_event);
39}
40
41int ocfs2_request_mount_vote(struct ocfs2_super *osb);
42int ocfs2_request_umount_vote(struct ocfs2_super *osb);
43int ocfs2_register_net_handlers(struct ocfs2_super *osb);
44void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
45
46void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
47 int node_num);
48#endif
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 98756156d298..3dc18d67557c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
168 * node. Taking the data lock will also ensure that we don't 168 * node. Taking the data lock will also ensure that we don't
169 * attempt page truncation as part of a downconvert. 169 * attempt page truncation as part of a downconvert.
170 */ 170 */
171 ret = ocfs2_meta_lock(inode, &di_bh, 1); 171 ret = ocfs2_inode_lock(inode, &di_bh, 1);
172 if (ret < 0) { 172 if (ret < 0) {
173 mlog_errno(ret); 173 mlog_errno(ret);
174 goto out; 174 goto out;
@@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
181 */ 181 */
182 down_write(&OCFS2_I(inode)->ip_alloc_sem); 182 down_write(&OCFS2_I(inode)->ip_alloc_sem);
183 183
184 ret = ocfs2_data_lock(inode, 1);
185 if (ret < 0) {
186 mlog_errno(ret);
187 goto out_meta_unlock;
188 }
189
190 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 184 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
191 185
192 ocfs2_data_unlock(inode, 1);
193
194out_meta_unlock:
195 up_write(&OCFS2_I(inode)->ip_alloc_sem); 186 up_write(&OCFS2_I(inode)->ip_alloc_sem);
196 187
197 brelse(di_bh); 188 brelse(di_bh);
198 ocfs2_meta_unlock(inode, 1); 189 ocfs2_inode_unlock(inode, 1);
199 190
200out: 191out:
201 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 192 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
214{ 205{
215 int ret = 0, lock_level = 0; 206 int ret = 0, lock_level = 0;
216 207
217 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 208 ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
218 file->f_vfsmnt, &lock_level); 209 file->f_vfsmnt, &lock_level);
219 if (ret < 0) { 210 if (ret < 0) {
220 mlog_errno(ret); 211 mlog_errno(ret);
221 goto out; 212 goto out;
222 } 213 }
223 ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level); 214 ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
224out: 215out:
225 vma->vm_ops = &ocfs2_file_vm_ops; 216 vma->vm_ops = &ocfs2_file_vm_ops;
226 vma->vm_flags |= VM_CAN_NONLINEAR; 217 vma->vm_flags |= VM_CAN_NONLINEAR;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 729259016c18..ae9ad9587516 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,7 +60,6 @@
60#include "symlink.h" 60#include "symlink.h"
61#include "sysfile.h" 61#include "sysfile.h"
62#include "uptodate.h" 62#include "uptodate.h"
63#include "vote.h"
64 63
65#include "buffer_head_io.h" 64#include "buffer_head_io.h"
66 65
@@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
116 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len, 115 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
117 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); 116 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
118 117
119 status = ocfs2_meta_lock(dir, NULL, 0); 118 status = ocfs2_inode_lock(dir, NULL, 0);
120 if (status < 0) { 119 if (status < 0) {
121 if (status != -ENOENT) 120 if (status != -ENOENT)
122 mlog_errno(status); 121 mlog_errno(status);
@@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
129 if (status < 0) 128 if (status < 0)
130 goto bail_add; 129 goto bail_add;
131 130
132 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 131 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
133 if (IS_ERR(inode)) { 132 if (IS_ERR(inode)) {
134 ret = ERR_PTR(-EACCES); 133 ret = ERR_PTR(-EACCES);
135 goto bail_unlock; 134 goto bail_unlock;
@@ -176,8 +175,8 @@ bail_unlock:
176 /* Don't drop the cluster lock until *after* the d_add -- 175 /* Don't drop the cluster lock until *after* the d_add --
177 * unlink on another node will message us to remove that 176 * unlink on another node will message us to remove that
178 * dentry under this lock so otherwise we can race this with 177 * dentry under this lock so otherwise we can race this with
179 * the vote thread and have a stale dentry. */ 178 * the downconvert thread and have a stale dentry. */
180 ocfs2_meta_unlock(dir, 0); 179 ocfs2_inode_unlock(dir, 0);
181 180
182bail: 181bail:
183 182
@@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
209 /* get our super block */ 208 /* get our super block */
210 osb = OCFS2_SB(dir->i_sb); 209 osb = OCFS2_SB(dir->i_sb);
211 210
212 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 211 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
213 if (status < 0) { 212 if (status < 0) {
214 if (status != -ENOENT) 213 if (status != -ENOENT)
215 mlog_errno(status); 214 mlog_errno(status);
@@ -323,7 +322,7 @@ leave:
323 if (handle) 322 if (handle)
324 ocfs2_commit_trans(osb, handle); 323 ocfs2_commit_trans(osb, handle);
325 324
326 ocfs2_meta_unlock(dir, 1); 325 ocfs2_inode_unlock(dir, 1);
327 326
328 if (status == -ENOSPC) 327 if (status == -ENOSPC)
329 mlog(0, "Disk is full\n"); 328 mlog(0, "Disk is full\n");
@@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
553 if (S_ISDIR(inode->i_mode)) 552 if (S_ISDIR(inode->i_mode))
554 return -EPERM; 553 return -EPERM;
555 554
556 err = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 555 err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
557 if (err < 0) { 556 if (err < 0) {
558 if (err != -ENOENT) 557 if (err != -ENOENT)
559 mlog_errno(err); 558 mlog_errno(err);
@@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
578 goto out; 577 goto out;
579 } 578 }
580 579
581 err = ocfs2_meta_lock(inode, &fe_bh, 1); 580 err = ocfs2_inode_lock(inode, &fe_bh, 1);
582 if (err < 0) { 581 if (err < 0) {
583 if (err != -ENOENT) 582 if (err != -ENOENT)
584 mlog_errno(err); 583 mlog_errno(err);
@@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
643out_commit: 642out_commit:
644 ocfs2_commit_trans(osb, handle); 643 ocfs2_commit_trans(osb, handle);
645out_unlock_inode: 644out_unlock_inode:
646 ocfs2_meta_unlock(inode, 1); 645 ocfs2_inode_unlock(inode, 1);
647 646
648out: 647out:
649 ocfs2_meta_unlock(dir, 1); 648 ocfs2_inode_unlock(dir, 1);
650 649
651 if (de_bh) 650 if (de_bh)
652 brelse(de_bh); 651 brelse(de_bh);
@@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
720 return -EPERM; 719 return -EPERM;
721 } 720 }
722 721
723 status = ocfs2_meta_lock(dir, &parent_node_bh, 1); 722 status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
724 if (status < 0) { 723 if (status < 0) {
725 if (status != -ENOENT) 724 if (status != -ENOENT)
726 mlog_errno(status); 725 mlog_errno(status);
@@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
745 goto leave; 744 goto leave;
746 } 745 }
747 746
748 status = ocfs2_meta_lock(inode, &fe_bh, 1); 747 status = ocfs2_inode_lock(inode, &fe_bh, 1);
749 if (status < 0) { 748 if (status < 0) {
750 if (status != -ENOENT) 749 if (status != -ENOENT)
751 mlog_errno(status); 750 mlog_errno(status);
@@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
765 764
766 status = ocfs2_remote_dentry_delete(dentry); 765 status = ocfs2_remote_dentry_delete(dentry);
767 if (status < 0) { 766 if (status < 0) {
768 /* This vote should succeed under all normal 767 /* This remote delete should succeed under all normal
769 * circumstances. */ 768 * circumstances. */
770 mlog_errno(status); 769 mlog_errno(status);
771 goto leave; 770 goto leave;
@@ -841,13 +840,13 @@ leave:
841 ocfs2_commit_trans(osb, handle); 840 ocfs2_commit_trans(osb, handle);
842 841
843 if (child_locked) 842 if (child_locked)
844 ocfs2_meta_unlock(inode, 1); 843 ocfs2_inode_unlock(inode, 1);
845 844
846 ocfs2_meta_unlock(dir, 1); 845 ocfs2_inode_unlock(dir, 1);
847 846
848 if (orphan_dir) { 847 if (orphan_dir) {
849 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 848 /* This was locked for us in ocfs2_prepare_orphan_dir() */
850 ocfs2_meta_unlock(orphan_dir, 1); 849 ocfs2_inode_unlock(orphan_dir, 1);
851 mutex_unlock(&orphan_dir->i_mutex); 850 mutex_unlock(&orphan_dir->i_mutex);
852 iput(orphan_dir); 851 iput(orphan_dir);
853 } 852 }
@@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
908 inode1 = tmpinode; 907 inode1 = tmpinode;
909 } 908 }
910 /* lock id2 */ 909 /* lock id2 */
911 status = ocfs2_meta_lock(inode2, bh2, 1); 910 status = ocfs2_inode_lock(inode2, bh2, 1);
912 if (status < 0) { 911 if (status < 0) {
913 if (status != -ENOENT) 912 if (status != -ENOENT)
914 mlog_errno(status); 913 mlog_errno(status);
@@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
917 } 916 }
918 917
919 /* lock id1 */ 918 /* lock id1 */
920 status = ocfs2_meta_lock(inode1, bh1, 1); 919 status = ocfs2_inode_lock(inode1, bh1, 1);
921 if (status < 0) { 920 if (status < 0) {
922 /* 921 /*
923 * An error return must mean that no cluster locks 922 * An error return must mean that no cluster locks
924 * were held on function exit. 923 * were held on function exit.
925 */ 924 */
926 if (oi1->ip_blkno != oi2->ip_blkno) 925 if (oi1->ip_blkno != oi2->ip_blkno)
927 ocfs2_meta_unlock(inode2, 1); 926 ocfs2_inode_unlock(inode2, 1);
928 927
929 if (status != -ENOENT) 928 if (status != -ENOENT)
930 mlog_errno(status); 929 mlog_errno(status);
@@ -937,10 +936,10 @@ bail:
937 936
938static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) 937static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
939{ 938{
940 ocfs2_meta_unlock(inode1, 1); 939 ocfs2_inode_unlock(inode1, 1);
941 940
942 if (inode1 != inode2) 941 if (inode1 != inode2)
943 ocfs2_meta_unlock(inode2, 1); 942 ocfs2_inode_unlock(inode2, 1);
944} 943}
945 944
946static int ocfs2_rename(struct inode *old_dir, 945static int ocfs2_rename(struct inode *old_dir,
@@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir,
1031 1030
1032 /* 1031 /*
1033 * Aside from allowing a meta data update, the locking here 1032 * Aside from allowing a meta data update, the locking here
1034 * also ensures that the vote thread on other nodes won't have 1033 * also ensures that the downconvert thread on other nodes
1035 * to concurrently downconvert the inode and the dentry locks. 1034 * won't have to concurrently downconvert the inode and the
1035 * dentry locks.
1036 */ 1036 */
1037 status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1); 1037 status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
1038 if (status < 0) { 1038 if (status < 0) {
1039 if (status != -ENOENT) 1039 if (status != -ENOENT)
1040 mlog_errno(status); 1040 mlog_errno(status);
@@ -1105,9 +1105,16 @@ static int ocfs2_rename(struct inode *old_dir,
1105 goto bail; 1105 goto bail;
1106 } 1106 }
1107 1107
1108 if (!new_de && new_inode) 1108 if (!new_de && new_inode) {
1109 mlog(ML_ERROR, "inode %lu does not exist in it's parent " 1109 /*
1110 "directory!", new_inode->i_ino); 1110 * Target was unlinked by another node while we were
1111 * waiting to get to ocfs2_rename(). There isn't
1112 * anything we can do here to help the situation, so
1113 * bubble up the appropriate error.
1114 */
1115 status = -ENOENT;
1116 goto bail;
1117 }
1111 1118
1112 /* In case we need to overwrite an existing file, we blow it 1119 /* In case we need to overwrite an existing file, we blow it
1113 * away first */ 1120 * away first */
@@ -1136,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
1136 goto bail; 1143 goto bail;
1137 } 1144 }
1138 1145
1139 status = ocfs2_meta_lock(new_inode, &newfe_bh, 1); 1146 status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
1140 if (status < 0) { 1147 if (status < 0) {
1141 if (status != -ENOENT) 1148 if (status != -ENOENT)
1142 mlog_errno(status); 1149 mlog_errno(status);
@@ -1348,14 +1355,14 @@ bail:
1348 ocfs2_double_unlock(old_dir, new_dir); 1355 ocfs2_double_unlock(old_dir, new_dir);
1349 1356
1350 if (old_child_locked) 1357 if (old_child_locked)
1351 ocfs2_meta_unlock(old_inode, 1); 1358 ocfs2_inode_unlock(old_inode, 1);
1352 1359
1353 if (new_child_locked) 1360 if (new_child_locked)
1354 ocfs2_meta_unlock(new_inode, 1); 1361 ocfs2_inode_unlock(new_inode, 1);
1355 1362
1356 if (orphan_dir) { 1363 if (orphan_dir) {
1357 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 1364 /* This was locked for us in ocfs2_prepare_orphan_dir() */
1358 ocfs2_meta_unlock(orphan_dir, 1); 1365 ocfs2_inode_unlock(orphan_dir, 1);
1359 mutex_unlock(&orphan_dir->i_mutex); 1366 mutex_unlock(&orphan_dir->i_mutex);
1360 iput(orphan_dir); 1367 iput(orphan_dir);
1361 } 1368 }
@@ -1523,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
1523 credits = ocfs2_calc_symlink_credits(sb); 1530 credits = ocfs2_calc_symlink_credits(sb);
1524 1531
1525 /* lock the parent directory */ 1532 /* lock the parent directory */
1526 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 1533 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
1527 if (status < 0) { 1534 if (status < 0) {
1528 if (status != -ENOENT) 1535 if (status != -ENOENT)
1529 mlog_errno(status); 1536 mlog_errno(status);
@@ -1650,7 +1657,7 @@ bail:
1650 if (handle) 1657 if (handle)
1651 ocfs2_commit_trans(osb, handle); 1658 ocfs2_commit_trans(osb, handle);
1652 1659
1653 ocfs2_meta_unlock(dir, 1); 1660 ocfs2_inode_unlock(dir, 1);
1654 1661
1655 if (new_fe_bh) 1662 if (new_fe_bh)
1656 brelse(new_fe_bh); 1663 brelse(new_fe_bh);
@@ -1728,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1728 1735
1729 mutex_lock(&orphan_dir_inode->i_mutex); 1736 mutex_lock(&orphan_dir_inode->i_mutex);
1730 1737
1731 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 1738 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
1732 if (status < 0) { 1739 if (status < 0) {
1733 mlog_errno(status); 1740 mlog_errno(status);
1734 goto leave; 1741 goto leave;
@@ -1738,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1738 orphan_dir_bh, name, 1745 orphan_dir_bh, name,
1739 OCFS2_ORPHAN_NAMELEN, de_bh); 1746 OCFS2_ORPHAN_NAMELEN, de_bh);
1740 if (status < 0) { 1747 if (status < 0) {
1741 ocfs2_meta_unlock(orphan_dir_inode, 1); 1748 ocfs2_inode_unlock(orphan_dir_inode, 1);
1742 1749
1743 mlog_errno(status); 1750 mlog_errno(status);
1744 goto leave; 1751 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 60a23e1906b0..d08480580470 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
101 * about to be 101 * about to be
102 * dropped. */ 102 * dropped. */
103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ 103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
104#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
104 105
105struct ocfs2_lock_res_ops; 106struct ocfs2_lock_res_ops;
106 107
@@ -170,6 +171,7 @@ enum ocfs2_mount_options
170 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 171 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
171 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 172 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
172 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ 173 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
174 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
173}; 175};
174 176
175#define OCFS2_OSB_SOFT_RO 0x0001 177#define OCFS2_OSB_SOFT_RO 0x0001
@@ -189,9 +191,7 @@ struct ocfs2_super
189 struct ocfs2_slot_info *slot_info; 191 struct ocfs2_slot_info *slot_info;
190 192
191 spinlock_t node_map_lock; 193 spinlock_t node_map_lock;
192 struct ocfs2_node_map mounted_map;
193 struct ocfs2_node_map recovery_map; 194 struct ocfs2_node_map recovery_map;
194 struct ocfs2_node_map umount_map;
195 195
196 u64 root_blkno; 196 u64 root_blkno;
197 u64 system_dir_blkno; 197 u64 system_dir_blkno;
@@ -231,7 +231,9 @@ struct ocfs2_super
231 wait_queue_head_t checkpoint_event; 231 wait_queue_head_t checkpoint_event;
232 atomic_t needs_checkpoint; 232 atomic_t needs_checkpoint;
233 struct ocfs2_journal *journal; 233 struct ocfs2_journal *journal;
234 unsigned long osb_commit_interval;
234 235
236 int local_alloc_size;
235 enum ocfs2_local_alloc_state local_alloc_state; 237 enum ocfs2_local_alloc_state local_alloc_state;
236 struct buffer_head *local_alloc_bh; 238 struct buffer_head *local_alloc_bh;
237 u64 la_last_gd; 239 u64 la_last_gd;
@@ -254,28 +256,21 @@ struct ocfs2_super
254 256
255 wait_queue_head_t recovery_event; 257 wait_queue_head_t recovery_event;
256 258
257 spinlock_t vote_task_lock; 259 spinlock_t dc_task_lock;
258 struct task_struct *vote_task; 260 struct task_struct *dc_task;
259 wait_queue_head_t vote_event; 261 wait_queue_head_t dc_event;
260 unsigned long vote_wake_sequence; 262 unsigned long dc_wake_sequence;
261 unsigned long vote_work_sequence; 263 unsigned long dc_work_sequence;
262 264
265 /*
266 * Any thread can add locks to the list, but the downconvert
267 * thread is the only one allowed to remove locks. Any change
268 * to this rule requires updating
269 * ocfs2_downconvert_thread_do_work().
270 */
263 struct list_head blocked_lock_list; 271 struct list_head blocked_lock_list;
264 unsigned long blocked_lock_count; 272 unsigned long blocked_lock_count;
265 273
266 struct list_head vote_list;
267 int vote_count;
268
269 u32 net_key;
270 spinlock_t net_response_lock;
271 unsigned int net_response_ids;
272 struct list_head net_response_list;
273
274 struct o2hb_callback_func osb_hb_up;
275 struct o2hb_callback_func osb_hb_down;
276
277 struct list_head osb_net_handlers;
278
279 wait_queue_head_t osb_mount_event; 274 wait_queue_head_t osb_mount_event;
280 275
281 /* Truncate log info */ 276 /* Truncate log info */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 6ef876759a73..3633edd3982f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,6 +231,20 @@ struct ocfs2_space_resv {
231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) 231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) 232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
233 233
234/* Used to pass group descriptor data when online resize is done */
235struct ocfs2_new_group_input {
236 __u64 group; /* Group descriptor's blkno. */
237 __u32 clusters; /* Total number of clusters in this group */
238 __u32 frees; /* Total free clusters in this group */
239 __u16 chain; /* Chain for this group */
240 __u16 reserved1;
241 __u32 reserved2;
242};
243
244#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
245#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
246#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
247
234/* 248/*
235 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 249 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
236 */ 250 */
@@ -256,6 +270,14 @@ struct ocfs2_space_resv {
256/* Journal limits (in bytes) */ 270/* Journal limits (in bytes) */
257#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 271#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
258 272
273/*
274 * Default local alloc size (in megabytes)
275 *
276 * The value chosen should be such that most allocations, including new
277 * block groups, use local alloc.
278 */
279#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
280
259struct ocfs2_system_inode_info { 281struct ocfs2_system_inode_info {
260 char *si_name; 282 char *si_name;
261 int si_iflags; 283 int si_iflags;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4ca02b1c38ac..86f3e3799c2b 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
45 OCFS2_LOCK_TYPE_RW, 45 OCFS2_LOCK_TYPE_RW,
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK,
48 OCFS2_NUM_LOCK_TYPES 49 OCFS2_NUM_LOCK_TYPES
49}; 50};
50 51
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
73 case OCFS2_LOCK_TYPE_OPEN: 74 case OCFS2_LOCK_TYPE_OPEN:
74 c = 'O'; 75 c = 'O';
75 break; 76 break;
77 case OCFS2_LOCK_TYPE_FLOCK:
78 c = 'F';
79 break;
76 default: 80 default:
77 c = '\0'; 81 c = '\0';
78 } 82 }
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
90 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 94 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
91 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 95 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
92 [OCFS2_LOCK_TYPE_OPEN] = "Open", 96 [OCFS2_LOCK_TYPE_OPEN] = "Open",
97 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
93}; 98};
94 99
95static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 000000000000..37835ffcb039
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,634 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.c
5 *
6 * volume resize.
7 * Inspired by ext3/resize.c.
8 *
9 * Copyright (C) 2007 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29
30#define MLOG_MASK_PREFIX ML_DISK_ALLOC
31#include <cluster/masklog.h>
32
33#include "ocfs2.h"
34
35#include "alloc.h"
36#include "dlmglue.h"
37#include "inode.h"
38#include "journal.h"
39#include "super.h"
40#include "sysfile.h"
41#include "uptodate.h"
42
43#include "buffer_head_io.h"
44#include "suballoc.h"
45#include "resize.h"
46
47/*
48 * Check whether there are new backup superblocks exist
49 * in the last group. If there are some, mark them or clear
50 * them in the bitmap.
51 *
52 * Return how many backups we find in the last group.
53 */
54static u16 ocfs2_calc_new_backup_super(struct inode *inode,
55 struct ocfs2_group_desc *gd,
56 int new_clusters,
57 u32 first_new_cluster,
58 u16 cl_cpg,
59 int set)
60{
61 int i;
62 u16 backups = 0;
63 u32 cluster;
64 u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
65
66 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
67 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
68 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
69
70 gd_blkno = ocfs2_which_cluster_group(inode, cluster);
71 if (gd_blkno < lgd_blkno)
72 continue;
73 else if (gd_blkno > lgd_blkno)
74 break;
75
76 if (set)
77 ocfs2_set_bit(cluster % cl_cpg,
78 (unsigned long *)gd->bg_bitmap);
79 else
80 ocfs2_clear_bit(cluster % cl_cpg,
81 (unsigned long *)gd->bg_bitmap);
82 backups++;
83 }
84
85 mlog_exit_void();
86 return backups;
87}
88
89static int ocfs2_update_last_group_and_inode(handle_t *handle,
90 struct inode *bm_inode,
91 struct buffer_head *bm_bh,
92 struct buffer_head *group_bh,
93 u32 first_new_cluster,
94 int new_clusters)
95{
96 int ret = 0;
97 struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
98 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
99 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
100 struct ocfs2_chain_rec *cr;
101 struct ocfs2_group_desc *group;
102 u16 chain, num_bits, backups = 0;
103 u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
104 u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
105
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster);
108
109 ret = ocfs2_journal_access(handle, bm_inode, group_bh,
110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) {
112 mlog_errno(ret);
113 goto out;
114 }
115
116 group = (struct ocfs2_group_desc *)group_bh->b_data;
117
118 /* update the group first. */
119 num_bits = new_clusters * cl_bpc;
120 le16_add_cpu(&group->bg_bits, num_bits);
121 le16_add_cpu(&group->bg_free_bits_count, num_bits);
122
123 /*
124 * check whether there are some new backup superblocks exist in
125 * this group and update the group bitmap accordingly.
126 */
127 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
128 OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
129 backups = ocfs2_calc_new_backup_super(bm_inode,
130 group,
131 new_clusters,
132 first_new_cluster,
133 cl_cpg, 1);
134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
135 }
136
137 ret = ocfs2_journal_dirty(handle, group_bh);
138 if (ret < 0) {
139 mlog_errno(ret);
140 goto out_rollback;
141 }
142
143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) {
147 mlog_errno(ret);
148 goto out_rollback;
149 }
150
151 chain = le16_to_cpu(group->bg_chain);
152 cr = (&cl->cl_recs[chain]);
153 le32_add_cpu(&cr->c_total, num_bits);
154 le32_add_cpu(&cr->c_free, num_bits);
155 le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
156 le32_add_cpu(&fe->i_clusters, new_clusters);
157
158 if (backups) {
159 le32_add_cpu(&cr->c_free, -1 * backups);
160 le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
161 }
162
163 spin_lock(&OCFS2_I(bm_inode)->ip_lock);
164 OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
165 le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
166 spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
167 i_size_write(bm_inode, le64_to_cpu(fe->i_size));
168
169 ocfs2_journal_dirty(handle, bm_bh);
170
171out_rollback:
172 if (ret < 0) {
173 ocfs2_calc_new_backup_super(bm_inode,
174 group,
175 new_clusters,
176 first_new_cluster,
177 cl_cpg, 0);
178 le16_add_cpu(&group->bg_free_bits_count, backups);
179 le16_add_cpu(&group->bg_bits, -1 * num_bits);
180 le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
181 }
182out:
183 mlog_exit(ret);
184 return ret;
185}
186
187static int update_backups(struct inode * inode, u32 clusters, char *data)
188{
189 int i, ret = 0;
190 u32 cluster;
191 u64 blkno;
192 struct buffer_head *backup = NULL;
193 struct ocfs2_dinode *backup_di = NULL;
194 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
195
196 /* calculate the real backups we need to update. */
197 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
198 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
199 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
200 if (cluster > clusters)
201 break;
202
203 ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
204 if (ret < 0) {
205 mlog_errno(ret);
206 break;
207 }
208
209 memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
210
211 backup_di = (struct ocfs2_dinode *)backup->b_data;
212 backup_di->i_blkno = cpu_to_le64(blkno);
213
214 ret = ocfs2_write_super_or_backup(osb, backup);
215 brelse(backup);
216 backup = NULL;
217 if (ret < 0) {
218 mlog_errno(ret);
219 break;
220 }
221 }
222
223 return ret;
224}
225
226static void ocfs2_update_super_and_backups(struct inode *inode,
227 int new_clusters)
228{
229 int ret;
230 u32 clusters = 0;
231 struct buffer_head *super_bh = NULL;
232 struct ocfs2_dinode *super_di = NULL;
233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234
235 /*
236 * update the superblock last.
237 * It doesn't matter if the write failed.
238 */
239 ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
240 &super_bh, 0, NULL);
241 if (ret < 0) {
242 mlog_errno(ret);
243 goto out;
244 }
245
246 super_di = (struct ocfs2_dinode *)super_bh->b_data;
247 le32_add_cpu(&super_di->i_clusters, new_clusters);
248 clusters = le32_to_cpu(super_di->i_clusters);
249
250 ret = ocfs2_write_super_or_backup(osb, super_bh);
251 if (ret < 0) {
252 mlog_errno(ret);
253 goto out;
254 }
255
256 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
257 ret = update_backups(inode, clusters, super_bh->b_data);
258
259out:
260 brelse(super_bh);
261 if (ret)
262 printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
263 " during fs resize. This condition is not fatal,"
264 " but fsck.ocfs2 should be run to fix it\n",
265 osb->dev_str);
266 return;
267}
268
269/*
270 * Extend the filesystem to the new number of clusters specified. This entry
271 * point is only used to extend the current filesystem to the end of the last
272 * existing group.
273 */
274int ocfs2_group_extend(struct inode * inode, int new_clusters)
275{
276 int ret;
277 handle_t *handle;
278 struct buffer_head *main_bm_bh = NULL;
279 struct buffer_head *group_bh = NULL;
280 struct inode *main_bm_inode = NULL;
281 struct ocfs2_dinode *fe = NULL;
282 struct ocfs2_group_desc *group = NULL;
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 u16 cl_bpc;
285 u32 first_new_cluster;
286 u64 lgd_blkno;
287
288 mlog_entry_void();
289
290 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
291 return -EROFS;
292
293 if (new_clusters < 0)
294 return -EINVAL;
295 else if (new_clusters == 0)
296 return 0;
297
298 main_bm_inode = ocfs2_get_system_file_inode(osb,
299 GLOBAL_BITMAP_SYSTEM_INODE,
300 OCFS2_INVALID_SLOT);
301 if (!main_bm_inode) {
302 ret = -EINVAL;
303 mlog_errno(ret);
304 goto out;
305 }
306
307 mutex_lock(&main_bm_inode->i_mutex);
308
309 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
310 if (ret < 0) {
311 mlog_errno(ret);
312 goto out_mutex;
313 }
314
315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
316
317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
318 ocfs2_group_bitmap_size(osb->sb) * 8) {
319 mlog(ML_ERROR, "The disk is too old and small. "
320 "Force to do offline resize.");
321 ret = -EINVAL;
322 goto out_unlock;
323 }
324
325 if (!OCFS2_IS_VALID_DINODE(fe)) {
326 OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
327 ret = -EIO;
328 goto out_unlock;
329 }
330
331 first_new_cluster = le32_to_cpu(fe->i_clusters);
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1);
334
335 ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
336 main_bm_inode);
337 if (ret < 0) {
338 mlog_errno(ret);
339 goto out_unlock;
340 }
341
342 group = (struct ocfs2_group_desc *)group_bh->b_data;
343
344 ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
345 if (ret) {
346 mlog_errno(ret);
347 goto out_unlock;
348 }
349
350 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
351 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
352 le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
353 ret = -EINVAL;
354 goto out_unlock;
355 }
356
357 mlog(0, "extend the last group at %llu, new clusters = %d\n",
358 (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
359
360 handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
361 if (IS_ERR(handle)) {
362 mlog_errno(PTR_ERR(handle));
363 ret = -EINVAL;
364 goto out_unlock;
365 }
366
367 /* update the last group descriptor and inode. */
368 ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
369 main_bm_bh, group_bh,
370 first_new_cluster,
371 new_clusters);
372 if (ret) {
373 mlog_errno(ret);
374 goto out_commit;
375 }
376
377 ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
378
379out_commit:
380 ocfs2_commit_trans(osb, handle);
381out_unlock:
382 brelse(group_bh);
383 brelse(main_bm_bh);
384
385 ocfs2_inode_unlock(main_bm_inode, 1);
386
387out_mutex:
388 mutex_unlock(&main_bm_inode->i_mutex);
389 iput(main_bm_inode);
390
391out:
392 mlog_exit_void();
393 return ret;
394}
395
396static int ocfs2_check_new_group(struct inode *inode,
397 struct ocfs2_dinode *di,
398 struct ocfs2_new_group_input *input,
399 struct buffer_head *group_bh)
400{
401 int ret;
402 struct ocfs2_group_desc *gd;
403 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
404 unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
405 le16_to_cpu(di->id2.i_chain.cl_bpc);
406
407
408 gd = (struct ocfs2_group_desc *)group_bh->b_data;
409
410 ret = -EIO;
411 if (!OCFS2_IS_VALID_GROUP_DESC(gd))
412 mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
413 (unsigned long long)le64_to_cpu(gd->bg_blkno));
414 else if (di->i_blkno != gd->bg_parent_dinode)
415 mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
416 "pointer (%llu, expected %llu)\n",
417 (unsigned long long)le64_to_cpu(gd->bg_blkno),
418 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
419 (unsigned long long)le64_to_cpu(di->i_blkno));
420 else if (le16_to_cpu(gd->bg_bits) > max_bits)
421 mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
422 (unsigned long long)le64_to_cpu(gd->bg_blkno),
423 le16_to_cpu(gd->bg_bits));
424 else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
425 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
426 "claims that %u are free\n",
427 (unsigned long long)le64_to_cpu(gd->bg_blkno),
428 le16_to_cpu(gd->bg_bits),
429 le16_to_cpu(gd->bg_free_bits_count));
430 else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
431 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
432 "max bitmap bits of %u\n",
433 (unsigned long long)le64_to_cpu(gd->bg_blkno),
434 le16_to_cpu(gd->bg_bits),
435 8 * le16_to_cpu(gd->bg_size));
436 else if (le16_to_cpu(gd->bg_chain) != input->chain)
437 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
438 "while input has %u set.\n",
439 (unsigned long long)le64_to_cpu(gd->bg_blkno),
440 le16_to_cpu(gd->bg_chain), input->chain);
441 else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
442 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
443 "input has %u clusters set\n",
444 (unsigned long long)le64_to_cpu(gd->bg_blkno),
445 le16_to_cpu(gd->bg_bits), input->clusters);
446 else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
447 mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
448 "but it should have %u set\n",
449 (unsigned long long)le64_to_cpu(gd->bg_blkno),
450 le16_to_cpu(gd->bg_bits),
451 input->frees * cl_bpc);
452 else
453 ret = 0;
454
455 return ret;
456}
457
458static int ocfs2_verify_group_and_input(struct inode *inode,
459 struct ocfs2_dinode *di,
460 struct ocfs2_new_group_input *input,
461 struct buffer_head *group_bh)
462{
463 u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
464 u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
465 u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
466 u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
467 u32 total_clusters = le32_to_cpu(di->i_clusters);
468 int ret = -EINVAL;
469
470 if (cluster < total_clusters)
471 mlog(ML_ERROR, "add a group which is in the current volume.\n");
472 else if (input->chain >= cl_count)
473 mlog(ML_ERROR, "input chain exceeds the limit.\n");
474 else if (next_free != cl_count && next_free != input->chain)
475 mlog(ML_ERROR,
476 "the add group should be in chain %u\n", next_free);
477 else if (total_clusters + input->clusters < total_clusters)
478 mlog(ML_ERROR, "add group's clusters overflow.\n");
479 else if (input->clusters > cl_cpg)
480 mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
481 else if (input->frees > input->clusters)
482 mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
483 else if (total_clusters % cl_cpg != 0)
484 mlog(ML_ERROR,
485 "the last group isn't full. Use group extend first.\n");
486 else if (input->group != ocfs2_which_cluster_group(inode, cluster))
487 mlog(ML_ERROR, "group blkno is invalid\n");
488 else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
489 mlog(ML_ERROR, "group descriptor check failed.\n");
490 else
491 ret = 0;
492
493 return ret;
494}
495
496/* Add a new group descriptor to global_bitmap. */
497int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
498{
499 int ret;
500 handle_t *handle;
501 struct buffer_head *main_bm_bh = NULL;
502 struct inode *main_bm_inode = NULL;
503 struct ocfs2_dinode *fe = NULL;
504 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
505 struct buffer_head *group_bh = NULL;
506 struct ocfs2_group_desc *group = NULL;
507 struct ocfs2_chain_list *cl;
508 struct ocfs2_chain_rec *cr;
509 u16 cl_bpc;
510
511 mlog_entry_void();
512
513 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
514 return -EROFS;
515
516 main_bm_inode = ocfs2_get_system_file_inode(osb,
517 GLOBAL_BITMAP_SYSTEM_INODE,
518 OCFS2_INVALID_SLOT);
519 if (!main_bm_inode) {
520 ret = -EINVAL;
521 mlog_errno(ret);
522 goto out;
523 }
524
525 mutex_lock(&main_bm_inode->i_mutex);
526
527 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
528 if (ret < 0) {
529 mlog_errno(ret);
530 goto out_mutex;
531 }
532
533 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
534
535 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
536 ocfs2_group_bitmap_size(osb->sb) * 8) {
537 mlog(ML_ERROR, "The disk is too old and small."
538 " Force to do offline resize.");
539 ret = -EINVAL;
540 goto out_unlock;
541 }
542
543 ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
544 if (ret < 0) {
545 mlog(ML_ERROR, "Can't read the group descriptor # %llu "
546 "from the device.", (unsigned long long)input->group);
547 goto out_unlock;
548 }
549
550 ocfs2_set_new_buffer_uptodate(inode, group_bh);
551
552 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
553 if (ret) {
554 mlog_errno(ret);
555 goto out_unlock;
556 }
557
558 mlog(0, "Add a new group %llu in chain = %u, length = %u\n",
559 (unsigned long long)input->group, input->chain, input->clusters);
560
561 handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
562 if (IS_ERR(handle)) {
563 mlog_errno(PTR_ERR(handle));
564 ret = -EINVAL;
565 goto out_unlock;
566 }
567
568 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
569 cl = &fe->id2.i_chain;
570 cr = &cl->cl_recs[input->chain];
571
572 ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
573 OCFS2_JOURNAL_ACCESS_WRITE);
574 if (ret < 0) {
575 mlog_errno(ret);
576 goto out_commit;
577 }
578
579 group = (struct ocfs2_group_desc *)group_bh->b_data;
580 group->bg_next_group = cr->c_blkno;
581
582 ret = ocfs2_journal_dirty(handle, group_bh);
583 if (ret < 0) {
584 mlog_errno(ret);
585 goto out_commit;
586 }
587
588 ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
589 OCFS2_JOURNAL_ACCESS_WRITE);
590 if (ret < 0) {
591 mlog_errno(ret);
592 goto out_commit;
593 }
594
595 if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
596 le16_add_cpu(&cl->cl_next_free_rec, 1);
597 memset(cr, 0, sizeof(struct ocfs2_chain_rec));
598 }
599
600 cr->c_blkno = le64_to_cpu(input->group);
601 le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
602 le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
603
604 le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
605 le32_add_cpu(&fe->id1.bitmap1.i_used,
606 (input->clusters - input->frees) * cl_bpc);
607 le32_add_cpu(&fe->i_clusters, input->clusters);
608
609 ocfs2_journal_dirty(handle, main_bm_bh);
610
611 spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
612 OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
613 le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
614 spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
615 i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
616
617 ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
618
619out_commit:
620 ocfs2_commit_trans(osb, handle);
621out_unlock:
622 brelse(group_bh);
623 brelse(main_bm_bh);
624
625 ocfs2_inode_unlock(main_bm_inode, 1);
626
627out_mutex:
628 mutex_unlock(&main_bm_inode->i_mutex);
629 iput(main_bm_inode);
630
631out:
632 mlog_exit_void();
633 return ret;
634}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 000000000000..f38841abf10b
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_RESIZE_H
27#define OCFS2_RESIZE_H
28
29int ocfs2_group_extend(struct inode * inode, int new_clusters);
30int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
31
32#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index af4882b62cfa..3a50ce555e64 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
48 s16 slot_num, 48 s16 slot_num,
49 s16 node_num); 49 s16 node_num);
50 50
51/* Use the slot information we've collected to create a map of mounted
52 * nodes. Should be holding an EX on super block. assumes slot info is
53 * up to date. Note that we call this *after* we find a slot, so our
54 * own node should be set in the map too... */
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
56{
57 int i;
58 struct ocfs2_slot_info *si = osb->slot_info;
59
60 spin_lock(&si->si_lock);
61
62 for (i = 0; i < si->si_size; i++)
63 if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
64 ocfs2_node_map_set_bit(osb, &osb->mounted_map,
65 si->si_global_node_nums[i]);
66
67 spin_unlock(&si->si_lock);
68}
69
70/* post the slot information on disk into our slot_info struct. */ 51/* post the slot information on disk into our slot_info struct. */
71void ocfs2_update_slot_info(struct ocfs2_slot_info *si) 52void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
72{ 53{
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031b..1025872aaade 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
52void ocfs2_clear_slot(struct ocfs2_slot_info *si, 52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num); 53 s16 slot_num);
54 54
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
56
57static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, 55static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
58 int slot_num) 56 int slot_num)
59{ 57{
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8f09f5235e3a..7e397e2c25dd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
102 u64 bg_blkno, 102 u64 bg_blkno,
103 u16 bg_bit_off); 103 u16 bg_bit_off);
104static inline u64 ocfs2_which_cluster_group(struct inode *inode,
105 u32 cluster);
106static inline void ocfs2_block_to_cluster_group(struct inode *inode, 104static inline void ocfs2_block_to_cluster_group(struct inode *inode,
107 u64 data_blkno, 105 u64 data_blkno,
108 u64 *bg_blkno, 106 u64 *bg_blkno,
@@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
114 112
115 if (inode) { 113 if (inode) {
116 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 114 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
117 ocfs2_meta_unlock(inode, 1); 115 ocfs2_inode_unlock(inode, 1);
118 116
119 mutex_unlock(&inode->i_mutex); 117 mutex_unlock(&inode->i_mutex);
120 118
@@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
131} 129}
132 130
133/* somewhat more expensive than our other checks, so use sparingly. */ 131/* somewhat more expensive than our other checks, so use sparingly. */
134static int ocfs2_check_group_descriptor(struct super_block *sb, 132int ocfs2_check_group_descriptor(struct super_block *sb,
135 struct ocfs2_dinode *di, 133 struct ocfs2_dinode *di,
136 struct ocfs2_group_desc *gd) 134 struct ocfs2_group_desc *gd)
137{ 135{
138 unsigned int max_bits; 136 unsigned int max_bits;
139 137
@@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
412 410
413 mutex_lock(&alloc_inode->i_mutex); 411 mutex_lock(&alloc_inode->i_mutex);
414 412
415 status = ocfs2_meta_lock(alloc_inode, &bh, 1); 413 status = ocfs2_inode_lock(alloc_inode, &bh, 1);
416 if (status < 0) { 414 if (status < 0) {
417 mutex_unlock(&alloc_inode->i_mutex); 415 mutex_unlock(&alloc_inode->i_mutex);
418 iput(alloc_inode); 416 iput(alloc_inode);
@@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1443 1441
1444/* given a cluster offset, calculate which block group it belongs to 1442/* given a cluster offset, calculate which block group it belongs to
1445 * and return that block offset. */ 1443 * and return that block offset. */
1446static inline u64 ocfs2_which_cluster_group(struct inode *inode, 1444u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1447 u32 cluster)
1448{ 1445{
1449 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1446 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1450 u32 group_no; 1447 u32 group_no;
@@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1519 if (min_clusters > (osb->bitmap_cpg - 1)) { 1516 if (min_clusters > (osb->bitmap_cpg - 1)) {
1520 /* The only paths asking for contiguousness 1517 /* The only paths asking for contiguousness
1521 * should know about this already. */ 1518 * should know about this already. */
1522 mlog(ML_ERROR, "minimum allocation requested exceeds " 1519 mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1523 "group bitmap size!"); 1520 "group bitmap size %u!\n", min_clusters,
1521 osb->bitmap_cpg);
1524 status = -ENOSPC; 1522 status = -ENOSPC;
1525 goto bail; 1523 goto bail;
1526 } 1524 }
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index cafe93703095..8799033bb459 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
148 struct ocfs2_alloc_context *ac); 148 struct ocfs2_alloc_context *ac);
149 149
150/* given a cluster offset, calculate which block group it belongs to
151 * and return that block offset. */
152u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
153
154/* somewhat more expensive than our other checks, so use sparingly. */
155int ocfs2_check_group_descriptor(struct super_block *sb,
156 struct ocfs2_dinode *di,
157 struct ocfs2_group_desc *gd);
150#endif /* _CHAINALLOC_H_ */ 158#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index be562ac3e89c..01fe40ee5ea9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,7 +65,6 @@
65#include "sysfile.h" 65#include "sysfile.h"
66#include "uptodate.h" 66#include "uptodate.h"
67#include "ver.h" 67#include "ver.h"
68#include "vote.h"
69 68
70#include "buffer_head_io.h" 69#include "buffer_head_io.h"
71 70
@@ -84,9 +83,11 @@ MODULE_LICENSE("GPL");
84 83
85struct mount_options 84struct mount_options
86{ 85{
86 unsigned long commit_interval;
87 unsigned long mount_opt; 87 unsigned long mount_opt;
88 unsigned int atime_quantum; 88 unsigned int atime_quantum;
89 signed short slot; 89 signed short slot;
90 unsigned int localalloc_opt;
90}; 91};
91 92
92static int ocfs2_parse_options(struct super_block *sb, char *options, 93static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -150,6 +151,9 @@ enum {
150 Opt_data_writeback, 151 Opt_data_writeback,
151 Opt_atime_quantum, 152 Opt_atime_quantum,
152 Opt_slot, 153 Opt_slot,
154 Opt_commit,
155 Opt_localalloc,
156 Opt_localflocks,
153 Opt_err, 157 Opt_err,
154}; 158};
155 159
@@ -165,6 +169,9 @@ static match_table_t tokens = {
165 {Opt_data_writeback, "data=writeback"}, 169 {Opt_data_writeback, "data=writeback"},
166 {Opt_atime_quantum, "atime_quantum=%u"}, 170 {Opt_atime_quantum, "atime_quantum=%u"},
167 {Opt_slot, "preferred_slot=%u"}, 171 {Opt_slot, "preferred_slot=%u"},
172 {Opt_commit, "commit=%u"},
173 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"},
168 {Opt_err, NULL} 175 {Opt_err, NULL}
169}; 176};
170 177
@@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
213 220
214 mlog_entry_void(); 221 mlog_entry_void();
215 222
216 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE); 223 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
217 if (IS_ERR(new)) { 224 if (IS_ERR(new)) {
218 status = PTR_ERR(new); 225 status = PTR_ERR(new);
219 mlog_errno(status); 226 mlog_errno(status);
@@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
221 } 228 }
222 osb->root_inode = new; 229 osb->root_inode = new;
223 230
224 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE); 231 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
225 if (IS_ERR(new)) { 232 if (IS_ERR(new)) {
226 status = PTR_ERR(new); 233 status = PTR_ERR(new);
227 mlog_errno(status); 234 mlog_errno(status);
@@ -438,14 +445,16 @@ unlock_osb:
438 } 445 }
439 446
440 if (!ret) { 447 if (!ret) {
441 if (!ocfs2_is_hard_readonly(osb))
442 ocfs2_set_journal_params(osb);
443
444 /* Only save off the new mount options in case of a successful 448 /* Only save off the new mount options in case of a successful
445 * remount. */ 449 * remount. */
446 osb->s_mount_opt = parsed_options.mount_opt; 450 osb->s_mount_opt = parsed_options.mount_opt;
447 osb->s_atime_quantum = parsed_options.atime_quantum; 451 osb->s_atime_quantum = parsed_options.atime_quantum;
448 osb->preferred_slot = parsed_options.slot; 452 osb->preferred_slot = parsed_options.slot;
453 if (parsed_options.commit_interval)
454 osb->osb_commit_interval = parsed_options.commit_interval;
455
456 if (!ocfs2_is_hard_readonly(osb))
457 ocfs2_set_journal_params(osb);
449 } 458 }
450out: 459out:
451 return ret; 460 return ret;
@@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
597 osb->s_mount_opt = parsed_options.mount_opt; 606 osb->s_mount_opt = parsed_options.mount_opt;
598 osb->s_atime_quantum = parsed_options.atime_quantum; 607 osb->s_atime_quantum = parsed_options.atime_quantum;
599 osb->preferred_slot = parsed_options.slot; 608 osb->preferred_slot = parsed_options.slot;
609 osb->osb_commit_interval = parsed_options.commit_interval;
610 osb->local_alloc_size = parsed_options.localalloc_opt;
600 611
601 sb->s_magic = OCFS2_SUPER_MAGIC; 612 sb->s_magic = OCFS2_SUPER_MAGIC;
602 613
@@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb,
747 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 758 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
748 options ? options : "(none)"); 759 options ? options : "(none)");
749 760
761 mopt->commit_interval = 0;
750 mopt->mount_opt = 0; 762 mopt->mount_opt = 0;
751 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 763 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
752 mopt->slot = OCFS2_INVALID_SLOT; 764 mopt->slot = OCFS2_INVALID_SLOT;
765 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
753 766
754 if (!options) { 767 if (!options) {
755 status = 1; 768 status = 1;
@@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb,
816 if (option) 829 if (option)
817 mopt->slot = (s16)option; 830 mopt->slot = (s16)option;
818 break; 831 break;
832 case Opt_commit:
833 option = 0;
834 if (match_int(&args[0], &option)) {
835 status = 0;
836 goto bail;
837 }
838 if (option < 0)
839 return 0;
840 if (option == 0)
841 option = JBD_DEFAULT_MAX_COMMIT_AGE;
842 mopt->commit_interval = HZ * option;
843 break;
844 case Opt_localalloc:
845 option = 0;
846 if (match_int(&args[0], &option)) {
847 status = 0;
848 goto bail;
849 }
850 if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
851 mopt->localalloc_opt = option;
852 break;
853 case Opt_localflocks:
854 /*
855 * Changing this during remount could race
856 * flock() requests, or "unbalance" existing
857 * ones (e.g., a lock is taken in one mode but
858 * dropped in the other). If users care enough
859 * to flip locking modes during remount, we
860 * could add a "local" flag to individual
861 * flock structures for proper tracking of
862 * state.
863 */
864 if (!is_remount)
865 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
866 break;
819 default: 867 default:
820 mlog(ML_ERROR, 868 mlog(ML_ERROR,
821 "Unrecognized mount option \"%s\" " 869 "Unrecognized mount option \"%s\" "
@@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
864 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) 912 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
865 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); 913 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
866 914
915 if (osb->osb_commit_interval)
916 seq_printf(s, ",commit=%u",
917 (unsigned) (osb->osb_commit_interval / HZ));
918
919 if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
920 seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
921
922 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
923 seq_printf(s, ",localflocks,");
924
867 return 0; 925 return 0;
868} 926}
869 927
@@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
965 goto bail; 1023 goto bail;
966 } 1024 }
967 1025
968 status = ocfs2_meta_lock(inode, &bh, 0); 1026 status = ocfs2_inode_lock(inode, &bh, 0);
969 if (status < 0) { 1027 if (status < 0) {
970 mlog_errno(status); 1028 mlog_errno(status);
971 goto bail; 1029 goto bail;
@@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
989 1047
990 brelse(bh); 1048 brelse(bh);
991 1049
992 ocfs2_meta_unlock(inode, 0); 1050 ocfs2_inode_unlock(inode, 0);
993 status = 0; 1051 status = 0;
994bail: 1052bail:
995 if (inode) 1053 if (inode)
@@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
1020 oi->ip_clusters = 0; 1078 oi->ip_clusters = 0;
1021 1079
1022 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1080 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1023 ocfs2_lock_res_init_once(&oi->ip_meta_lockres); 1081 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1024 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
1025 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1082 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1026 1083
1027 ocfs2_metadata_cache_init(&oi->vfs_inode); 1084 ocfs2_metadata_cache_init(&oi->vfs_inode);
@@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb)
1117 goto leave; 1174 goto leave;
1118 } 1175 }
1119 1176
1120 status = ocfs2_register_hb_callbacks(osb);
1121 if (status < 0) {
1122 mlog_errno(status);
1123 goto leave;
1124 }
1125
1126 status = ocfs2_dlm_init(osb); 1177 status = ocfs2_dlm_init(osb);
1127 if (status < 0) { 1178 if (status < 0) {
1128 mlog_errno(status); 1179 mlog_errno(status);
1129 goto leave; 1180 goto leave;
1130 } 1181 }
1131 1182
1132 /* requires vote_thread to be running. */
1133 status = ocfs2_register_net_handlers(osb);
1134 if (status < 0) {
1135 mlog_errno(status);
1136 goto leave;
1137 }
1138
1139 status = ocfs2_super_lock(osb, 1); 1183 status = ocfs2_super_lock(osb, 1);
1140 if (status < 0) { 1184 if (status < 0) {
1141 mlog_errno(status); 1185 mlog_errno(status);
@@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1150 goto leave; 1194 goto leave;
1151 } 1195 }
1152 1196
1153 ocfs2_populate_mounted_map(osb);
1154
1155 /* load all node-local system inodes */ 1197 /* load all node-local system inodes */
1156 status = ocfs2_init_local_system_inodes(osb); 1198 status = ocfs2_init_local_system_inodes(osb);
1157 if (status < 0) { 1199 if (status < 0) {
@@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1174 if (ocfs2_mount_local(osb)) 1216 if (ocfs2_mount_local(osb))
1175 goto leave; 1217 goto leave;
1176 1218
1177 /* This should be sent *after* we recovered our journal as it
1178 * will cause other nodes to unmark us as needing
1179 * recovery. However, we need to send it *before* dropping the
1180 * super block lock as otherwise their recovery threads might
1181 * try to clean us up while we're live! */
1182 status = ocfs2_request_mount_vote(osb);
1183 if (status < 0)
1184 mlog_errno(status);
1185
1186leave: 1219leave:
1187 if (unlock_super) 1220 if (unlock_super)
1188 ocfs2_super_unlock(osb, 1); 1221 ocfs2_super_unlock(osb, 1);
@@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1240 mlog_errno(tmp); 1273 mlog_errno(tmp);
1241 return; 1274 return;
1242 } 1275 }
1243
1244 tmp = ocfs2_request_umount_vote(osb);
1245 if (tmp < 0)
1246 mlog_errno(tmp);
1247 } 1276 }
1248 1277
1249 if (osb->slot_num != OCFS2_INVALID_SLOT) 1278 if (osb->slot_num != OCFS2_INVALID_SLOT)
@@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1254 1283
1255 ocfs2_release_system_inodes(osb); 1284 ocfs2_release_system_inodes(osb);
1256 1285
1257 if (osb->dlm) { 1286 if (osb->dlm)
1258 ocfs2_unregister_net_handlers(osb);
1259
1260 ocfs2_dlm_shutdown(osb); 1287 ocfs2_dlm_shutdown(osb);
1261 }
1262
1263 ocfs2_clear_hb_callbacks(osb);
1264 1288
1265 debugfs_remove(osb->osb_debug_root); 1289 debugfs_remove(osb->osb_debug_root);
1266 1290
@@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1315 int i, cbits, bbits; 1339 int i, cbits, bbits;
1316 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1340 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1317 struct inode *inode = NULL; 1341 struct inode *inode = NULL;
1318 struct buffer_head *bitmap_bh = NULL;
1319 struct ocfs2_journal *journal; 1342 struct ocfs2_journal *journal;
1320 __le32 uuid_net_key; 1343 __le32 uuid_net_key;
1321 struct ocfs2_super *osb; 1344 struct ocfs2_super *osb;
@@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
1344 osb->s_sectsize_bits = blksize_bits(sector_size); 1367 osb->s_sectsize_bits = blksize_bits(sector_size);
1345 BUG_ON(!osb->s_sectsize_bits); 1368 BUG_ON(!osb->s_sectsize_bits);
1346 1369
1347 osb->net_response_ids = 0;
1348 spin_lock_init(&osb->net_response_lock);
1349 INIT_LIST_HEAD(&osb->net_response_list);
1350
1351 INIT_LIST_HEAD(&osb->osb_net_handlers);
1352 init_waitqueue_head(&osb->recovery_event); 1370 init_waitqueue_head(&osb->recovery_event);
1353 spin_lock_init(&osb->vote_task_lock); 1371 spin_lock_init(&osb->dc_task_lock);
1354 init_waitqueue_head(&osb->vote_event); 1372 init_waitqueue_head(&osb->dc_event);
1355 osb->vote_work_sequence = 0; 1373 osb->dc_work_sequence = 0;
1356 osb->vote_wake_sequence = 0; 1374 osb->dc_wake_sequence = 0;
1357 INIT_LIST_HEAD(&osb->blocked_lock_list); 1375 INIT_LIST_HEAD(&osb->blocked_lock_list);
1358 osb->blocked_lock_count = 0; 1376 osb->blocked_lock_count = 0;
1359 INIT_LIST_HEAD(&osb->vote_list);
1360 spin_lock_init(&osb->osb_lock); 1377 spin_lock_init(&osb->osb_lock);
1361 1378
1362 atomic_set(&osb->alloc_stats.moves, 0); 1379 atomic_set(&osb->alloc_stats.moves, 0);
@@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1496 } 1513 }
1497 1514
1498 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); 1515 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
1499 osb->net_key = le32_to_cpu(uuid_net_key);
1500 1516
1501 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 1517 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
1502 osb->vol_label[63] = '\0'; 1518 osb->vol_label[63] = '\0';
@@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
1539 } 1555 }
1540 1556
1541 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 1557 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1542
1543 /* We don't have a cluster lock on the bitmap here because
1544 * we're only interested in static information and the extra
1545 * complexity at mount time isn't worht it. Don't pass the
1546 * inode in to the read function though as we don't want it to
1547 * be put in the cache. */
1548 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1549 NULL);
1550 iput(inode); 1558 iput(inode);
1551 if (status < 0) {
1552 mlog_errno(status);
1553 goto bail;
1554 }
1555 1559
1556 di = (struct ocfs2_dinode *) bitmap_bh->b_data; 1560 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
1557 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1558 brelse(bitmap_bh);
1559 mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
1560 (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
1561 1561
1562 status = ocfs2_init_slot_info(osb); 1562 status = ocfs2_init_slot_info(osb);
1563 if (status < 0) { 1563 if (status < 0) {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fd2e846e3e6f..ab713ebdd546 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
112 goto bail; 112 goto bail;
113 } 113 }
114 114
115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE); 115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
116 if (IS_ERR(inode)) { 116 if (IS_ERR(inode)) {
117 mlog_errno(PTR_ERR(inode)); 117 mlog_errno(PTR_ERR(inode));
118 inode = NULL; 118 inode = NULL;
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
index 5405ce121c99..e2488f4128a2 100644
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
@@ -29,7 +29,7 @@
29 29
30#include "ver.h" 30#include "ver.h"
31 31
32#define OCFS2_BUILD_VERSION "1.3.3" 32#define OCFS2_BUILD_VERSION "1.5.0"
33 33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION 34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35 35
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index c05358538f2b..000000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,756 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.c
5 *
6 * description here
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/kthread.h>
30
31#include <cluster/heartbeat.h>
32#include <cluster/nodemanager.h>
33#include <cluster/tcp.h>
34
35#include <dlm/dlmapi.h>
36
37#define MLOG_MASK_PREFIX ML_VOTE
38#include <cluster/masklog.h>
39
40#include "ocfs2.h"
41
42#include "alloc.h"
43#include "dlmglue.h"
44#include "extent_map.h"
45#include "heartbeat.h"
46#include "inode.h"
47#include "journal.h"
48#include "slot_map.h"
49#include "vote.h"
50
51#include "buffer_head_io.h"
52
53#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
54#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
55struct ocfs2_msg_hdr
56{
57 __be32 h_response_id; /* used to lookup message handle on sending
58 * node. */
59 __be32 h_request;
60 __be64 h_blkno;
61 __be32 h_generation;
62 __be32 h_node_num; /* node sending this particular message. */
63};
64
65struct ocfs2_vote_msg
66{
67 struct ocfs2_msg_hdr v_hdr;
68 __be32 v_reserved1;
69} __attribute__ ((packed));
70
71/* Responses are given these values to maintain backwards
72 * compatibility with older ocfs2 versions */
73#define OCFS2_RESPONSE_OK (0)
74#define OCFS2_RESPONSE_BUSY (-16)
75#define OCFS2_RESPONSE_BAD_MSG (-22)
76
77struct ocfs2_response_msg
78{
79 struct ocfs2_msg_hdr r_hdr;
80 __be32 r_response;
81} __attribute__ ((packed));
82
83struct ocfs2_vote_work {
84 struct list_head w_list;
85 struct ocfs2_vote_msg w_msg;
86};
87
88enum ocfs2_vote_request {
89 OCFS2_VOTE_REQ_INVALID = 0,
90 OCFS2_VOTE_REQ_MOUNT,
91 OCFS2_VOTE_REQ_UMOUNT,
92 OCFS2_VOTE_REQ_LAST
93};
94
95static inline int ocfs2_is_valid_vote_request(int request)
96{
97 return OCFS2_VOTE_REQ_INVALID < request &&
98 request < OCFS2_VOTE_REQ_LAST;
99}
100
101typedef void (*ocfs2_net_response_callback)(void *priv,
102 struct ocfs2_response_msg *resp);
103struct ocfs2_net_response_cb {
104 ocfs2_net_response_callback rc_cb;
105 void *rc_priv;
106};
107
108struct ocfs2_net_wait_ctxt {
109 struct list_head n_list;
110 u32 n_response_id;
111 wait_queue_head_t n_event;
112 struct ocfs2_node_map n_node_map;
113 int n_response; /* an agreggate response. 0 if
114 * all nodes are go, < 0 on any
115 * negative response from any
116 * node or network error. */
117 struct ocfs2_net_response_cb *n_callback;
118};
119
120static void ocfs2_process_mount_request(struct ocfs2_super *osb,
121 unsigned int node_num)
122{
123 mlog(0, "MOUNT vote from node %u\n", node_num);
124 /* The other node only sends us this message when he has an EX
125 * on the superblock, so our recovery threads (if having been
126 * launched) are waiting on it.*/
127 ocfs2_recovery_map_clear(osb, node_num);
128 ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
129
130 /* We clear the umount map here because a node may have been
131 * previously mounted, safely unmounted but never stopped
132 * heartbeating - in which case we'd have a stale entry. */
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136static void ocfs2_process_umount_request(struct ocfs2_super *osb,
137 unsigned int node_num)
138{
139 mlog(0, "UMOUNT vote from node %u\n", node_num);
140 ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
141 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
142}
143
144static void ocfs2_process_vote(struct ocfs2_super *osb,
145 struct ocfs2_vote_msg *msg)
146{
147 int net_status, vote_response;
148 unsigned int node_num;
149 u64 blkno;
150 enum ocfs2_vote_request request;
151 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
152 struct ocfs2_response_msg response;
153
154 /* decode the network mumbo jumbo into local variables. */
155 request = be32_to_cpu(hdr->h_request);
156 blkno = be64_to_cpu(hdr->h_blkno);
157 node_num = be32_to_cpu(hdr->h_node_num);
158
159 mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
160 request, (unsigned long long)blkno, node_num);
161
162 if (!ocfs2_is_valid_vote_request(request)) {
163 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
164 request, node_num);
165 vote_response = OCFS2_RESPONSE_BAD_MSG;
166 goto respond;
167 }
168
169 vote_response = OCFS2_RESPONSE_OK;
170
171 switch (request) {
172 case OCFS2_VOTE_REQ_UMOUNT:
173 ocfs2_process_umount_request(osb, node_num);
174 goto respond;
175 case OCFS2_VOTE_REQ_MOUNT:
176 ocfs2_process_mount_request(osb, node_num);
177 goto respond;
178 default:
179 /* avoids a gcc warning */
180 break;
181 }
182
183respond:
184 /* Response struture is small so we just put it on the stack
185 * and stuff it inline. */
186 memset(&response, 0, sizeof(struct ocfs2_response_msg));
187 response.r_hdr.h_response_id = hdr->h_response_id;
188 response.r_hdr.h_blkno = hdr->h_blkno;
189 response.r_hdr.h_generation = hdr->h_generation;
190 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
191 response.r_response = cpu_to_be32(vote_response);
192
193 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
194 osb->net_key,
195 &response,
196 sizeof(struct ocfs2_response_msg),
197 node_num,
198 NULL);
199 /* We still want to error print for ENOPROTOOPT here. The
200 * sending node shouldn't have unregistered his net handler
201 * without sending an unmount vote 1st */
202 if (net_status < 0
203 && net_status != -ETIMEDOUT
204 && net_status != -ENOTCONN)
205 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
206 node_num, net_status);
207}
208
209static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
210{
211 unsigned long processed;
212 struct ocfs2_lock_res *lockres;
213 struct ocfs2_vote_work *work;
214
215 mlog_entry_void();
216
217 spin_lock(&osb->vote_task_lock);
218 /* grab this early so we know to try again if a state change and
219 * wake happens part-way through our work */
220 osb->vote_work_sequence = osb->vote_wake_sequence;
221
222 processed = osb->blocked_lock_count;
223 while (processed) {
224 BUG_ON(list_empty(&osb->blocked_lock_list));
225
226 lockres = list_entry(osb->blocked_lock_list.next,
227 struct ocfs2_lock_res, l_blocked_list);
228 list_del_init(&lockres->l_blocked_list);
229 osb->blocked_lock_count--;
230 spin_unlock(&osb->vote_task_lock);
231
232 BUG_ON(!processed);
233 processed--;
234
235 ocfs2_process_blocked_lock(osb, lockres);
236
237 spin_lock(&osb->vote_task_lock);
238 }
239
240 while (osb->vote_count) {
241 BUG_ON(list_empty(&osb->vote_list));
242 work = list_entry(osb->vote_list.next,
243 struct ocfs2_vote_work, w_list);
244 list_del(&work->w_list);
245 osb->vote_count--;
246 spin_unlock(&osb->vote_task_lock);
247
248 ocfs2_process_vote(osb, &work->w_msg);
249 kfree(work);
250
251 spin_lock(&osb->vote_task_lock);
252 }
253 spin_unlock(&osb->vote_task_lock);
254
255 mlog_exit_void();
256}
257
258static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
259{
260 int empty = 0;
261
262 spin_lock(&osb->vote_task_lock);
263 if (list_empty(&osb->blocked_lock_list) &&
264 list_empty(&osb->vote_list))
265 empty = 1;
266
267 spin_unlock(&osb->vote_task_lock);
268 return empty;
269}
270
271static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
272{
273 int should_wake = 0;
274
275 spin_lock(&osb->vote_task_lock);
276 if (osb->vote_work_sequence != osb->vote_wake_sequence)
277 should_wake = 1;
278 spin_unlock(&osb->vote_task_lock);
279
280 return should_wake;
281}
282
283int ocfs2_vote_thread(void *arg)
284{
285 int status = 0;
286 struct ocfs2_super *osb = arg;
287
288 /* only quit once we've been asked to stop and there is no more
289 * work available */
290 while (!(kthread_should_stop() &&
291 ocfs2_vote_thread_lists_empty(osb))) {
292
293 wait_event_interruptible(osb->vote_event,
294 ocfs2_vote_thread_should_wake(osb) ||
295 kthread_should_stop());
296
297 mlog(0, "vote_thread: awoken\n");
298
299 ocfs2_vote_thread_do_work(osb);
300 }
301
302 osb->vote_task = NULL;
303 return status;
304}
305
306static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
307{
308 struct ocfs2_net_wait_ctxt *w;
309
310 w = kzalloc(sizeof(*w), GFP_NOFS);
311 if (!w) {
312 mlog_errno(-ENOMEM);
313 goto bail;
314 }
315
316 INIT_LIST_HEAD(&w->n_list);
317 init_waitqueue_head(&w->n_event);
318 ocfs2_node_map_init(&w->n_node_map);
319 w->n_response_id = response_id;
320 w->n_callback = NULL;
321bail:
322 return w;
323}
324
325static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
326{
327 unsigned int ret;
328
329 spin_lock(&osb->net_response_lock);
330 ret = ++osb->net_response_ids;
331 spin_unlock(&osb->net_response_lock);
332
333 return ret;
334}
335
336static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
337 struct ocfs2_net_wait_ctxt *w)
338{
339 spin_lock(&osb->net_response_lock);
340 list_del(&w->n_list);
341 spin_unlock(&osb->net_response_lock);
342}
343
344static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
345 struct ocfs2_net_wait_ctxt *w)
346{
347 spin_lock(&osb->net_response_lock);
348 list_add_tail(&w->n_list,
349 &osb->net_response_list);
350 spin_unlock(&osb->net_response_lock);
351}
352
353static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
354 struct ocfs2_net_wait_ctxt *w,
355 int node_num)
356{
357 assert_spin_locked(&osb->net_response_lock);
358
359 ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
360 if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
361 wake_up(&w->n_event);
362}
363
364/* Intended to be called from the node down callback, we fake remove
365 * the node from all our response contexts */
366void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
367 int node_num)
368{
369 struct list_head *p;
370 struct ocfs2_net_wait_ctxt *w = NULL;
371
372 spin_lock(&osb->net_response_lock);
373
374 list_for_each(p, &osb->net_response_list) {
375 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
376
377 __ocfs2_mark_node_responded(osb, w, node_num);
378 }
379
380 spin_unlock(&osb->net_response_lock);
381}
382
383static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
384 struct ocfs2_vote_msg *request,
385 unsigned int response_id,
386 int *response,
387 struct ocfs2_net_response_cb *callback)
388{
389 int status, i, remote_err;
390 struct ocfs2_net_wait_ctxt *w = NULL;
391 int dequeued = 0;
392
393 mlog_entry_void();
394
395 w = ocfs2_new_net_wait_ctxt(response_id);
396 if (!w) {
397 status = -ENOMEM;
398 mlog_errno(status);
399 goto bail;
400 }
401 w->n_callback = callback;
402
403 /* we're pretty much ready to go at this point, and this fills
404 * in n_response which we need anyway... */
405 ocfs2_queue_net_wait_ctxt(osb, w);
406
407 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
408
409 while (i != O2NM_INVALID_NODE_NUM) {
410 if (i != osb->node_num) {
411 mlog(0, "trying to send request to node %i\n", i);
412 ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
413
414 remote_err = 0;
415 status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
416 osb->net_key,
417 request,
418 sizeof(*request),
419 i,
420 &remote_err);
421 if (status == -ETIMEDOUT) {
422 mlog(0, "remote node %d timed out!\n", i);
423 status = -EAGAIN;
424 goto bail;
425 }
426 if (remote_err < 0) {
427 status = remote_err;
428 mlog(0, "remote error %d on node %d!\n",
429 remote_err, i);
430 mlog_errno(status);
431 goto bail;
432 }
433 if (status < 0) {
434 mlog_errno(status);
435 goto bail;
436 }
437 }
438 i++;
439 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
440 mlog(0, "next is %d, i am %d\n", i, osb->node_num);
441 }
442 mlog(0, "done sending, now waiting on responses...\n");
443
444 wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
445
446 ocfs2_dequeue_net_wait_ctxt(osb, w);
447 dequeued = 1;
448
449 *response = w->n_response;
450 status = 0;
451bail:
452 if (w) {
453 if (!dequeued)
454 ocfs2_dequeue_net_wait_ctxt(osb, w);
455 kfree(w);
456 }
457
458 mlog_exit(status);
459 return status;
460}
461
462static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
463 u64 blkno,
464 unsigned int generation,
465 enum ocfs2_vote_request type)
466{
467 struct ocfs2_vote_msg *request;
468 struct ocfs2_msg_hdr *hdr;
469
470 BUG_ON(!ocfs2_is_valid_vote_request(type));
471
472 request = kzalloc(sizeof(*request), GFP_NOFS);
473 if (!request) {
474 mlog_errno(-ENOMEM);
475 } else {
476 hdr = &request->v_hdr;
477 hdr->h_node_num = cpu_to_be32(osb->node_num);
478 hdr->h_request = cpu_to_be32(type);
479 hdr->h_blkno = cpu_to_be64(blkno);
480 hdr->h_generation = cpu_to_be32(generation);
481 }
482
483 return request;
484}
485
486/* Complete the buildup of a new vote request and process the
487 * broadcast return value. */
488static int ocfs2_do_request_vote(struct ocfs2_super *osb,
489 struct ocfs2_vote_msg *request,
490 struct ocfs2_net_response_cb *callback)
491{
492 int status, response = -EBUSY;
493 unsigned int response_id;
494 struct ocfs2_msg_hdr *hdr;
495
496 response_id = ocfs2_new_response_id(osb);
497
498 hdr = &request->v_hdr;
499 hdr->h_response_id = cpu_to_be32(response_id);
500
501 status = ocfs2_broadcast_vote(osb, request, response_id, &response,
502 callback);
503 if (status < 0) {
504 mlog_errno(status);
505 goto bail;
506 }
507
508 status = response;
509bail:
510
511 return status;
512}
513
514int ocfs2_request_mount_vote(struct ocfs2_super *osb)
515{
516 int status;
517 struct ocfs2_vote_msg *request = NULL;
518
519 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
520 if (!request) {
521 status = -ENOMEM;
522 goto bail;
523 }
524
525 status = -EAGAIN;
526 while (status == -EAGAIN) {
527 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
528 signal_pending(current)) {
529 status = -ERESTARTSYS;
530 goto bail;
531 }
532
533 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
534 osb->node_num)) {
535 status = 0;
536 goto bail;
537 }
538
539 status = ocfs2_do_request_vote(osb, request, NULL);
540 }
541
542bail:
543 kfree(request);
544 return status;
545}
546
547int ocfs2_request_umount_vote(struct ocfs2_super *osb)
548{
549 int status;
550 struct ocfs2_vote_msg *request = NULL;
551
552 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
553 if (!request) {
554 status = -ENOMEM;
555 goto bail;
556 }
557
558 status = -EAGAIN;
559 while (status == -EAGAIN) {
560 /* Do not check signals on this vote... We really want
561 * this one to go all the way through. */
562
563 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
564 osb->node_num)) {
565 status = 0;
566 goto bail;
567 }
568
569 status = ocfs2_do_request_vote(osb, request, NULL);
570 }
571
572bail:
573 kfree(request);
574 return status;
575}
576
577/* TODO: This should eventually be a hash table! */
578static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
579 u32 response_id)
580{
581 struct list_head *p;
582 struct ocfs2_net_wait_ctxt *w = NULL;
583
584 list_for_each(p, &osb->net_response_list) {
585 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
586 if (response_id == w->n_response_id)
587 break;
588 w = NULL;
589 }
590
591 return w;
592}
593
594/* Translate response codes into local node errno values */
595static inline int ocfs2_translate_response(int response)
596{
597 int ret;
598
599 switch (response) {
600 case OCFS2_RESPONSE_OK:
601 ret = 0;
602 break;
603
604 case OCFS2_RESPONSE_BUSY:
605 ret = -EBUSY;
606 break;
607
608 default:
609 ret = -EINVAL;
610 }
611
612 return ret;
613}
614
615static int ocfs2_handle_response_message(struct o2net_msg *msg,
616 u32 len,
617 void *data, void **ret_data)
618{
619 unsigned int response_id, node_num;
620 int response_status;
621 struct ocfs2_super *osb = data;
622 struct ocfs2_response_msg *resp;
623 struct ocfs2_net_wait_ctxt * w;
624 struct ocfs2_net_response_cb *resp_cb;
625
626 resp = (struct ocfs2_response_msg *) msg->buf;
627
628 response_id = be32_to_cpu(resp->r_hdr.h_response_id);
629 node_num = be32_to_cpu(resp->r_hdr.h_node_num);
630 response_status =
631 ocfs2_translate_response(be32_to_cpu(resp->r_response));
632
633 mlog(0, "received response message:\n");
634 mlog(0, "h_response_id = %u\n", response_id);
635 mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
636 mlog(0, "h_blkno = %llu\n",
637 (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
638 mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
639 mlog(0, "h_node_num = %u\n", node_num);
640 mlog(0, "r_response = %d\n", response_status);
641
642 spin_lock(&osb->net_response_lock);
643 w = __ocfs2_find_net_wait_ctxt(osb, response_id);
644 if (!w) {
645 mlog(0, "request not found!\n");
646 goto bail;
647 }
648 resp_cb = w->n_callback;
649
650 if (response_status && (!w->n_response)) {
651 /* we only really need one negative response so don't
652 * set it twice. */
653 w->n_response = response_status;
654 }
655
656 if (resp_cb) {
657 spin_unlock(&osb->net_response_lock);
658
659 resp_cb->rc_cb(resp_cb->rc_priv, resp);
660
661 spin_lock(&osb->net_response_lock);
662 }
663
664 __ocfs2_mark_node_responded(osb, w, node_num);
665bail:
666 spin_unlock(&osb->net_response_lock);
667
668 return 0;
669}
670
671static int ocfs2_handle_vote_message(struct o2net_msg *msg,
672 u32 len,
673 void *data, void **ret_data)
674{
675 int status;
676 struct ocfs2_super *osb = data;
677 struct ocfs2_vote_work *work;
678
679 work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
680 if (!work) {
681 status = -ENOMEM;
682 mlog_errno(status);
683 goto bail;
684 }
685
686 INIT_LIST_HEAD(&work->w_list);
687 memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
688
689 mlog(0, "scheduling vote request:\n");
690 mlog(0, "h_response_id = %u\n",
691 be32_to_cpu(work->w_msg.v_hdr.h_response_id));
692 mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
693 mlog(0, "h_blkno = %llu\n",
694 (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
695 mlog(0, "h_generation = %u\n",
696 be32_to_cpu(work->w_msg.v_hdr.h_generation));
697 mlog(0, "h_node_num = %u\n",
698 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
699
700 spin_lock(&osb->vote_task_lock);
701 list_add_tail(&work->w_list, &osb->vote_list);
702 osb->vote_count++;
703 spin_unlock(&osb->vote_task_lock);
704
705 ocfs2_kick_vote_thread(osb);
706
707 status = 0;
708bail:
709 return status;
710}
711
712void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
713{
714 if (!osb->net_key)
715 return;
716
717 o2net_unregister_handler_list(&osb->osb_net_handlers);
718
719 if (!list_empty(&osb->net_response_list))
720 mlog(ML_ERROR, "net response list not empty!\n");
721
722 osb->net_key = 0;
723}
724
725int ocfs2_register_net_handlers(struct ocfs2_super *osb)
726{
727 int status = 0;
728
729 if (ocfs2_mount_local(osb))
730 return 0;
731
732 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
733 osb->net_key,
734 sizeof(struct ocfs2_response_msg),
735 ocfs2_handle_response_message,
736 osb, NULL, &osb->osb_net_handlers);
737 if (status) {
738 mlog_errno(status);
739 goto bail;
740 }
741
742 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
743 osb->net_key,
744 sizeof(struct ocfs2_vote_msg),
745 ocfs2_handle_vote_message,
746 osb, NULL, &osb->osb_net_handlers);
747 if (status) {
748 mlog_errno(status);
749 goto bail;
750 }
751bail:
752 if (status < 0)
753 ocfs2_unregister_net_handlers(osb);
754
755 return status;
756}