aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/acl.c36
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c9
-rw-r--r--fs/ocfs2/aops.h3
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c532
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/tcp.c24
-rw-r--r--fs/ocfs2/dcache.c33
-rw-r--r--fs/ocfs2/dcache.h1
-rw-r--r--fs/ocfs2/dir.c24
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h30
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c21
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c401
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c49
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c114
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c1
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/dlmglue.h1
-rw-r--r--fs/ocfs2/file.c87
-rw-r--r--fs/ocfs2/inode.c7
-rw-r--r--fs/ocfs2/inode.h12
-rw-r--r--fs/ocfs2/ioctl.c356
-rw-r--r--fs/ocfs2/journal.c9
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/mmap.c15
-rw-r--r--fs/ocfs2/namei.c305
-rw-r--r--fs/ocfs2/ocfs2.h63
-rw-r--r--fs/ocfs2/ocfs2_fs.h83
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h103
-rw-r--r--fs/ocfs2/refcounttree.c73
-rw-r--r--fs/ocfs2/refcounttree.h7
-rw-r--r--fs/ocfs2/reservations.c22
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/ocfs2/suballoc.c239
-rw-r--r--fs/ocfs2/suballoc.h21
-rw-r--r--fs/ocfs2/super.c170
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/sysfile.c60
-rw-r--r--fs/ocfs2/xattr.c6
46 files changed, 2563 insertions, 421 deletions
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index da702294d7e7..391915093fe1 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
209 } 209 }
210 210
211 inode->i_mode = new_mode; 211 inode->i_mode = new_mode;
212 inode->i_ctime = CURRENT_TIME;
212 di->i_mode = cpu_to_le16(inode->i_mode); 213 di->i_mode = cpu_to_le16(inode->i_mode);
214 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
215 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
213 216
214 ocfs2_journal_dirty(handle, di_bh); 217 ocfs2_journal_dirty(handle, di_bh);
215 218
@@ -290,12 +293,30 @@ static int ocfs2_set_acl(handle_t *handle,
290 293
291int ocfs2_check_acl(struct inode *inode, int mask) 294int ocfs2_check_acl(struct inode *inode, int mask)
292{ 295{
293 struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); 296 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
297 struct buffer_head *di_bh = NULL;
298 struct posix_acl *acl;
299 int ret = -EAGAIN;
294 300
295 if (IS_ERR(acl)) 301 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
302 return ret;
303
304 ret = ocfs2_read_inode_block(inode, &di_bh);
305 if (ret < 0) {
306 mlog_errno(ret);
307 return ret;
308 }
309
310 acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
311
312 brelse(di_bh);
313
314 if (IS_ERR(acl)) {
315 mlog_errno(PTR_ERR(acl));
296 return PTR_ERR(acl); 316 return PTR_ERR(acl);
317 }
297 if (acl) { 318 if (acl) {
298 int ret = posix_acl_permission(inode, acl, mask); 319 ret = posix_acl_permission(inode, acl, mask);
299 posix_acl_release(acl); 320 posix_acl_release(acl);
300 return ret; 321 return ret;
301 } 322 }
@@ -344,7 +365,7 @@ int ocfs2_init_acl(handle_t *handle,
344{ 365{
345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 366 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
346 struct posix_acl *acl = NULL; 367 struct posix_acl *acl = NULL;
347 int ret = 0; 368 int ret = 0, ret2;
348 mode_t mode; 369 mode_t mode;
349 370
350 if (!S_ISLNK(inode->i_mode)) { 371 if (!S_ISLNK(inode->i_mode)) {
@@ -381,7 +402,12 @@ int ocfs2_init_acl(handle_t *handle,
381 mode = inode->i_mode; 402 mode = inode->i_mode;
382 ret = posix_acl_create_masq(clone, &mode); 403 ret = posix_acl_create_masq(clone, &mode);
383 if (ret >= 0) { 404 if (ret >= 0) {
384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); 405 ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
406 if (ret2) {
407 mlog_errno(ret2);
408 ret = ret2;
409 goto cleanup;
410 }
385 if (ret > 0) { 411 if (ret > 0) {
386 ret = ocfs2_set_acl(handle, inode, 412 ret = ocfs2_set_acl(handle, inode,
387 di_bh, ACL_TYPE_ACCESS, 413 di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 215e12ce1d85..592fae5007d1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6672 last_page_bytes = PAGE_ALIGN(end); 6672 last_page_bytes = PAGE_ALIGN(end);
6673 index = start >> PAGE_CACHE_SHIFT; 6673 index = start >> PAGE_CACHE_SHIFT;
6674 do { 6674 do {
6675 pages[numpages] = grab_cache_page(mapping, index); 6675 pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
6676 if (!pages[numpages]) { 6676 if (!pages[numpages]) {
6677 ret = -ENOMEM; 6677 ret = -ENOMEM;
6678 mlog_errno(ret); 6678 mlog_errno(ret);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0de69c9a08be..5cfeee118158 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -883,8 +883,8 @@ struct ocfs2_write_ctxt {
883 * out in so that future reads from that region will get 883 * out in so that future reads from that region will get
884 * zero's. 884 * zero's.
885 */ 885 */
886 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
887 unsigned int w_num_pages; 886 unsigned int w_num_pages;
887 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
888 struct page *w_target_page; 888 struct page *w_target_page;
889 889
890 /* 890 /*
@@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1642 return ret; 1642 return ret;
1643} 1643}
1644 1644
1645int ocfs2_write_begin_nolock(struct address_space *mapping, 1645int ocfs2_write_begin_nolock(struct file *filp,
1646 struct address_space *mapping,
1646 loff_t pos, unsigned len, unsigned flags, 1647 loff_t pos, unsigned len, unsigned flags,
1647 struct page **pagep, void **fsdata, 1648 struct page **pagep, void **fsdata,
1648 struct buffer_head *di_bh, struct page *mmap_page) 1649 struct buffer_head *di_bh, struct page *mmap_page)
@@ -1692,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1692 mlog_errno(ret); 1693 mlog_errno(ret);
1693 goto out; 1694 goto out;
1694 } else if (ret == 1) { 1695 } else if (ret == 1) {
1695 ret = ocfs2_refcount_cow(inode, di_bh, 1696 ret = ocfs2_refcount_cow(inode, filp, di_bh,
1696 wc->w_cpos, wc->w_clen, UINT_MAX); 1697 wc->w_cpos, wc->w_clen, UINT_MAX);
1697 if (ret) { 1698 if (ret) {
1698 mlog_errno(ret); 1699 mlog_errno(ret);
@@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1854 */ 1855 */
1855 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1856 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1856 1857
1857 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1858 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
1858 fsdata, di_bh, NULL); 1859 fsdata, di_bh, NULL);
1859 if (ret) { 1860 if (ret) {
1860 mlog_errno(ret); 1861 mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc513..7606f663da6d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
48 loff_t pos, unsigned len, unsigned copied, 48 loff_t pos, unsigned len, unsigned copied,
49 struct page *page, void *fsdata); 49 struct page *page, void *fsdata);
50 50
51int ocfs2_write_begin_nolock(struct address_space *mapping, 51int ocfs2_write_begin_nolock(struct file *filp,
52 struct address_space *mapping,
52 loff_t pos, unsigned len, unsigned flags, 53 loff_t pos, unsigned len, unsigned flags,
53 struct page **pagep, void **fsdata, 54 struct page **pagep, void **fsdata,
54 struct buffer_head *di_bh, struct page *mmap_page); 55 struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index ec6d12339593..c7ee03c22226 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
439 439
440 ocfs2_blockcheck_inc_failure(stats); 440 ocfs2_blockcheck_inc_failure(stats);
441 mlog(ML_ERROR, 441 mlog(ML_ERROR,
442 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", 442 "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
443 (unsigned int)check.bc_crc32e, (unsigned int)crc); 443 (unsigned int)check.bc_crc32e, (unsigned int)crc);
444 444
445 /* Ok, try ECC fixups */ 445 /* Ok, try ECC fixups */
@@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
453 goto out; 453 goto out;
454 } 454 }
455 455
456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", 456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
457 (unsigned int)check.bc_crc32e, (unsigned int)crc); 457 (unsigned int)check.bc_crc32e, (unsigned int)crc);
458 458
459 rc = -EIO; 459 rc = -EIO;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f92d56..52c7557f3e25 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,10 +62,51 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
62static LIST_HEAD(o2hb_node_events); 62static LIST_HEAD(o2hb_node_events);
63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
64 64
65/*
66 * In global heartbeat, we maintain a series of region bitmaps.
67 * - o2hb_region_bitmap allows us to limit the region number to max region.
68 * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
69 * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
70 * heartbeat on it.
71 * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
72 */
73static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
74static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
75static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
76static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
77
78#define O2HB_DB_TYPE_LIVENODES 0
79#define O2HB_DB_TYPE_LIVEREGIONS 1
80#define O2HB_DB_TYPE_QUORUMREGIONS 2
81#define O2HB_DB_TYPE_FAILEDREGIONS 3
82#define O2HB_DB_TYPE_REGION_LIVENODES 4
83#define O2HB_DB_TYPE_REGION_NUMBER 5
84#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
85struct o2hb_debug_buf {
86 int db_type;
87 int db_size;
88 int db_len;
89 void *db_data;
90};
91
92static struct o2hb_debug_buf *o2hb_db_livenodes;
93static struct o2hb_debug_buf *o2hb_db_liveregions;
94static struct o2hb_debug_buf *o2hb_db_quorumregions;
95static struct o2hb_debug_buf *o2hb_db_failedregions;
96
65#define O2HB_DEBUG_DIR "o2hb" 97#define O2HB_DEBUG_DIR "o2hb"
66#define O2HB_DEBUG_LIVENODES "livenodes" 98#define O2HB_DEBUG_LIVENODES "livenodes"
99#define O2HB_DEBUG_LIVEREGIONS "live_regions"
100#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
101#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
102#define O2HB_DEBUG_REGION_NUMBER "num"
103#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
104
67static struct dentry *o2hb_debug_dir; 105static struct dentry *o2hb_debug_dir;
68static struct dentry *o2hb_debug_livenodes; 106static struct dentry *o2hb_debug_livenodes;
107static struct dentry *o2hb_debug_liveregions;
108static struct dentry *o2hb_debug_quorumregions;
109static struct dentry *o2hb_debug_failedregions;
69 110
70static LIST_HEAD(o2hb_all_regions); 111static LIST_HEAD(o2hb_all_regions);
71 112
@@ -77,7 +118,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
77 118
78#define O2HB_DEFAULT_BLOCK_BITS 9 119#define O2HB_DEFAULT_BLOCK_BITS 9
79 120
121enum o2hb_heartbeat_modes {
122 O2HB_HEARTBEAT_LOCAL = 0,
123 O2HB_HEARTBEAT_GLOBAL,
124 O2HB_HEARTBEAT_NUM_MODES,
125};
126
127char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
128 "local", /* O2HB_HEARTBEAT_LOCAL */
129 "global", /* O2HB_HEARTBEAT_GLOBAL */
130};
131
80unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 132unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
133unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
81 134
82/* Only sets a new threshold if there are no active regions. 135/* Only sets a new threshold if there are no active regions.
83 * 136 *
@@ -94,6 +147,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
94 } 147 }
95} 148}
96 149
150static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
151{
152 int ret = -1;
153
154 if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
155 spin_lock(&o2hb_live_lock);
156 if (list_empty(&o2hb_all_regions)) {
157 o2hb_heartbeat_mode = hb_mode;
158 ret = 0;
159 }
160 spin_unlock(&o2hb_live_lock);
161 }
162
163 return ret;
164}
165
97struct o2hb_node_event { 166struct o2hb_node_event {
98 struct list_head hn_item; 167 struct list_head hn_item;
99 enum o2hb_callback_type hn_event_type; 168 enum o2hb_callback_type hn_event_type;
@@ -135,6 +204,18 @@ struct o2hb_region {
135 struct block_device *hr_bdev; 204 struct block_device *hr_bdev;
136 struct o2hb_disk_slot *hr_slots; 205 struct o2hb_disk_slot *hr_slots;
137 206
207 /* live node map of this region */
208 unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
209 unsigned int hr_region_num;
210
211 struct dentry *hr_debug_dir;
212 struct dentry *hr_debug_livenodes;
213 struct dentry *hr_debug_regnum;
214 struct dentry *hr_debug_elapsed_time;
215 struct o2hb_debug_buf *hr_db_livenodes;
216 struct o2hb_debug_buf *hr_db_regnum;
217 struct o2hb_debug_buf *hr_db_elapsed_time;
218
138 /* let the person setting up hb wait for it to return until it 219 /* let the person setting up hb wait for it to return until it
139 * has reached a 'steady' state. This will be fixed when we have 220 * has reached a 'steady' state. This will be fixed when we have
140 * a more complete api that doesn't lead to this sort of fragility. */ 221 * a more complete api that doesn't lead to this sort of fragility. */
@@ -163,8 +244,19 @@ struct o2hb_bio_wait_ctxt {
163 int wc_error; 244 int wc_error;
164}; 245};
165 246
247static int o2hb_pop_count(void *map, int count)
248{
249 int i = -1, pop = 0;
250
251 while ((i = find_next_bit(map, count, i + 1)) < count)
252 pop++;
253 return pop;
254}
255
166static void o2hb_write_timeout(struct work_struct *work) 256static void o2hb_write_timeout(struct work_struct *work)
167{ 257{
258 int failed, quorum;
259 unsigned long flags;
168 struct o2hb_region *reg = 260 struct o2hb_region *reg =
169 container_of(work, struct o2hb_region, 261 container_of(work, struct o2hb_region,
170 hr_write_timeout_work.work); 262 hr_write_timeout_work.work);
@@ -172,6 +264,28 @@ static void o2hb_write_timeout(struct work_struct *work)
172 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " 264 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
173 "milliseconds\n", reg->hr_dev_name, 265 "milliseconds\n", reg->hr_dev_name,
174 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 266 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
267
268 if (o2hb_global_heartbeat_active()) {
269 spin_lock_irqsave(&o2hb_live_lock, flags);
270 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
271 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
272 failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
273 O2NM_MAX_REGIONS);
274 quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
275 O2NM_MAX_REGIONS);
276 spin_unlock_irqrestore(&o2hb_live_lock, flags);
277
278 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
279 quorum, failed);
280
281 /*
282 * Fence if the number of failed regions >= half the number
283 * of quorum regions
284 */
285 if ((failed << 1) < quorum)
286 return;
287 }
288
175 o2quo_disk_timeout(); 289 o2quo_disk_timeout();
176} 290}
177 291
@@ -180,6 +294,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
180 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 294 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
181 O2HB_MAX_WRITE_TIMEOUT_MS); 295 O2HB_MAX_WRITE_TIMEOUT_MS);
182 296
297 if (o2hb_global_heartbeat_active()) {
298 spin_lock(&o2hb_live_lock);
299 clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
300 spin_unlock(&o2hb_live_lock);
301 }
183 cancel_delayed_work(&reg->hr_write_timeout_work); 302 cancel_delayed_work(&reg->hr_write_timeout_work);
184 reg->hr_last_timeout_start = jiffies; 303 reg->hr_last_timeout_start = jiffies;
185 schedule_delayed_work(&reg->hr_write_timeout_work, 304 schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -513,6 +632,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
513{ 632{
514 assert_spin_locked(&o2hb_live_lock); 633 assert_spin_locked(&o2hb_live_lock);
515 634
635 BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
636
516 event->hn_event_type = type; 637 event->hn_event_type = type;
517 event->hn_node = node; 638 event->hn_node = node;
518 event->hn_node_num = node_num; 639 event->hn_node_num = node_num;
@@ -554,6 +675,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
554 o2nm_node_put(node); 675 o2nm_node_put(node);
555} 676}
556 677
678static void o2hb_set_quorum_device(struct o2hb_region *reg,
679 struct o2hb_disk_slot *slot)
680{
681 assert_spin_locked(&o2hb_live_lock);
682
683 if (!o2hb_global_heartbeat_active())
684 return;
685
686 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
687 return;
688
689 /*
690 * A region can be added to the quorum only when it sees all
691 * live nodes heartbeat on it. In other words, the region has been
692 * added to all nodes.
693 */
694 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
695 sizeof(o2hb_live_node_bitmap)))
696 return;
697
698 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
699 return;
700
701 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
702 config_item_name(&reg->hr_item));
703
704 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
705}
706
557static int o2hb_check_slot(struct o2hb_region *reg, 707static int o2hb_check_slot(struct o2hb_region *reg,
558 struct o2hb_disk_slot *slot) 708 struct o2hb_disk_slot *slot)
559{ 709{
@@ -565,14 +715,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
565 u64 cputime; 715 u64 cputime;
566 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 716 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
567 unsigned int slot_dead_ms; 717 unsigned int slot_dead_ms;
718 int tmp;
568 719
569 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 720 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
570 721
571 /* Is this correct? Do we assume that the node doesn't exist 722 /*
572 * if we're not configured for him? */ 723 * If a node is no longer configured but is still in the livemap, we
724 * may need to clear that bit from the livemap.
725 */
573 node = o2nm_get_node_by_num(slot->ds_node_num); 726 node = o2nm_get_node_by_num(slot->ds_node_num);
574 if (!node) 727 if (!node) {
575 return 0; 728 spin_lock(&o2hb_live_lock);
729 tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
730 spin_unlock(&o2hb_live_lock);
731 if (!tmp)
732 return 0;
733 }
576 734
577 if (!o2hb_verify_crc(reg, hb_block)) { 735 if (!o2hb_verify_crc(reg, hb_block)) {
578 /* all paths from here will drop o2hb_live_lock for 736 /* all paths from here will drop o2hb_live_lock for
@@ -639,8 +797,12 @@ fire_callbacks:
639 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", 797 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
640 slot->ds_node_num, (long long)slot->ds_last_generation); 798 slot->ds_node_num, (long long)slot->ds_last_generation);
641 799
800 set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
801
642 /* first on the list generates a callback */ 802 /* first on the list generates a callback */
643 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 803 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
804 mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
805 "bitmap\n", slot->ds_node_num);
644 set_bit(slot->ds_node_num, o2hb_live_node_bitmap); 806 set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
645 807
646 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, 808 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -684,13 +846,18 @@ fire_callbacks:
684 mlog(ML_HEARTBEAT, "Node %d left my region\n", 846 mlog(ML_HEARTBEAT, "Node %d left my region\n",
685 slot->ds_node_num); 847 slot->ds_node_num);
686 848
849 clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
850
687 /* last off the live_slot generates a callback */ 851 /* last off the live_slot generates a callback */
688 list_del_init(&slot->ds_live_item); 852 list_del_init(&slot->ds_live_item);
689 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 853 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
854 mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
855 "nodes bitmap\n", slot->ds_node_num);
690 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); 856 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
691 857
692 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, 858 /* node can be null */
693 slot->ds_node_num); 859 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
860 node, slot->ds_node_num);
694 861
695 changed = 1; 862 changed = 1;
696 } 863 }
@@ -706,11 +873,14 @@ fire_callbacks:
706 slot->ds_equal_samples = 0; 873 slot->ds_equal_samples = 0;
707 } 874 }
708out: 875out:
876 o2hb_set_quorum_device(reg, slot);
877
709 spin_unlock(&o2hb_live_lock); 878 spin_unlock(&o2hb_live_lock);
710 879
711 o2hb_run_event_list(&event); 880 o2hb_run_event_list(&event);
712 881
713 o2nm_node_put(node); 882 if (node)
883 o2nm_node_put(node);
714 return changed; 884 return changed;
715} 885}
716 886
@@ -737,6 +907,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
737{ 907{
738 int i, ret, highest_node, change = 0; 908 int i, ret, highest_node, change = 0;
739 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 909 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
910 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
740 struct o2hb_bio_wait_ctxt write_wc; 911 struct o2hb_bio_wait_ctxt write_wc;
741 912
742 ret = o2nm_configured_node_map(configured_nodes, 913 ret = o2nm_configured_node_map(configured_nodes,
@@ -746,6 +917,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
746 return ret; 917 return ret;
747 } 918 }
748 919
920 /*
921 * If a node is not configured but is in the livemap, we still need
922 * to read the slot so as to be able to remove it from the livemap.
923 */
924 o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
925 i = -1;
926 while ((i = find_next_bit(live_node_bitmap,
927 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
928 set_bit(i, configured_nodes);
929 }
930
749 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 931 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
750 if (highest_node >= O2NM_MAX_NODES) { 932 if (highest_node >= O2NM_MAX_NODES) {
751 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 933 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
@@ -917,21 +1099,59 @@ static int o2hb_thread(void *data)
917#ifdef CONFIG_DEBUG_FS 1099#ifdef CONFIG_DEBUG_FS
918static int o2hb_debug_open(struct inode *inode, struct file *file) 1100static int o2hb_debug_open(struct inode *inode, struct file *file)
919{ 1101{
1102 struct o2hb_debug_buf *db = inode->i_private;
1103 struct o2hb_region *reg;
920 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1104 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
921 char *buf = NULL; 1105 char *buf = NULL;
922 int i = -1; 1106 int i = -1;
923 int out = 0; 1107 int out = 0;
924 1108
1109 /* max_nodes should be the largest bitmap we pass here */
1110 BUG_ON(sizeof(map) < db->db_size);
1111
925 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 1112 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
926 if (!buf) 1113 if (!buf)
927 goto bail; 1114 goto bail;
928 1115
929 o2hb_fill_node_map(map, sizeof(map)); 1116 switch (db->db_type) {
1117 case O2HB_DB_TYPE_LIVENODES:
1118 case O2HB_DB_TYPE_LIVEREGIONS:
1119 case O2HB_DB_TYPE_QUORUMREGIONS:
1120 case O2HB_DB_TYPE_FAILEDREGIONS:
1121 spin_lock(&o2hb_live_lock);
1122 memcpy(map, db->db_data, db->db_size);
1123 spin_unlock(&o2hb_live_lock);
1124 break;
1125
1126 case O2HB_DB_TYPE_REGION_LIVENODES:
1127 spin_lock(&o2hb_live_lock);
1128 reg = (struct o2hb_region *)db->db_data;
1129 memcpy(map, reg->hr_live_node_bitmap, db->db_size);
1130 spin_unlock(&o2hb_live_lock);
1131 break;
1132
1133 case O2HB_DB_TYPE_REGION_NUMBER:
1134 reg = (struct o2hb_region *)db->db_data;
1135 out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
1136 reg->hr_region_num);
1137 goto done;
1138
1139 case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
1140 reg = (struct o2hb_region *)db->db_data;
1141 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
1142 jiffies_to_msecs(jiffies -
1143 reg->hr_last_timeout_start));
1144 goto done;
1145
1146 default:
1147 goto done;
1148 }
930 1149
931 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) 1150 while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
932 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); 1151 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
933 out += snprintf(buf + out, PAGE_SIZE - out, "\n"); 1152 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
934 1153
1154done:
935 i_size_write(inode, out); 1155 i_size_write(inode, out);
936 1156
937 file->private_data = buf; 1157 file->private_data = buf;
@@ -978,10 +1198,104 @@ static const struct file_operations o2hb_debug_fops = {
978 1198
979void o2hb_exit(void) 1199void o2hb_exit(void)
980{ 1200{
981 if (o2hb_debug_livenodes) 1201 kfree(o2hb_db_livenodes);
982 debugfs_remove(o2hb_debug_livenodes); 1202 kfree(o2hb_db_liveregions);
983 if (o2hb_debug_dir) 1203 kfree(o2hb_db_quorumregions);
984 debugfs_remove(o2hb_debug_dir); 1204 kfree(o2hb_db_failedregions);
1205 debugfs_remove(o2hb_debug_failedregions);
1206 debugfs_remove(o2hb_debug_quorumregions);
1207 debugfs_remove(o2hb_debug_liveregions);
1208 debugfs_remove(o2hb_debug_livenodes);
1209 debugfs_remove(o2hb_debug_dir);
1210}
1211
1212static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
1213 struct o2hb_debug_buf **db, int db_len,
1214 int type, int size, int len, void *data)
1215{
1216 *db = kmalloc(db_len, GFP_KERNEL);
1217 if (!*db)
1218 return NULL;
1219
1220 (*db)->db_type = type;
1221 (*db)->db_size = size;
1222 (*db)->db_len = len;
1223 (*db)->db_data = data;
1224
1225 return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
1226 &o2hb_debug_fops);
1227}
1228
1229static int o2hb_debug_init(void)
1230{
1231 int ret = -ENOMEM;
1232
1233 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
1234 if (!o2hb_debug_dir) {
1235 mlog_errno(ret);
1236 goto bail;
1237 }
1238
1239 o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1240 o2hb_debug_dir,
1241 &o2hb_db_livenodes,
1242 sizeof(*o2hb_db_livenodes),
1243 O2HB_DB_TYPE_LIVENODES,
1244 sizeof(o2hb_live_node_bitmap),
1245 O2NM_MAX_NODES,
1246 o2hb_live_node_bitmap);
1247 if (!o2hb_debug_livenodes) {
1248 mlog_errno(ret);
1249 goto bail;
1250 }
1251
1252 o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
1253 o2hb_debug_dir,
1254 &o2hb_db_liveregions,
1255 sizeof(*o2hb_db_liveregions),
1256 O2HB_DB_TYPE_LIVEREGIONS,
1257 sizeof(o2hb_live_region_bitmap),
1258 O2NM_MAX_REGIONS,
1259 o2hb_live_region_bitmap);
1260 if (!o2hb_debug_liveregions) {
1261 mlog_errno(ret);
1262 goto bail;
1263 }
1264
1265 o2hb_debug_quorumregions =
1266 o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
1267 o2hb_debug_dir,
1268 &o2hb_db_quorumregions,
1269 sizeof(*o2hb_db_quorumregions),
1270 O2HB_DB_TYPE_QUORUMREGIONS,
1271 sizeof(o2hb_quorum_region_bitmap),
1272 O2NM_MAX_REGIONS,
1273 o2hb_quorum_region_bitmap);
1274 if (!o2hb_debug_quorumregions) {
1275 mlog_errno(ret);
1276 goto bail;
1277 }
1278
1279 o2hb_debug_failedregions =
1280 o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
1281 o2hb_debug_dir,
1282 &o2hb_db_failedregions,
1283 sizeof(*o2hb_db_failedregions),
1284 O2HB_DB_TYPE_FAILEDREGIONS,
1285 sizeof(o2hb_failed_region_bitmap),
1286 O2NM_MAX_REGIONS,
1287 o2hb_failed_region_bitmap);
1288 if (!o2hb_debug_failedregions) {
1289 mlog_errno(ret);
1290 goto bail;
1291 }
1292
1293 ret = 0;
1294bail:
1295 if (ret)
1296 o2hb_exit();
1297
1298 return ret;
985} 1299}
986 1300
987int o2hb_init(void) 1301int o2hb_init(void)
@@ -997,24 +1311,12 @@ int o2hb_init(void)
997 INIT_LIST_HEAD(&o2hb_node_events); 1311 INIT_LIST_HEAD(&o2hb_node_events);
998 1312
999 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 1313 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
1314 memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
1315 memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
1316 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
1317 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
1000 1318
1001 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); 1319 return o2hb_debug_init();
1002 if (!o2hb_debug_dir) {
1003 mlog_errno(-ENOMEM);
1004 return -ENOMEM;
1005 }
1006
1007 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
1008 S_IFREG|S_IRUSR,
1009 o2hb_debug_dir, NULL,
1010 &o2hb_debug_fops);
1011 if (!o2hb_debug_livenodes) {
1012 mlog_errno(-ENOMEM);
1013 debugfs_remove(o2hb_debug_dir);
1014 return -ENOMEM;
1015 }
1016
1017 return 0;
1018} 1320}
1019 1321
1020/* if we're already in a callback then we're already serialized by the sem */ 1322/* if we're already in a callback then we're already serialized by the sem */
@@ -1078,6 +1380,13 @@ static void o2hb_region_release(struct config_item *item)
1078 if (reg->hr_slots) 1380 if (reg->hr_slots)
1079 kfree(reg->hr_slots); 1381 kfree(reg->hr_slots);
1080 1382
1383 kfree(reg->hr_db_regnum);
1384 kfree(reg->hr_db_livenodes);
1385 debugfs_remove(reg->hr_debug_livenodes);
1386 debugfs_remove(reg->hr_debug_regnum);
1387 debugfs_remove(reg->hr_debug_elapsed_time);
1388 debugfs_remove(reg->hr_debug_dir);
1389
1081 spin_lock(&o2hb_live_lock); 1390 spin_lock(&o2hb_live_lock);
1082 list_del(&reg->hr_all_item); 1391 list_del(&reg->hr_all_item);
1083 spin_unlock(&o2hb_live_lock); 1392 spin_unlock(&o2hb_live_lock);
@@ -1441,6 +1750,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1441 /* Ok, we were woken. Make sure it wasn't by drop_item() */ 1750 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1442 spin_lock(&o2hb_live_lock); 1751 spin_lock(&o2hb_live_lock);
1443 hb_task = reg->hr_task; 1752 hb_task = reg->hr_task;
1753 if (o2hb_global_heartbeat_active())
1754 set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
1444 spin_unlock(&o2hb_live_lock); 1755 spin_unlock(&o2hb_live_lock);
1445 1756
1446 if (hb_task) 1757 if (hb_task)
@@ -1448,6 +1759,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1448 else 1759 else
1449 ret = -EIO; 1760 ret = -EIO;
1450 1761
1762 if (hb_task && o2hb_global_heartbeat_active())
1763 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
1764 config_item_name(&reg->hr_item));
1765
1451out: 1766out:
1452 if (filp) 1767 if (filp)
1453 fput(filp); 1768 fput(filp);
@@ -1586,21 +1901,94 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
1586 : NULL; 1901 : NULL;
1587} 1902}
1588 1903
1904static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
1905{
1906 int ret = -ENOMEM;
1907
1908 reg->hr_debug_dir =
1909 debugfs_create_dir(config_item_name(&reg->hr_item), dir);
1910 if (!reg->hr_debug_dir) {
1911 mlog_errno(ret);
1912 goto bail;
1913 }
1914
1915 reg->hr_debug_livenodes =
1916 o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1917 reg->hr_debug_dir,
1918 &(reg->hr_db_livenodes),
1919 sizeof(*(reg->hr_db_livenodes)),
1920 O2HB_DB_TYPE_REGION_LIVENODES,
1921 sizeof(reg->hr_live_node_bitmap),
1922 O2NM_MAX_NODES, reg);
1923 if (!reg->hr_debug_livenodes) {
1924 mlog_errno(ret);
1925 goto bail;
1926 }
1927
1928 reg->hr_debug_regnum =
1929 o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
1930 reg->hr_debug_dir,
1931 &(reg->hr_db_regnum),
1932 sizeof(*(reg->hr_db_regnum)),
1933 O2HB_DB_TYPE_REGION_NUMBER,
1934 0, O2NM_MAX_NODES, reg);
1935 if (!reg->hr_debug_regnum) {
1936 mlog_errno(ret);
1937 goto bail;
1938 }
1939
1940 reg->hr_debug_elapsed_time =
1941 o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
1942 reg->hr_debug_dir,
1943 &(reg->hr_db_elapsed_time),
1944 sizeof(*(reg->hr_db_elapsed_time)),
1945 O2HB_DB_TYPE_REGION_ELAPSED_TIME,
1946 0, 0, reg);
1947 if (!reg->hr_debug_elapsed_time) {
1948 mlog_errno(ret);
1949 goto bail;
1950 }
1951
1952 ret = 0;
1953bail:
1954 return ret;
1955}
1956
1589static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, 1957static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
1590 const char *name) 1958 const char *name)
1591{ 1959{
1592 struct o2hb_region *reg = NULL; 1960 struct o2hb_region *reg = NULL;
1961 int ret;
1593 1962
1594 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); 1963 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
1595 if (reg == NULL) 1964 if (reg == NULL)
1596 return ERR_PTR(-ENOMEM); 1965 return ERR_PTR(-ENOMEM);
1597 1966
1598 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type); 1967 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
1968 return ERR_PTR(-ENAMETOOLONG);
1599 1969
1600 spin_lock(&o2hb_live_lock); 1970 spin_lock(&o2hb_live_lock);
1971 reg->hr_region_num = 0;
1972 if (o2hb_global_heartbeat_active()) {
1973 reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
1974 O2NM_MAX_REGIONS);
1975 if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
1976 spin_unlock(&o2hb_live_lock);
1977 return ERR_PTR(-EFBIG);
1978 }
1979 set_bit(reg->hr_region_num, o2hb_region_bitmap);
1980 }
1601 list_add_tail(&reg->hr_all_item, &o2hb_all_regions); 1981 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1602 spin_unlock(&o2hb_live_lock); 1982 spin_unlock(&o2hb_live_lock);
1603 1983
1984 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1985
1986 ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
1987 if (ret) {
1988 config_item_put(&reg->hr_item);
1989 return ERR_PTR(ret);
1990 }
1991
1604 return &reg->hr_item; 1992 return &reg->hr_item;
1605} 1993}
1606 1994
@@ -1612,6 +2000,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1612 2000
1613 /* stop the thread when the user removes the region dir */ 2001 /* stop the thread when the user removes the region dir */
1614 spin_lock(&o2hb_live_lock); 2002 spin_lock(&o2hb_live_lock);
2003 if (o2hb_global_heartbeat_active()) {
2004 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2005 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2006 }
1615 hb_task = reg->hr_task; 2007 hb_task = reg->hr_task;
1616 reg->hr_task = NULL; 2008 reg->hr_task = NULL;
1617 spin_unlock(&o2hb_live_lock); 2009 spin_unlock(&o2hb_live_lock);
@@ -1628,6 +2020,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1628 wake_up(&o2hb_steady_queue); 2020 wake_up(&o2hb_steady_queue);
1629 } 2021 }
1630 2022
2023 if (o2hb_global_heartbeat_active())
2024 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2025 config_item_name(&reg->hr_item));
1631 config_item_put(item); 2026 config_item_put(item);
1632} 2027}
1633 2028
@@ -1688,6 +2083,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
1688 return count; 2083 return count;
1689} 2084}
1690 2085
2086static
2087ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
2088 char *page)
2089{
2090 return sprintf(page, "%s\n",
2091 o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
2092}
2093
2094static
2095ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
2096 const char *page, size_t count)
2097{
2098 unsigned int i;
2099 int ret;
2100 size_t len;
2101
2102 len = (page[count - 1] == '\n') ? count - 1 : count;
2103 if (!len)
2104 return -EINVAL;
2105
2106 for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
2107 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
2108 continue;
2109
2110 ret = o2hb_global_hearbeat_mode_set(i);
2111 if (!ret)
2112 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
2113 o2hb_heartbeat_mode_desc[i]);
2114 return count;
2115 }
2116
2117 return -EINVAL;
2118
2119}
2120
1691static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { 2121static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
1692 .attr = { .ca_owner = THIS_MODULE, 2122 .attr = { .ca_owner = THIS_MODULE,
1693 .ca_name = "dead_threshold", 2123 .ca_name = "dead_threshold",
@@ -1696,8 +2126,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
1696 .store = o2hb_heartbeat_group_threshold_store, 2126 .store = o2hb_heartbeat_group_threshold_store,
1697}; 2127};
1698 2128
2129static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
2130 .attr = { .ca_owner = THIS_MODULE,
2131 .ca_name = "mode",
2132 .ca_mode = S_IRUGO | S_IWUSR },
2133 .show = o2hb_heartbeat_group_mode_show,
2134 .store = o2hb_heartbeat_group_mode_store,
2135};
2136
1699static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { 2137static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
1700 &o2hb_heartbeat_group_attr_threshold.attr, 2138 &o2hb_heartbeat_group_attr_threshold.attr,
2139 &o2hb_heartbeat_group_attr_mode.attr,
1701 NULL, 2140 NULL,
1702}; 2141};
1703 2142
@@ -1963,3 +2402,34 @@ void o2hb_stop_all_regions(void)
1963 spin_unlock(&o2hb_live_lock); 2402 spin_unlock(&o2hb_live_lock);
1964} 2403}
1965EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); 2404EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
2405
2406int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
2407{
2408 struct o2hb_region *reg;
2409 int numregs = 0;
2410 char *p;
2411
2412 spin_lock(&o2hb_live_lock);
2413
2414 p = region_uuids;
2415 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2416 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
2417 if (numregs < max_regions) {
2418 memcpy(p, config_item_name(&reg->hr_item),
2419 O2HB_MAX_REGION_NAME_LEN);
2420 p += O2HB_MAX_REGION_NAME_LEN;
2421 }
2422 numregs++;
2423 }
2424
2425 spin_unlock(&o2hb_live_lock);
2426
2427 return numregs;
2428}
2429EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
2430
2431int o2hb_global_heartbeat_active(void)
2432{
2433 return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
2434}
2435EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b49..00ad8e8fea51 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
31 31
32#define O2HB_REGION_TIMEOUT_MS 2000 32#define O2HB_REGION_TIMEOUT_MS 2000
33 33
34#define O2HB_MAX_REGION_NAME_LEN 32
35
34/* number of changes to be seen as live */ 36/* number of changes to be seen as live */
35#define O2HB_LIVE_THRESHOLD 2 37#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 38/* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
81int o2hb_check_node_heartbeating_from_callback(u8 node_num); 83int o2hb_check_node_heartbeating_from_callback(u8 node_num);
82int o2hb_check_local_node_heartbeating(void); 84int o2hb_check_local_node_heartbeating(void);
83void o2hb_stop_all_regions(void); 85void o2hb_stop_all_regions(void);
86int o2hb_get_all_regions(char *region_uuids, u8 numregions);
87int o2hb_global_heartbeat_active(void);
84 88
85#endif /* O2CLUSTER_HEARTBEAT_H */ 89#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a2fa56..ea2ed9f56c94 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,7 +119,8 @@
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ 122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
123#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */
123 124
124#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 125#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
125#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 126#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f367fed..bb240647ca5f 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); 711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
712 spin_lock_init(&node->nd_lock); 712 spin_lock_init(&node->nd_lock);
713 713
714 mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
715
714 return &node->nd_item; 716 return &node->nd_item;
715} 717}
716 718
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
744 } 746 }
745 write_unlock(&cluster->cl_nodes_lock); 747 write_unlock(&cluster->cl_nodes_lock);
746 748
749 mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
750 config_item_name(&node->nd_item));
751
747 config_item_put(item); 752 config_item_put(item);
748} 753}
749 754
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad571..49b594325bec 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
36/* host name, group name, cluster name all 64 bytes */ 36/* host name, group name, cluster name all 64 bytes */
37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN 37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
38 38
39/*
40 * Maximum number of global heartbeat regions allowed.
41 * **CAUTION** Changing this number will break dlm compatibility.
42 */
43#define O2NM_MAX_REGIONS 32
44
39#endif /* _OCFS2_NODEMANAGER_H */ 45#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa75ca3f78da..9aa426e42123 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
977int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 977int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
978 size_t caller_veclen, u8 target_node, int *status) 978 size_t caller_veclen, u8 target_node, int *status)
979{ 979{
980 int ret; 980 int ret = 0;
981 struct o2net_msg *msg = NULL; 981 struct o2net_msg *msg = NULL;
982 size_t veclen, caller_bytes = 0; 982 size_t veclen, caller_bytes = 0;
983 struct kvec *vec = NULL; 983 struct kvec *vec = NULL;
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
1696{ 1696{
1697 o2quo_hb_down(node_num); 1697 o2quo_hb_down(node_num);
1698 1698
1699 if (!node)
1700 return;
1701
1699 if (node_num != o2nm_this_node()) 1702 if (node_num != o2nm_this_node())
1700 o2net_disconnect_node(node); 1703 o2net_disconnect_node(node);
1701 1704
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1709 1712
1710 o2quo_hb_up(node_num); 1713 o2quo_hb_up(node_num);
1711 1714
1715 BUG_ON(!node);
1716
1712 /* ensure an immediate connect attempt */ 1717 /* ensure an immediate connect attempt */
1713 nn->nn_last_connect_attempt = jiffies - 1718 nn->nn_last_connect_attempt = jiffies -
1714 (msecs_to_jiffies(o2net_reconnect_delay()) + 1); 1719 (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
@@ -1759,6 +1764,7 @@ static int o2net_accept_one(struct socket *sock)
1759 struct sockaddr_in sin; 1764 struct sockaddr_in sin;
1760 struct socket *new_sock = NULL; 1765 struct socket *new_sock = NULL;
1761 struct o2nm_node *node = NULL; 1766 struct o2nm_node *node = NULL;
1767 struct o2nm_node *local_node = NULL;
1762 struct o2net_sock_container *sc = NULL; 1768 struct o2net_sock_container *sc = NULL;
1763 struct o2net_node *nn; 1769 struct o2net_node *nn;
1764 1770
@@ -1796,11 +1802,15 @@ static int o2net_accept_one(struct socket *sock)
1796 goto out; 1802 goto out;
1797 } 1803 }
1798 1804
1799 if (o2nm_this_node() > node->nd_num) { 1805 if (o2nm_this_node() >= node->nd_num) {
1800 mlog(ML_NOTICE, "unexpected connect attempted from a lower " 1806 local_node = o2nm_get_node_by_num(o2nm_this_node());
1801 "numbered node '%s' at " "%pI4:%d with num %u\n", 1807 mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
1802 node->nd_name, &sin.sin_addr.s_addr, 1808 "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
1803 ntohs(sin.sin_port), node->nd_num); 1809 local_node->nd_name, local_node->nd_num,
1810 &(local_node->nd_ipv4_address),
1811 ntohs(local_node->nd_ipv4_port),
1812 node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
1813 ntohs(sin.sin_port));
1804 ret = -EINVAL; 1814 ret = -EINVAL;
1805 goto out; 1815 goto out;
1806 } 1816 }
@@ -1857,6 +1867,8 @@ out:
1857 sock_release(new_sock); 1867 sock_release(new_sock);
1858 if (node) 1868 if (node)
1859 o2nm_node_put(node); 1869 o2nm_node_put(node);
1870 if (local_node)
1871 o2nm_node_put(local_node);
1860 if (sc) 1872 if (sc)
1861 sc_put(sc); 1873 sc_put(sc);
1862 return ret; 1874 return ret;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe2..edaded48e7e9 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,6 +40,14 @@
40#include "inode.h" 40#include "inode.h"
41#include "super.h" 41#include "super.h"
42 42
43void ocfs2_dentry_attach_gen(struct dentry *dentry)
44{
45 unsigned long gen =
46 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
47 BUG_ON(dentry->d_inode);
48 dentry->d_fsdata = (void *)gen;
49}
50
43 51
44static int ocfs2_dentry_revalidate(struct dentry *dentry, 52static int ocfs2_dentry_revalidate(struct dentry *dentry,
45 struct nameidata *nd) 53 struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
51 mlog_entry("(0x%p, '%.*s')\n", dentry, 59 mlog_entry("(0x%p, '%.*s')\n", dentry,
52 dentry->d_name.len, dentry->d_name.name); 60 dentry->d_name.len, dentry->d_name.name);
53 61
54 /* Never trust a negative dentry - force a new lookup. */ 62 /* For a negative dentry -
63 * check the generation number of the parent and compare with the
64 * one stored in the inode.
65 */
55 if (inode == NULL) { 66 if (inode == NULL) {
56 mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, 67 unsigned long gen = (unsigned long) dentry->d_fsdata;
57 dentry->d_name.name); 68 unsigned long pgen =
58 goto bail; 69 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
70 mlog(0, "negative dentry: %.*s parent gen: %lu "
71 "dentry gen: %lu\n",
72 dentry->d_name.len, dentry->d_name.name, pgen, gen);
73 if (gen != pgen)
74 goto bail;
75 goto valid;
59 } 76 }
60 77
61 BUG_ON(!osb); 78 BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
96 goto bail; 113 goto bail;
97 } 114 }
98 115
116valid:
99 ret = 1; 117 ret = 1;
100 118
101bail: 119bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
227 if (!inode) 245 if (!inode)
228 return 0; 246 return 0;
229 247
248 if (!dentry->d_inode && dentry->d_fsdata) {
249 /* Converting a negative dentry to positive
250 Clear dentry->d_fsdata */
251 dentry->d_fsdata = dl = NULL;
252 }
253
230 if (dl) { 254 if (dl) {
231 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, 255 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
232 " \"%.*s\": old parent: %llu, new: %llu\n", 256 " \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
452 476
453out: 477out:
454 iput(inode); 478 iput(inode);
479 ocfs2_dentry_attach_gen(dentry);
455} 480}
456 481
457/* 482/*
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf1..b79eff709958 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
64 struct inode *old_dir, struct inode *new_dir); 64 struct inode *old_dir, struct inode *new_dir);
65 65
66extern spinlock_t dentry_attach_lock; 66extern spinlock_t dentry_attach_lock;
67void ocfs2_dentry_attach_gen(struct dentry *dentry);
67 68
68#endif /* OCFS2_DCACHE_H */ 69#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f04ebcfffc4a..c49f6de0e7ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3931 goto out_commit; 3931 goto out_commit;
3932 } 3932 }
3933 3933
3934 cpos = split_hash;
3935 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3936 data_ac, meta_ac, new_dx_leaves,
3937 num_dx_leaves);
3938 if (ret) {
3939 mlog_errno(ret);
3940 goto out_commit;
3941 }
3942
3934 for (i = 0; i < num_dx_leaves; i++) { 3943 for (i = 0; i < num_dx_leaves; i++) {
3935 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), 3944 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3936 orig_dx_leaves[i], 3945 orig_dx_leaves[i],
@@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3939 mlog_errno(ret); 3948 mlog_errno(ret);
3940 goto out_commit; 3949 goto out_commit;
3941 } 3950 }
3942 }
3943 3951
3944 cpos = split_hash; 3952 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3945 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, 3953 new_dx_leaves[i],
3946 data_ac, meta_ac, new_dx_leaves, 3954 OCFS2_JOURNAL_ACCESS_WRITE);
3947 num_dx_leaves); 3955 if (ret) {
3948 if (ret) { 3956 mlog_errno(ret);
3949 mlog_errno(ret); 3957 goto out_commit;
3950 goto out_commit; 3958 }
3951 } 3959 }
3952 3960
3953 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, 3961 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4b6ae2c13b47..b36d0bf77a5a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -445,7 +445,9 @@ enum {
445 DLM_LOCK_REQUEST_MSG, /* 515 */ 445 DLM_LOCK_REQUEST_MSG, /* 515 */
446 DLM_RECO_DATA_DONE_MSG, /* 516 */ 446 DLM_RECO_DATA_DONE_MSG, /* 516 */
447 DLM_BEGIN_RECO_MSG, /* 517 */ 447 DLM_BEGIN_RECO_MSG, /* 517 */
448 DLM_FINALIZE_RECO_MSG /* 518 */ 448 DLM_FINALIZE_RECO_MSG, /* 518 */
449 DLM_QUERY_REGION, /* 519 */
450 DLM_QUERY_NODEINFO, /* 520 */
449}; 451};
450 452
451struct dlm_reco_node_data 453struct dlm_reco_node_data
@@ -727,6 +729,31 @@ struct dlm_cancel_join
727 u8 domain[O2NM_MAX_NAME_LEN]; 729 u8 domain[O2NM_MAX_NAME_LEN];
728}; 730};
729 731
732struct dlm_query_region {
733 u8 qr_node;
734 u8 qr_numregions;
735 u8 qr_namelen;
736 u8 pad1;
737 u8 qr_domain[O2NM_MAX_NAME_LEN];
738 u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
739};
740
741struct dlm_node_info {
742 u8 ni_nodenum;
743 u8 pad1;
744 u16 ni_ipv4_port;
745 u32 ni_ipv4_address;
746};
747
748struct dlm_query_nodeinfo {
749 u8 qn_nodenum;
750 u8 qn_numnodes;
751 u8 qn_namelen;
752 u8 pad1;
753 u8 qn_domain[O2NM_MAX_NAME_LEN];
754 struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
755};
756
730struct dlm_exit_domain 757struct dlm_exit_domain
731{ 758{
732 u8 node_idx; 759 u8 node_idx;
@@ -1030,6 +1057,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
1030 struct dlm_lock_resource *res); 1057 struct dlm_lock_resource *res);
1031void dlm_clean_master_list(struct dlm_ctxt *dlm, 1058void dlm_clean_master_list(struct dlm_ctxt *dlm,
1032 u8 dead_node); 1059 u8 dead_node);
1060void dlm_force_free_mles(struct dlm_ctxt *dlm);
1033int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); 1061int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
1034int __dlm_lockres_has_locks(struct dlm_lock_resource *res); 1062int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
1035int __dlm_lockres_unused(struct dlm_lock_resource *res); 1063int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 5efdd37dfe48..272ec8631a51 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
493 struct hlist_head *bucket; 493 struct hlist_head *bucket;
494 struct hlist_node *list; 494 struct hlist_node *list;
495 int i, out = 0; 495 int i, out = 0;
496 unsigned long total = 0, longest = 0, bktcnt; 496 unsigned long total = 0, longest = 0, bucket_count = 0;
497 497
498 out += snprintf(db->buf + out, db->len - out, 498 out += snprintf(db->buf + out, db->len - out,
499 "Dumping MLEs for Domain: %s\n", dlm->name); 499 "Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
505 mle = hlist_entry(list, struct dlm_master_list_entry, 505 mle = hlist_entry(list, struct dlm_master_list_entry,
506 master_hash_node); 506 master_hash_node);
507 ++total; 507 ++total;
508 ++bktcnt; 508 ++bucket_count;
509 if (db->len - out < 200) 509 if (db->len - out < 200)
510 continue; 510 continue;
511 out += dump_mle(mle, db->buf + out, db->len - out); 511 out += dump_mle(mle, db->buf + out, db->len - out);
512 } 512 }
513 longest = max(longest, bktcnt); 513 longest = max(longest, bucket_count);
514 bktcnt = 0; 514 bucket_count = 0;
515 } 515 }
516 spin_unlock(&dlm->master_lock); 516 spin_unlock(&dlm->master_lock);
517 517
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
636 spin_lock(&dlm->track_lock); 636 spin_lock(&dlm->track_lock);
637 if (oldres) 637 if (oldres)
638 track_list = &oldres->tracking; 638 track_list = &oldres->tracking;
639 else 639 else {
640 track_list = &dlm->tracking_list; 640 track_list = &dlm->tracking_list;
641 if (list_empty(track_list)) {
642 dl = NULL;
643 spin_unlock(&dlm->track_lock);
644 goto bail;
645 }
646 }
641 647
642 list_for_each_entry(res, track_list, tracking) { 648 list_for_each_entry(res, track_list, tracking) {
643 if (&res->tracking == &dlm->tracking_list) 649 if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
660 } else 666 } else
661 dl = NULL; 667 dl = NULL;
662 668
669bail:
663 /* passed to seq_show */ 670 /* passed to seq_show */
664 return dl; 671 return dl;
665} 672}
@@ -775,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
775 782
776 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ 783 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
777 out += snprintf(db->buf + out, db->len - out, 784 out += snprintf(db->buf + out, db->len - out,
778 "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); 785 "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
786 dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
787 dlm->dlm_locking_proto.pv_minor);
779 788
780 /* Thread Pid: xxx Node: xxx State: xxxxx */ 789 /* Thread Pid: xxx Node: xxx State: xxxxx */
781 out += snprintf(db->buf + out, db->len - out, 790 out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 153abb5abef0..58a93b953735 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
128 * will have a negotiated version with the same major number and a minor 128 * will have a negotiated version with the same major number and a minor
129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should 129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
130 * be used to determine what a running domain is actually using. 130 * be used to determine what a running domain is actually using.
131 *
132 * New in version 1.1:
133 * - Message DLM_QUERY_REGION added to support global heartbeat
134 * - Message DLM_QUERY_NODEINFO added to allow online node removes
131 */ 135 */
132static const struct dlm_protocol_version dlm_protocol = { 136static const struct dlm_protocol_version dlm_protocol = {
133 .pv_major = 1, 137 .pv_major = 1,
134 .pv_minor = 0, 138 .pv_minor = 1,
135}; 139};
136 140
137#define DLM_DOMAIN_BACKOFF_MS 200 141#define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
142 void **ret_data); 146 void **ret_data);
143static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 147static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
144 void **ret_data); 148 void **ret_data);
149static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
150 void *data, void **ret_data);
145static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 151static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
146 void **ret_data); 152 void **ret_data);
147static int dlm_protocol_compare(struct dlm_protocol_version *existing, 153static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -693,6 +699,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
693 699
694 dlm_mark_domain_leaving(dlm); 700 dlm_mark_domain_leaving(dlm);
695 dlm_leave_domain(dlm); 701 dlm_leave_domain(dlm);
702 dlm_force_free_mles(dlm);
696 dlm_complete_dlm_shutdown(dlm); 703 dlm_complete_dlm_shutdown(dlm);
697 } 704 }
698 dlm_put(dlm); 705 dlm_put(dlm);
@@ -920,6 +927,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
920 return 0; 927 return 0;
921} 928}
922 929
930static int dlm_match_regions(struct dlm_ctxt *dlm,
931 struct dlm_query_region *qr)
932{
933 char *local = NULL, *remote = qr->qr_regions;
934 char *l, *r;
935 int localnr, i, j, foundit;
936 int status = 0;
937
938 if (!o2hb_global_heartbeat_active()) {
939 if (qr->qr_numregions) {
940 mlog(ML_ERROR, "Domain %s: Joining node %d has global "
941 "heartbeat enabled but local node %d does not\n",
942 qr->qr_domain, qr->qr_node, dlm->node_num);
943 status = -EINVAL;
944 }
945 goto bail;
946 }
947
948 if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
949 mlog(ML_ERROR, "Domain %s: Local node %d has global "
950 "heartbeat enabled but joining node %d does not\n",
951 qr->qr_domain, dlm->node_num, qr->qr_node);
952 status = -EINVAL;
953 goto bail;
954 }
955
956 r = remote;
957 for (i = 0; i < qr->qr_numregions; ++i) {
958 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
959 r += O2HB_MAX_REGION_NAME_LEN;
960 }
961
962 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
963 if (!local) {
964 status = -ENOMEM;
965 goto bail;
966 }
967
968 localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
969
970 /* compare local regions with remote */
971 l = local;
972 for (i = 0; i < localnr; ++i) {
973 foundit = 0;
974 r = remote;
975 for (j = 0; j <= qr->qr_numregions; ++j) {
976 if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
977 foundit = 1;
978 break;
979 }
980 r += O2HB_MAX_REGION_NAME_LEN;
981 }
982 if (!foundit) {
983 status = -EINVAL;
984 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
985 "in local node %d but not in joining node %d\n",
986 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
987 dlm->node_num, qr->qr_node);
988 goto bail;
989 }
990 l += O2HB_MAX_REGION_NAME_LEN;
991 }
992
993 /* compare remote with local regions */
994 r = remote;
995 for (i = 0; i < qr->qr_numregions; ++i) {
996 foundit = 0;
997 l = local;
998 for (j = 0; j < localnr; ++j) {
999 if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
1000 foundit = 1;
1001 break;
1002 }
1003 l += O2HB_MAX_REGION_NAME_LEN;
1004 }
1005 if (!foundit) {
1006 status = -EINVAL;
1007 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
1008 "in joining node %d but not in local node %d\n",
1009 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
1010 qr->qr_node, dlm->node_num);
1011 goto bail;
1012 }
1013 r += O2HB_MAX_REGION_NAME_LEN;
1014 }
1015
1016bail:
1017 kfree(local);
1018
1019 return status;
1020}
1021
1022static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
1023{
1024 struct dlm_query_region *qr = NULL;
1025 int status, ret = 0, i;
1026 char *p;
1027
1028 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1029 goto bail;
1030
1031 qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
1032 if (!qr) {
1033 ret = -ENOMEM;
1034 mlog_errno(ret);
1035 goto bail;
1036 }
1037
1038 qr->qr_node = dlm->node_num;
1039 qr->qr_namelen = strlen(dlm->name);
1040 memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
1041 /* if local hb, the numregions will be zero */
1042 if (o2hb_global_heartbeat_active())
1043 qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
1044 O2NM_MAX_REGIONS);
1045
1046 p = qr->qr_regions;
1047 for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
1048 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
1049
1050 i = -1;
1051 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1052 i + 1)) < O2NM_MAX_NODES) {
1053 if (i == dlm->node_num)
1054 continue;
1055
1056 mlog(0, "Sending regions to node %d\n", i);
1057
1058 ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
1059 sizeof(struct dlm_query_region),
1060 i, &status);
1061 if (ret >= 0)
1062 ret = status;
1063 if (ret) {
1064 mlog(ML_ERROR, "Region mismatch %d, node %d\n",
1065 ret, i);
1066 break;
1067 }
1068 }
1069
1070bail:
1071 kfree(qr);
1072 return ret;
1073}
1074
1075static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1076 void *data, void **ret_data)
1077{
1078 struct dlm_query_region *qr;
1079 struct dlm_ctxt *dlm = NULL;
1080 int status = 0;
1081 int locked = 0;
1082
1083 qr = (struct dlm_query_region *) msg->buf;
1084
1085 mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
1086 qr->qr_domain);
1087
1088 status = -EINVAL;
1089
1090 spin_lock(&dlm_domain_lock);
1091 dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
1092 if (!dlm) {
1093 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1094 "before join domain\n", qr->qr_node, qr->qr_domain);
1095 goto bail;
1096 }
1097
1098 spin_lock(&dlm->spinlock);
1099 locked = 1;
1100 if (dlm->joining_node != qr->qr_node) {
1101 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1102 "but joining node is %d\n", qr->qr_node, qr->qr_domain,
1103 dlm->joining_node);
1104 goto bail;
1105 }
1106
1107 /* Support for global heartbeat was added in 1.1 */
1108 if (dlm->dlm_locking_proto.pv_major == 1 &&
1109 dlm->dlm_locking_proto.pv_minor == 0) {
1110 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1111 "but active dlm protocol is %d.%d\n", qr->qr_node,
1112 qr->qr_domain, dlm->dlm_locking_proto.pv_major,
1113 dlm->dlm_locking_proto.pv_minor);
1114 goto bail;
1115 }
1116
1117 status = dlm_match_regions(dlm, qr);
1118
1119bail:
1120 if (locked)
1121 spin_unlock(&dlm->spinlock);
1122 spin_unlock(&dlm_domain_lock);
1123
1124 return status;
1125}
1126
1127static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
1128{
1129 struct o2nm_node *local;
1130 struct dlm_node_info *remote;
1131 int i, j;
1132 int status = 0;
1133
1134 for (j = 0; j < qn->qn_numnodes; ++j)
1135 mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
1136 &(qn->qn_nodes[j].ni_ipv4_address),
1137 ntohs(qn->qn_nodes[j].ni_ipv4_port));
1138
1139 for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
1140 local = o2nm_get_node_by_num(i);
1141 remote = NULL;
1142 for (j = 0; j < qn->qn_numnodes; ++j) {
1143 if (qn->qn_nodes[j].ni_nodenum == i) {
1144 remote = &(qn->qn_nodes[j]);
1145 break;
1146 }
1147 }
1148
1149 if (!local && !remote)
1150 continue;
1151
1152 if ((local && !remote) || (!local && remote))
1153 status = -EINVAL;
1154
1155 if (!status &&
1156 ((remote->ni_nodenum != local->nd_num) ||
1157 (remote->ni_ipv4_port != local->nd_ipv4_port) ||
1158 (remote->ni_ipv4_address != local->nd_ipv4_address)))
1159 status = -EINVAL;
1160
1161 if (status) {
1162 if (remote && !local)
1163 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1164 "registered in joining node %d but not in "
1165 "local node %d\n", qn->qn_domain,
1166 remote->ni_nodenum,
1167 &(remote->ni_ipv4_address),
1168 ntohs(remote->ni_ipv4_port),
1169 qn->qn_nodenum, dlm->node_num);
1170 if (local && !remote)
1171 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1172 "registered in local node %d but not in "
1173 "joining node %d\n", qn->qn_domain,
1174 local->nd_num, &(local->nd_ipv4_address),
1175 ntohs(local->nd_ipv4_port),
1176 dlm->node_num, qn->qn_nodenum);
1177 BUG_ON((!local && !remote));
1178 }
1179
1180 if (local)
1181 o2nm_node_put(local);
1182 }
1183
1184 return status;
1185}
1186
1187static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
1188{
1189 struct dlm_query_nodeinfo *qn = NULL;
1190 struct o2nm_node *node;
1191 int ret = 0, status, count, i;
1192
1193 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1194 goto bail;
1195
1196 qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
1197 if (!qn) {
1198 ret = -ENOMEM;
1199 mlog_errno(ret);
1200 goto bail;
1201 }
1202
1203 for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
1204 node = o2nm_get_node_by_num(i);
1205 if (!node)
1206 continue;
1207 qn->qn_nodes[count].ni_nodenum = node->nd_num;
1208 qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
1209 qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
1210 mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
1211 &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
1212 ++count;
1213 o2nm_node_put(node);
1214 }
1215
1216 qn->qn_nodenum = dlm->node_num;
1217 qn->qn_numnodes = count;
1218 qn->qn_namelen = strlen(dlm->name);
1219 memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
1220
1221 i = -1;
1222 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1223 i + 1)) < O2NM_MAX_NODES) {
1224 if (i == dlm->node_num)
1225 continue;
1226
1227 mlog(0, "Sending nodeinfo to node %d\n", i);
1228
1229 ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
1230 qn, sizeof(struct dlm_query_nodeinfo),
1231 i, &status);
1232 if (ret >= 0)
1233 ret = status;
1234 if (ret) {
1235 mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
1236 break;
1237 }
1238 }
1239
1240bail:
1241 kfree(qn);
1242 return ret;
1243}
1244
1245static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
1246 void *data, void **ret_data)
1247{
1248 struct dlm_query_nodeinfo *qn;
1249 struct dlm_ctxt *dlm = NULL;
1250 int locked = 0, status = -EINVAL;
1251
1252 qn = (struct dlm_query_nodeinfo *) msg->buf;
1253
1254 mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
1255 qn->qn_domain);
1256
1257 spin_lock(&dlm_domain_lock);
1258 dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
1259 if (!dlm) {
1260 mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
1261 "join domain\n", qn->qn_nodenum, qn->qn_domain);
1262 goto bail;
1263 }
1264
1265 spin_lock(&dlm->spinlock);
1266 locked = 1;
1267 if (dlm->joining_node != qn->qn_nodenum) {
1268 mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
1269 "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
1270 dlm->joining_node);
1271 goto bail;
1272 }
1273
1274 /* Support for node query was added in 1.1 */
1275 if (dlm->dlm_locking_proto.pv_major == 1 &&
1276 dlm->dlm_locking_proto.pv_minor == 0) {
1277 mlog(ML_ERROR, "Node %d queried nodes on domain %s "
1278 "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
1279 qn->qn_domain, dlm->dlm_locking_proto.pv_major,
1280 dlm->dlm_locking_proto.pv_minor);
1281 goto bail;
1282 }
1283
1284 status = dlm_match_nodes(dlm, qn);
1285
1286bail:
1287 if (locked)
1288 spin_unlock(&dlm->spinlock);
1289 spin_unlock(&dlm_domain_lock);
1290
1291 return status;
1292}
1293
923static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 1294static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
924 void **ret_data) 1295 void **ret_data)
925{ 1296{
@@ -1240,6 +1611,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1240 set_bit(dlm->node_num, dlm->domain_map); 1611 set_bit(dlm->node_num, dlm->domain_map);
1241 spin_unlock(&dlm->spinlock); 1612 spin_unlock(&dlm->spinlock);
1242 1613
1614 /* Support for global heartbeat and node info was added in 1.1 */
1615 if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
1616 status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
1617 if (status) {
1618 mlog_errno(status);
1619 goto bail;
1620 }
1621 status = dlm_send_regions(dlm, ctxt->yes_resp_map);
1622 if (status) {
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626 }
1627
1243 dlm_send_join_asserts(dlm, ctxt->yes_resp_map); 1628 dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
1244 1629
1245 /* Joined state *must* be set before the joining node 1630 /* Joined state *must* be set before the joining node
@@ -1806,7 +2191,21 @@ static int dlm_register_net_handlers(void)
1806 sizeof(struct dlm_cancel_join), 2191 sizeof(struct dlm_cancel_join),
1807 dlm_cancel_join_handler, 2192 dlm_cancel_join_handler,
1808 NULL, NULL, &dlm_join_handlers); 2193 NULL, NULL, &dlm_join_handlers);
2194 if (status)
2195 goto bail;
2196
2197 status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
2198 sizeof(struct dlm_query_region),
2199 dlm_query_region_handler,
2200 NULL, NULL, &dlm_join_handlers);
1809 2201
2202 if (status)
2203 goto bail;
2204
2205 status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
2206 sizeof(struct dlm_query_nodeinfo),
2207 dlm_query_nodeinfo_handler,
2208 NULL, NULL, &dlm_join_handlers);
1810bail: 2209bail:
1811 if (status < 0) 2210 if (status < 0)
1812 dlm_unregister_net_handlers(); 2211 dlm_unregister_net_handlers();
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 94b97fc6a88e..f564b0e5f80d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
511 511
512 atomic_dec(&dlm->res_cur_count); 512 atomic_dec(&dlm->res_cur_count);
513 513
514 dlm_put(dlm);
515
516 if (!hlist_unhashed(&res->hash_node) || 514 if (!hlist_unhashed(&res->hash_node) ||
517 !list_empty(&res->granted) || 515 !list_empty(&res->granted) ||
518 !list_empty(&res->converting) || 516 !list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
585 res->migration_pending = 0; 583 res->migration_pending = 0;
586 res->inflight_locks = 0; 584 res->inflight_locks = 0;
587 585
588 /* put in dlm_lockres_release */
589 dlm_grab(dlm);
590 res->dlm = dlm; 586 res->dlm = dlm;
591 587
592 kref_init(&res->refs); 588 kref_init(&res->refs);
@@ -3050,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3050 /* check for pre-existing lock */ 3046 /* check for pre-existing lock */
3051 spin_lock(&dlm->spinlock); 3047 spin_lock(&dlm->spinlock);
3052 res = __dlm_lookup_lockres(dlm, name, namelen, hash); 3048 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
3053 spin_lock(&dlm->master_lock);
3054
3055 if (res) { 3049 if (res) {
3056 spin_lock(&res->spinlock); 3050 spin_lock(&res->spinlock);
3057 if (res->state & DLM_LOCK_RES_RECOVERING) { 3051 if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3069,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3069 spin_unlock(&res->spinlock); 3063 spin_unlock(&res->spinlock);
3070 } 3064 }
3071 3065
3066 spin_lock(&dlm->master_lock);
3072 /* ignore status. only nonzero status would BUG. */ 3067 /* ignore status. only nonzero status would BUG. */
3073 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, 3068 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
3074 name, namelen, 3069 name, namelen,
3075 migrate->new_master, 3070 migrate->new_master,
3076 migrate->master); 3071 migrate->master);
3077 3072
3078unlock:
3079 spin_unlock(&dlm->master_lock); 3073 spin_unlock(&dlm->master_lock);
3074unlock:
3080 spin_unlock(&dlm->spinlock); 3075 spin_unlock(&dlm->spinlock);
3081 3076
3082 if (oldmle) { 3077 if (oldmle) {
@@ -3438,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3438 wake_up(&res->wq); 3433 wake_up(&res->wq);
3439 wake_up(&dlm->migration_wq); 3434 wake_up(&dlm->migration_wq);
3440} 3435}
3436
3437void dlm_force_free_mles(struct dlm_ctxt *dlm)
3438{
3439 int i;
3440 struct hlist_head *bucket;
3441 struct dlm_master_list_entry *mle;
3442 struct hlist_node *tmp, *list;
3443
3444 /*
3445 * We notified all other nodes that we are exiting the domain and
3446 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3447 * around we force free them and wake any processes that are waiting
3448 * on the mles
3449 */
3450 spin_lock(&dlm->spinlock);
3451 spin_lock(&dlm->master_lock);
3452
3453 BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
3454 BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
3455
3456 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3457 bucket = dlm_master_hash(dlm, i);
3458 hlist_for_each_safe(list, tmp, bucket) {
3459 mle = hlist_entry(list, struct dlm_master_list_entry,
3460 master_hash_node);
3461 if (mle->type != DLM_MLE_BLOCK) {
3462 mlog(ML_ERROR, "bad mle: %p\n", mle);
3463 dlm_print_one_mle(mle);
3464 }
3465 atomic_set(&mle->woken, 1);
3466 wake_up(&mle->wq);
3467
3468 __dlm_unlink_mle(dlm, mle);
3469 __dlm_mle_detach_hb_events(dlm, mle);
3470 __dlm_put_mle(mle);
3471 }
3472 }
3473 spin_unlock(&dlm->master_lock);
3474 spin_unlock(&dlm->spinlock);
3475}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9dfaac73b36d..aaaffbcbe916 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1997 struct list_head *queue; 1997 struct list_head *queue;
1998 struct dlm_lock *lock, *next; 1998 struct dlm_lock *lock, *next;
1999 1999
2000 assert_spin_locked(&dlm->spinlock);
2001 assert_spin_locked(&res->spinlock);
2000 res->state |= DLM_LOCK_RES_RECOVERING; 2002 res->state |= DLM_LOCK_RES_RECOVERING;
2001 if (!list_empty(&res->recovering)) { 2003 if (!list_empty(&res->recovering)) {
2002 mlog(0, 2004 mlog(0,
@@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2326 /* zero the lvb if necessary */ 2328 /* zero the lvb if necessary */
2327 dlm_revalidate_lvb(dlm, res, dead_node); 2329 dlm_revalidate_lvb(dlm, res, dead_node);
2328 if (res->owner == dead_node) { 2330 if (res->owner == dead_node) {
2329 if (res->state & DLM_LOCK_RES_DROPPING_REF) 2331 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2330 mlog(0, "%s:%.*s: owned by " 2332 mlog(ML_NOTICE, "Ignore %.*s for "
2331 "dead node %u, this node was " 2333 "recovery as it is being freed\n",
2332 "dropping its ref when it died. " 2334 res->lockname.len,
2333 "continue, dropping the flag.\n", 2335 res->lockname.name);
2334 dlm->name, res->lockname.len, 2336 } else
2335 res->lockname.name, dead_node); 2337 dlm_move_lockres_to_recovery_list(dlm,
2336 2338 res);
2337 /* the wake_up for this will happen when the
2338 * RECOVERING flag is dropped later */
2339 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2340 2339
2341 dlm_move_lockres_to_recovery_list(dlm, res);
2342 } else if (res->owner == dlm->node_num) { 2340 } else if (res->owner == dlm->node_num) {
2343 dlm_free_dead_locks(dlm, res, dead_node); 2341 dlm_free_dead_locks(dlm, res, dead_node);
2344 __dlm_lockres_calc_usage(dlm, res); 2342 __dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d4f73ca68fe5..2211acf33d9b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
92 * truly ready to be freed. */ 92 * truly ready to be freed. */
93int __dlm_lockres_unused(struct dlm_lock_resource *res) 93int __dlm_lockres_unused(struct dlm_lock_resource *res)
94{ 94{
95 if (!__dlm_lockres_has_locks(res) && 95 int bit;
96 (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { 96
97 /* try not to scan the bitmap unless the first two 97 if (__dlm_lockres_has_locks(res))
98 * conditions are already true */ 98 return 0;
99 int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 99
100 if (bit >= O2NM_MAX_NODES) { 100 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
101 /* since the bit for dlm->node_num is not 101 return 0;
102 * set, inflight_locks better be zero */ 102
103 BUG_ON(res->inflight_locks != 0); 103 if (res->state & DLM_LOCK_RES_RECOVERING)
104 return 1; 104 return 0;
105 } 105
106 } 106 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
107 return 0; 107 if (bit < O2NM_MAX_NODES)
108 return 0;
109
110 /*
111 * since the bit for dlm->node_num is not set, inflight_locks better
112 * be zero
113 */
114 BUG_ON(res->inflight_locks != 0);
115 return 1;
108} 116}
109 117
110 118
@@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
152 spin_unlock(&dlm->spinlock); 160 spin_unlock(&dlm->spinlock);
153} 161}
154 162
155static int dlm_purge_lockres(struct dlm_ctxt *dlm, 163static void dlm_purge_lockres(struct dlm_ctxt *dlm,
156 struct dlm_lock_resource *res) 164 struct dlm_lock_resource *res)
157{ 165{
158 int master; 166 int master;
159 int ret = 0; 167 int ret = 0;
160 168
161 spin_lock(&res->spinlock); 169 assert_spin_locked(&dlm->spinlock);
162 if (!__dlm_lockres_unused(res)) { 170 assert_spin_locked(&res->spinlock);
163 mlog(0, "%s:%.*s: tried to purge but not unused\n",
164 dlm->name, res->lockname.len, res->lockname.name);
165 __dlm_print_one_lock_resource(res);
166 spin_unlock(&res->spinlock);
167 BUG();
168 }
169
170 if (res->state & DLM_LOCK_RES_MIGRATING) {
171 mlog(0, "%s:%.*s: Delay dropref as this lockres is "
172 "being remastered\n", dlm->name, res->lockname.len,
173 res->lockname.name);
174 /* Re-add the lockres to the end of the purge list */
175 if (!list_empty(&res->purge)) {
176 list_del_init(&res->purge);
177 list_add_tail(&res->purge, &dlm->purge_list);
178 }
179 spin_unlock(&res->spinlock);
180 return 0;
181 }
182 171
183 master = (res->owner == dlm->node_num); 172 master = (res->owner == dlm->node_num);
184 173
185 if (!master)
186 res->state |= DLM_LOCK_RES_DROPPING_REF;
187 spin_unlock(&res->spinlock);
188 174
189 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, 175 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
190 res->lockname.name, master); 176 res->lockname.name, master);
191 177
192 if (!master) { 178 if (!master) {
179 res->state |= DLM_LOCK_RES_DROPPING_REF;
193 /* drop spinlock... retake below */ 180 /* drop spinlock... retake below */
181 spin_unlock(&res->spinlock);
194 spin_unlock(&dlm->spinlock); 182 spin_unlock(&dlm->spinlock);
195 183
196 spin_lock(&res->spinlock); 184 spin_lock(&res->spinlock);
@@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
208 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", 196 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
209 dlm->name, res->lockname.len, res->lockname.name, ret); 197 dlm->name, res->lockname.len, res->lockname.name, ret);
210 spin_lock(&dlm->spinlock); 198 spin_lock(&dlm->spinlock);
199 spin_lock(&res->spinlock);
211 } 200 }
212 201
213 spin_lock(&res->spinlock);
214 if (!list_empty(&res->purge)) { 202 if (!list_empty(&res->purge)) {
215 mlog(0, "removing lockres %.*s:%p from purgelist, " 203 mlog(0, "removing lockres %.*s:%p from purgelist, "
216 "master = %d\n", res->lockname.len, res->lockname.name, 204 "master = %d\n", res->lockname.len, res->lockname.name,
217 res, master); 205 res, master);
218 list_del_init(&res->purge); 206 list_del_init(&res->purge);
219 spin_unlock(&res->spinlock);
220 dlm_lockres_put(res); 207 dlm_lockres_put(res);
221 dlm->purge_count--; 208 dlm->purge_count--;
222 } else 209 }
223 spin_unlock(&res->spinlock); 210
211 if (!__dlm_lockres_unused(res)) {
212 mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
213 dlm->name, res->lockname.len, res->lockname.name);
214 __dlm_print_one_lock_resource(res);
215 BUG();
216 }
224 217
225 __dlm_unhash_lockres(res); 218 __dlm_unhash_lockres(res);
226 219
227 /* lockres is not in the hash now. drop the flag and wake up 220 /* lockres is not in the hash now. drop the flag and wake up
228 * any processes waiting in dlm_get_lock_resource. */ 221 * any processes waiting in dlm_get_lock_resource. */
229 if (!master) { 222 if (!master) {
230 spin_lock(&res->spinlock);
231 res->state &= ~DLM_LOCK_RES_DROPPING_REF; 223 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
232 spin_unlock(&res->spinlock); 224 spin_unlock(&res->spinlock);
233 wake_up(&res->wq); 225 wake_up(&res->wq);
234 } 226 } else
235 return 0; 227 spin_unlock(&res->spinlock);
236} 228}
237 229
238static void dlm_run_purge_list(struct dlm_ctxt *dlm, 230static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
251 lockres = list_entry(dlm->purge_list.next, 243 lockres = list_entry(dlm->purge_list.next,
252 struct dlm_lock_resource, purge); 244 struct dlm_lock_resource, purge);
253 245
254 /* Status of the lockres *might* change so double
255 * check. If the lockres is unused, holding the dlm
256 * spinlock will prevent people from getting and more
257 * refs on it -- there's no need to keep the lockres
258 * spinlock. */
259 spin_lock(&lockres->spinlock); 246 spin_lock(&lockres->spinlock);
260 unused = __dlm_lockres_unused(lockres);
261 spin_unlock(&lockres->spinlock);
262
263 if (!unused)
264 continue;
265 247
266 purge_jiffies = lockres->last_used + 248 purge_jiffies = lockres->last_used +
267 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); 249 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
273 * in tail order, we can stop at the first 255 * in tail order, we can stop at the first
274 * unpurgable resource -- anyone added after 256 * unpurgable resource -- anyone added after
275 * him will have a greater last_used value */ 257 * him will have a greater last_used value */
258 spin_unlock(&lockres->spinlock);
276 break; 259 break;
277 } 260 }
278 261
262 /* Status of the lockres *might* change so double
263 * check. If the lockres is unused, holding the dlm
264 * spinlock will prevent people from getting and more
265 * refs on it. */
266 unused = __dlm_lockres_unused(lockres);
267 if (!unused ||
268 (lockres->state & DLM_LOCK_RES_MIGRATING)) {
269 mlog(0, "lockres %s:%.*s: is in use or "
270 "being remastered, used %d, state %d\n",
271 dlm->name, lockres->lockname.len,
272 lockres->lockname.name, !unused, lockres->state);
273 list_move_tail(&dlm->purge_list, &lockres->purge);
274 spin_unlock(&lockres->spinlock);
275 continue;
276 }
277
279 dlm_lockres_get(lockres); 278 dlm_lockres_get(lockres);
280 279
281 /* This may drop and reacquire the dlm spinlock if it 280 dlm_purge_lockres(dlm, lockres);
282 * has to do migration. */
283 if (dlm_purge_lockres(dlm, lockres))
284 BUG();
285 281
286 dlm_lockres_put(lockres); 282 dlm_lockres_put(lockres);
287 283
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index c2903b84bb7a..a7ebd9d42dc8 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -612,6 +612,7 @@ static const struct file_operations dlmfs_file_operations = {
612 .poll = dlmfs_file_poll, 612 .poll = dlmfs_file_poll,
613 .read = dlmfs_file_read, 613 .read = dlmfs_file_read,
614 .write = dlmfs_file_write, 614 .write = dlmfs_file_write,
615 .llseek = default_llseek,
615}; 616};
616 617
617static const struct inode_operations dlmfs_dir_inode_operations = { 618static const struct inode_operations dlmfs_dir_inode_operations = {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 5e02a893f46e..e8d94d722ecb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
3635{ 3635{
3636 struct inode *inode; 3636 struct inode *inode;
3637 struct address_space *mapping; 3637 struct address_space *mapping;
3638 struct ocfs2_inode_info *oi;
3638 3639
3639 inode = ocfs2_lock_res_inode(lockres); 3640 inode = ocfs2_lock_res_inode(lockres);
3640 mapping = inode->i_mapping; 3641 mapping = inode->i_mapping;
3641 3642
3643 if (S_ISDIR(inode->i_mode)) {
3644 oi = OCFS2_I(inode);
3645 oi->ip_dir_lock_gen++;
3646 mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
3647 goto out;
3648 }
3649
3642 if (!S_ISREG(inode->i_mode)) 3650 if (!S_ISREG(inode->i_mode))
3643 goto out; 3651 goto out;
3644 3652
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d6..1d596d8c4a4a 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
84 OI_LS_PARENT, 84 OI_LS_PARENT,
85 OI_LS_RENAME1, 85 OI_LS_RENAME1,
86 OI_LS_RENAME2, 86 OI_LS_RENAME2,
87 OI_LS_REFLINK_TARGET,
87}; 88};
88 89
89int ocfs2_dlm_init(struct ocfs2_super *osb); 90int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 81296b4e3646..1ca6867935bb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,6 +36,7 @@
36#include <linux/writeback.h> 36#include <linux/writeback.h>
37#include <linux/falloc.h> 37#include <linux/falloc.h>
38#include <linux/quotaops.h> 38#include <linux/quotaops.h>
39#include <linux/blkdev.h>
39 40
40#define MLOG_MASK_PREFIX ML_INODE 41#define MLOG_MASK_PREFIX ML_INODE
41#include <cluster/masklog.h> 42#include <cluster/masklog.h>
@@ -63,12 +64,6 @@
63 64
64#include "buffer_head_io.h" 65#include "buffer_head_io.h"
65 66
66static int ocfs2_sync_inode(struct inode *inode)
67{
68 filemap_fdatawrite(inode->i_mapping);
69 return sync_mapping_buffers(inode->i_mapping);
70}
71
72static int ocfs2_init_file_private(struct inode *inode, struct file *file) 67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
73{ 68{
74 struct ocfs2_file_private *fp; 69 struct ocfs2_file_private *fp;
@@ -179,19 +174,22 @@ static int ocfs2_sync_file(struct file *file, int datasync)
179{ 174{
180 int err = 0; 175 int err = 0;
181 journal_t *journal; 176 journal_t *journal;
182 struct dentry *dentry = file->f_path.dentry;
183 struct inode *inode = file->f_mapping->host; 177 struct inode *inode = file->f_mapping->host;
184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 178 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
185 179
186 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 180 mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
187 dentry->d_name.len, dentry->d_name.name); 181 file->f_path.dentry, file->f_path.dentry->d_name.len,
188 182 file->f_path.dentry->d_name.name);
189 err = ocfs2_sync_inode(dentry->d_inode);
190 if (err)
191 goto bail;
192 183
193 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 184 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
185 /*
186 * We still have to flush drive's caches to get data to the
187 * platter
188 */
189 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
190 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
194 goto bail; 191 goto bail;
192 }
195 193
196 journal = osb->journal->j_journal; 194 journal = osb->journal->j_journal;
197 err = jbd2_journal_force_commit(journal); 195 err = jbd2_journal_force_commit(journal);
@@ -361,7 +359,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
361 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 359 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
362 goto out; 360 goto out;
363 361
364 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); 362 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
365 363
366out: 364out:
367 return status; 365 return status;
@@ -774,7 +772,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
774 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); 772 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
775 BUG_ON(abs_from & (inode->i_blkbits - 1)); 773 BUG_ON(abs_from & (inode->i_blkbits - 1));
776 774
777 page = grab_cache_page(mapping, index); 775 page = find_or_create_page(mapping, index, GFP_NOFS);
778 if (!page) { 776 if (!page) {
779 ret = -ENOMEM; 777 ret = -ENOMEM;
780 mlog_errno(ret); 778 mlog_errno(ret);
@@ -904,8 +902,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
904 zero_clusters = last_cpos - zero_cpos; 902 zero_clusters = last_cpos - zero_cpos;
905 903
906 if (needs_cow) { 904 if (needs_cow) {
907 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, 905 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
908 UINT_MAX); 906 zero_clusters, UINT_MAX);
909 if (rc) { 907 if (rc) {
910 mlog_errno(rc); 908 mlog_errno(rc);
911 goto out; 909 goto out;
@@ -2053,6 +2051,7 @@ out:
2053} 2051}
2054 2052
2055static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2053static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2054 struct file *file,
2056 loff_t pos, size_t count, 2055 loff_t pos, size_t count,
2057 int *meta_level) 2056 int *meta_level)
2058{ 2057{
@@ -2070,7 +2069,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2070 2069
2071 *meta_level = 1; 2070 *meta_level = 1;
2072 2071
2073 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 2072 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2074 if (ret) 2073 if (ret)
2075 mlog_errno(ret); 2074 mlog_errno(ret);
2076out: 2075out:
@@ -2078,7 +2077,7 @@ out:
2078 return ret; 2077 return ret;
2079} 2078}
2080 2079
2081static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 2080static int ocfs2_prepare_inode_for_write(struct file *file,
2082 loff_t *ppos, 2081 loff_t *ppos,
2083 size_t count, 2082 size_t count,
2084 int appending, 2083 int appending,
@@ -2086,6 +2085,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2086 int *has_refcount) 2085 int *has_refcount)
2087{ 2086{
2088 int ret = 0, meta_level = 0; 2087 int ret = 0, meta_level = 0;
2088 struct dentry *dentry = file->f_path.dentry;
2089 struct inode *inode = dentry->d_inode; 2089 struct inode *inode = dentry->d_inode;
2090 loff_t saved_pos, end; 2090 loff_t saved_pos, end;
2091 2091
@@ -2141,6 +2141,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2141 meta_level = -1; 2141 meta_level = -1;
2142 2142
2143 ret = ocfs2_prepare_inode_for_refcount(inode, 2143 ret = ocfs2_prepare_inode_for_refcount(inode,
2144 file,
2144 saved_pos, 2145 saved_pos,
2145 count, 2146 count,
2146 &meta_level); 2147 &meta_level);
@@ -2223,6 +2224,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2223 struct file *file = iocb->ki_filp; 2224 struct file *file = iocb->ki_filp;
2224 struct inode *inode = file->f_path.dentry->d_inode; 2225 struct inode *inode = file->f_path.dentry->d_inode;
2225 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2226 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2227 int full_coherency = !(osb->s_mount_opt &
2228 OCFS2_MOUNT_COHERENCY_BUFFERED);
2226 2229
2227 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2230 mlog_entry("(0x%p, %u, '%.*s')\n", file,
2228 (unsigned int)nr_segs, 2231 (unsigned int)nr_segs,
@@ -2246,16 +2249,39 @@ relock:
2246 have_alloc_sem = 1; 2249 have_alloc_sem = 1;
2247 } 2250 }
2248 2251
2249 /* concurrent O_DIRECT writes are allowed */ 2252 /*
2250 rw_level = !direct_io; 2253 * Concurrent O_DIRECT writes are allowed with
2254 * mount_option "coherency=buffered".
2255 */
2256 rw_level = (!direct_io || full_coherency);
2257
2251 ret = ocfs2_rw_lock(inode, rw_level); 2258 ret = ocfs2_rw_lock(inode, rw_level);
2252 if (ret < 0) { 2259 if (ret < 0) {
2253 mlog_errno(ret); 2260 mlog_errno(ret);
2254 goto out_sems; 2261 goto out_sems;
2255 } 2262 }
2256 2263
2264 /*
2265 * O_DIRECT writes with "coherency=full" need to take EX cluster
2266 * inode_lock to guarantee coherency.
2267 */
2268 if (direct_io && full_coherency) {
2269 /*
2270 * We need to take and drop the inode lock to force
2271 * other nodes to drop their caches. Buffered I/O
2272 * already does this in write_begin().
2273 */
2274 ret = ocfs2_inode_lock(inode, NULL, 1);
2275 if (ret < 0) {
2276 mlog_errno(ret);
2277 goto out_sems;
2278 }
2279
2280 ocfs2_inode_unlock(inode, 1);
2281 }
2282
2257 can_do_direct = direct_io; 2283 can_do_direct = direct_io;
2258 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 2284 ret = ocfs2_prepare_inode_for_write(file, ppos,
2259 iocb->ki_left, appending, 2285 iocb->ki_left, appending,
2260 &can_do_direct, &has_refcount); 2286 &can_do_direct, &has_refcount);
2261 if (ret < 0) { 2287 if (ret < 0) {
@@ -2303,17 +2329,6 @@ relock:
2303 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2329 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2304 ppos, count, ocount); 2330 ppos, count, ocount);
2305 if (written < 0) { 2331 if (written < 0) {
2306 /*
2307 * direct write may have instantiated a few
2308 * blocks outside i_size. Trim these off again.
2309 * Don't need i_size_read because we hold i_mutex.
2310 *
2311 * XXX(truncate): this looks buggy because ocfs2 did not
2312 * actually implement ->truncate. Take a look at
2313 * the new truncate sequence and update this accordingly
2314 */
2315 if (*ppos + count > inode->i_size)
2316 truncate_setsize(inode, inode->i_size);
2317 ret = written; 2332 ret = written;
2318 goto out_dio; 2333 goto out_dio;
2319 } 2334 }
@@ -2329,7 +2344,7 @@ out_dio:
2329 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2344 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2330 2345
2331 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || 2346 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2332 ((file->f_flags & O_DIRECT) && has_refcount)) { 2347 ((file->f_flags & O_DIRECT) && !direct_io)) {
2333 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2348 ret = filemap_fdatawrite_range(file->f_mapping, pos,
2334 pos + count - 1); 2349 pos + count - 1);
2335 if (ret < 0) 2350 if (ret < 0)
@@ -2385,7 +2400,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2385{ 2400{
2386 int ret; 2401 int ret;
2387 2402
2388 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2403 ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2389 sd->total_len, 0, NULL, NULL); 2404 sd->total_len, 0, NULL, NULL);
2390 if (ret < 0) { 2405 if (ret < 0) {
2391 mlog_errno(ret); 2406 mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0492464916b1..f935fd6600dd 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
335 else 335 else
336 inode->i_fop = &ocfs2_dops_no_plocks; 336 inode->i_fop = &ocfs2_dops_no_plocks;
337 i_size_write(inode, le64_to_cpu(fe->i_size)); 337 i_size_write(inode, le64_to_cpu(fe->i_size));
338 OCFS2_I(inode)->ip_dir_lock_gen = 1;
338 break; 339 break;
339 case S_IFLNK: 340 case S_IFLNK:
340 if (ocfs2_inode_is_fast_symlink(inode)) 341 if (ocfs2_inode_is_fast_symlink(inode))
@@ -488,7 +489,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
488 OCFS2_BH_IGNORE_CACHE); 489 OCFS2_BH_IGNORE_CACHE);
489 } else { 490 } else {
490 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 491 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
491 if (!status) 492 /*
493 * If buffer is in jbd, then its checksum may not have been
494 * computed as yet.
495 */
496 if (!status && !buffer_jbd(bh))
492 status = ocfs2_validate_inode_block(osb->sb, bh); 497 status = ocfs2_validate_inode_block(osb->sb, bh);
493 } 498 }
494 if (status < 0) { 499 if (status < 0) {
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 6de5a869db30..1c508b149b3a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
46 /* These fields are protected by ip_lock */ 46 /* These fields are protected by ip_lock */
47 spinlock_t ip_lock; 47 spinlock_t ip_lock;
48 u32 ip_open_count; 48 u32 ip_open_count;
49 u32 ip_clusters;
50 struct list_head ip_io_markers; 49 struct list_head ip_io_markers;
50 u32 ip_clusters;
51 51
52 u16 ip_dyn_features;
52 struct mutex ip_io_mutex; 53 struct mutex ip_io_mutex;
53
54 u32 ip_flags; /* see below */ 54 u32 ip_flags; /* see below */
55 u32 ip_attr; /* inode attributes */ 55 u32 ip_attr; /* inode attributes */
56 u16 ip_dyn_features;
57 56
58 /* protected by recovery_lock. */ 57 /* protected by recovery_lock. */
59 struct inode *ip_next_orphan; 58 struct inode *ip_next_orphan;
60 59
61 u32 ip_dir_start_lookup;
62
63 struct ocfs2_caching_info ip_metadata_cache; 60 struct ocfs2_caching_info ip_metadata_cache;
64
65 struct ocfs2_extent_map ip_extent_map; 61 struct ocfs2_extent_map ip_extent_map;
66
67 struct inode vfs_inode; 62 struct inode vfs_inode;
68 struct jbd2_inode ip_jinode; 63 struct jbd2_inode ip_jinode;
69 64
65 u32 ip_dir_start_lookup;
66
70 /* Only valid if the inode is the dir. */ 67 /* Only valid if the inode is the dir. */
71 u32 ip_last_used_slot; 68 u32 ip_last_used_slot;
72 u64 ip_last_used_group; 69 u64 ip_last_used_group;
70 u32 ip_dir_lock_gen;
73 71
74 struct ocfs2_alloc_reservation ip_la_data_resv; 72 struct ocfs2_alloc_reservation ip_la_data_resv;
75}; 73};
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132cef..7a4868196152 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
26 26
27#include <linux/ext2_fs.h> 27#include <linux/ext2_fs.h>
28 28
29#define o2info_from_user(a, b) \
30 copy_from_user(&(a), (b), sizeof(a))
31#define o2info_to_user(a, b) \
32 copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
33
34/*
35 * This call is void because we are already reporting an error that may
36 * be -EFAULT. The error will be returned from the ioctl(2) call. It's
37 * just a best-effort to tell userspace that this request caused the error.
38 */
39static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
40 struct ocfs2_info_request __user *req)
41{
42 kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
43 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
44}
45
46#define o2info_set_request_error(a, b) \
47 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
48
29static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 49static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
30{ 50{
31 int status; 51 int status;
@@ -109,6 +129,328 @@ bail:
109 return status; 129 return status;
110} 130}
111 131
132int ocfs2_info_handle_blocksize(struct inode *inode,
133 struct ocfs2_info_request __user *req)
134{
135 int status = -EFAULT;
136 struct ocfs2_info_blocksize oib;
137
138 if (o2info_from_user(oib, req))
139 goto bail;
140
141 oib.ib_blocksize = inode->i_sb->s_blocksize;
142 oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
143
144 if (o2info_to_user(oib, req))
145 goto bail;
146
147 status = 0;
148bail:
149 if (status)
150 o2info_set_request_error(oib, req);
151
152 return status;
153}
154
155int ocfs2_info_handle_clustersize(struct inode *inode,
156 struct ocfs2_info_request __user *req)
157{
158 int status = -EFAULT;
159 struct ocfs2_info_clustersize oic;
160 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
161
162 if (o2info_from_user(oic, req))
163 goto bail;
164
165 oic.ic_clustersize = osb->s_clustersize;
166 oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
167
168 if (o2info_to_user(oic, req))
169 goto bail;
170
171 status = 0;
172bail:
173 if (status)
174 o2info_set_request_error(oic, req);
175
176 return status;
177}
178
179int ocfs2_info_handle_maxslots(struct inode *inode,
180 struct ocfs2_info_request __user *req)
181{
182 int status = -EFAULT;
183 struct ocfs2_info_maxslots oim;
184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
185
186 if (o2info_from_user(oim, req))
187 goto bail;
188
189 oim.im_max_slots = osb->max_slots;
190 oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
191
192 if (o2info_to_user(oim, req))
193 goto bail;
194
195 status = 0;
196bail:
197 if (status)
198 o2info_set_request_error(oim, req);
199
200 return status;
201}
202
203int ocfs2_info_handle_label(struct inode *inode,
204 struct ocfs2_info_request __user *req)
205{
206 int status = -EFAULT;
207 struct ocfs2_info_label oil;
208 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
209
210 if (o2info_from_user(oil, req))
211 goto bail;
212
213 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
214 oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
215
216 if (o2info_to_user(oil, req))
217 goto bail;
218
219 status = 0;
220bail:
221 if (status)
222 o2info_set_request_error(oil, req);
223
224 return status;
225}
226
227int ocfs2_info_handle_uuid(struct inode *inode,
228 struct ocfs2_info_request __user *req)
229{
230 int status = -EFAULT;
231 struct ocfs2_info_uuid oiu;
232 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
233
234 if (o2info_from_user(oiu, req))
235 goto bail;
236
237 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
238 oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
239
240 if (o2info_to_user(oiu, req))
241 goto bail;
242
243 status = 0;
244bail:
245 if (status)
246 o2info_set_request_error(oiu, req);
247
248 return status;
249}
250
251int ocfs2_info_handle_fs_features(struct inode *inode,
252 struct ocfs2_info_request __user *req)
253{
254 int status = -EFAULT;
255 struct ocfs2_info_fs_features oif;
256 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
257
258 if (o2info_from_user(oif, req))
259 goto bail;
260
261 oif.if_compat_features = osb->s_feature_compat;
262 oif.if_incompat_features = osb->s_feature_incompat;
263 oif.if_ro_compat_features = osb->s_feature_ro_compat;
264 oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
265
266 if (o2info_to_user(oif, req))
267 goto bail;
268
269 status = 0;
270bail:
271 if (status)
272 o2info_set_request_error(oif, req);
273
274 return status;
275}
276
277int ocfs2_info_handle_journal_size(struct inode *inode,
278 struct ocfs2_info_request __user *req)
279{
280 int status = -EFAULT;
281 struct ocfs2_info_journal_size oij;
282 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
283
284 if (o2info_from_user(oij, req))
285 goto bail;
286
287 oij.ij_journal_size = osb->journal->j_inode->i_size;
288
289 oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
290
291 if (o2info_to_user(oij, req))
292 goto bail;
293
294 status = 0;
295bail:
296 if (status)
297 o2info_set_request_error(oij, req);
298
299 return status;
300}
301
302int ocfs2_info_handle_unknown(struct inode *inode,
303 struct ocfs2_info_request __user *req)
304{
305 int status = -EFAULT;
306 struct ocfs2_info_request oir;
307
308 if (o2info_from_user(oir, req))
309 goto bail;
310
311 oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
312
313 if (o2info_to_user(oir, req))
314 goto bail;
315
316 status = 0;
317bail:
318 if (status)
319 o2info_set_request_error(oir, req);
320
321 return status;
322}
323
324/*
325 * Validate and distinguish OCFS2_IOC_INFO requests.
326 *
327 * - validate the magic number.
328 * - distinguish different requests.
329 * - validate size of different requests.
330 */
331int ocfs2_info_handle_request(struct inode *inode,
332 struct ocfs2_info_request __user *req)
333{
334 int status = -EFAULT;
335 struct ocfs2_info_request oir;
336
337 if (o2info_from_user(oir, req))
338 goto bail;
339
340 status = -EINVAL;
341 if (oir.ir_magic != OCFS2_INFO_MAGIC)
342 goto bail;
343
344 switch (oir.ir_code) {
345 case OCFS2_INFO_BLOCKSIZE:
346 if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
347 status = ocfs2_info_handle_blocksize(inode, req);
348 break;
349 case OCFS2_INFO_CLUSTERSIZE:
350 if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
351 status = ocfs2_info_handle_clustersize(inode, req);
352 break;
353 case OCFS2_INFO_MAXSLOTS:
354 if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
355 status = ocfs2_info_handle_maxslots(inode, req);
356 break;
357 case OCFS2_INFO_LABEL:
358 if (oir.ir_size == sizeof(struct ocfs2_info_label))
359 status = ocfs2_info_handle_label(inode, req);
360 break;
361 case OCFS2_INFO_UUID:
362 if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
363 status = ocfs2_info_handle_uuid(inode, req);
364 break;
365 case OCFS2_INFO_FS_FEATURES:
366 if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
367 status = ocfs2_info_handle_fs_features(inode, req);
368 break;
369 case OCFS2_INFO_JOURNAL_SIZE:
370 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
371 status = ocfs2_info_handle_journal_size(inode, req);
372 break;
373 default:
374 status = ocfs2_info_handle_unknown(inode, req);
375 break;
376 }
377
378bail:
379 return status;
380}
381
382int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
383 u64 *req_addr, int compat_flag)
384{
385 int status = -EFAULT;
386 u64 __user *bp = NULL;
387
388 if (compat_flag) {
389#ifdef CONFIG_COMPAT
390 /*
391 * pointer bp stores the base address of a pointers array,
392 * which collects all addresses of separate request.
393 */
394 bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
395#else
396 BUG();
397#endif
398 } else
399 bp = (u64 __user *)(unsigned long)(info->oi_requests);
400
401 if (o2info_from_user(*req_addr, bp + idx))
402 goto bail;
403
404 status = 0;
405bail:
406 return status;
407}
408
409/*
410 * OCFS2_IOC_INFO handles an array of requests passed from userspace.
411 *
412 * ocfs2_info_handle() recevies a large info aggregation, grab and
413 * validate the request count from header, then break it into small
414 * pieces, later specific handlers can handle them one by one.
415 *
416 * Idea here is to make each separate request small enough to ensure
417 * a better backward&forward compatibility, since a small piece of
418 * request will be less likely to be broken if disk layout get changed.
419 */
420int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
421 int compat_flag)
422{
423 int i, status = 0;
424 u64 req_addr;
425 struct ocfs2_info_request __user *reqp;
426
427 if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
428 (!info->oi_requests)) {
429 status = -EINVAL;
430 goto bail;
431 }
432
433 for (i = 0; i < info->oi_count; i++) {
434
435 status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
436 if (status)
437 break;
438
439 reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
440 if (!reqp) {
441 status = -EINVAL;
442 goto bail;
443 }
444
445 status = ocfs2_info_handle_request(inode, reqp);
446 if (status)
447 break;
448 }
449
450bail:
451 return status;
452}
453
112long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 454long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
113{ 455{
114 struct inode *inode = filp->f_path.dentry->d_inode; 456 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
120 struct reflink_arguments args; 462 struct reflink_arguments args;
121 const char *old_path, *new_path; 463 const char *old_path, *new_path;
122 bool preserve; 464 bool preserve;
465 struct ocfs2_info info;
123 466
124 switch (cmd) { 467 switch (cmd) {
125 case OCFS2_IOC_GETFLAGS: 468 case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
174 preserve = (args.preserve != 0); 517 preserve = (args.preserve != 0);
175 518
176 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); 519 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
520 case OCFS2_IOC_INFO:
521 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
522 sizeof(struct ocfs2_info)))
523 return -EFAULT;
524
525 return ocfs2_info_handle(inode, &info, 0);
177 default: 526 default:
178 return -ENOTTY; 527 return -ENOTTY;
179 } 528 }
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
185 bool preserve; 534 bool preserve;
186 struct reflink_arguments args; 535 struct reflink_arguments args;
187 struct inode *inode = file->f_path.dentry->d_inode; 536 struct inode *inode = file->f_path.dentry->d_inode;
537 struct ocfs2_info info;
188 538
189 switch (cmd) { 539 switch (cmd) {
190 case OCFS2_IOC32_GETFLAGS: 540 case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
209 559
210 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), 560 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
211 compat_ptr(args.new_path), preserve); 561 compat_ptr(args.new_path), preserve);
562 case OCFS2_IOC_INFO:
563 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
564 sizeof(struct ocfs2_info)))
565 return -EFAULT;
566
567 return ocfs2_info_handle(inode, &info, 1);
212 default: 568 default:
213 return -ENOIOCTLCMD; 569 return -ENOIOCTLCMD;
214 } 570 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9b57c0350ff9..faa2303dbf0a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
301{ 301{
302 int status = 0; 302 int status = 0;
303 unsigned int flushed; 303 unsigned int flushed;
304 unsigned long old_id;
305 struct ocfs2_journal *journal = NULL; 304 struct ocfs2_journal *journal = NULL;
306 305
307 mlog_entry_void(); 306 mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
326 goto finally; 325 goto finally;
327 } 326 }
328 327
329 old_id = ocfs2_inc_trans_id(journal); 328 ocfs2_inc_trans_id(journal);
330 329
331 flushed = atomic_read(&journal->j_num_trans); 330 flushed = atomic_read(&journal->j_num_trans);
332 atomic_set(&journal->j_num_trans, 0); 331 atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
342 return status; 341 return status;
343} 342}
344 343
345/* pass it NULL and it will allocate a new handle object for you. If
346 * you pass it a handle however, it may still return error, in which
347 * case it has free'd the passed handle for you. */
348handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) 344handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
349{ 345{
350 journal_t *journal = osb->journal->j_journal; 346 journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1888 1884
1889 os = &osb->osb_orphan_scan; 1885 os = &osb->osb_orphan_scan;
1890 1886
1887 mlog(0, "Begin orphan scan\n");
1888
1891 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) 1889 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
1892 goto out; 1890 goto out;
1893 1891
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1920unlock: 1918unlock:
1921 ocfs2_orphan_scan_unlock(osb, seqno); 1919 ocfs2_orphan_scan_unlock(osb, seqno);
1922out: 1920out:
1921 mlog(0, "Orphan scan completed\n");
1923 return; 1922 return;
1924} 1923}
1925 1924
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710f..43e56b97f9c0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
67 struct buffer_head *j_bh; /* Journal disk inode block */ 67 struct buffer_head *j_bh; /* Journal disk inode block */
68 atomic_t j_num_trans; /* Number of transactions 68 atomic_t j_num_trans; /* Number of transactions
69 * currently in the system. */ 69 * currently in the system. */
70 spinlock_t j_lock;
70 unsigned long j_trans_id; 71 unsigned long j_trans_id;
71 struct rw_semaphore j_trans_barrier; 72 struct rw_semaphore j_trans_barrier;
72 wait_queue_head_t j_checkpointed; 73 wait_queue_head_t j_checkpointed;
73 74
74 spinlock_t j_lock; 75 /* both fields protected by j_lock*/
75 struct list_head j_la_cleanups; 76 struct list_head j_la_cleanups;
76 struct work_struct j_recovery_work; 77 struct work_struct j_recovery_work;
77}; 78};
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af2b8fe1f139..7e32db9c2c99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
59 return ret; 59 return ret;
60} 60}
61 61
62static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, 62static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
63 struct page *page) 63 struct page *page)
64{ 64{
65 int ret; 65 int ret;
66 struct inode *inode = file->f_path.dentry->d_inode;
66 struct address_space *mapping = inode->i_mapping; 67 struct address_space *mapping = inode->i_mapping;
67 loff_t pos = page_offset(page); 68 loff_t pos = page_offset(page);
68 unsigned int len = PAGE_CACHE_SIZE; 69 unsigned int len = PAGE_CACHE_SIZE;
@@ -74,9 +75,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
74 /* 75 /*
75 * Another node might have truncated while we were waiting on 76 * Another node might have truncated while we were waiting on
76 * cluster locks. 77 * cluster locks.
78 * We don't check size == 0 before the shift. This is borrowed
79 * from do_generic_file_read.
77 */ 80 */
78 last_index = size >> PAGE_CACHE_SHIFT; 81 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
79 if (page->index > last_index) { 82 if (unlikely(!size || page->index > last_index)) {
80 ret = -EINVAL; 83 ret = -EINVAL;
81 goto out; 84 goto out;
82 } 85 }
@@ -107,9 +110,9 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
107 * because the "write" would invalidate their data. 110 * because the "write" would invalidate their data.
108 */ 111 */
109 if (page->index == last_index) 112 if (page->index == last_index)
110 len = size & ~PAGE_CACHE_MASK; 113 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
111 114
112 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, 115 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
113 &fsdata, di_bh, page); 116 &fsdata, di_bh, page);
114 if (ret) { 117 if (ret) {
115 if (ret != -ENOSPC) 118 if (ret != -ENOSPC)
@@ -157,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
157 */ 160 */
158 down_write(&OCFS2_I(inode)->ip_alloc_sem); 161 down_write(&OCFS2_I(inode)->ip_alloc_sem);
159 162
160 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 163 ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
161 164
162 up_write(&OCFS2_I(inode)->ip_alloc_sem); 165 up_write(&OCFS2_I(inode)->ip_alloc_sem);
163 166
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f171b51a74f7..e7bde21149ae 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -171,7 +171,8 @@ bail_add:
171 ret = ERR_PTR(status); 171 ret = ERR_PTR(status);
172 goto bail_unlock; 172 goto bail_unlock;
173 } 173 }
174 } 174 } else
175 ocfs2_dentry_attach_gen(dentry);
175 176
176bail_unlock: 177bail_unlock:
177 /* Don't drop the cluster lock until *after* the d_add -- 178 /* Don't drop the cluster lock until *after* the d_add --
@@ -472,32 +473,23 @@ leave:
472 return status; 473 return status;
473} 474}
474 475
475static int ocfs2_mknod_locked(struct ocfs2_super *osb, 476static int __ocfs2_mknod_locked(struct inode *dir,
476 struct inode *dir, 477 struct inode *inode,
477 struct inode *inode, 478 dev_t dev,
478 dev_t dev, 479 struct buffer_head **new_fe_bh,
479 struct buffer_head **new_fe_bh, 480 struct buffer_head *parent_fe_bh,
480 struct buffer_head *parent_fe_bh, 481 handle_t *handle,
481 handle_t *handle, 482 struct ocfs2_alloc_context *inode_ac,
482 struct ocfs2_alloc_context *inode_ac) 483 u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
483{ 484{
484 int status = 0; 485 int status = 0;
486 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
485 struct ocfs2_dinode *fe = NULL; 487 struct ocfs2_dinode *fe = NULL;
486 struct ocfs2_extent_list *fel; 488 struct ocfs2_extent_list *fel;
487 u64 suballoc_loc, fe_blkno = 0;
488 u16 suballoc_bit;
489 u16 feat; 489 u16 feat;
490 490
491 *new_fe_bh = NULL; 491 *new_fe_bh = NULL;
492 492
493 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
494 inode_ac, &suballoc_loc,
495 &suballoc_bit, &fe_blkno);
496 if (status < 0) {
497 mlog_errno(status);
498 goto leave;
499 }
500
501 /* populate as many fields early on as possible - many of 493 /* populate as many fields early on as possible - many of
502 * these are used by the support functions here and in 494 * these are used by the support functions here and in
503 * callers. */ 495 * callers. */
@@ -591,6 +583,34 @@ leave:
591 return status; 583 return status;
592} 584}
593 585
586static int ocfs2_mknod_locked(struct ocfs2_super *osb,
587 struct inode *dir,
588 struct inode *inode,
589 dev_t dev,
590 struct buffer_head **new_fe_bh,
591 struct buffer_head *parent_fe_bh,
592 handle_t *handle,
593 struct ocfs2_alloc_context *inode_ac)
594{
595 int status = 0;
596 u64 suballoc_loc, fe_blkno = 0;
597 u16 suballoc_bit;
598
599 *new_fe_bh = NULL;
600
601 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
602 inode_ac, &suballoc_loc,
603 &suballoc_bit, &fe_blkno);
604 if (status < 0) {
605 mlog_errno(status);
606 return status;
607 }
608
609 return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
610 parent_fe_bh, handle, inode_ac,
611 fe_blkno, suballoc_loc, suballoc_bit);
612}
613
594static int ocfs2_mkdir(struct inode *dir, 614static int ocfs2_mkdir(struct inode *dir,
595 struct dentry *dentry, 615 struct dentry *dentry,
596 int mode) 616 int mode)
@@ -1852,61 +1872,117 @@ bail:
1852 return status; 1872 return status;
1853} 1873}
1854 1874
1855static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 1875static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
1856 struct inode **ret_orphan_dir, 1876 struct inode **ret_orphan_dir,
1857 u64 blkno, 1877 struct buffer_head **ret_orphan_dir_bh)
1858 char *name,
1859 struct ocfs2_dir_lookup_result *lookup)
1860{ 1878{
1861 struct inode *orphan_dir_inode; 1879 struct inode *orphan_dir_inode;
1862 struct buffer_head *orphan_dir_bh = NULL; 1880 struct buffer_head *orphan_dir_bh = NULL;
1863 int status = 0; 1881 int ret = 0;
1864
1865 status = ocfs2_blkno_stringify(blkno, name);
1866 if (status < 0) {
1867 mlog_errno(status);
1868 return status;
1869 }
1870 1882
1871 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1883 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1872 ORPHAN_DIR_SYSTEM_INODE, 1884 ORPHAN_DIR_SYSTEM_INODE,
1873 osb->slot_num); 1885 osb->slot_num);
1874 if (!orphan_dir_inode) { 1886 if (!orphan_dir_inode) {
1875 status = -ENOENT; 1887 ret = -ENOENT;
1876 mlog_errno(status); 1888 mlog_errno(ret);
1877 return status; 1889 return ret;
1878 } 1890 }
1879 1891
1880 mutex_lock(&orphan_dir_inode->i_mutex); 1892 mutex_lock(&orphan_dir_inode->i_mutex);
1881 1893
1882 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); 1894 ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
1883 if (status < 0) { 1895 if (ret < 0) {
1884 mlog_errno(status); 1896 mutex_unlock(&orphan_dir_inode->i_mutex);
1885 goto leave; 1897 iput(orphan_dir_inode);
1898
1899 mlog_errno(ret);
1900 return ret;
1886 } 1901 }
1887 1902
1888 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 1903 *ret_orphan_dir = orphan_dir_inode;
1889 orphan_dir_bh, name, 1904 *ret_orphan_dir_bh = orphan_dir_bh;
1890 OCFS2_ORPHAN_NAMELEN, lookup);
1891 if (status < 0) {
1892 ocfs2_inode_unlock(orphan_dir_inode, 1);
1893 1905
1894 mlog_errno(status); 1906 return 0;
1895 goto leave; 1907}
1908
1909static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
1910 struct buffer_head *orphan_dir_bh,
1911 u64 blkno,
1912 char *name,
1913 struct ocfs2_dir_lookup_result *lookup)
1914{
1915 int ret;
1916 struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
1917
1918 ret = ocfs2_blkno_stringify(blkno, name);
1919 if (ret < 0) {
1920 mlog_errno(ret);
1921 return ret;
1922 }
1923
1924 ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
1925 orphan_dir_bh, name,
1926 OCFS2_ORPHAN_NAMELEN, lookup);
1927 if (ret < 0) {
1928 mlog_errno(ret);
1929 return ret;
1930 }
1931
1932 return 0;
1933}
1934
1935/**
1936 * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
1937 * insertion of an orphan.
1938 * @osb: ocfs2 file system
1939 * @ret_orphan_dir: Orphan dir inode - returned locked!
1940 * @blkno: Actual block number of the inode to be inserted into orphan dir.
1941 * @lookup: dir lookup result, to be passed back into functions like
1942 * ocfs2_orphan_add
1943 *
1944 * Returns zero on success and the ret_orphan_dir, name and lookup
1945 * fields will be populated.
1946 *
1947 * Returns non-zero on failure.
1948 */
1949static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1950 struct inode **ret_orphan_dir,
1951 u64 blkno,
1952 char *name,
1953 struct ocfs2_dir_lookup_result *lookup)
1954{
1955 struct inode *orphan_dir_inode = NULL;
1956 struct buffer_head *orphan_dir_bh = NULL;
1957 int ret = 0;
1958
1959 ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
1960 &orphan_dir_bh);
1961 if (ret < 0) {
1962 mlog_errno(ret);
1963 return ret;
1964 }
1965
1966 ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
1967 blkno, name, lookup);
1968 if (ret < 0) {
1969 mlog_errno(ret);
1970 goto out;
1896 } 1971 }
1897 1972
1898 *ret_orphan_dir = orphan_dir_inode; 1973 *ret_orphan_dir = orphan_dir_inode;
1899 1974
1900leave: 1975out:
1901 if (status) { 1976 brelse(orphan_dir_bh);
1977
1978 if (ret) {
1979 ocfs2_inode_unlock(orphan_dir_inode, 1);
1902 mutex_unlock(&orphan_dir_inode->i_mutex); 1980 mutex_unlock(&orphan_dir_inode->i_mutex);
1903 iput(orphan_dir_inode); 1981 iput(orphan_dir_inode);
1904 } 1982 }
1905 1983
1906 brelse(orphan_dir_bh); 1984 mlog_exit(ret);
1907 1985 return ret;
1908 mlog_exit(status);
1909 return status;
1910} 1986}
1911 1987
1912static int ocfs2_orphan_add(struct ocfs2_super *osb, 1988static int ocfs2_orphan_add(struct ocfs2_super *osb,
@@ -2053,6 +2129,99 @@ leave:
2053 return status; 2129 return status;
2054} 2130}
2055 2131
2132/**
2133 * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
2134 * allocated file. This is different from the typical 'add to orphan dir'
2135 * operation in that the inode does not yet exist. This is a problem because
2136 * the orphan dir stringifies the inode block number to come up with it's
2137 * dirent. Obviously if the inode does not yet exist we have a chicken and egg
2138 * problem. This function works around it by calling deeper into the orphan
2139 * and suballoc code than other callers. Use this only by necessity.
2140 * @dir: The directory which this inode will ultimately wind up under - not the
2141 * orphan dir!
2142 * @dir_bh: buffer_head the @dir inode block
2143 * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
2144 * with the string to be used for orphan dirent. Pass back to the orphan dir
2145 * code.
2146 * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
2147 * dir code.
2148 * @ret_di_blkno: block number where the new inode will be allocated.
2149 * @orphan_insert: Dir insert context to be passed back into orphan dir code.
2150 * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
2151 *
2152 * Returns zero on success and the ret_orphan_dir, name and lookup
2153 * fields will be populated.
2154 *
2155 * Returns non-zero on failure.
2156 */
2157static int ocfs2_prep_new_orphaned_file(struct inode *dir,
2158 struct buffer_head *dir_bh,
2159 char *orphan_name,
2160 struct inode **ret_orphan_dir,
2161 u64 *ret_di_blkno,
2162 struct ocfs2_dir_lookup_result *orphan_insert,
2163 struct ocfs2_alloc_context **ret_inode_ac)
2164{
2165 int ret;
2166 u64 di_blkno;
2167 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2168 struct inode *orphan_dir = NULL;
2169 struct buffer_head *orphan_dir_bh = NULL;
2170 struct ocfs2_alloc_context *inode_ac = NULL;
2171
2172 ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
2173 if (ret < 0) {
2174 mlog_errno(ret);
2175 return ret;
2176 }
2177
2178 /* reserve an inode spot */
2179 ret = ocfs2_reserve_new_inode(osb, &inode_ac);
2180 if (ret < 0) {
2181 if (ret != -ENOSPC)
2182 mlog_errno(ret);
2183 goto out;
2184 }
2185
2186 ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
2187 &di_blkno);
2188 if (ret) {
2189 mlog_errno(ret);
2190 goto out;
2191 }
2192
2193 ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
2194 di_blkno, orphan_name, orphan_insert);
2195 if (ret < 0) {
2196 mlog_errno(ret);
2197 goto out;
2198 }
2199
2200out:
2201 if (ret == 0) {
2202 *ret_orphan_dir = orphan_dir;
2203 *ret_di_blkno = di_blkno;
2204 *ret_inode_ac = inode_ac;
2205 /*
2206 * orphan_name and orphan_insert are already up to
2207 * date via prepare_orphan_dir
2208 */
2209 } else {
2210 /* Unroll reserve_new_inode* */
2211 if (inode_ac)
2212 ocfs2_free_alloc_context(inode_ac);
2213
2214 /* Unroll orphan dir locking */
2215 mutex_unlock(&orphan_dir->i_mutex);
2216 ocfs2_inode_unlock(orphan_dir, 1);
2217 iput(orphan_dir);
2218 }
2219
2220 brelse(orphan_dir_bh);
2221
2222 return 0;
2223}
2224
2056int ocfs2_create_inode_in_orphan(struct inode *dir, 2225int ocfs2_create_inode_in_orphan(struct inode *dir,
2057 int mode, 2226 int mode,
2058 struct inode **new_inode) 2227 struct inode **new_inode)
@@ -2068,6 +2237,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2068 struct buffer_head *new_di_bh = NULL; 2237 struct buffer_head *new_di_bh = NULL;
2069 struct ocfs2_alloc_context *inode_ac = NULL; 2238 struct ocfs2_alloc_context *inode_ac = NULL;
2070 struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; 2239 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
2240 u64 uninitialized_var(di_blkno), suballoc_loc;
2241 u16 suballoc_bit;
2071 2242
2072 status = ocfs2_inode_lock(dir, &parent_di_bh, 1); 2243 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2073 if (status < 0) { 2244 if (status < 0) {
@@ -2076,20 +2247,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2076 return status; 2247 return status;
2077 } 2248 }
2078 2249
2079 /* 2250 status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
2080 * We give the orphan dir the root blkno to fake an orphan name, 2251 orphan_name, &orphan_dir,
2081 * and allocate enough space for our insertion. 2252 &di_blkno, &orphan_insert, &inode_ac);
2082 */
2083 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
2084 osb->root_blkno,
2085 orphan_name, &orphan_insert);
2086 if (status < 0) {
2087 mlog_errno(status);
2088 goto leave;
2089 }
2090
2091 /* reserve an inode spot */
2092 status = ocfs2_reserve_new_inode(osb, &inode_ac);
2093 if (status < 0) { 2253 if (status < 0) {
2094 if (status != -ENOSPC) 2254 if (status != -ENOSPC)
2095 mlog_errno(status); 2255 mlog_errno(status);
@@ -2116,17 +2276,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2116 goto leave; 2276 goto leave;
2117 did_quota_inode = 1; 2277 did_quota_inode = 1;
2118 2278
2119 inode->i_nlink = 0; 2279 status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
2120 /* do the real work now. */ 2280 &suballoc_loc,
2121 status = ocfs2_mknod_locked(osb, dir, inode, 2281 &suballoc_bit, di_blkno);
2122 0, &new_di_bh, parent_di_bh, handle,
2123 inode_ac);
2124 if (status < 0) { 2282 if (status < 0) {
2125 mlog_errno(status); 2283 mlog_errno(status);
2126 goto leave; 2284 goto leave;
2127 } 2285 }
2128 2286
2129 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); 2287 inode->i_nlink = 0;
2288 /* do the real work now. */
2289 status = __ocfs2_mknod_locked(dir, inode,
2290 0, &new_di_bh, parent_di_bh, handle,
2291 inode_ac, di_blkno, suballoc_loc,
2292 suballoc_bit);
2130 if (status < 0) { 2293 if (status < 0) {
2131 mlog_errno(status); 2294 mlog_errno(status);
2132 goto leave; 2295 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a2..d8408217e3bd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
150struct ocfs2_lock_res { 150struct ocfs2_lock_res {
151 void *l_priv; 151 void *l_priv;
152 struct ocfs2_lock_res_ops *l_ops; 152 struct ocfs2_lock_res_ops *l_ops;
153 spinlock_t l_lock; 153
154 154
155 struct list_head l_blocked_list; 155 struct list_head l_blocked_list;
156 struct list_head l_mask_waiters; 156 struct list_head l_mask_waiters;
157 157
158 enum ocfs2_lock_type l_type;
159 unsigned long l_flags; 158 unsigned long l_flags;
160 char l_name[OCFS2_LOCK_ID_MAX_LEN]; 159 char l_name[OCFS2_LOCK_ID_MAX_LEN];
161 int l_level;
162 unsigned int l_ro_holders; 160 unsigned int l_ro_holders;
163 unsigned int l_ex_holders; 161 unsigned int l_ex_holders;
164 struct ocfs2_dlm_lksb l_lksb; 162 unsigned char l_level;
163
164 /* Data packed - type enum ocfs2_lock_type */
165 unsigned char l_type;
165 166
166 /* used from AST/BAST funcs. */ 167 /* used from AST/BAST funcs. */
167 enum ocfs2_ast_action l_action; 168 /* Data packed - enum type ocfs2_ast_action */
168 enum ocfs2_unlock_action l_unlock_action; 169 unsigned char l_action;
169 int l_requested; 170 /* Data packed - enum type ocfs2_unlock_action */
170 int l_blocking; 171 unsigned char l_unlock_action;
172 unsigned char l_requested;
173 unsigned char l_blocking;
171 unsigned int l_pending_gen; 174 unsigned int l_pending_gen;
172 175
176 spinlock_t l_lock;
177
178 struct ocfs2_dlm_lksb l_lksb;
179
173 wait_queue_head_t l_event; 180 wait_queue_head_t l_event;
174 181
175 struct list_head l_debug_list; 182 struct list_head l_debug_list;
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state
243 250
244enum ocfs2_mount_options 251enum ocfs2_mount_options
245{ 252{
246 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ 253 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
247 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ 254 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
248 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 255 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
249 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 256 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +263,10 @@ enum ocfs2_mount_options
256 control lists */ 263 control lists */
257 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ 264 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
258 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ 265 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
266 OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
267 writes */
268 OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
269 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
259}; 270};
260 271
261#define OCFS2_OSB_SOFT_RO 0x0001 272#define OCFS2_OSB_SOFT_RO 0x0001
@@ -277,7 +288,8 @@ struct ocfs2_super
277 struct super_block *sb; 288 struct super_block *sb;
278 struct inode *root_inode; 289 struct inode *root_inode;
279 struct inode *sys_root_inode; 290 struct inode *sys_root_inode;
280 struct inode *system_inodes[NUM_SYSTEM_INODES]; 291 struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
292 struct inode **local_system_inodes;
281 293
282 struct ocfs2_slot_info *slot_info; 294 struct ocfs2_slot_info *slot_info;
283 295
@@ -368,6 +380,8 @@ struct ocfs2_super
368 struct ocfs2_alloc_stats alloc_stats; 380 struct ocfs2_alloc_stats alloc_stats;
369 char dev_str[20]; /* "major,minor" of the device */ 381 char dev_str[20]; /* "major,minor" of the device */
370 382
383 u8 osb_stackflags;
384
371 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 385 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
372 struct ocfs2_cluster_connection *cconn; 386 struct ocfs2_cluster_connection *cconn;
373 struct ocfs2_lock_res osb_super_lockres; 387 struct ocfs2_lock_res osb_super_lockres;
@@ -601,10 +615,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
601 return ret; 615 return ret;
602} 616}
603 617
604static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) 618static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
605{ 619{
606 return (osb->s_feature_incompat & 620 return (osb->s_feature_incompat &
607 OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); 621 (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
622 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
623}
624
625static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
626{
627 if (ocfs2_clusterinfo_valid(osb) &&
628 memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
629 OCFS2_STACK_LABEL_LEN))
630 return 1;
631 return 0;
632}
633
634static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
635{
636 if (ocfs2_clusterinfo_valid(osb) &&
637 !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
638 OCFS2_STACK_LABEL_LEN))
639 return 1;
640 return 0;
641}
642
643static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
644{
645 return ocfs2_o2cb_stack(osb) &&
646 (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
608} 647}
609 648
610static inline int ocfs2_mount_local(struct ocfs2_super *osb) 649static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 33f1c9a8258d..c2e4f8222e2f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
101 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ 103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) 104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
105 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
105#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 106#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
106 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 107 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
107 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 108 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -170,6 +171,13 @@
170#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 171#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
171 172
172/* 173/*
174 * Incompat bit to indicate useable clusterinfo with stackflags for all
175 * cluster stacks (userspace adnd o2cb). If this bit is set,
176 * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
177 */
178#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
179
180/*
173 * backup superblock flag is used to indicate that this volume 181 * backup superblock flag is used to indicate that this volume
174 * has backup superblocks. 182 * has backup superblocks.
175 */ 183 */
@@ -235,18 +243,31 @@
235#define OCFS2_HAS_REFCOUNT_FL (0x0010) 243#define OCFS2_HAS_REFCOUNT_FL (0x0010)
236 244
237/* Inode attributes, keep in sync with EXT2 */ 245/* Inode attributes, keep in sync with EXT2 */
238#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 246#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */
239#define OCFS2_UNRM_FL (0x00000002) /* Undelete */ 247#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */
240#define OCFS2_COMPR_FL (0x00000004) /* Compress file */ 248#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */
241#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */ 249#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
242#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */ 250#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
243#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */ 251#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
244#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */ 252#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
245#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */ 253#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
246#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */ 254/* Reserved for compression usage... */
247 255#define OCFS2_DIRTY_FL FS_DIRTY_FL
248#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ 256#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
249#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ 257#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
258#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
259/* End compression flags --- maybe not all used */
260#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */
261#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
262#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
263#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
264#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
265#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
266#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
267#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
268
269#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
270#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
250 271
251/* 272/*
252 * Extent record flags (e_node.leaf.flags) 273 * Extent record flags (e_node.leaf.flags)
@@ -279,10 +300,13 @@
279#define OCFS2_VOL_UUID_LEN 16 300#define OCFS2_VOL_UUID_LEN 16
280#define OCFS2_MAX_VOL_LABEL_LEN 64 301#define OCFS2_MAX_VOL_LABEL_LEN 64
281 302
282/* The alternate, userspace stack fields */ 303/* The cluster stack fields */
283#define OCFS2_STACK_LABEL_LEN 4 304#define OCFS2_STACK_LABEL_LEN 4
284#define OCFS2_CLUSTER_NAME_LEN 16 305#define OCFS2_CLUSTER_NAME_LEN 16
285 306
307/* Classic (historically speaking) cluster stack */
308#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
309
286/* Journal limits (in bytes) */ 310/* Journal limits (in bytes) */
287#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 311#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
288 312
@@ -292,6 +316,11 @@
292 */ 316 */
293#define OCFS2_MIN_XATTR_INLINE_SIZE 256 317#define OCFS2_MIN_XATTR_INLINE_SIZE 256
294 318
319/*
320 * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
321 */
322#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
323
295struct ocfs2_system_inode_info { 324struct ocfs2_system_inode_info {
296 char *si_name; 325 char *si_name;
297 int si_iflags; 326 int si_iflags;
@@ -309,6 +338,7 @@ enum {
309 USER_QUOTA_SYSTEM_INODE, 338 USER_QUOTA_SYSTEM_INODE,
310 GROUP_QUOTA_SYSTEM_INODE, 339 GROUP_QUOTA_SYSTEM_INODE,
311#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE 340#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
341#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
312 ORPHAN_DIR_SYSTEM_INODE, 342 ORPHAN_DIR_SYSTEM_INODE,
313 EXTENT_ALLOC_SYSTEM_INODE, 343 EXTENT_ALLOC_SYSTEM_INODE,
314 INODE_ALLOC_SYSTEM_INODE, 344 INODE_ALLOC_SYSTEM_INODE,
@@ -317,8 +347,12 @@ enum {
317 TRUNCATE_LOG_SYSTEM_INODE, 347 TRUNCATE_LOG_SYSTEM_INODE,
318 LOCAL_USER_QUOTA_SYSTEM_INODE, 348 LOCAL_USER_QUOTA_SYSTEM_INODE,
319 LOCAL_GROUP_QUOTA_SYSTEM_INODE, 349 LOCAL_GROUP_QUOTA_SYSTEM_INODE,
350#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
320 NUM_SYSTEM_INODES 351 NUM_SYSTEM_INODES
321}; 352};
353#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
354#define NUM_LOCAL_SYSTEM_INODES \
355 (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
322 356
323static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { 357static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
324 /* Global system inodes (single copy) */ 358 /* Global system inodes (single copy) */
@@ -347,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
347/* Parameter passed from mount.ocfs2 to module */ 381/* Parameter passed from mount.ocfs2 to module */
348#define OCFS2_HB_NONE "heartbeat=none" 382#define OCFS2_HB_NONE "heartbeat=none"
349#define OCFS2_HB_LOCAL "heartbeat=local" 383#define OCFS2_HB_LOCAL "heartbeat=local"
384#define OCFS2_HB_GLOBAL "heartbeat=global"
350 385
351/* 386/*
352 * OCFS2 directory file types. Only the low 3 bits are used. The 387 * OCFS2 directory file types. Only the low 3 bits are used. The
@@ -553,9 +588,21 @@ struct ocfs2_slot_map_extended {
553 */ 588 */
554}; 589};
555 590
591/*
592 * ci_stackflags is only valid if the incompat bit
593 * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
594 */
556struct ocfs2_cluster_info { 595struct ocfs2_cluster_info {
557/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; 596/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
558 __le32 ci_reserved; 597 union {
598 __le32 ci_reserved;
599 struct {
600 __u8 ci_stackflags;
601 __u8 ci_reserved1;
602 __u8 ci_reserved2;
603 __u8 ci_reserved3;
604 };
605 };
559/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; 606/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
560/*18*/ 607/*18*/
561}; 608};
@@ -592,9 +639,9 @@ struct ocfs2_super_block {
592 * group header */ 639 * group header */
593/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 640/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
594/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ 641/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
595/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace 642/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
596 stack. Only valid 643 userspace or clusterinfo
597 with INCOMPAT flag. */ 644 INCOMPAT flag set. */
598/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 645/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
599 for this fs*/ 646 for this fs*/
600 __le16 s_reserved0; 647 __le16 s_reserved0;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a83..b46f39bf7438 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
23/* 23/*
24 * ioctl commands 24 * ioctl commands
25 */ 25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) 26#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) 27#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) 28#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) 29#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
30 30
31/* 31/*
32 * Space reservation / allocation / free ioctls and argument structure 32 * Space reservation / allocation / free ioctls and argument structure
@@ -76,4 +76,99 @@ struct reflink_arguments {
76}; 76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) 77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78 78
79/* Following definitions dedicated for ocfs2_info_request ioctls. */
80#define OCFS2_INFO_MAX_REQUEST (50)
81#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
82
83/* Magic number of all requests */
84#define OCFS2_INFO_MAGIC (0x4F32494E)
85
86/*
87 * Always try to separate info request into small pieces to
88 * guarantee the backward&forward compatibility.
89 */
90struct ocfs2_info {
91 __u64 oi_requests; /* Array of __u64 pointers to requests */
92 __u32 oi_count; /* Number of requests in info_requests */
93 __u32 oi_pad;
94};
95
96struct ocfs2_info_request {
97/*00*/ __u32 ir_magic; /* Magic number */
98 __u32 ir_code; /* Info request code */
99 __u32 ir_size; /* Size of request */
100 __u32 ir_flags; /* Request flags */
101/*10*/ /* Request specific fields */
102};
103
104struct ocfs2_info_clustersize {
105 struct ocfs2_info_request ic_req;
106 __u32 ic_clustersize;
107 __u32 ic_pad;
108};
109
110struct ocfs2_info_blocksize {
111 struct ocfs2_info_request ib_req;
112 __u32 ib_blocksize;
113 __u32 ib_pad;
114};
115
116struct ocfs2_info_maxslots {
117 struct ocfs2_info_request im_req;
118 __u32 im_max_slots;
119 __u32 im_pad;
120};
121
122struct ocfs2_info_label {
123 struct ocfs2_info_request il_req;
124 __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
125} __attribute__ ((packed));
126
127struct ocfs2_info_uuid {
128 struct ocfs2_info_request iu_req;
129 __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
130} __attribute__ ((packed));
131
132struct ocfs2_info_fs_features {
133 struct ocfs2_info_request if_req;
134 __u32 if_compat_features;
135 __u32 if_incompat_features;
136 __u32 if_ro_compat_features;
137 __u32 if_pad;
138};
139
140struct ocfs2_info_journal_size {
141 struct ocfs2_info_request ij_req;
142 __u64 ij_journal_size;
143};
144
145/* Codes for ocfs2_info_request */
146enum ocfs2_info_type {
147 OCFS2_INFO_CLUSTERSIZE = 1,
148 OCFS2_INFO_BLOCKSIZE,
149 OCFS2_INFO_MAXSLOTS,
150 OCFS2_INFO_LABEL,
151 OCFS2_INFO_UUID,
152 OCFS2_INFO_FS_FEATURES,
153 OCFS2_INFO_JOURNAL_SIZE,
154 OCFS2_INFO_NUM_TYPES
155};
156
157/* Flags for struct ocfs2_info_request */
158/* Filled by the caller */
159#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
160 required. This is a hint.
161 It is up to ocfs2 whether
162 the request can be fulfilled
163 without locking. */
164/* Filled by ocfs2 */
165#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
166 this request and
167 filled in the answer */
168
169#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
170 request handling. */
171
172#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
173
79#endif /* OCFS2_IOCTL_H */ 174#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3ac5aa733e9c..b5f9160e93e9 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
49 49
50struct ocfs2_cow_context { 50struct ocfs2_cow_context {
51 struct inode *inode; 51 struct inode *inode;
52 struct file *file;
52 u32 cow_start; 53 u32 cow_start;
53 u32 cow_len; 54 u32 cow_len;
54 struct ocfs2_extent_tree data_et; 55 struct ocfs2_extent_tree data_et;
@@ -2436,16 +2437,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2436 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + 2437 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2437 le32_to_cpu(rec.r_clusters)) - cpos; 2438 le32_to_cpu(rec.r_clusters)) - cpos;
2438 /* 2439 /*
2439 * If the refcount rec already exist, cool. We just need
2440 * to check whether there is a split. Otherwise we just need
2441 * to increase the refcount.
2442 * If we will insert one, increases recs_add.
2443 *
2444 * We record all the records which will be inserted to the 2440 * We record all the records which will be inserted to the
2445 * same refcount block, so that we can tell exactly whether 2441 * same refcount block, so that we can tell exactly whether
2446 * we need a new refcount block or not. 2442 * we need a new refcount block or not.
2443 *
2444 * If we will insert a new one, this is easy and only happens
2445 * during adding refcounted flag to the extent, so we don't
2446 * have a chance of spliting. We just need one record.
2447 *
2448 * If the refcount rec already exists, that would be a little
2449 * complicated. we may have to:
2450 * 1) split at the beginning if the start pos isn't aligned.
2451 * we need 1 more record in this case.
2452 * 2) split int the end if the end pos isn't aligned.
2453 * we need 1 more record in this case.
2454 * 3) split in the middle because of file system fragmentation.
2455 * we need 2 more records in this case(we can't detect this
2456 * beforehand, so always think of the worst case).
2447 */ 2457 */
2448 if (rec.r_refcount) { 2458 if (rec.r_refcount) {
2459 recs_add += 2;
2449 /* Check whether we need a split at the beginning. */ 2460 /* Check whether we need a split at the beginning. */
2450 if (cpos == start_cpos && 2461 if (cpos == start_cpos &&
2451 cpos != le64_to_cpu(rec.r_cpos)) 2462 cpos != le64_to_cpu(rec.r_cpos))
@@ -2922,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2922 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2923 struct page *page; 2934 struct page *page;
2924 pgoff_t page_index; 2935 pgoff_t page_index;
2925 unsigned int from, to; 2936 unsigned int from, to, readahead_pages;
2926 loff_t offset, end, map_end; 2937 loff_t offset, end, map_end;
2927 struct address_space *mapping = context->inode->i_mapping; 2938 struct address_space *mapping = context->inode->i_mapping;
2928 2939
2929 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, 2940 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2930 new_cluster, new_len, cpos); 2941 new_cluster, new_len, cpos);
2931 2942
2943 readahead_pages =
2944 (ocfs2_cow_contig_clusters(sb) <<
2945 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
2932 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2946 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2933 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2947 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2934 /* 2948 /*
@@ -2950,7 +2964,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2950 if (map_end & (PAGE_CACHE_SIZE - 1)) 2964 if (map_end & (PAGE_CACHE_SIZE - 1))
2951 to = map_end & (PAGE_CACHE_SIZE - 1); 2965 to = map_end & (PAGE_CACHE_SIZE - 1);
2952 2966
2953 page = grab_cache_page(mapping, page_index); 2967 page = find_or_create_page(mapping, page_index, GFP_NOFS);
2954 2968
2955 /* 2969 /*
2956 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page 2970 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -2959,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2959 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2960 BUG_ON(PageDirty(page)); 2974 BUG_ON(PageDirty(page));
2961 2975
2976 if (PageReadahead(page) && context->file) {
2977 page_cache_async_readahead(mapping,
2978 &context->file->f_ra,
2979 context->file,
2980 page, page_index,
2981 readahead_pages);
2982 }
2983
2962 if (!PageUptodate(page)) { 2984 if (!PageUptodate(page)) {
2963 ret = block_read_full_page(page, ocfs2_get_block); 2985 ret = block_read_full_page(page, ocfs2_get_block);
2964 if (ret) { 2986 if (ret) {
@@ -3169,7 +3191,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3169 if (map_end > end) 3191 if (map_end > end)
3170 map_end = end; 3192 map_end = end;
3171 3193
3172 page = grab_cache_page(context->inode->i_mapping, page_index); 3194 page = find_or_create_page(context->inode->i_mapping,
3195 page_index, GFP_NOFS);
3173 BUG_ON(!page); 3196 BUG_ON(!page);
3174 3197
3175 wait_on_page_writeback(page); 3198 wait_on_page_writeback(page);
@@ -3398,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3398 return ret; 3421 return ret;
3399} 3422}
3400 3423
3424static void ocfs2_readahead_for_cow(struct inode *inode,
3425 struct file *file,
3426 u32 start, u32 len)
3427{
3428 struct address_space *mapping;
3429 pgoff_t index;
3430 unsigned long num_pages;
3431 int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
3432
3433 if (!file)
3434 return;
3435
3436 mapping = file->f_mapping;
3437 num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
3438 if (!num_pages)
3439 num_pages = 1;
3440
3441 index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
3442 page_cache_sync_readahead(mapping, &file->f_ra, file,
3443 index, num_pages);
3444}
3445
3401/* 3446/*
3402 * Starting at cpos, try to CoW write_len clusters. Don't CoW 3447 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3403 * past max_cpos. This will stop when it runs into a hole or an 3448 * past max_cpos. This will stop when it runs into a hole or an
3404 * unrefcounted extent. 3449 * unrefcounted extent.
3405 */ 3450 */
3406static int ocfs2_refcount_cow_hunk(struct inode *inode, 3451static int ocfs2_refcount_cow_hunk(struct inode *inode,
3452 struct file *file,
3407 struct buffer_head *di_bh, 3453 struct buffer_head *di_bh,
3408 u32 cpos, u32 write_len, u32 max_cpos) 3454 u32 cpos, u32 write_len, u32 max_cpos)
3409{ 3455{
@@ -3432,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3432 3478
3433 BUG_ON(cow_len == 0); 3479 BUG_ON(cow_len == 0);
3434 3480
3481 ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
3482
3435 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3483 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3436 if (!context) { 3484 if (!context) {
3437 ret = -ENOMEM; 3485 ret = -ENOMEM;
@@ -3453,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3453 context->ref_root_bh = ref_root_bh; 3501 context->ref_root_bh = ref_root_bh;
3454 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; 3502 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3455 context->get_clusters = ocfs2_di_get_clusters; 3503 context->get_clusters = ocfs2_di_get_clusters;
3504 context->file = file;
3456 3505
3457 ocfs2_init_dinode_extent_tree(&context->data_et, 3506 ocfs2_init_dinode_extent_tree(&context->data_et,
3458 INODE_CACHE(inode), di_bh); 3507 INODE_CACHE(inode), di_bh);
@@ -3481,6 +3530,7 @@ out:
3481 * clusters between cpos and cpos+write_len are safe to modify. 3530 * clusters between cpos and cpos+write_len are safe to modify.
3482 */ 3531 */
3483int ocfs2_refcount_cow(struct inode *inode, 3532int ocfs2_refcount_cow(struct inode *inode,
3533 struct file *file,
3484 struct buffer_head *di_bh, 3534 struct buffer_head *di_bh,
3485 u32 cpos, u32 write_len, u32 max_cpos) 3535 u32 cpos, u32 write_len, u32 max_cpos)
3486{ 3536{
@@ -3500,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
3500 num_clusters = write_len; 3550 num_clusters = write_len;
3501 3551
3502 if (ext_flags & OCFS2_EXT_REFCOUNTED) { 3552 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3503 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, 3553 ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
3504 num_clusters, max_cpos); 3554 num_clusters, max_cpos);
3505 if (ret) { 3555 if (ret) {
3506 mlog_errno(ret); 3556 mlog_errno(ret);
@@ -4190,8 +4240,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
4190 goto out; 4240 goto out;
4191 } 4241 }
4192 4242
4193 mutex_lock(&new_inode->i_mutex); 4243 mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
4194 ret = ocfs2_inode_lock(new_inode, &new_bh, 1); 4244 ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
4245 OI_LS_REFLINK_TARGET);
4195 if (ret) { 4246 if (ret) {
4196 mlog_errno(ret); 4247 mlog_errno(ret);
4197 goto out_unlock; 4248 goto out_unlock;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e2..c8ce46f7d8e3 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
21 struct rb_node rf_node; 21 struct rb_node rf_node;
22 u64 rf_blkno; 22 u64 rf_blkno;
23 u32 rf_generation; 23 u32 rf_generation;
24 struct kref rf_getcnt;
24 struct rw_semaphore rf_sem; 25 struct rw_semaphore rf_sem;
25 struct ocfs2_lock_res rf_lockres; 26 struct ocfs2_lock_res rf_lockres;
26 struct kref rf_getcnt;
27 int rf_removed; 27 int rf_removed;
28 28
29 /* the following 4 fields are used by caching_info. */ 29 /* the following 4 fields are used by caching_info. */
30 struct ocfs2_caching_info rf_ci;
31 spinlock_t rf_lock; 30 spinlock_t rf_lock;
31 struct ocfs2_caching_info rf_ci;
32 struct mutex rf_io_mutex; 32 struct mutex rf_io_mutex;
33 struct super_block *rf_sb; 33 struct super_block *rf_sb;
34}; 34};
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
52 u32 clusters, 52 u32 clusters,
53 int *credits, 53 int *credits,
54 int *ref_blocks); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, 55int ocfs2_refcount_cow(struct inode *inode,
56 struct file *filep, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos); 57 u32 cpos, u32 write_len, u32 max_cpos);
57 58
58typedef int (ocfs2_post_refcount_func)(struct inode *inode, 59typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index d8b6e4259b80..3e78db361bc7 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
732 struct ocfs2_alloc_reservation *resv, 732 struct ocfs2_alloc_reservation *resv,
733 int *cstart, int *clen) 733 int *cstart, int *clen)
734{ 734{
735 unsigned int wanted = *clen;
736
737 if (resv == NULL || ocfs2_resmap_disabled(resmap)) 735 if (resv == NULL || ocfs2_resmap_disabled(resmap))
738 return -ENOSPC; 736 return -ENOSPC;
739 737
740 spin_lock(&resv_lock); 738 spin_lock(&resv_lock);
741 739
742 /*
743 * We don't want to over-allocate for temporary
744 * windows. Otherwise, we run the risk of fragmenting the
745 * allocation space.
746 */
747 wanted = ocfs2_resv_window_bits(resmap, resv);
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen;
750
751 if (ocfs2_resv_empty(resv)) { 740 if (ocfs2_resv_empty(resv)) {
752 mlog(0, "empty reservation, find new window\n"); 741 /*
742 * We don't want to over-allocate for temporary
743 * windows. Otherwise, we run the risk of fragmenting the
744 * allocation space.
745 */
746 unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
753 747
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen;
750
751 mlog(0, "empty reservation, find new window\n");
754 /* 752 /*
755 * Try to get a window here. If it works, we must fall 753 * Try to get a window here. If it works, we must fall
756 * through and test the bitmap . This avoids some 754 * through and test the bitmap . This avoids some
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949f..ab4e0172cc1d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
357{ 357{
358 int status = 0; 358 int status = 0;
359 u64 blkno; 359 u64 blkno;
360 unsigned long long blocks, bytes; 360 unsigned long long blocks, bytes = 0;
361 unsigned int i; 361 unsigned int i;
362 struct buffer_head *bh; 362 struct buffer_head *bh;
363 363
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f696c5..19965b00c43c 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
283 /* for now we only have one cluster/node, make sure we see it 283 /* for now we only have one cluster/node, make sure we see it
284 * in the heartbeat universe */ 284 * in the heartbeat universe */
285 if (!o2hb_check_local_node_heartbeating()) { 285 if (!o2hb_check_local_node_heartbeating()) {
286 if (o2hb_global_heartbeat_active())
287 mlog(ML_ERROR, "Global heartbeat not started\n");
286 rc = -EINVAL; 288 rc = -EINVAL;
287 goto out; 289 goto out;
288 } 290 }
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca0688..252e7c82f929 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/smp_lock.h>
26#include <linux/reboot.h> 25#include <linux/reboot.h>
27#include <asm/uaccess.h> 26#include <asm/uaccess.h>
28 27
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
612 return -ENOMEM; 611 return -ENOMEM;
613 p->op_this_node = -1; 612 p->op_this_node = -1;
614 613
615 lock_kernel();
616 mutex_lock(&ocfs2_control_lock); 614 mutex_lock(&ocfs2_control_lock);
617 file->private_data = p; 615 file->private_data = p;
618 list_add(&p->op_list, &ocfs2_control_private_list); 616 list_add(&p->op_list, &ocfs2_control_private_list);
619 mutex_unlock(&ocfs2_control_lock); 617 mutex_unlock(&ocfs2_control_lock);
620 unlock_kernel();
621 618
622 return 0; 619 return 0;
623} 620}
@@ -628,6 +625,7 @@ static const struct file_operations ocfs2_control_fops = {
628 .read = ocfs2_control_read, 625 .read = ocfs2_control_read,
629 .write = ocfs2_control_write, 626 .write = ocfs2_control_write,
630 .owner = THIS_MODULE, 627 .owner = THIS_MODULE,
628 .llseek = default_llseek,
631}; 629};
632 630
633static struct miscdevice ocfs2_control_device = { 631static struct miscdevice ocfs2_control_device = {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a8e6a95a353f..5fed60de7630 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -57,11 +57,28 @@ struct ocfs2_suballoc_result {
57 u64 sr_bg_blkno; /* The bg we allocated from. Set 57 u64 sr_bg_blkno; /* The bg we allocated from. Set
58 to 0 when a block group is 58 to 0 when a block group is
59 contiguous. */ 59 contiguous. */
60 u64 sr_bg_stable_blkno; /*
61 * Doesn't change, always
62 * set to target block
63 * group descriptor
64 * block.
65 */
60 u64 sr_blkno; /* The first allocated block */ 66 u64 sr_blkno; /* The first allocated block */
61 unsigned int sr_bit_offset; /* The bit in the bg */ 67 unsigned int sr_bit_offset; /* The bit in the bg */
62 unsigned int sr_bits; /* How many bits we claimed */ 68 unsigned int sr_bits; /* How many bits we claimed */
63}; 69};
64 70
71static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72{
73 if (res->sr_blkno == 0)
74 return 0;
75
76 if (res->sr_bg_blkno)
77 return res->sr_bg_blkno;
78
79 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80}
81
65static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 82static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
66static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 83static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
67static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 84static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
138 brelse(ac->ac_bh); 155 brelse(ac->ac_bh);
139 ac->ac_bh = NULL; 156 ac->ac_bh = NULL;
140 ac->ac_resv = NULL; 157 ac->ac_resv = NULL;
158 if (ac->ac_find_loc_priv) {
159 kfree(ac->ac_find_loc_priv);
160 ac->ac_find_loc_priv = NULL;
161 }
141} 162}
142 163
143void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 164void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -336,7 +357,7 @@ out:
336static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 357static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
337 struct ocfs2_group_desc *bg, 358 struct ocfs2_group_desc *bg,
338 struct ocfs2_chain_list *cl, 359 struct ocfs2_chain_list *cl,
339 u64 p_blkno, u32 clusters) 360 u64 p_blkno, unsigned int clusters)
340{ 361{
341 struct ocfs2_extent_list *el = &bg->bg_list; 362 struct ocfs2_extent_list *el = &bg->bg_list;
342 struct ocfs2_extent_rec *rec; 363 struct ocfs2_extent_rec *rec;
@@ -348,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
348 rec->e_blkno = cpu_to_le64(p_blkno); 369 rec->e_blkno = cpu_to_le64(p_blkno);
349 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 370 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
350 le16_to_cpu(cl->cl_bpc)); 371 le16_to_cpu(cl->cl_bpc));
351 rec->e_leaf_clusters = cpu_to_le32(clusters); 372 rec->e_leaf_clusters = cpu_to_le16(clusters);
352 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 373 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
353 le16_add_cpu(&bg->bg_free_bits_count, 374 le16_add_cpu(&bg->bg_free_bits_count,
354 clusters * le16_to_cpu(cl->cl_bpc)); 375 clusters * le16_to_cpu(cl->cl_bpc));
@@ -1359,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1359 } 1380 }
1360 1381
1361 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1382 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1383 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1384 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
1385 " count %u but claims %u are freed. num_bits %d",
1386 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1387 le16_to_cpu(bg->bg_bits),
1388 le16_to_cpu(bg->bg_free_bits_count), num_bits);
1389 return -EROFS;
1390 }
1362 while(num_bits--) 1391 while(num_bits--)
1363 ocfs2_set_bit(bit_off++, bitmap); 1392 ocfs2_set_bit(bit_off++, bitmap);
1364 1393
@@ -1678,6 +1707,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1678 if (!ret) 1707 if (!ret)
1679 ocfs2_bg_discontig_fix_result(ac, gd, res); 1708 ocfs2_bg_discontig_fix_result(ac, gd, res);
1680 1709
1710 /*
1711 * sr_bg_blkno might have been changed by
1712 * ocfs2_bg_discontig_fix_result
1713 */
1714 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1715
1716 if (ac->ac_find_loc_only)
1717 goto out_loc_only;
1718
1681 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1719 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1682 res->sr_bits, 1720 res->sr_bits,
1683 le16_to_cpu(gd->bg_chain)); 1721 le16_to_cpu(gd->bg_chain));
@@ -1691,6 +1729,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1691 if (ret < 0) 1729 if (ret < 0)
1692 mlog_errno(ret); 1730 mlog_errno(ret);
1693 1731
1732out_loc_only:
1694 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1733 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1695 1734
1696out: 1735out:
@@ -1708,7 +1747,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1708{ 1747{
1709 int status; 1748 int status;
1710 u16 chain; 1749 u16 chain;
1711 u32 tmp_used;
1712 u64 next_group; 1750 u64 next_group;
1713 struct inode *alloc_inode = ac->ac_inode; 1751 struct inode *alloc_inode = ac->ac_inode;
1714 struct buffer_head *group_bh = NULL; 1752 struct buffer_head *group_bh = NULL;
@@ -1770,6 +1808,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1770 if (!status) 1808 if (!status)
1771 ocfs2_bg_discontig_fix_result(ac, bg, res); 1809 ocfs2_bg_discontig_fix_result(ac, bg, res);
1772 1810
1811 /*
1812 * sr_bg_blkno might have been changed by
1813 * ocfs2_bg_discontig_fix_result
1814 */
1815 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1773 1816
1774 /* 1817 /*
1775 * Keep track of previous block descriptor read. When 1818 * Keep track of previous block descriptor read. When
@@ -1796,22 +1839,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1796 } 1839 }
1797 } 1840 }
1798 1841
1799 /* Ok, claim our bits now: set the info on dinode, chainlist 1842 if (ac->ac_find_loc_only)
1800 * and then the group */ 1843 goto out_loc_only;
1801 status = ocfs2_journal_access_di(handle, 1844
1802 INODE_CACHE(alloc_inode), 1845 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1803 ac->ac_bh, 1846 ac->ac_bh, res->sr_bits,
1804 OCFS2_JOURNAL_ACCESS_WRITE); 1847 chain);
1805 if (status < 0) { 1848 if (status) {
1806 mlog_errno(status); 1849 mlog_errno(status);
1807 goto bail; 1850 goto bail;
1808 } 1851 }
1809 1852
1810 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1811 fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
1812 le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
1813 ocfs2_journal_dirty(handle, ac->ac_bh);
1814
1815 status = ocfs2_block_group_set_bits(handle, 1853 status = ocfs2_block_group_set_bits(handle,
1816 alloc_inode, 1854 alloc_inode,
1817 bg, 1855 bg,
@@ -1826,6 +1864,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1826 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, 1864 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1827 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1865 (unsigned long long)le64_to_cpu(fe->i_blkno));
1828 1866
1867out_loc_only:
1829 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1868 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1830bail: 1869bail:
1831 brelse(group_bh); 1870 brelse(group_bh);
@@ -1845,6 +1884,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1845 int status; 1884 int status;
1846 u16 victim, i; 1885 u16 victim, i;
1847 u16 bits_left = 0; 1886 u16 bits_left = 0;
1887 u64 hint = ac->ac_last_group;
1848 struct ocfs2_chain_list *cl; 1888 struct ocfs2_chain_list *cl;
1849 struct ocfs2_dinode *fe; 1889 struct ocfs2_dinode *fe;
1850 1890
@@ -1872,7 +1912,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1872 goto bail; 1912 goto bail;
1873 } 1913 }
1874 1914
1875 res->sr_bg_blkno = ac->ac_last_group; 1915 res->sr_bg_blkno = hint;
1876 if (res->sr_bg_blkno) { 1916 if (res->sr_bg_blkno) {
1877 /* Attempt to short-circuit the usual search mechanism 1917 /* Attempt to short-circuit the usual search mechanism
1878 * by jumping straight to the most recently used 1918 * by jumping straight to the most recently used
@@ -1896,8 +1936,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1896 1936
1897 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1937 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1898 res, &bits_left); 1938 res, &bits_left);
1899 if (!status) 1939 if (!status) {
1940 hint = ocfs2_group_from_res(res);
1900 goto set_hint; 1941 goto set_hint;
1942 }
1901 if (status < 0 && status != -ENOSPC) { 1943 if (status < 0 && status != -ENOSPC) {
1902 mlog_errno(status); 1944 mlog_errno(status);
1903 goto bail; 1945 goto bail;
@@ -1920,8 +1962,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1920 ac->ac_chain = i; 1962 ac->ac_chain = i;
1921 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1963 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1922 res, &bits_left); 1964 res, &bits_left);
1923 if (!status) 1965 if (!status) {
1966 hint = ocfs2_group_from_res(res);
1924 break; 1967 break;
1968 }
1925 if (status < 0 && status != -ENOSPC) { 1969 if (status < 0 && status != -ENOSPC) {
1926 mlog_errno(status); 1970 mlog_errno(status);
1927 goto bail; 1971 goto bail;
@@ -1936,7 +1980,7 @@ set_hint:
1936 if (bits_left < min_bits) 1980 if (bits_left < min_bits)
1937 ac->ac_last_group = 0; 1981 ac->ac_last_group = 0;
1938 else 1982 else
1939 ac->ac_last_group = res->sr_bg_blkno; 1983 ac->ac_last_group = hint;
1940 } 1984 }
1941 1985
1942bail: 1986bail:
@@ -2016,6 +2060,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2016 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2060 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2017} 2061}
2018 2062
2063int ocfs2_find_new_inode_loc(struct inode *dir,
2064 struct buffer_head *parent_fe_bh,
2065 struct ocfs2_alloc_context *ac,
2066 u64 *fe_blkno)
2067{
2068 int ret;
2069 handle_t *handle = NULL;
2070 struct ocfs2_suballoc_result *res;
2071
2072 BUG_ON(!ac);
2073 BUG_ON(ac->ac_bits_given != 0);
2074 BUG_ON(ac->ac_bits_wanted != 1);
2075 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2076
2077 res = kzalloc(sizeof(*res), GFP_NOFS);
2078 if (res == NULL) {
2079 ret = -ENOMEM;
2080 mlog_errno(ret);
2081 goto out;
2082 }
2083
2084 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2085
2086 /*
2087 * The handle started here is for chain relink. Alternatively,
2088 * we could just disable relink for these calls.
2089 */
2090 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2091 if (IS_ERR(handle)) {
2092 ret = PTR_ERR(handle);
2093 handle = NULL;
2094 mlog_errno(ret);
2095 goto out;
2096 }
2097
2098 /*
2099 * This will instruct ocfs2_claim_suballoc_bits and
2100 * ocfs2_search_one_group to search but save actual allocation
2101 * for later.
2102 */
2103 ac->ac_find_loc_only = 1;
2104
2105 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2106 if (ret < 0) {
2107 mlog_errno(ret);
2108 goto out;
2109 }
2110
2111 ac->ac_find_loc_priv = res;
2112 *fe_blkno = res->sr_blkno;
2113
2114out:
2115 if (handle)
2116 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2117
2118 if (ret)
2119 kfree(res);
2120
2121 return ret;
2122}
2123
2124int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2125 struct inode *dir,
2126 struct ocfs2_alloc_context *ac,
2127 u64 *suballoc_loc,
2128 u16 *suballoc_bit,
2129 u64 di_blkno)
2130{
2131 int ret;
2132 u16 chain;
2133 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2134 struct buffer_head *bg_bh = NULL;
2135 struct ocfs2_group_desc *bg;
2136 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2137
2138 /*
2139 * Since di_blkno is being passed back in, we check for any
2140 * inconsistencies which may have happened between
2141 * calls. These are code bugs as di_blkno is not expected to
2142 * change once returned from ocfs2_find_new_inode_loc()
2143 */
2144 BUG_ON(res->sr_blkno != di_blkno);
2145
2146 ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2147 res->sr_bg_stable_blkno, &bg_bh);
2148 if (ret) {
2149 mlog_errno(ret);
2150 goto out;
2151 }
2152
2153 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2154 chain = le16_to_cpu(bg->bg_chain);
2155
2156 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2157 ac->ac_bh, res->sr_bits,
2158 chain);
2159 if (ret) {
2160 mlog_errno(ret);
2161 goto out;
2162 }
2163
2164 ret = ocfs2_block_group_set_bits(handle,
2165 ac->ac_inode,
2166 bg,
2167 bg_bh,
2168 res->sr_bit_offset,
2169 res->sr_bits);
2170 if (ret < 0) {
2171 mlog_errno(ret);
2172 goto out;
2173 }
2174
2175 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
2176 (unsigned long long)di_blkno);
2177
2178 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2179
2180 BUG_ON(res->sr_bits != 1);
2181
2182 *suballoc_loc = res->sr_bg_blkno;
2183 *suballoc_bit = res->sr_bit_offset;
2184 ac->ac_bits_given++;
2185 ocfs2_save_inode_ac_group(dir, ac);
2186
2187out:
2188 brelse(bg_bh);
2189
2190 return ret;
2191}
2192
2019int ocfs2_claim_new_inode(handle_t *handle, 2193int ocfs2_claim_new_inode(handle_t *handle,
2020 struct inode *dir, 2194 struct inode *dir,
2021 struct buffer_head *parent_fe_bh, 2195 struct buffer_head *parent_fe_bh,
@@ -2253,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
2253 (unsigned long *) undo_bg->bg_bitmap); 2427 (unsigned long *) undo_bg->bg_bitmap);
2254 } 2428 }
2255 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2429 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2430 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2431 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
2432 " count %u but claims %u are freed. num_bits %d",
2433 (unsigned long long)le64_to_cpu(bg->bg_blkno),
2434 le16_to_cpu(bg->bg_bits),
2435 le16_to_cpu(bg->bg_free_bits_count), num_bits);
2436 return -EROFS;
2437 }
2256 2438
2257 if (undo_fn) 2439 if (undo_fn)
2258 jbd_unlock_bh_state(group_bh); 2440 jbd_unlock_bh_state(group_bh);
@@ -2567,7 +2749,8 @@ out:
2567 * suballoc_bit. 2749 * suballoc_bit.
2568 */ 2750 */
2569static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 2751static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2570 u16 *suballoc_slot, u16 *suballoc_bit) 2752 u16 *suballoc_slot, u64 *group_blkno,
2753 u16 *suballoc_bit)
2571{ 2754{
2572 int status; 2755 int status;
2573 struct buffer_head *inode_bh = NULL; 2756 struct buffer_head *inode_bh = NULL;
@@ -2604,6 +2787,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2604 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 2787 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2605 if (suballoc_bit) 2788 if (suballoc_bit)
2606 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 2789 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2790 if (group_blkno)
2791 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2607 2792
2608bail: 2793bail:
2609 brelse(inode_bh); 2794 brelse(inode_bh);
@@ -2621,7 +2806,8 @@ bail:
2621 */ 2806 */
2622static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 2807static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2623 struct inode *suballoc, 2808 struct inode *suballoc,
2624 struct buffer_head *alloc_bh, u64 blkno, 2809 struct buffer_head *alloc_bh,
2810 u64 group_blkno, u64 blkno,
2625 u16 bit, int *res) 2811 u16 bit, int *res)
2626{ 2812{
2627 struct ocfs2_dinode *alloc_di; 2813 struct ocfs2_dinode *alloc_di;
@@ -2642,10 +2828,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2642 goto bail; 2828 goto bail;
2643 } 2829 }
2644 2830
2645 if (alloc_di->i_suballoc_loc) 2831 bg_blkno = group_blkno ? group_blkno :
2646 bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); 2832 ocfs2_which_suballoc_group(blkno, bit);
2647 else
2648 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2649 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, 2833 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2650 &group_bh); 2834 &group_bh);
2651 if (status < 0) { 2835 if (status < 0) {
@@ -2680,6 +2864,7 @@ bail:
2680int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 2864int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2681{ 2865{
2682 int status; 2866 int status;
2867 u64 group_blkno = 0;
2683 u16 suballoc_bit = 0, suballoc_slot = 0; 2868 u16 suballoc_bit = 0, suballoc_slot = 0;
2684 struct inode *inode_alloc_inode; 2869 struct inode *inode_alloc_inode;
2685 struct buffer_head *alloc_bh = NULL; 2870 struct buffer_head *alloc_bh = NULL;
@@ -2687,7 +2872,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2687 mlog_entry("blkno: %llu", (unsigned long long)blkno); 2872 mlog_entry("blkno: %llu", (unsigned long long)blkno);
2688 2873
2689 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2874 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2690 &suballoc_bit); 2875 &group_blkno, &suballoc_bit);
2691 if (status < 0) { 2876 if (status < 0) {
2692 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 2877 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2693 goto bail; 2878 goto bail;
@@ -2715,7 +2900,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2715 } 2900 }
2716 2901
2717 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 2902 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2718 blkno, suballoc_bit, res); 2903 group_blkno, blkno, suballoc_bit, res);
2719 if (status < 0) 2904 if (status < 0)
2720 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 2905 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2721 2906
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a017dd3ee7d9..b8afabfeede4 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,9 @@ struct ocfs2_alloc_context {
56 u64 ac_max_block; /* Highest block number to allocate. 0 is 56 u64 ac_max_block; /* Highest block number to allocate. 0 is
57 is the same as ~0 - unlimited */ 57 is the same as ~0 - unlimited */
58 58
59 int ac_find_loc_only; /* hack for reflink operation ordering */
60 struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
61
59 struct ocfs2_alloc_reservation *ac_resv; 62 struct ocfs2_alloc_reservation *ac_resv;
60}; 63};
61 64
@@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
197 struct ocfs2_alloc_context **meta_ac); 200 struct ocfs2_alloc_context **meta_ac);
198 201
199int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); 202int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
203
204
205
206/*
207 * The following two interfaces are for ocfs2_create_inode_in_orphan().
208 */
209int ocfs2_find_new_inode_loc(struct inode *dir,
210 struct buffer_head *parent_fe_bh,
211 struct ocfs2_alloc_context *ac,
212 u64 *fe_blkno);
213
214int ocfs2_claim_new_inode_at_loc(handle_t *handle,
215 struct inode *dir,
216 struct ocfs2_alloc_context *ac,
217 u64 *suballoc_loc,
218 u16 *suballoc_bit,
219 u64 di_blkno);
220
200#endif /* _CHAINALLOC_H_ */ 221#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fa1be1b304d1..56f0cb395820 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -162,6 +162,7 @@ enum {
162 Opt_nointr, 162 Opt_nointr,
163 Opt_hb_none, 163 Opt_hb_none,
164 Opt_hb_local, 164 Opt_hb_local,
165 Opt_hb_global,
165 Opt_data_ordered, 166 Opt_data_ordered,
166 Opt_data_writeback, 167 Opt_data_writeback,
167 Opt_atime_quantum, 168 Opt_atime_quantum,
@@ -177,6 +178,8 @@ enum {
177 Opt_noacl, 178 Opt_noacl,
178 Opt_usrquota, 179 Opt_usrquota,
179 Opt_grpquota, 180 Opt_grpquota,
181 Opt_coherency_buffered,
182 Opt_coherency_full,
180 Opt_resv_level, 183 Opt_resv_level,
181 Opt_dir_resv_level, 184 Opt_dir_resv_level,
182 Opt_err, 185 Opt_err,
@@ -190,6 +193,7 @@ static const match_table_t tokens = {
190 {Opt_nointr, "nointr"}, 193 {Opt_nointr, "nointr"},
191 {Opt_hb_none, OCFS2_HB_NONE}, 194 {Opt_hb_none, OCFS2_HB_NONE},
192 {Opt_hb_local, OCFS2_HB_LOCAL}, 195 {Opt_hb_local, OCFS2_HB_LOCAL},
196 {Opt_hb_global, OCFS2_HB_GLOBAL},
193 {Opt_data_ordered, "data=ordered"}, 197 {Opt_data_ordered, "data=ordered"},
194 {Opt_data_writeback, "data=writeback"}, 198 {Opt_data_writeback, "data=writeback"},
195 {Opt_atime_quantum, "atime_quantum=%u"}, 199 {Opt_atime_quantum, "atime_quantum=%u"},
@@ -205,6 +209,8 @@ static const match_table_t tokens = {
205 {Opt_noacl, "noacl"}, 209 {Opt_noacl, "noacl"},
206 {Opt_usrquota, "usrquota"}, 210 {Opt_usrquota, "usrquota"},
207 {Opt_grpquota, "grpquota"}, 211 {Opt_grpquota, "grpquota"},
212 {Opt_coherency_buffered, "coherency=buffered"},
213 {Opt_coherency_full, "coherency=full"},
208 {Opt_resv_level, "resv_level=%u"}, 214 {Opt_resv_level, "resv_level=%u"},
209 {Opt_dir_resv_level, "dir_resv_level=%u"}, 215 {Opt_dir_resv_level, "dir_resv_level=%u"},
210 {Opt_err, NULL} 216 {Opt_err, NULL}
@@ -514,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
514 520
515 mlog_entry_void(); 521 mlog_entry_void();
516 522
517 for (i = 0; i < NUM_SYSTEM_INODES; i++) { 523 for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
518 inode = osb->system_inodes[i]; 524 inode = osb->global_system_inodes[i];
519 if (inode) { 525 if (inode) {
520 iput(inode); 526 iput(inode);
521 osb->system_inodes[i] = NULL; 527 osb->global_system_inodes[i] = NULL;
522 } 528 }
523 } 529 }
524 530
@@ -534,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
534 osb->root_inode = NULL; 540 osb->root_inode = NULL;
535 } 541 }
536 542
543 if (!osb->local_system_inodes)
544 goto out;
545
546 for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
547 if (osb->local_system_inodes[i]) {
548 iput(osb->local_system_inodes[i]);
549 osb->local_system_inodes[i] = NULL;
550 }
551 }
552
553 kfree(osb->local_system_inodes);
554 osb->local_system_inodes = NULL;
555
556out:
537 mlog_exit(0); 557 mlog_exit(0);
538} 558}
539 559
@@ -608,8 +628,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
608 int ret = 0; 628 int ret = 0;
609 struct mount_options parsed_options; 629 struct mount_options parsed_options;
610 struct ocfs2_super *osb = OCFS2_SB(sb); 630 struct ocfs2_super *osb = OCFS2_SB(sb);
611 631 u32 tmp;
612 lock_kernel();
613 632
614 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 633 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
615 !ocfs2_check_set_options(sb, &parsed_options)) { 634 !ocfs2_check_set_options(sb, &parsed_options)) {
@@ -617,8 +636,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
617 goto out; 636 goto out;
618 } 637 }
619 638
620 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != 639 tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
621 (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 640 OCFS2_MOUNT_HB_NONE;
641 if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
622 ret = -EINVAL; 642 ret = -EINVAL;
623 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); 643 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
624 goto out; 644 goto out;
@@ -717,7 +737,6 @@ unlock_osb:
717 MS_POSIXACL : 0); 737 MS_POSIXACL : 0);
718 } 738 }
719out: 739out:
720 unlock_kernel();
721 return ret; 740 return ret;
722} 741}
723 742
@@ -809,23 +828,29 @@ bail:
809 828
810static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) 829static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
811{ 830{
812 if (ocfs2_mount_local(osb)) { 831 u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
813 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { 832
833 if (osb->s_mount_opt & hb_enabled) {
834 if (ocfs2_mount_local(osb)) {
814 mlog(ML_ERROR, "Cannot heartbeat on a locally " 835 mlog(ML_ERROR, "Cannot heartbeat on a locally "
815 "mounted device.\n"); 836 "mounted device.\n");
816 return -EINVAL; 837 return -EINVAL;
817 } 838 }
818 } 839 if (ocfs2_userspace_stack(osb)) {
819
820 if (ocfs2_userspace_stack(osb)) {
821 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
822 mlog(ML_ERROR, "Userspace stack expected, but " 840 mlog(ML_ERROR, "Userspace stack expected, but "
823 "o2cb heartbeat arguments passed to mount\n"); 841 "o2cb heartbeat arguments passed to mount\n");
824 return -EINVAL; 842 return -EINVAL;
825 } 843 }
844 if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
845 !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
846 ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
847 ocfs2_cluster_o2cb_global_heartbeat(osb))) {
848 mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
849 return -EINVAL;
850 }
826 } 851 }
827 852
828 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 853 if (!(osb->s_mount_opt & hb_enabled)) {
829 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && 854 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
830 !ocfs2_userspace_stack(osb)) { 855 !ocfs2_userspace_stack(osb)) {
831 mlog(ML_ERROR, "Heartbeat has to be started to mount " 856 mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -1291,6 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1291{ 1316{
1292 int status; 1317 int status;
1293 char *p; 1318 char *p;
1319 u32 tmp;
1294 1320
1295 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 1321 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
1296 options ? options : "(none)"); 1322 options ? options : "(none)");
@@ -1322,7 +1348,10 @@ static int ocfs2_parse_options(struct super_block *sb,
1322 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; 1348 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
1323 break; 1349 break;
1324 case Opt_hb_none: 1350 case Opt_hb_none:
1325 mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; 1351 mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
1352 break;
1353 case Opt_hb_global:
1354 mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
1326 break; 1355 break;
1327 case Opt_barrier: 1356 case Opt_barrier:
1328 if (match_int(&args[0], &option)) { 1357 if (match_int(&args[0], &option)) {
@@ -1438,6 +1467,12 @@ static int ocfs2_parse_options(struct super_block *sb,
1438 case Opt_grpquota: 1467 case Opt_grpquota:
1439 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1468 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1440 break; 1469 break;
1470 case Opt_coherency_buffered:
1471 mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
1472 break;
1473 case Opt_coherency_full:
1474 mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
1475 break;
1441 case Opt_acl: 1476 case Opt_acl:
1442 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1477 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1443 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; 1478 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1477,6 +1512,15 @@ static int ocfs2_parse_options(struct super_block *sb,
1477 } 1512 }
1478 } 1513 }
1479 1514
1515 /* Ensure only one heartbeat mode */
1516 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
1517 OCFS2_MOUNT_HB_NONE);
1518 if (hweight32(tmp) != 1) {
1519 mlog(ML_ERROR, "Invalid heartbeat mount options\n");
1520 status = 0;
1521 goto bail;
1522 }
1523
1480 status = 1; 1524 status = 1;
1481 1525
1482bail: 1526bail:
@@ -1490,10 +1534,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1490 unsigned long opts = osb->s_mount_opt; 1534 unsigned long opts = osb->s_mount_opt;
1491 unsigned int local_alloc_megs; 1535 unsigned int local_alloc_megs;
1492 1536
1493 if (opts & OCFS2_MOUNT_HB_LOCAL) 1537 if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
1494 seq_printf(s, ",_netdev,heartbeat=local"); 1538 seq_printf(s, ",_netdev");
1495 else 1539 if (opts & OCFS2_MOUNT_HB_LOCAL)
1496 seq_printf(s, ",heartbeat=none"); 1540 seq_printf(s, ",%s", OCFS2_HB_LOCAL);
1541 else
1542 seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
1543 } else
1544 seq_printf(s, ",%s", OCFS2_HB_NONE);
1497 1545
1498 if (opts & OCFS2_MOUNT_NOINTR) 1546 if (opts & OCFS2_MOUNT_NOINTR)
1499 seq_printf(s, ",nointr"); 1547 seq_printf(s, ",nointr");
@@ -1536,6 +1584,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1536 if (opts & OCFS2_MOUNT_GRPQUOTA) 1584 if (opts & OCFS2_MOUNT_GRPQUOTA)
1537 seq_printf(s, ",grpquota"); 1585 seq_printf(s, ",grpquota");
1538 1586
1587 if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
1588 seq_printf(s, ",coherency=buffered");
1589 else
1590 seq_printf(s, ",coherency=full");
1591
1539 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1592 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1540 seq_printf(s, ",nouser_xattr"); 1593 seq_printf(s, ",nouser_xattr");
1541 else 1594 else
@@ -1640,13 +1693,9 @@ static void ocfs2_put_super(struct super_block *sb)
1640{ 1693{
1641 mlog_entry("(0x%p)\n", sb); 1694 mlog_entry("(0x%p)\n", sb);
1642 1695
1643 lock_kernel();
1644
1645 ocfs2_sync_blockdev(sb); 1696 ocfs2_sync_blockdev(sb);
1646 ocfs2_dismount_volume(sb, 0); 1697 ocfs2_dismount_volume(sb, 0);
1647 1698
1648 unlock_kernel();
1649
1650 mlog_exit_void(); 1699 mlog_exit_void();
1651} 1700}
1652 1701
@@ -1990,6 +2039,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
1990 return 0; 2039 return 0;
1991} 2040}
1992 2041
2042/* Make sure entire volume is addressable by our journal. Requires
2043 osb_clusters_at_boot to be valid and for the journal to have been
2044 initialized by ocfs2_journal_init(). */
2045static int ocfs2_journal_addressable(struct ocfs2_super *osb)
2046{
2047 int status = 0;
2048 u64 max_block =
2049 ocfs2_clusters_to_blocks(osb->sb,
2050 osb->osb_clusters_at_boot) - 1;
2051
2052 /* 32-bit block number is always OK. */
2053 if (max_block <= (u32)~0ULL)
2054 goto out;
2055
2056 /* Volume is "huge", so see if our journal is new enough to
2057 support it. */
2058 if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
2059 OCFS2_FEATURE_COMPAT_JBD2_SB) &&
2060 jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
2061 JBD2_FEATURE_INCOMPAT_64BIT))) {
2062 mlog(ML_ERROR, "The journal cannot address the entire volume. "
2063 "Enable the 'block64' journal option with tunefs.ocfs2");
2064 status = -EFBIG;
2065 goto out;
2066 }
2067
2068 out:
2069 return status;
2070}
2071
1993static int ocfs2_initialize_super(struct super_block *sb, 2072static int ocfs2_initialize_super(struct super_block *sb,
1994 struct buffer_head *bh, 2073 struct buffer_head *bh,
1995 int sector_size, 2074 int sector_size,
@@ -2002,6 +2081,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2002 struct ocfs2_journal *journal; 2081 struct ocfs2_journal *journal;
2003 __le32 uuid_net_key; 2082 __le32 uuid_net_key;
2004 struct ocfs2_super *osb; 2083 struct ocfs2_super *osb;
2084 u64 total_blocks;
2005 2085
2006 mlog_entry_void(); 2086 mlog_entry_void();
2007 2087
@@ -2060,6 +2140,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2060 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 2140 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
2061 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 2141 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
2062 2142
2143 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2144 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2145 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2146 osb->max_slots);
2147 status = -EINVAL;
2148 goto bail;
2149 }
2150 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2151
2063 ocfs2_orphan_scan_init(osb); 2152 ocfs2_orphan_scan_init(osb);
2064 2153
2065 status = ocfs2_recovery_init(osb); 2154 status = ocfs2_recovery_init(osb);
@@ -2098,15 +2187,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2098 goto bail; 2187 goto bail;
2099 } 2188 }
2100 2189
2101 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2102 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2103 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2104 osb->max_slots);
2105 status = -EINVAL;
2106 goto bail;
2107 }
2108 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2109
2110 osb->slot_recovery_generations = 2190 osb->slot_recovery_generations =
2111 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), 2191 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
2112 GFP_KERNEL); 2192 GFP_KERNEL);
@@ -2149,7 +2229,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2149 goto bail; 2229 goto bail;
2150 } 2230 }
2151 2231
2152 if (ocfs2_userspace_stack(osb)) { 2232 if (ocfs2_clusterinfo_valid(osb)) {
2233 osb->osb_stackflags =
2234 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
2153 memcpy(osb->osb_cluster_stack, 2235 memcpy(osb->osb_cluster_stack,
2154 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2236 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
2155 OCFS2_STACK_LABEL_LEN); 2237 OCFS2_STACK_LABEL_LEN);
@@ -2214,11 +2296,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2214 goto bail; 2296 goto bail;
2215 } 2297 }
2216 2298
2217 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) 2299 total_blocks = ocfs2_clusters_to_blocks(osb->sb,
2218 > (u32)~0UL) { 2300 le32_to_cpu(di->i_clusters));
2219 mlog(ML_ERROR, "Volume might try to write to blocks beyond " 2301
2220 "what jbd can address in 32 bits.\n"); 2302 status = generic_check_addressable(osb->sb->s_blocksize_bits,
2221 status = -EINVAL; 2303 total_blocks);
2304 if (status) {
2305 mlog(ML_ERROR, "Volume too large "
2306 "to mount safely on this system");
2307 status = -EFBIG;
2222 goto bail; 2308 goto bail;
2223 } 2309 }
2224 2310
@@ -2380,6 +2466,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2380 goto finally; 2466 goto finally;
2381 } 2467 }
2382 2468
2469 /* Now that journal has been initialized, check to make sure
2470 entire volume is addressable. */
2471 status = ocfs2_journal_addressable(osb);
2472 if (status)
2473 goto finally;
2474
2383 /* If the journal was unmounted cleanly then we don't want to 2475 /* If the journal was unmounted cleanly then we don't want to
2384 * recover anything. Otherwise, journal_load will do that 2476 * recover anything. Otherwise, journal_load will do that
2385 * dirty work for us :) */ 2477 * dirty work for us :) */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc4..9975457c981f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
128 } 128 }
129 129
130 /* Fast symlinks can't be large */ 130 /* Fast symlinks can't be large */
131 len = strlen(target); 131 len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
132 link = kzalloc(len + 1, GFP_NOFS); 132 link = kzalloc(len + 1, GFP_NOFS);
133 if (!link) { 133 if (!link) {
134 status = -ENOMEM; 134 status = -ENOMEM;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190cdbf1..902efb23b6a6 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
44 int type, 44 int type,
45 u32 slot); 45 u32 slot);
46 46
47static inline int is_global_system_inode(int type);
48static inline int is_in_system_inode_array(struct ocfs2_super *osb,
49 int type,
50 u32 slot);
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC 47#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; 48static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
54#endif 49#endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
59 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; 54 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
60} 55}
61 56
62static inline int is_in_system_inode_array(struct ocfs2_super *osb, 57static struct inode **get_local_system_inode(struct ocfs2_super *osb,
63 int type, 58 int type,
64 u32 slot) 59 u32 slot)
65{ 60{
66 return slot == osb->slot_num || is_global_system_inode(type); 61 int index;
62 struct inode **local_system_inodes, **free = NULL;
63
64 BUG_ON(slot == OCFS2_INVALID_SLOT);
65 BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
66 type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
67
68 spin_lock(&osb->osb_lock);
69 local_system_inodes = osb->local_system_inodes;
70 spin_unlock(&osb->osb_lock);
71
72 if (unlikely(!local_system_inodes)) {
73 local_system_inodes = kzalloc(sizeof(struct inode *) *
74 NUM_LOCAL_SYSTEM_INODES *
75 osb->max_slots,
76 GFP_NOFS);
77 if (!local_system_inodes) {
78 mlog_errno(-ENOMEM);
79 /*
80 * return NULL here so that ocfs2_get_sytem_file_inodes
81 * will try to create an inode and use it. We will try
82 * to initialize local_system_inodes next time.
83 */
84 return NULL;
85 }
86
87 spin_lock(&osb->osb_lock);
88 if (osb->local_system_inodes) {
89 /* Someone has initialized it for us. */
90 free = local_system_inodes;
91 local_system_inodes = osb->local_system_inodes;
92 } else
93 osb->local_system_inodes = local_system_inodes;
94 spin_unlock(&osb->osb_lock);
95 if (unlikely(free))
96 kfree(free);
97 }
98
99 index = (slot * NUM_LOCAL_SYSTEM_INODES) +
100 (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
101
102 return &local_system_inodes[index];
67} 103}
68 104
69struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, 105struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
74 struct inode **arr = NULL; 110 struct inode **arr = NULL;
75 111
76 /* avoid the lookup if cached in local system file array */ 112 /* avoid the lookup if cached in local system file array */
77 if (is_in_system_inode_array(osb, type, slot)) 113 if (is_global_system_inode(type)) {
78 arr = &(osb->system_inodes[type]); 114 arr = &(osb->global_system_inodes[type]);
115 } else
116 arr = get_local_system_inode(osb, type, slot);
79 117
80 if (arr && ((inode = *arr) != NULL)) { 118 if (arr && ((inode = *arr) != NULL)) {
81 /* get a ref in addition to the array ref */ 119 /* get a ref in addition to the array ref */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03469f61801..67cd43914641 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
1286 xis.inode_bh = xbs.inode_bh = di_bh; 1286 xis.inode_bh = xbs.inode_bh = di_bh;
1287 di = (struct ocfs2_dinode *)di_bh->b_data; 1287 di = (struct ocfs2_dinode *)di_bh->b_data;
1288 1288
1289 down_read(&oi->ip_xattr_sem);
1290 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, 1289 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
1291 buffer_size, &xis); 1290 buffer_size, &xis);
1292 if (ret == -ENODATA && di->i_xattr_loc) 1291 if (ret == -ENODATA && di->i_xattr_loc)
1293 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, 1292 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
1294 buffer_size, &xbs); 1293 buffer_size, &xbs);
1295 up_read(&oi->ip_xattr_sem);
1296 1294
1297 return ret; 1295 return ret;
1298} 1296}
@@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
1316 mlog_errno(ret); 1314 mlog_errno(ret);
1317 return ret; 1315 return ret;
1318 } 1316 }
1317 down_read(&OCFS2_I(inode)->ip_xattr_sem);
1319 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, 1318 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
1320 name, buffer, buffer_size); 1319 name, buffer, buffer_size);
1320 up_read(&OCFS2_I(inode)->ip_xattr_sem);
1321 1321
1322 ocfs2_inode_unlock(inode, 0); 1322 ocfs2_inode_unlock(inode, 0);
1323 1323
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
7081 goto out; 7081 goto out;
7082 } 7082 }
7083 7083
7084 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) 7084 if (!indexed)
7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); 7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
7086 else 7086 else
7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); 7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);